-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
90 lines (72 loc) · 2.38 KB
/
utils.py
File metadata and controls
90 lines (72 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
from urlparse import urlparse, urlunparse
from time import sleep
def get_content(source_doc_obj, requests_session):
""" Returns content of source_doc_obj.
If the content wasn't previously downloaded, it requests the content and downloads it.
"""
try:
with open(source_doc_obj.file_name, 'r') as f:
content = f.read()
except:
print ' No file found. Downloading from {}'.format(source_doc_obj.url)
sleep(4)
response = requests_session.get(source_doc_obj.url, timeout=(3.05, 27))
response.raise_for_status()
content = response.content
with open(source_doc_obj.file_name, 'w') as f:
f.write(content)
return content
def extract_links(content, parent_url):
""" Returns a list of extracted links as dicts from the provided html. """
out_links = []
soup = BeautifulSoup(content, 'html5lib')
if 'senate' in parent_url.lower():
chamber = 'S'
try:
found_links = soup.find(id = 'list').find_all('a')
except AttributeError:
found_links = soup.find('body').find_all('a')
except Exception:
raise
elif 'house' in parent_url.lower():
chamber = 'H'
try:
found_links = soup.find(attrs = {'style': 'width:700px', 'class': 'sitebox'}).find_all('a')
except:
try:
found_links = soup.find(id = 'right').find_all('a')
except AttributeError:
found_links = soup.find('body').find_all('a')
except Exception:
raise
for link in found_links:
if link['href'] != '/':
parent_path = ''.join(re.findall('.+\/', urlparse(parent_url).path))
link_path = urlparse(link['href']).path.replace('./', '/').strip()
if parent_path.lower() not in link_path.lower():
full_path = parent_path + link_path
else:
full_path = link_path
out_links.append({
'scheme': urlparse(parent_url).scheme
, 'netloc': urlparse(parent_url).netloc
, 'path': link_path
, 'params': urlparse(link['href']).params
, 'query': urlparse(link['href']).query
, 'fragment': urlparse(link['href']).fragment
, 'url': urlunparse((
urlparse(parent_url).scheme
, urlparse(parent_url).netloc
, full_path
, urlparse(link['href']).params
, urlparse(link['href']).query
, urlparse(link['href']).fragment
))
, 'name': link.text.strip()
, 'chamber': chamber
})
return out_links