Skip to content

Commit 8452914

Browse files
committed
Added chapter16
1 parent 638f1ee commit 8452914

File tree

9 files changed

+302
-0
lines changed

9 files changed

+302
-0
lines changed

chapter16/multiprocess.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from urllib.request import urlopen
2+
from bs4 import BeautifulSoup
3+
import re
4+
import random
5+
6+
from multiprocessing import Process, Queue
7+
import os
8+
import time
9+
import Thread
10+
11+
def getLinks(bsObj, queue):
12+
print('Getting links in {}'.format(os.getpid()))
13+
links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
14+
return [link for link in links if link not in queue.get()]
15+
16+
def scrape_article(path, queue):
17+
queue.get().append()
18+
print("Process {} list is now: {}".format(os.getpid(), visited))
19+
html = urlopen('http://en.wikipedia.org{}'.format(path))
20+
time.sleep(5)
21+
bsObj = BeautifulSoup(html, 'html.parser')
22+
title = bsObj.find('h1').get_text()
23+
print('Scraping {} in process {}'.format(title, os.getpid()))
24+
links = getLinks(bsObj)
25+
if len(links) > 0:
26+
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
27+
print(newArticle)
28+
scrape_article(newArticle)
29+
30+
processes = []
31+
queue = Queue()
32+
processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', queue,)))
33+
processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', queue,)))
34+
35+
for p in processes:
36+
p.start()

chapter16/multiprocess_example.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from multiprocessing import Process
2+
import time
3+
4+
def print_time(threadName, delay, iterations):
5+
start = int(time.time())
6+
for i in range(0,iterations):
7+
time.sleep(delay)
8+
seconds_elapsed = str(int(time.time()) - start)
9+
print (threadName if threadName else seconds_elapsed)
10+
11+
12+
processes = []
13+
processes.append(Process(target=print_time, args=(None, 1, 100)))
14+
processes.append(Process(target=print_time, args=("Fizz", 3, 33)))
15+
processes.append(Process(target=print_time, args=("Buzz", 5, 20)))
16+
17+
for p in processes:
18+
p.start()
19+
20+
for p in processes:
21+
p.join()
22+
23+
print("Program complete")

chapter16/multiprocess_queue.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
2+
from urllib.request import urlopen
3+
from bs4 import BeautifulSoup
4+
import re
5+
import random
6+
from multiprocessing import Process, Queue
7+
import os
8+
import time
9+
10+
11+
def task_delegator(taskQueue, foundUrlsQueue):
12+
#Initialize with a task for each process
13+
visited = ['/wiki/Kevin_Bacon', '/wiki/Monty_Python']
14+
taskQueue.put('/wiki/Kevin_Bacon')
15+
taskQueue.put('/wiki/Monty_Python')
16+
17+
while 1:
18+
#Check to see if there are new links in the foundUrlsQueue for processing
19+
if not foundUrlsQueue.empty():
20+
links = [link for link in foundUrlsQueue.get() if link not in visited]
21+
for link in links:
22+
#Add new link to the taskQueue
23+
taskQueue.put(link)
24+
25+
def get_links(bsObj):
26+
links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
27+
return [link.attrs['href'] for link in links]
28+
29+
def scrape_article(taskQueue, foundUrlsQueue):
30+
while 1:
31+
while taskQueue.empty():
32+
#Sleep 100 ms while waiting for the task queue
33+
#This should be rare
34+
time.sleep(.1)
35+
path = taskQueue.get()
36+
html = urlopen('http://en.wikipedia.org{}'.format(path))
37+
time.sleep(5)
38+
bsObj = BeautifulSoup(html, 'html.parser')
39+
title = bsObj.find('h1').get_text()
40+
print('Scraping {} in process {}'.format(title, os.getpid()))
41+
links = get_links(bsObj)
42+
#Send these to the delegator for processing
43+
foundUrlsQueue.put(links)
44+
45+
46+
processes = []
47+
taskQueue = Queue()
48+
foundUrlsQueue = Queue()
49+
processes.append(Process(target=task_delegator, args=(taskQueue, foundUrlsQueue,)))
50+
processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,)))
51+
processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,)))
52+
53+
for p in processes:
54+
p.start()

chapter16/multithreaded.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from urllib.request import urlopen
2+
from bs4 import BeautifulSoup
3+
import re
4+
import random
5+
6+
import _thread
7+
import time
8+
9+
visited = []
10+
def getLinks(thread_name, bsObj):
11+
print('Getting links in {}'.format(thread_name))
12+
links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
13+
return [link for link in links if link not in visited]
14+
15+
def scrape_article(thread_name, path):
16+
visited.append(path)
17+
html = urlopen('http://en.wikipedia.org{}'.format(path))
18+
time.sleep(5)
19+
bsObj = BeautifulSoup(html, 'html.parser')
20+
title = bsObj.find('h1').get_text()
21+
print('Scraping {} in thread {}'.format(title, thread_name))
22+
links = getLinks(thread_name, bsObj)
23+
if len(links) > 0:
24+
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
25+
print(newArticle)
26+
scrape_article(thread_name, newArticle)
27+
28+
29+
try:
30+
_thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',))
31+
_thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',))
32+
except:
33+
print ('Error: unable to start threads')
34+
35+
while 1:
36+
pass

chapter16/multithreaded_class.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from urllib.request import urlopen
2+
from bs4 import BeautifulSoup
3+
import re
4+
import random
5+
6+
import _thread
7+
import time
8+
9+
visited = []
10+
def getLinks(thread_name, bsObj):
11+
print('Getting links in {}'.format(thread_name))
12+
links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
13+
return [link for link in links if link not in visited]
14+
15+
# Define a function for the thread
16+
def scrape_article(thread_name, path):
17+
visited.append(path)
18+
html = urlopen('http://en.wikipedia.org{}'.format(path))
19+
time.sleep(5)
20+
bsObj = BeautifulSoup(html, 'html.parser')
21+
title = bsObj.find('h1').get_text()
22+
print('Scraping {} in thread {}'.format(title, thread_name))
23+
links = getLinks(thread_name, bsObj)
24+
if len(links) > 0:
25+
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
26+
print(newArticle)
27+
scrape_article(thread_name, newArticle)
28+
29+
30+
# Create two threads as follows
31+
try:
32+
_thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',))
33+
_thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',))
34+
except:
35+
print ('Error: unable to start threads')
36+
37+
while 1:
38+
pass

chapter16/multithreaded_example.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import _thread
2+
import time
3+
4+
def print_time(threadName, delay, iterations):
5+
start = int(time.time())
6+
for i in range(0,iterations):
7+
time.sleep(delay)
8+
seconds_elapsed = str(int(time.time()) - start)
9+
print (threadName if threadName else seconds_elapsed)
10+
11+
try:
12+
_thread.start_new_thread(print_time, (None, 1, 100))
13+
_thread.start_new_thread(print_time, ("Fizz", 3, 33))
14+
_thread.start_new_thread(print_time, ("Buzz", 5, 20))
15+
except:
16+
print ("Error: unable to start thread")
17+
18+
while 1:
19+
pass

chapter16/multithreaded_queue.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from urllib.request import urlopen
2+
from bs4 import BeautifulSoup
3+
import re
4+
import random
5+
import _thread
6+
from queue import Queue
7+
import time
8+
import pymysql
9+
10+
11+
def storage(queue):
12+
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='', db='mysql', charset='utf8')
13+
cur = conn.cursor()
14+
cur.execute('USE wiki_threads')
15+
while 1:
16+
if not queue.empty():
17+
article = queue.get()
18+
cur.execute('SELECT * FROM pages WHERE path = %s', (article["path"]))
19+
if cur.rowcount == 0:
20+
print("Storing article {}".format(article["title"]))
21+
cur.execute('INSERT INTO pages (title, path) VALUES (%s, %s)', (article["title"], article["path"]))
22+
conn.commit()
23+
else:
24+
print("Article already exists: {}".format(article['title']))
25+
26+
visited = []
27+
def getLinks(thread_name, bsObj):
28+
print('Getting links in {}'.format(thread_name))
29+
links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
30+
return [link for link in links if link not in visited]
31+
32+
def scrape_article(thread_name, path, queue):
33+
visited.append(path)
34+
html = urlopen('http://en.wikipedia.org{}'.format(path))
35+
time.sleep(5)
36+
bsObj = BeautifulSoup(html, 'html.parser')
37+
title = bsObj.find('h1').get_text()
38+
print('Added {} for storage in thread {}'.format(title, thread_name))
39+
queue.put({"title":title, "path":path})
40+
links = getLinks(thread_name, bsObj)
41+
if len(links) > 0:
42+
newArticle = links[random.randint(0, len(links)-1)].attrs['href']
43+
scrape_article(thread_name, newArticle, queue)
44+
45+
queue = Queue()
46+
try:
47+
_thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon', queue,))
48+
_thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python', queue,))
49+
_thread.start_new_thread(storage, (queue,))
50+
except:
51+
print ('Error: unable to start threads')
52+
53+
while 1:
54+
pass

chapter16/threading_crawler.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import threading
2+
import time
3+
4+
class Crawler(threading.Thread):
5+
def __init__(self):
6+
threading.Thread.__init__(self)
7+
self.done = False
8+
9+
def isDone(self):
10+
return self.done
11+
12+
def run(self):
13+
time.sleep(5)
14+
self.done = True
15+
raise Exception('Something bad happened!')
16+
17+
t = Crawler()
18+
t.start()
19+
20+
while True:
21+
time.sleep(1)
22+
if t.isDone():
23+
print('Done')
24+
break
25+
if not t.isAlive():
26+
t = Crawler()
27+
t.start()
28+

chapter16/threading_example.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import threading
2+
import time
3+
4+
def print_time(threadName, delay, iterations):
5+
start = int(time.time())
6+
for i in range(0,iterations):
7+
time.sleep(delay)
8+
seconds_elapsed = str(int(time.time()) - start)
9+
print ('{} {}'.format(seconds_elapsed, threadName))
10+
11+
t = threading.Thread(target=print_time, args=('Fizz', 3, 33)).start()
12+
t = threading.Thread(target=print_time, args=('Buzz', 5, 20)).start()
13+
t = threading.Thread(target=print_time, args=('Counter', 1, 100)).start()
14+

0 commit comments

Comments
 (0)