Added chapter16

REMitchell · REMitchell · commit 84529143d42a · 2018-03-23T22:14:14.000-04:00
diff --git a/chapter16/multiprocess.py b/chapter16/multiprocess.py
@@ -0,0 +1,36 @@
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import re
+import random
+
+from multiprocessing import Process, Queue
+import os
+import time
+import Thread
+
+def getLinks(bsObj, queue):
+    print('Getting links in {}'.format(os.getpid()))
+    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
+    return [link for link in links if link not in queue.get()]
+
+def scrape_article(path, queue):
+    queue.get().append()
+    print("Process {} list is now: {}".format(os.getpid(), visited))
+    html = urlopen('http://en.wikipedia.org{}'.format(path))
+    time.sleep(5)
+    bsObj = BeautifulSoup(html, 'html.parser')
+    title = bsObj.find('h1').get_text()
+    print('Scraping {} in process {}'.format(title, os.getpid()))
+    links = getLinks(bsObj)
+    if len(links) > 0:
+        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
+        print(newArticle)
+        scrape_article(newArticle)
+
+processes = []
+queue = Queue()
+processes.append(Process(target=scrape_article, args=('/wiki/Kevin_Bacon', queue,)))
+processes.append(Process(target=scrape_article, args=('/wiki/Monty_Python', queue,)))
+
+for p in processes:
+    p.start()
diff --git a/chapter16/multiprocess_example.py b/chapter16/multiprocess_example.py
@@ -0,0 +1,23 @@
+from multiprocessing import Process
+import time
+
+def print_time(threadName, delay, iterations):
+    start = int(time.time())
+    for i in range(0,iterations):
+        time.sleep(delay)
+        seconds_elapsed = str(int(time.time()) - start)
+        print (threadName if threadName else seconds_elapsed)
+
+
+processes = []
+processes.append(Process(target=print_time, args=(None, 1, 100)))
+processes.append(Process(target=print_time, args=("Fizz", 3, 33)))
+processes.append(Process(target=print_time, args=("Buzz", 5, 20)))
+
+for p in processes:
+    p.start()
+
+for p in processes:
+    p.join()
+    
+print("Program complete")
diff --git a/chapter16/multiprocess_queue.py b/chapter16/multiprocess_queue.py
@@ -0,0 +1,54 @@
+
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import re
+import random
+from multiprocessing import Process, Queue
+import os
+import time
+
+
+def task_delegator(taskQueue, foundUrlsQueue):
+    #Initialize with a task for each process
+    visited = ['/wiki/Kevin_Bacon', '/wiki/Monty_Python']
+    taskQueue.put('/wiki/Kevin_Bacon')
+    taskQueue.put('/wiki/Monty_Python')
+
+    while 1:
+        #Check to see if there are new links in the foundUrlsQueue for processing
+        if not foundUrlsQueue.empty():
+            links = [link for link in foundUrlsQueue.get() if link not in visited]
+            for link in links:
+                #Add new link to the taskQueue
+                taskQueue.put(link)
+
+def get_links(bsObj):
+    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
+    return [link.attrs['href'] for link in links]
+
+def scrape_article(taskQueue, foundUrlsQueue):
+    while 1:
+        while taskQueue.empty():
+            #Sleep 100 ms while waiting for the task queue 
+            #This should be rare
+            time.sleep(.1)
+        path = taskQueue.get()
+        html = urlopen('http://en.wikipedia.org{}'.format(path))
+        time.sleep(5)
+        bsObj = BeautifulSoup(html, 'html.parser')
+        title = bsObj.find('h1').get_text()
+        print('Scraping {} in process {}'.format(title, os.getpid()))
+        links = get_links(bsObj)
+        #Send these to the delegator for processing
+        foundUrlsQueue.put(links)
+
+
+processes = []
+taskQueue = Queue()
+foundUrlsQueue = Queue()
+processes.append(Process(target=task_delegator, args=(taskQueue, foundUrlsQueue,)))
+processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,)))
+processes.append(Process(target=scrape_article, args=(taskQueue, foundUrlsQueue,)))
+
+for p in processes:
+    p.start()
diff --git a/chapter16/multithreaded.py b/chapter16/multithreaded.py
@@ -0,0 +1,36 @@
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import re
+import random
+
+import _thread
+import time
+
+visited = []
+def getLinks(thread_name, bsObj):
+    print('Getting links in {}'.format(thread_name))
+    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
+    return [link for link in links if link not in visited]
+
+def scrape_article(thread_name, path):
+    visited.append(path)
+    html = urlopen('http://en.wikipedia.org{}'.format(path))
+    time.sleep(5)
+    bsObj = BeautifulSoup(html, 'html.parser')
+    title = bsObj.find('h1').get_text()
+    print('Scraping {} in thread {}'.format(title, thread_name))
+    links = getLinks(thread_name, bsObj)
+    if len(links) > 0:
+        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
+        print(newArticle)
+        scrape_article(thread_name, newArticle)
+
+
+try:
+   _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',))
+   _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',))
+except:
+   print ('Error: unable to start threads')
+
+while 1:
+    pass
diff --git a/chapter16/multithreaded_class.py b/chapter16/multithreaded_class.py
@@ -0,0 +1,38 @@
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import re
+import random
+
+import _thread
+import time
+
+visited = []
+def getLinks(thread_name, bsObj):
+    print('Getting links in {}'.format(thread_name))
+    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
+    return [link for link in links if link not in visited]
+
+# Define a function for the thread
+def scrape_article(thread_name, path):
+    visited.append(path)
+    html = urlopen('http://en.wikipedia.org{}'.format(path))
+    time.sleep(5)
+    bsObj = BeautifulSoup(html, 'html.parser')
+    title = bsObj.find('h1').get_text()
+    print('Scraping {} in thread {}'.format(title, thread_name))
+    links = getLinks(thread_name, bsObj)
+    if len(links) > 0:
+        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
+        print(newArticle)
+        scrape_article(thread_name, newArticle)
+
+
+# Create two threads as follows
+try:
+   _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon',))
+   _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python',))
+except:
+   print ('Error: unable to start threads')
+
+while 1:
+    pass
diff --git a/chapter16/multithreaded_example.py b/chapter16/multithreaded_example.py
@@ -0,0 +1,19 @@
+import _thread
+import time
+
+def print_time(threadName, delay, iterations):
+    start = int(time.time())
+    for i in range(0,iterations):
+        time.sleep(delay)
+        seconds_elapsed = str(int(time.time()) - start)
+        print (threadName if threadName else seconds_elapsed)
+
+try:
+    _thread.start_new_thread(print_time, (None, 1, 100))
+    _thread.start_new_thread(print_time, ("Fizz", 3, 33))
+    _thread.start_new_thread(print_time, ("Buzz", 5, 20))
+except:
+    print ("Error: unable to start thread")
+
+while 1:
+    pass
diff --git a/chapter16/multithreaded_queue.py b/chapter16/multithreaded_queue.py
@@ -0,0 +1,54 @@
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import re
+import random
+import _thread
+from queue import Queue
+import time
+import pymysql
+
+
+def storage(queue):
+    conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='', db='mysql', charset='utf8')
+    cur = conn.cursor()
+    cur.execute('USE wiki_threads')
+    while 1:
+        if not queue.empty():
+            article = queue.get()
+            cur.execute('SELECT * FROM pages WHERE path = %s', (article["path"]))
+            if cur.rowcount == 0:
+                print("Storing article {}".format(article["title"]))
+                cur.execute('INSERT INTO pages (title, path) VALUES (%s, %s)', (article["title"], article["path"]))
+                conn.commit()
+            else:
+                print("Article already exists: {}".format(article['title']))
+
+visited = []
+def getLinks(thread_name, bsObj):
+    print('Getting links in {}'.format(thread_name))
+    links = bsObj.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
+    return [link for link in links if link not in visited]
+
+def scrape_article(thread_name, path, queue):
+    visited.append(path)
+    html = urlopen('http://en.wikipedia.org{}'.format(path))
+    time.sleep(5)
+    bsObj = BeautifulSoup(html, 'html.parser')
+    title = bsObj.find('h1').get_text()
+    print('Added {} for storage in thread {}'.format(title, thread_name))
+    queue.put({"title":title, "path":path})
+    links = getLinks(thread_name, bsObj)
+    if len(links) > 0:
+        newArticle = links[random.randint(0, len(links)-1)].attrs['href']
+        scrape_article(thread_name, newArticle, queue)
+
+queue = Queue()
+try:
+   _thread.start_new_thread(scrape_article, ('Thread 1', '/wiki/Kevin_Bacon', queue,))
+   _thread.start_new_thread(scrape_article, ('Thread 2', '/wiki/Monty_Python', queue,))
+   _thread.start_new_thread(storage, (queue,))
+except:
+   print ('Error: unable to start threads')
+
+while 1:
+    pass
diff --git a/chapter16/threading_crawler.py b/chapter16/threading_crawler.py
@@ -0,0 +1,28 @@
+import threading
+import time
+
+class Crawler(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+        self.done = False
+
+    def isDone(self):
+        return self.done
+
+    def run(self):
+        time.sleep(5)
+        self.done = True
+        raise Exception('Something bad happened!')
+
+t = Crawler()
+t.start()
+
+while True:
+    time.sleep(1)
+    if t.isDone():
+        print('Done')
+        break
+    if not t.isAlive():
+        t = Crawler()
+        t.start()
+
diff --git a/chapter16/threading_example.py b/chapter16/threading_example.py
@@ -0,0 +1,14 @@
+import threading
+import time
+
+def print_time(threadName, delay, iterations):
+    start = int(time.time())
+    for i in range(0,iterations):
+        time.sleep(delay)
+        seconds_elapsed = str(int(time.time()) - start)
+        print ('{} {}'.format(seconds_elapsed, threadName))
+
+t = threading.Thread(target=print_time, args=('Fizz', 3, 33)).start()
+t = threading.Thread(target=print_time, args=('Buzz', 5, 20)).start()
+t = threading.Thread(target=print_time, args=('Counter', 1, 100)).start()
+