1+ from urllib .request import urlopen
2+ from bs4 import BeautifulSoup
3+ import re
4+ import random
5+ import _thread
6+ from queue import Queue
7+ import time
8+ import pymysql
9+
10+
11+ def storage (queue ):
12+ conn = pymysql .connect (host = '127.0.0.1' , unix_socket = '/tmp/mysql.sock' , user = 'root' , passwd = '' , db = 'mysql' , charset = 'utf8' )
13+ cur = conn .cursor ()
14+ cur .execute ('USE wiki_threads' )
15+ while 1 :
16+ if not queue .empty ():
17+ article = queue .get ()
18+ cur .execute ('SELECT * FROM pages WHERE path = %s' , (article ["path" ]))
19+ if cur .rowcount == 0 :
20+ print ("Storing article {}" .format (article ["title" ]))
21+ cur .execute ('INSERT INTO pages (title, path) VALUES (%s, %s)' , (article ["title" ], article ["path" ]))
22+ conn .commit ()
23+ else :
24+ print ("Article already exists: {}" .format (article ['title' ]))
25+
26+ visited = []
27+ def getLinks (thread_name , bsObj ):
28+ print ('Getting links in {}' .format (thread_name ))
29+ links = bsObj .find ('div' , {'id' :'bodyContent' }).find_all ('a' , href = re .compile ('^(/wiki/)((?!:).)*$' ))
30+ return [link for link in links if link not in visited ]
31+
32+ def scrape_article (thread_name , path , queue ):
33+ visited .append (path )
34+ html = urlopen ('http://en.wikipedia.org{}' .format (path ))
35+ time .sleep (5 )
36+ bsObj = BeautifulSoup (html , 'html.parser' )
37+ title = bsObj .find ('h1' ).get_text ()
38+ print ('Added {} for storage in thread {}' .format (title , thread_name ))
39+ queue .put ({"title" :title , "path" :path })
40+ links = getLinks (thread_name , bsObj )
41+ if len (links ) > 0 :
42+ newArticle = links [random .randint (0 , len (links )- 1 )].attrs ['href' ]
43+ scrape_article (thread_name , newArticle , queue )
44+
45+ queue = Queue ()
46+ try :
47+ _thread .start_new_thread (scrape_article , ('Thread 1' , '/wiki/Kevin_Bacon' , queue ,))
48+ _thread .start_new_thread (scrape_article , ('Thread 2' , '/wiki/Monty_Python' , queue ,))
49+ _thread .start_new_thread (storage , (queue ,))
50+ except :
51+ print ('Error: unable to start threads' )
52+
53+ while 1 :
54+ pass
0 commit comments