From 2feac8754934dc9e7d5e5055593dfedf7daebf33 Mon Sep 17 00:00:00 2001 From: Evgeniya Kuznetsova Date: Tue, 5 Dec 2017 13:14:10 -0700 Subject: [PATCH 01/50] 1) Added graceful failure, script can now be stopped with Ctrl+C, 2) Added a switch between soup and no_soup modes to global vars, 3) Added a regex filter for searching relevant URLs, new global var filter_regex, 4) Added a function that extracts visible text from page using Soup --- .../Crawler/crawlerExpand.py | 558 +++++++++++------- 1 file changed, 355 insertions(+), 203 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 5cc39a5..f41ab03 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -7,6 +7,8 @@ import time import codecs import string +import shutil +import re try: from os import scandir, walk except ImportError: @@ -26,7 +28,13 @@ # depth to go for depth_to_go = int(sys.argv[3]) # directory name -directory = sys.argv[4] + "_" + curtime +directory = sys.argv[4] +target_dir = directory + "_" + curtime + + +filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission).*") + +mode = "no_soup" # soup or no_soup # Checks if the url includes http at the front @@ -43,22 +51,61 @@ } ) -# Check if the directory exists -if not os.path.isdir(directory): - os.mkdir(directory) # if it doesnt then make it -os.chdir(directory) # then change directory to that folder - -# Create a log file in the folder that was just created -logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) -# file to log empty requests into -empty_request_log = codecs.open("_empty_requests.txt", "w", "utf-8-sig") -# file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) -planned_urls = codecs.open("_planned_urls.txt", "w", "utf-8-sig") -# file to log visited urls into - URLs that have been requested and have the html -visited_urls = codecs.open("_visited_urls.txt", "w", "utf-8-sig") -# file to log crawled urls into - URLs that crawler will "check" against to see if needs logging -crawled_urls = codecs.open("_crawled_urls.txt", "w", "utf-8-sig") - +#os.mkdir(target_dir) # make a timestampted folder +# Check if the original directory exists +if os.path.isdir(directory): + #shutil.copy(directory + "/_planned_urls.txt", target_dir) + #shutil.copy(directory + "/_empty_requests.txt", target_dir) + #shutil.copy(directory + "/_visited_urls.txt", target_dir) + #shutil.copy(directory + "/_crawled_urls.txt", target_dir) + shutil.copytree(directory, target_dir) + os.chdir(target_dir) # then change directory to that folder + + #count number visited + with open("_visited_urls.txt") as f: + for i, l in enumerate(f, start=1): + pass + page = i + + #create array of planned + with open("_planned_urls.txt") as f: + content = f.readlines() + #remove whitespace characters like `\n` at the end of each line + planned = content[page-1:] + plannedURLsArray = [x.strip() for x in planned] + + with open("_crawled_urls.txt") as f: + content = f.readlines() + #remove whitespace characters like `\n` at the end of each line + crawledURLsArray = [x.strip() for x in content] + + logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) + # file to log empty requests into + empty_request_log = codecs.open("_empty_requests.txt", "a", "utf-8-sig") + # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) + planned_urls = codecs.open("_planned_urls.txt", "a", "utf-8-sig") + # file to log visited urls into - URLs that have been requested and have the html + visited_urls = codecs.open("_visited_urls.txt", "a", "utf-8-sig") + # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging + crawled_urls = codecs.open("_crawled_urls.txt", "a", "utf-8-sig") + + +else: + os.mkdir(target_dir) # make a timestampted folder + os.chdir(target_dir) # then change directory to that folder + # Create a log file in the folder that was just created + logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) + # file to log empty requests into + empty_request_log = codecs.open("_empty_requests.txt", "w", "utf-8-sig") + # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) + planned_urls = codecs.open("_planned_urls.txt", "w", "utf-8-sig") + plannedURLsArray = [] + # file to log visited urls into - URLs that have been requested and have the html + visited_urls = codecs.open("_visited_urls.txt", "w", "utf-8-sig") + # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging + crawled_urls = codecs.open("_crawled_urls.txt", "w", "utf-8-sig") + crawledURLsArray = [] + page = 1 # Function that checks if the link provided is in the same domain as the seed def checkDomain(link): @@ -76,50 +123,33 @@ def request_url(url): visited_urls.write(url) visited_urls.write("\n") # Use requests module to get html from url as an object - source_code = requests.get(url, headers=headers) # variable = requests.get(url) - # Get source code of page as text - html = source_code.text - return html - - -# Function for manually cleaning up name -# Deprecated, we use format_filename instead now -def clean_name (name): - name = name.replace("\n", "") - name = name.replace("\r", "") - name = name.replace("\t", "") - name = name.replace("|", "") - name = name.replace(":", "") - name = name.replace("?", "") - name = name.replace("'", "") - - # "/" - # "\\" - # "*" - # "\"" - # "<" - # ">" - # "^" - # "!" - name = name.strip(' ') - return name - - + html = '' + try: + r = requests.get(url, headers=headers) + if r.ok: + if "text/html" in r.headers["content-type"]: + html = r.text + return html + return None + except KeyboardInterrupt: + print("\n\nScript interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() + except Exception: + logging.exception("Couldn\'t request " + url) + return None + # Function to create a filename out of a string # Called from create_name def format_filename(name): #Taken from: https://gist.github.com/seanh/93666 """Take a string and return a valid filename constructed from the string. Uses a whitelist approach: any characters not present in valid_chars are - removed. Also spaces are replaced with underscores. - - Note: this method may produce invalid filenames such as ``, `.` or `..` - When I use this method I prepend a date string like '2009_01_15_19_46_32_' - and append a file extension like '.txt', so I avoid the potential of using - an invalid filename.""" + removed. Also spaces are replaced with underscores.""" valid_chars = "-_() %s%s" % (string.ascii_letters, string.digits) filename = ''.join(c for c in name if c in valid_chars) # Remove spaces in filename + filename = filename.strip() filename = filename.replace(' ','_') return filename @@ -127,169 +157,269 @@ def format_filename(name): # Function for creating name # Use the title of the html page as the title of the text file # Called from main function -def create_name (soup): - try: - name = soup.title.string # removes all the unnecessary things from title - name = format_filename(name) +def create_name_from_html (html): + name_list = (html.partition("")[-1] + if name: + # removes invalid characters from title + name = format_filename(name) + '__' + str(time.time()) logging.info('Created name ' + name) - except: - name = "no_title_" # if no title provided give a no title with number title + else: + name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp logging.warn('Failed to create a name, using \'' + name + '\' instead') return name - -# Function for creating file - +def create_name_from_soup (soup): + name = soup.title.string + if name: + # removes invalid characters from title + name = format_filename(name) + '__' + str(time.time()) + logging.info('Created name ' + name) + else: + name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp + logging.warn('Failed to create a name, using \'' + name + '\' instead') + return name -# Function for saving links +#Function for deleting paired single or double quotes +def dequote(s): + """ + If a string has single or double quotes around it, remove them. + Make sure the pair of quotes match. + If a matching pair of quotes is not found, return the string unchanged. + """ + if (len(s)>= 2 and s[0] == s[-1]) and s.startswith(("'", '"')): + s = s[1:-1] + s = s.strip('"\'') + return s -# Main function. -# max_pages is the number of pages to crawl (given as the second argument) -def trade_spider(max_pages): +#Function that takes link, saves the contents to text file call href_split +def crawl(max_pages): logging.info("Crawling through domain '" + seed + "'") - page = 1 - # Array that holds the queue to be visited later - plannedURLsArray = [url] - # Logging the urls - planned_urls.write(url) - planned_urls.write("\n") - - # Gets the root of the url - url_split = url.split("://", 1) - # Array that holds urls that have been found. - # This is the array that all new URLs are checked against to prevent repeating. - # Record URL with both http and https prefixes - crawledURLsArray = ["http://" + url_split[1]] - crawledURLsArray.append("https://" + url_split[1]) - # Also log the same into the text file - crawled_urls.write("http://" + url_split[1] + "\n") - crawled_urls.write("https://" + url_split[1] + "\n") - - # Sets the depth already crawled to 0 - dsize = 0 - # Create an array of queue size on each level of the tree. Used to stop the crawler at a certain depth. - # Alas, it appears to be broken... - depth = [dsize] - # Checks if the crawler has gone over the max number of pages - # Also checks if the depth has gone over the max depth - # Also checks if there are still URLs in the queue - while page <= max_pages and dsize <= depth_to_go and len(plannedURLsArray) > 0: - # Empty html variable, just in case - html = '' - # Try to get the html of the URL - try: - html = request_url(plannedURLsArray[0]) - except: - logging.warn('Error while requesting an html response ' + plannedURLsArray[0]) - # Checks if html exists and is not empty - if html: - # Uses module to parse html into an obejct (a tree of nodes). Nodes are tags, attributes, ect. - # May need to be re-thought! Very memory heavy. ??? - soup = BeautifulSoup(html, 'html5lib') + if page == 1: + # Array that holds the queue to be visited later + plannedURLsArray.append(url) + # Logging the urls + planned_urls.write(url) + planned_urls.write("\n") + + # Gets the root of the url + url_split = url.split("://", 1) + # Array that holds urls that have been found. + # This is the array that all new URLs are checked against to prevent repeating. + # Record URL with both http and https prefixes + crawledURLsArray.append("http://" + url_split[1]) + crawledURLsArray.append("https://" + url_split[1]) + # Also log the same into the text file + crawled_urls.write("http://" + url_split[1] + "\n") + crawled_urls.write("https://" + url_split[1] + "\n") + + while page <= max_pages and len(plannedURLsArray) > 0: + process_current_link() + + + +def process_current_link (): + global page + + print(plannedURLsArray[0]) + # Try to get the html of the URL + html = request_url(plannedURLsArray[0]) + + if html: + #Soupify + soup = BeautifulSoup(html, 'html5lib') + + if mode=="no_soup": # Gets the name for the file to store the html text in - name = create_name(soup) - # Adds the .txt to the end of the name - name = "{0}.txt".format(name) + name = create_name_from_html(html) + #find and process all links + process_links_from_html(html, plannedURLsArray[0]) + else: + name = create_name_from_soup(soup) + process_links_from_soup(soup, plannedURLsArray[0]) + + # Adds the .txt to the end of the name + name = "{0}.txt".format(name) + # Find only visible text + visible_text = extract_text(soup) + + if visible_text: #save it as a text file try: - # Check if file with given name exists - if os.path.isfile(name): - # If exists, add timestamp to name to make it unique. - name = name[:name.find(".")] + "_" + str(time.time()) + ".txt" - - # Open/create the file with that name + # Create and open the file with that name fo = codecs.open(name, "w", "utf-8-sig") # Write URL to that file fo.write("\n") # Append the html to the file - fo.write(html) + fo.write(visible_text) # Close the pipe to the file fo.close() # Log the creation of the file logging.info('Created file ' + name) - #print(plannedURLsArray[0]) - # Looks for tables with content (hopefully programs and courses) - # ACALOG-specific: find all tables with class "block_n2_and_content" - for table in soup.findAll('table', class_='block_n2_and_content'): - # Old code: look for all (links) in soup - #for link in soup.findAll('a', href=True): #Untab the lines below if you uncomment this - # Make sure to only collect from the site we want - for link in table.findAll('a', href=True): - # Collects the href string and stores the link as a tuple - # It stores the URL without a #thing and without an ending slash - new_link = (urllib.parse.urldefrag(link['href'])[0]).rstrip('/') - # ACALOG-specific: removes ACALOG print-friendly format descriptor - #new_link = new_link.rstrip('&print') - # Smart function for relative links on the page. Joins given path and current URL. - new_link = urllib.parse.urljoin(plannedURLsArray[0], new_link) - # Checks if the just found link is in the same domain - if checkDomain(new_link): - # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray - if new_link not in crawledURLsArray: - # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link: - #???TODO: add checks for www.domain.com and https:// - # Adds new link to array - plannedURLsArray.append(new_link) - # Adds new link to queue file - planned_urls.write(new_link) - planned_urls.write("\n") - - # Remove the front of the URL (http or https) - http_split = new_link.split("://", 1) - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") - # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") - except: - logging.warning("Can not encode file: " + plannedURLsArray[0]) - # Else: html does not exist or is empty. Log error + except KeyboardInterrupt: + print("\n\nScript interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() + except Exception: + logging.exception("Can not encode file: " + plannedURLsArray[0]) else: - logging.warning('Request for ' + url + ' returned empty html') - empty_request_log.write(url) - empty_request_log.write("\n") - # Prints to console. - # Update on what URL is being examined - print(plannedURLsArray[0]) - # Update on the depth it is at - print("depth:", dsize) - # Update on the total number of pages - print("iterations:", page, "pages") - print("\n") - # Deletes the currently looked at URL from the queue - plannedURLsArray.pop(0) - # SUPER BROKEN??? - # Supposed to check if given depth has been reached - # Should look at the plannedURLsArray instead of the crawled one - if page >= depth[dsize]: - depth.append(len(crawledURLsArray)) - dsize += 1 - - # Increment page count - page += 1 - # Checks the size of the folder. Prints the amount of data collected in GB to the console and log file - if page%100 == 0: - size_of_directory = get_tree_size(os.curdir) / 1000000000 - print("Size: ", round(size_of_directory, 5), "GB") - print('\n') - logging.info("Size: " + round(size_of_directory, 5) + "GB") - # Prints in the log file the length of time the crawler has been running in seconds - logging.info("Has been running for " + str(time.time() - start_time) + " seconds") - # Time delay in seconds to prevent crashing the server - time.sleep(.01) + print('No visible text in ' + url) + logging.warning('No visible text in ' + url) + # Else: html does not exist or is empty. Log error + else: + logging.warning('Request for ' + url + ' returned empty html') + empty_request_log.write(url) + empty_request_log.write("\n") + + # Update on the total number of pages + print("iterations:", page, "pages") + print("\n") + # Deletes the currently looked at URL from the queue + plannedURLsArray.pop(0) + + # Increment page count + page += 1 + # Checks the size of the folder. Prints the amount of data collected in GB to the console and log file + if page%10 == 0: + size_of_directory = get_tree_size(os.curdir) / 1000000000 + print("Size: ", str(round(size_of_directory, 5)), "GB") + print('\n') + logging.info("Size: " + str(round(size_of_directory, 5)) + "GB") + # Prints in the log file the length of time the crawler has been running in seconds + logging.info("Has been running for " + str(time.time() - start_time) + " seconds") + # Time delay in seconds to prevent crashing the server + time.sleep(.01) + + +#Function for splitting html into links +def href_split (html): + links = [] + if html.partition('')[0] + href = href.partition(' ')[0] + href = dequote(href) + links.append(href) + return links + + +#input is a soup element +def is_relevant_link_from_soup(link): + if link.find(string=filter_regex): + return True + return False + #return True #Uncomment to grab all links + +def process_links_from_soup (soup, cur_domain): + for lnk in soup.findAll('a', href=True): + if is_relevant_link_from_soup(lnk): + new_link = (urllib.parse.urldefrag(lnk['href'])[0]).rstrip('/') + new_link = urllib.parse.urljoin(cur_domain, new_link) + if checkDomain(new_link): + # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray + if new_link not in crawledURLsArray: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link: + #???TODO: add checks for www.domain.com and https:// + # Adds new link to array + plannedURLsArray.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + +def is_relevant_link_from_html(link): + if filter_regex.match(link): + return True + return False + #return True #Uncomment to grab all links + +#Take an array of links, run the split on each and add the results to the appropriate arrays and files +def process_links_from_html (html, cur_domain): + if html.partition('')[0] + href = href.partition(' ')[0] + href = dequote(href) + new_link = (urllib.parse.urldefrag(href)[0]).rstrip('/') + new_link = urllib.parse.urljoin(cur_domain, new_link) + if checkDomain(new_link): + # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray + if new_link not in crawledURLsArray: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link: + #???TODO: add checks for www.domain.com and https:// + # Adds new link to array + plannedURLsArray.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + + + + + +#Function to extract text elements from an HTML and return them as an array of BeautifulSoup +def extract_text(soup): + data = soup.findAll(text=True) + result = filter(is_visible_html_element, data) + all_text = "" + for t in result: + if t.strip(): + all_text += t + "\n" + return all_text + +def is_visible_html_element(element): + if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: + return False + elif re.match('', str(element.encode('utf-8'))): + return False + return True + + + + + # Return total size of files in given path and subdirs by going through the tree. # Recursive. @@ -303,18 +433,40 @@ def get_tree_size(path): total += entry.stat(follow_symlinks=False).st_size return total + +def shut_down(): + global start_time + global logging + global empty_request_log + global visited_urls + global planned_urls + global crawled_urls + + # Get the time that the command finished + end_time = time.time() + # Print overall time taken to console + print("Overall time: " + str((end_time - start_time))) + # Log overall time and save to main log file + logging.info("Overall time: " + str((end_time - start_time))) + # Close all the things/pipes to files + empty_request_log.close() + visited_urls.close() + planned_urls.close() + crawled_urls.close() + sys.exit() + + # Get the time that the command was run start_time = time.time() -# Call main function -trade_spider(iterate) -# Get the time that the command finished -end_time = time.time() -# Print overall time taken to console -print("Overall time: " + str((end_time - start_time))) -# Log overall time and save to main log file -logging.info("Overall time: " + str((end_time - start_time))) -# Close all the things/pipes to files -empty_request_log.close() -visited_urls.close() -planned_urls.close() -crawled_urls.close() \ No newline at end of file + +try: + # Call main function + crawl(iterate) + shut_down() +except KeyboardInterrupt: + print("\n\nScript interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() +except Exception: + logging.exception("Error while running script") + \ No newline at end of file From af75d039877c71fc54150a9f7681e84fff65170b Mon Sep 17 00:00:00 2001 From: Evgeniya Kuznetsova Date: Tue, 5 Dec 2017 13:48:35 -0700 Subject: [PATCH 02/50] Added comments --- .../Crawler/crawlerExpand.py | 91 +++++++++++-------- 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index f41ab03..e623fe1 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -31,9 +31,13 @@ directory = sys.argv[4] target_dir = directory + "_" + curtime - +# RegEx that is used to filter searches for URLs on any given page. +#Used in is_relevant_link_from_soup and is_relevant_link_from_html functions filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission).*") +# Var to choose mode +# "soup" uses BeautifulSoup to assign a name to a page and to search the page for URLs +# "no_soup" uses a string search – splits the page into strings using "href=" as a partition limiter, then goes from there mode = "no_soup" # soup or no_soup @@ -51,34 +55,36 @@ } ) -#os.mkdir(target_dir) # make a timestampted folder -# Check if the original directory exists +# Checks if the directory with the given name already exists +# If it does, tries to continue a script run that was interrupted, using already existing lists of visited_urls and planned_urls +# If it doesn't, starts a new script run if os.path.isdir(directory): - #shutil.copy(directory + "/_planned_urls.txt", target_dir) - #shutil.copy(directory + "/_empty_requests.txt", target_dir) - #shutil.copy(directory + "/_visited_urls.txt", target_dir) - #shutil.copy(directory + "/_crawled_urls.txt", target_dir) + # Continuing a previous script run + + # Copy the contents of the existing directory to a new timestamped one shutil.copytree(directory, target_dir) os.chdir(target_dir) # then change directory to that folder - #count number visited + # Open the visited_urls text file and count the number of lines in it – that's how many pages the script visited throughout its previous runs with open("_visited_urls.txt") as f: for i, l in enumerate(f, start=1): pass page = i - - #create array of planned + + # Open the file with planned urls and add them to the array of planned urls with open("_planned_urls.txt") as f: content = f.readlines() #remove whitespace characters like `\n` at the end of each line planned = content[page-1:] plannedURLsArray = [x.strip() for x in planned] + # Open the file with crawled urls and add them to the array of crawled urls with open("_crawled_urls.txt") as f: content = f.readlines() #remove whitespace characters like `\n` at the end of each line crawledURLsArray = [x.strip() for x in content] + # Create a new log file logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) # file to log empty requests into empty_request_log = codecs.open("_empty_requests.txt", "a", "utf-8-sig") @@ -91,6 +97,7 @@ else: + # Start a new script run os.mkdir(target_dir) # make a timestampted folder os.chdir(target_dir) # then change directory to that folder # Create a log file in the folder that was just created @@ -156,7 +163,9 @@ def format_filename(name): # Function for creating name # Use the title of the html page as the title of the text file -# Called from main function +# Called from process_current_link +# Uses string search to locate the tag +# Parameter html is a string def create_name_from_html (html): name_list = (html.partition("</title")[0]).split("<title") #grab part of html before tag +# Parameter soup is a soup object def create_name_from_soup (soup): name = soup.title.string if name: @@ -195,7 +209,8 @@ def dequote(s): return s -#Function that takes link, saves the contents to text file call href_split +# Function that takes link, saves the contents to text file call href_split +# Main function def crawl(max_pages): logging.info("Crawling through domain '" + seed + "'") @@ -221,7 +236,7 @@ def crawl(max_pages): process_current_link() - +# Function that grabs the first link in the list of planned urls, requests the page and processes it def process_current_link (): global page @@ -229,8 +244,9 @@ def process_current_link (): # Try to get the html of the URL html = request_url(plannedURLsArray[0]) - if html: - #Soupify + if html: #if the request returned an html + # Soupify + # For now it soupifies the link regardless of the mode, because it uses soup later to extract visible text from the page soup = BeautifulSoup(html, 'html5lib') if mode=="no_soup": @@ -286,44 +302,36 @@ def process_current_link (): # Increment page count page += 1 - # Checks the size of the folder. Prints the amount of data collected in GB to the console and log file - if page%10 == 0: - size_of_directory = get_tree_size(os.curdir) / 1000000000 - print("Size: ", str(round(size_of_directory, 5)), "GB") + # Every 50 pages checks the size of the folder. Prints the amount of data collected in MB to the console and log file + if page%50 == 0: + size_of_directory = get_tree_size(os.curdir) / 1000000 + print("Size: ", str(round(size_of_directory, 5)), "MB") print('\n') - logging.info("Size: " + str(round(size_of_directory, 5)) + "GB") + logging.info("Size: " + str(round(size_of_directory, 5)) + "MB") # Prints in the log file the length of time the crawler has been running in seconds logging.info("Has been running for " + str(time.time() - start_time) + " seconds") # Time delay in seconds to prevent crashing the server time.sleep(.01) -#Function for splitting html into links -def href_split (html): - links = [] - if html.partition('')[0] - href = href.partition(' ')[0] - href = dequote(href) - links.append(href) - return links - - -#input is a soup element +# checks that the text content of the link matches the filter_regex +# input parameter is a soup element!!! def is_relevant_link_from_soup(link): if link.find(string=filter_regex): return True return False #return True #Uncomment to grab all links +# takes soup of a page, finds all links on it +# for each link checks if it's relevant +# for each relevant link, saves it to the planned urls array (if it hasn't been crawled yet) +# and to the crawled urls array (so that we don't save it a second time later) def process_links_from_soup (soup, cur_domain): for lnk in soup.findAll('a', href=True): if is_relevant_link_from_soup(lnk): new_link = (urllib.parse.urldefrag(lnk['href'])[0]).rstrip('/') new_link = urllib.parse.urljoin(cur_domain, new_link) + # if the link is in our main domain if checkDomain(new_link): # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray if new_link not in crawledURLsArray: @@ -350,6 +358,8 @@ def process_links_from_soup (soup, cur_domain): crawled_urls.write("https://" + http_split[1]) crawled_urls.write("\n") +# checks that the text content of the link matches the filter_regex +# input parameter is a string def is_relevant_link_from_html(link): if filter_regex.match(link): return True @@ -360,9 +370,9 @@ def is_relevant_link_from_html(link): def process_links_from_html (html, cur_domain): if html.partition('')[0] href = href.partition(' ')[0] @@ -399,7 +409,8 @@ def process_links_from_html (html, cur_domain): -#Function to extract text elements from an HTML and return them as an array of BeautifulSoup +# Function to extract text elements from an HTML and return them as an array of BeautifulSoup +# called from process_current_link def extract_text(soup): data = soup.findAll(text=True) result = filter(is_visible_html_element, data) @@ -409,6 +420,9 @@ def extract_text(soup): all_text += t + "\n" return all_text + +# check that the given soup element is a visible text element +# called from extract_text def is_visible_html_element(element): if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: return False @@ -434,6 +448,7 @@ def get_tree_size(path): return total +# Shut down gracefully and log it def shut_down(): global start_time global logging From 420b753ac1607b502ec76b373dc9483f0a3ea49e Mon Sep 17 00:00:00 2001 From: Evgeniya Kuznetsova Date: Wed, 6 Dec 2017 14:51:28 -0700 Subject: [PATCH 03/50] Deleted the noBS file --- .../Crawler/crawlerNoBS.py | 360 ------------------ 1 file changed, 360 deletions(-) delete mode 100644 Search-Engine-and-Crawler/Crawler/crawlerNoBS.py diff --git a/Search-Engine-and-Crawler/Crawler/crawlerNoBS.py b/Search-Engine-and-Crawler/Crawler/crawlerNoBS.py deleted file mode 100644 index 4e64a10..0000000 --- a/Search-Engine-and-Crawler/Crawler/crawlerNoBS.py +++ /dev/null @@ -1,360 +0,0 @@ -import requests -import urllib.parse -import os.path -import sys -import tldextract -import time -import codecs -import string -import shutil -try: - from os import scandir, walk -except ImportError: - from scandir import scandir, walk -import logging - -#Pay attention to robots.txt - -# current time, used in the names of the folder and the logging file -curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) - -# Arguments in order: url, total pages to look at, depth, first part of directory name -# url to start from -url = sys.argv[1] -# number of pages to iterate through -iterate = int(sys.argv[2]) -# depth to go for -depth_to_go = int(sys.argv[3]) -# directory name -directory = sys.argv[4] -target_dir = directory + "_" + curtime - - -# Checks if the url includes http at the front -if not url.startswith("http"): - url = "http://" + url -# Extracts the top level domain from the URL (eg. ualberta.ca, no slashes) -seed = tldextract.extract(url).domain - -# Set a header to pretend it's a browser -headers = requests.utils.default_headers() -headers.update ( - { - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0', - } -) - -#os.mkdir(target_dir) # make a timestampted folder -# Check if the original directory exists -if os.path.isdir(directory): - #shutil.copy(directory + "/_planned_urls.txt", target_dir) - #shutil.copy(directory + "/_empty_requests.txt", target_dir) - #shutil.copy(directory + "/_visited_urls.txt", target_dir) - #shutil.copy(directory + "/_crawled_urls.txt", target_dir) - shutil.copytree(directory, target_dir) - os.chdir(target_dir) # then change directory to that folder - - #count number visited - with open("_visited_urls.txt") as f: - for i, l in enumerate(f, start=1): - pass - page = i - - #create array of planned - with open("_planned_urls.txt") as f: - content = f.readlines() - #remove whitespace characters like `\n` at the end of each line - planned = content[page-1:] - plannedURLsArray = [x.strip() for x in planned] - - with open("_crawled_urls.txt") as f: - content = f.readlines() - #remove whitespace characters like `\n` at the end of each line - crawledURLsArray = [x.strip() for x in content] - - logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) - # file to log empty requests into - empty_request_log = codecs.open("_empty_requests.txt", "a", "utf-8-sig") - # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) - planned_urls = codecs.open("_planned_urls.txt", "a", "utf-8-sig") - # file to log visited urls into - URLs that have been requested and have the html - visited_urls = codecs.open("_visited_urls.txt", "a", "utf-8-sig") - # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging - crawled_urls = codecs.open("_crawled_urls.txt", "a", "utf-8-sig") - - -else: - os.mkdir(target_dir) # make a timestampted folder - os.chdir(target_dir) # then change directory to that folder - # Create a log file in the folder that was just created - logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) - # file to log empty requests into - empty_request_log = codecs.open("_empty_requests.txt", "w", "utf-8-sig") - # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) - planned_urls = codecs.open("_planned_urls.txt", "w", "utf-8-sig") - plannedURLsArray = [] - # file to log visited urls into - URLs that have been requested and have the html - visited_urls = codecs.open("_visited_urls.txt", "w", "utf-8-sig") - # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging - crawled_urls = codecs.open("_crawled_urls.txt", "w", "utf-8-sig") - crawledURLsArray = [] - page = 1 - - -dsize = 0 -depth = [dsize] - -# Function that checks if the link provided is in the same domain as the seed -def checkDomain(link): - link_domain = tldextract.extract(link) - return (link_domain.domain == seed) - - -# Fuction for requesting url -# Given a URL, go to that url and get the html and return it -# Called from main function -def request_url(url): - global headers - # Log that this URL is being saved - logging.info('Requesting ' + url) - visited_urls.write(url) - visited_urls.write("\n") - # Use requests module to get html from url as an object - try: - source_code = requests.get(url, headers=headers, timeout=1) # variable = requests.get(url) - # Get source code of page as text - html = source_code.text - except Timeout as e: - logging.warn('Connection timed out ' + url) - html = '' - except: - logging.warn('Couldn\'t request ' + url) - html = '' - return html - - -# Function to create a filename out of a string -# Called from create_name -def format_filename(name): - #Taken from: https://gist.github.com/seanh/93666 - """Take a string and return a valid filename constructed from the string. - Uses a whitelist approach: any characters not present in valid_chars are - removed. Also spaces are replaced with underscores.""" - valid_chars = "-_() %s%s" % (string.ascii_letters, string.digits) - filename = ''.join(c for c in name if c in valid_chars) - # Remove spaces in filename - filename = filename.strip() - filename = filename.replace(' ','_') - return filename - - -# Function for creating name -# Use the title of the html page as the title of the text file -# Called from main function -def create_name (html): - name_list = (html.partition("")[-1] - if name: - # removes invalid characters from title - name = format_filename(name) - logging.info('Created name ' + name) - else: - name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp - logging.warn('Failed to create a name, using \'' + name + '\' instead') - return name - - -#Function for deleting paired single or double quotes -def dequote(s): - """ - If a string has single or double quotes around it, remove them. - Make sure the pair of quotes match. - If a matching pair of quotes is not found, return the string unchanged. - """ - if (len(s)>= 2 and s[0] == s[-1]) and s.startswith(("'", '"')): - return s[1:-1] - return s - - -#Function that takes link, saves the contents to text file call href_split -def test_split(max_pages): - logging.info("Crawling through domain '" + seed + "'") - - if page == 1: - # Array that holds the queue to be visited later - plannedURLsArray.append(url) - # Logging the urls - planned_urls.write(url) - planned_urls.write("\n") - - # Gets the root of the url - url_split = url.split("://", 1) - # Array that holds urls that have been found. - # This is the array that all new URLs are checked against to prevent repeating. - # Record URL with both http and https prefixes - crawledURLsArray.append("http://" + url_split[1]) - crawledURLsArray.append("https://" + url_split[1]) - # Also log the same into the text file - crawled_urls.write("http://" + url_split[1] + "\n") - crawled_urls.write("https://" + url_split[1] + "\n") - - # Create an array of queue size on each level of the tree. Used to stop the crawler at a certain depth. - # Alas, it appears to be broken... - while page <= max_pages and dsize <= depth_to_go and len(plannedURLsArray) > 0: - save_current_link() - - - -def save_current_link (): - global dsize - global page - - html = '' - # Try to get the html of the URL - try: - html = request_url(plannedURLsArray[0]) - except: - logging.warn('Error while requesting an html response ' + plannedURLsArray[0]) - - if html: - # Gets the name for the file to store the html text in - name = create_name(html) - # Adds the .txt to the end of the name - name = "{0}.txt".format(name) - try: - # Check if file with given name exists - if os.path.isfile(name): - # If exists, add timestamp to name to make it unique. - name = name[:name.find(".")] + "_" + str(time.time()) + ".txt" - - # Open/create the file with that name - fo = codecs.open(name, "w", "utf-8-sig") - # Write URL to that file - fo.write("\n") - # Append the html to the file - fo.write(html) - # Close the pipe to the file - fo.close() - # Log the creation of the file - logging.info('Created file ' + name) - - #find and process all links - process_links(html, plannedURLsArray[0]) - except: - logging.warning("Can not encode file: " + plannedURLsArray[0]) - # Else: html does not exist or is empty. Log error - else: - logging.warning('Request for ' + url + ' returned empty html') - empty_request_log.write(url) - empty_request_log.write("\n") - - print(plannedURLsArray[0]) - # Update on the depth it is at - print("depth:", dsize) - # Update on the total number of pages - print("iterations:", page, "pages") - print("\n") - # Deletes the currently looked at URL from the queue - plannedURLsArray.pop(0) - # Check if given depth has been reached - if page >= depth[dsize]: - depth.append(page + len(plannedURLsArray)) - dsize += 1 - - # Increment page count - page += 1 - # Checks the size of the folder. Prints the amount of data collected in GB to the console and log file - if page%10 == 0: - size_of_directory = get_tree_size(os.curdir) / 1000000000 - print("Size: ", str(round(size_of_directory, 5)), "GB") - print('\n') - logging.info("Size: " + str(round(size_of_directory, 5)) + "GB") - # Prints in the log file the length of time the crawler has been running in seconds - logging.info("Has been running for " + str(time.time() - start_time) + " seconds") - # Time delay in seconds to prevent crashing the server - time.sleep(.01) - - -#Function for splitting html into links -def href_split (html): - links = [] - if html.partition('')[0] - href = href.partition(' ')[0] - href = dequote(href) - links.append(href) - return links - - -#Take an array of links, run the split on each and add the results to the appropriate arrays and files -def process_links(html, cur_domain): - if html.partition('')[0] - href = href.partition(' ')[0] - href = dequote(href) - new_link = (urllib.parse.urldefrag(href)[0]).rstrip('/') - new_link = urllib.parse.urljoin(cur_domain, new_link) - if checkDomain(new_link): - # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray - if new_link not in crawledURLsArray: - # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link: - #???TODO: add checks for www.domain.com and https:// - # Adds new link to array - plannedURLsArray.append(new_link) - # Adds new link to queue file - planned_urls.write(new_link) - planned_urls.write("\n") - - # Remove the front of the URL (http or https) - http_split = new_link.split("://", 1) - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") - # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") - - -# Return total size of files in given path and subdirs by going through the tree. -# Recursive. -# Called from main function -def get_tree_size(path): - total = 0 - for entry in scandir(path): - if entry.is_dir(follow_symlinks=False): - total += get_tree_size(entry.path) - else: - total += entry.stat(follow_symlinks=False).st_size - return total - -# Get the time that the command was run -start_time = time.time() -# Call main function -"""trade_spider(iterate)""" -test_split(iterate) -# Get the time that the command finished -end_time = time.time() -# Print overall time taken to console -print("Overall time: " + str((end_time - start_time))) -# Log overall time and save to main log file -logging.info("Overall time: " + str((end_time - start_time))) -# Close all the things/pipes to files -empty_request_log.close() -visited_urls.close() -planned_urls.close() -crawled_urls.close() \ No newline at end of file From 518460949fc6cd7bd3d0ad8a8de0ebf62c53cd59 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Mon, 18 Dec 2017 23:09:06 -0700 Subject: [PATCH 04/50] Integrate Keyword Search and CSV Serialization Serializes CSV file with frequency of keywords found on page Signed-off-by: Antony Oduor --- .../Crawler/crawlerExpand.py | 66 +++++++++++++++++-- Search-Engine-and-Crawler/Crawler/utils.py | 60 +++++++++++++++++ 2 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 Search-Engine-and-Crawler/Crawler/utils.py diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index e623fe1..59c3208 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -2,6 +2,7 @@ from bs4 import BeautifulSoup import urllib.parse import os.path +import os import sys import tldextract import time @@ -9,17 +10,26 @@ import string import shutil import re +from collections import Counter try: from os import scandir, walk except ImportError: from scandir import scandir, walk import logging +from utils import get_file_content_as_list, count_keywords, write_csv + #Pay attention to robots.txt # current time, used in the names of the folder and the logging file curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) +# this file should live in the same directory as the script +keywords_file = "keywords.txt" + +# the output file of all observed keyword frequencies +csv_file_name = "results.csv" + # Arguments in order: url, total pages to look at, depth, first part of directory name # url to start from url = sys.argv[1] @@ -64,13 +74,13 @@ # Copy the contents of the existing directory to a new timestamped one shutil.copytree(directory, target_dir) os.chdir(target_dir) # then change directory to that folder - + # Open the visited_urls text file and count the number of lines in it – that's how many pages the script visited throughout its previous runs with open("_visited_urls.txt") as f: for i, l in enumerate(f, start=1): pass page = i - + # Open the file with planned urls and add them to the array of planned urls with open("_planned_urls.txt") as f: content = f.readlines() @@ -97,9 +107,11 @@ else: + current_dir = os.getcwd() # Start a new script run os.mkdir(target_dir) # make a timestampted folder os.chdir(target_dir) # then change directory to that folder + shutil.copyfile(current_dir + "/" + keywords_file, keywords_file) # jump into working directory # Create a log file in the folder that was just created logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) # file to log empty requests into @@ -134,7 +146,7 @@ def request_url(url): try: r = requests.get(url, headers=headers) if r.ok: - if "text/html" in r.headers["content-type"]: + if "text/html" in r.headers["content-type"]: html = r.text return html return None @@ -145,7 +157,7 @@ def request_url(url): except Exception: logging.exception("Couldn\'t request " + url) return None - + # Function to create a filename out of a string # Called from create_name def format_filename(name): @@ -243,12 +255,13 @@ def process_current_link (): print(plannedURLsArray[0]) # Try to get the html of the URL html = request_url(plannedURLsArray[0]) + current_url = plannedURLsArray[0] if html: #if the request returned an html # Soupify # For now it soupifies the link regardless of the mode, because it uses soup later to extract visible text from the page soup = BeautifulSoup(html, 'html5lib') - + if mode=="no_soup": # Gets the name for the file to store the html text in name = create_name_from_html(html) @@ -264,6 +277,46 @@ def process_current_link (): # Find only visible text visible_text = extract_text(soup) + from pprint import pprint + # read keywords from file into a list + keywords = get_file_content_as_list(keywords_file) + # make the keywords lowercase + keywords = [x.lower() for x in keywords] + # make keywords dictionary with zero frequency as value + all_keywords = dict((el,0) for el in keywords) + + visible_text_list = visible_text.splitlines() + visible_text_list = [x.lower() for x in visible_text_list] + + # counts keywords in page + found_count, found_keywords = count_keywords(visible_text_list, keywords) + + found_keywords_freq_dict = Counter(found_keywords) + + all_keywords_dict = Counter(all_keywords) + # combine both dicts to have uniform dictionary for all pages + all_keywords_dict.update(found_keywords_freq_dict) + # after merging, sort the resulting dictionary based on keys to make + # a tuples list that is always uniform for every page + sorted_keywords_list = sorted(all_keywords_dict.items()) + + # create a sorted dictionary list + final_csv_dict = [] + final_csv_dict.append({x:y for x,y in sorted_keywords_list}) + + # extract a sorted list of keywords to write as CSV headers + headers = [str(x) for x, y in sorted_keywords_list] + # prepend url header onto the keywords list + headers.insert(0, u'url') + headers.insert(1, u'frequency_sum') + logging.info(headers) + + # prepend the current URL onto the frequencies dict object + final_csv_dict[0]['frequency_sum']= sum(final_csv_dict[0].values()) + final_csv_dict[0]['url']= current_url + + write_csv(csv_file_name, headers, final_csv_dict) + if visible_text: #save it as a text file try: # Create and open the file with that name @@ -419,7 +472,7 @@ def extract_text(soup): if t.strip(): all_text += t + "\n" return all_text - + # check that the given soup element is a visible text element # called from extract_text @@ -484,4 +537,3 @@ def shut_down(): shut_down() except Exception: logging.exception("Error while running script") - \ No newline at end of file diff --git a/Search-Engine-and-Crawler/Crawler/utils.py b/Search-Engine-and-Crawler/Crawler/utils.py new file mode 100644 index 0000000..c4d672d --- /dev/null +++ b/Search-Engine-and-Crawler/Crawler/utils.py @@ -0,0 +1,60 @@ +import re +import csv +import logging +import os + + +def get_file_content_as_list(file_name): + """Give a filename, open and read the contents into a list + file_name - file to be opened + return list of words + """ + with open(file_name, 'r') as file_name_handle: + return file_name_handle.read().splitlines() + + +def count_keywords(list_of_tokens, list_of_target_words): + """Counts how many instances of the keywords were found + list_of_tokens - The list of words as haystack + list_of_target_words - The list of words as needle + return number of words, list of keywords found + + Inspiration: http://www.cademuir.eu/blog/2011/10/20/python-searching-for-a-string-within-a-list-list-comprehension/ + """ + num_target_words = 0 + matched_words = [] + for token in list_of_target_words: # Goes through the tokens in the list + regex = re.compile(".*({}).*".format(token)) + # found_what = [m.group(0) for l in list_of_target_words for m in [regex.search(l)] if m] + found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m] + if len(found_what) > 0: # For each one it checks if it is in the target list + num_target_words += 1 + matched_words.append(token) + return num_target_words, matched_words # Note that we are returning a tuple (2 values) + + +def write_csv(output_file, keywords_header, keywords_x_freqs): + """Write a CSV file in the format url, , , , ... + output_file - the name of created CSV file + keywords_header - list with all the keywords to create header row of CSV + keywords_x_freqs - dictionary list with keywords and frequencies + return boolean + """ + try: + if os.path.exists(output_file): + append_write = 'a' # append if already exists + else: + append_write = 'w' # make a new file if not + + with open(output_file, append_write) as f: + # Using dictionary keys as fieldnames for the CSV file header + writer = csv.DictWriter(f, keywords_header) + if append_write == 'w': + writer.writeheader() + + for d in keywords_x_freqs: + writer.writerow(d) + return True + except Exception as e: + logging.error('Something bad happend while writing CSV:' + str(e)) + return False From c55082f47a9f408ff9e9243323b17ddc32706c6b Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Mon, 18 Dec 2017 23:47:24 -0700 Subject: [PATCH 05/50] Add README File For Crawler Signed-off-by: Antony Oduor --- Search-Engine-and-Crawler/Crawler/README.md | 18 ++++++++++++++++++ .../Crawler/crawlerExpand.py | 1 - 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 Search-Engine-and-Crawler/Crawler/README.md diff --git a/Search-Engine-and-Crawler/Crawler/README.md b/Search-Engine-and-Crawler/Crawler/README.md new file mode 100644 index 0000000..6eff4b5 --- /dev/null +++ b/Search-Engine-and-Crawler/Crawler/README.md @@ -0,0 +1,18 @@ +## Setting up python environment +``` +virtualenv -p python3.5 env3.5 +``` + +## Requirements +``` +pip install requests +pip install bs4 +``` + +### keywords +Create a `keywords.txt` file on this directory with a list of keywords to look for. Each keyword is on a new line. + +### Sample usage +``` +python crawlerExpand.py [URL] 10 50 lefolder +``` diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 59c3208..6eb75ff 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -1,7 +1,6 @@ import requests from bs4 import BeautifulSoup import urllib.parse -import os.path import os import sys import tldextract From 8bfa8a80f459cbf4b069ebdff92ee10adf7beb52 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Tue, 19 Dec 2017 11:22:36 -0700 Subject: [PATCH 06/50] Also Export a Sorted Frequency CSV Signed-off-by: Antony Oduor --- Search-Engine-and-Crawler/Crawler/README.md | 1 + Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/Search-Engine-and-Crawler/Crawler/README.md b/Search-Engine-and-Crawler/Crawler/README.md index 6eff4b5..5e01a11 100644 --- a/Search-Engine-and-Crawler/Crawler/README.md +++ b/Search-Engine-and-Crawler/Crawler/README.md @@ -7,6 +7,7 @@ virtualenv -p python3.5 env3.5 ``` pip install requests pip install bs4 +pip install pandas ``` ### keywords diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 6eb75ff..a56ccfb 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -10,6 +10,7 @@ import shutil import re from collections import Counter +import pandas as pd try: from os import scandir, walk except ImportError: @@ -509,6 +510,11 @@ def shut_down(): global planned_urls global crawled_urls + df = pd.read_csv(csv_file_name) + df = df.sort_values(['frequency_sum'], ascending=[0]) + sorted_csv_file_name = "results_sorted.csv" + df.to_csv(sorted_csv_file_name, index=False) + # Get the time that the command finished end_time = time.time() # Print overall time taken to console From b4b875dde15bb1b88c8d43535cc498d5d3297a89 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Tue, 19 Dec 2017 22:41:49 -0700 Subject: [PATCH 07/50] Update README With Dependencies Signed-off-by: Antony Oduor --- Search-Engine-and-Crawler/Crawler/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Search-Engine-and-Crawler/Crawler/README.md b/Search-Engine-and-Crawler/Crawler/README.md index 5e01a11..bdb70df 100644 --- a/Search-Engine-and-Crawler/Crawler/README.md +++ b/Search-Engine-and-Crawler/Crawler/README.md @@ -7,6 +7,8 @@ virtualenv -p python3.5 env3.5 ``` pip install requests pip install bs4 +pip install tldextract +pip install html5 pip install pandas ``` From 740efc0210d306199e9e4a30cc307a47f2bb8624 Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG Date: Wed, 20 Dec 2017 18:15:42 -0700 Subject: [PATCH 08/50] 1) Fixed the issue with the recursive urljoin 2) Added a filter_title_regex variable that allows users to add a regex to search in the title of every page to decide whether every link on the page should be scraped (e.g. in case of pages that contain links to all academic programs at the university) 3) Added keyword files to the main repository 4) Expanded the regex currently in use to filter links 5) Added a system that allows the scraper to go one domain away from the seed (but not further) to account for university media centers that have a separate web domain. --- .../Crawler/crawlerExpand.py | 65 ++++++++++++------- .../Crawler/keywords.txt | 25 +++++++ .../Crawler/keywords_game.txt | 25 +++++++ 3 files changed, 92 insertions(+), 23 deletions(-) create mode 100644 Search-Engine-and-Crawler/Crawler/keywords.txt create mode 100644 Search-Engine-and-Crawler/Crawler/keywords_game.txt diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index a56ccfb..9b7c25d 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -43,7 +43,8 @@ # RegEx that is used to filter searches for URLs on any given page. #Used in is_relevant_link_from_soup and is_relevant_link_from_html functions -filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission).*") +filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment).*") +filter_title_regex = re.compile(".*[Pp]rograms.*") # Var to choose mode # "soup" uses BeautifulSoup to assign a name to a page and to search the page for URLs @@ -61,7 +62,7 @@ headers = requests.utils.default_headers() headers.update ( { - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) @@ -127,9 +128,20 @@ page = 1 # Function that checks if the link provided is in the same domain as the seed -def checkDomain(link): - link_domain = tldextract.extract(link) - return (link_domain.domain == seed) +def checkDomain(new_link, cur_link): + new_link_domain = tldextract.extract(new_link).domain + # 1) check if new_link is in seed, if yes -> OK + if (new_link_domain == seed): + return True + # 2) check if cur_link is in seed (you came from the seed even if you're in a different domain now), if yes -> OK + cur_link_domain = tldextract.extract(cur_link).domain + if (cur_link_domain == seed): + return True + # 3) check if the new link is in the same domain as the cur link (you're still in the same domain, even though it's different from seed), if yes -> OK + if (new_link_domain == cur_link_domain): + return True + # otherwise, you're trying to leave a domain that's already not the seed, you should STOP + return False # Fuction for requesting url @@ -147,8 +159,7 @@ def request_url(url): r = requests.get(url, headers=headers) if r.ok: if "text/html" in r.headers["content-type"]: - html = r.text - return html + return r return None except KeyboardInterrupt: print("\n\nScript interrupted by user. Shutting down.") @@ -248,28 +259,33 @@ def crawl(max_pages): process_current_link() +def is_title_page_relevant(soup): + return True if soup.find('title', string=filter_title_regex) else False + # Function that grabs the first link in the list of planned urls, requests the page and processes it def process_current_link (): global page print(plannedURLsArray[0]) # Try to get the html of the URL - html = request_url(plannedURLsArray[0]) - current_url = plannedURLsArray[0] + r = request_url(plannedURLsArray[0]) - if html: #if the request returned an html + if r: #if the request returned an html + html = r.text + current_url = r.url # Soupify # For now it soupifies the link regardless of the mode, because it uses soup later to extract visible text from the page soup = BeautifulSoup(html, 'html5lib') + grab_all = is_title_page_relevant(soup) if mode=="no_soup": # Gets the name for the file to store the html text in name = create_name_from_html(html) #find and process all links - process_links_from_html(html, plannedURLsArray[0]) + process_links_from_html(html, current_url, grab_all) else: name = create_name_from_soup(soup) - process_links_from_soup(soup, plannedURLsArray[0]) + process_links_from_soup(soup, current_url, grab_all) # Adds the .txt to the end of the name name = "{0}.txt".format(name) @@ -309,7 +325,7 @@ def process_current_link (): # prepend url header onto the keywords list headers.insert(0, u'url') headers.insert(1, u'frequency_sum') - logging.info(headers) + #logging.info(headers) # prepend the current URL onto the frequencies dict object final_csv_dict[0]['frequency_sum']= sum(final_csv_dict[0].values()) @@ -323,7 +339,7 @@ def process_current_link (): fo = codecs.open(name, "w", "utf-8-sig") # Write URL to that file fo.write("\n") # Append the html to the file fo.write(visible_text) @@ -337,7 +353,7 @@ def process_current_link (): logging.info("Script interrupted by user") shut_down() except Exception: - logging.exception("Can not encode file: " + plannedURLsArray[0]) + logging.exception("Can not encode file: " + current_url) else: print('No visible text in ' + url) logging.warning('No visible text in ' + url) @@ -379,13 +395,15 @@ def is_relevant_link_from_soup(link): # for each link checks if it's relevant # for each relevant link, saves it to the planned urls array (if it hasn't been crawled yet) # and to the crawled urls array (so that we don't save it a second time later) -def process_links_from_soup (soup, cur_domain): +def process_links_from_soup (soup, cur_link, grab_all=False): + # check if the title of the current page matches the filter_title_regex for lnk in soup.findAll('a', href=True): - if is_relevant_link_from_soup(lnk): + # if not, check if the the link itself is relevant + if (grab_all or is_relevant_link_from_soup(lnk)): new_link = (urllib.parse.urldefrag(lnk['href'])[0]).rstrip('/') - new_link = urllib.parse.urljoin(cur_domain, new_link) + new_link = urllib.parse.urljoin(cur_link, new_link) # if the link is in our main domain - if checkDomain(new_link): + if checkDomain(new_link, cur_link): # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray if new_link not in crawledURLsArray: # Ensures no jpg or pdfs are stored and that no mailto: links are stored. @@ -420,19 +438,20 @@ def is_relevant_link_from_html(link): #return True #Uncomment to grab all links #Take an array of links, run the split on each and add the results to the appropriate arrays and files -def process_links_from_html (html, cur_domain): +def process_links_from_html (html, cur_link, grab_all=False): + print("grabbing all: ", str(grab_all)) if html.partition('')[0] href = href.partition(' ')[0] href = dequote(href) new_link = (urllib.parse.urldefrag(href)[0]).rstrip('/') - new_link = urllib.parse.urljoin(cur_domain, new_link) - if checkDomain(new_link): + new_link = urllib.parse.urljoin(cur_link, new_link) + if checkDomain(new_link, cur_link): # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray if new_link not in crawledURLsArray: # Ensures no jpg or pdfs are stored and that no mailto: links are stored. diff --git a/Search-Engine-and-Crawler/Crawler/keywords.txt b/Search-Engine-and-Crawler/Crawler/keywords.txt new file mode 100644 index 0000000..f6dea1b --- /dev/null +++ b/Search-Engine-and-Crawler/Crawler/keywords.txt @@ -0,0 +1,25 @@ +game +game studies +games studies +game design +game research +application +digital +media +communication +visual +level +animation +programming +illustration +interactive +mobile +development +design +3D +scripting +computational +entertainment +graphic +storytelling +virtual diff --git a/Search-Engine-and-Crawler/Crawler/keywords_game.txt b/Search-Engine-and-Crawler/Crawler/keywords_game.txt new file mode 100644 index 0000000..bb0203f --- /dev/null +++ b/Search-Engine-and-Crawler/Crawler/keywords_game.txt @@ -0,0 +1,25 @@ +Games +Games studies +game programming +game engine scripting +Game Design +game writing +game production +games and learning +game research +games and society +Game studies +game(s) studies +game analysis +game critique +gaming +game art +game AI +video games +computer games +console games +mobile games +web games +mobile game +serious games +critical games From cd2f431d0ae70f0877122c3d87aa8760a1b2e01e Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG Date: Wed, 20 Dec 2017 19:50:10 -0700 Subject: [PATCH 09/50] Added a list of popular domains we don't want to crawl --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 9b7c25d..3099f70 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -30,6 +30,10 @@ # the output file of all observed keyword frequencies csv_file_name = "results.csv" +# this should be a file input later but for now it's an array +# an array of popular domains that university websites link to but we don't want to crawl +ignore_domains = ["youtube", "facebook", "instagram", "twitter", "linkedin", "google", "pinterest", "snapchat"] + # Arguments in order: url, total pages to look at, depth, first part of directory name # url to start from url = sys.argv[1] @@ -130,6 +134,9 @@ # Function that checks if the link provided is in the same domain as the seed def checkDomain(new_link, cur_link): new_link_domain = tldextract.extract(new_link).domain + # 0) check whether new_link is in the list of popular domains that we don't want to crawl, if yes -> IGNORE IT + if new_link_domain in ignore_domains: + return False # 1) check if new_link is in seed, if yes -> OK if (new_link_domain == seed): return True @@ -545,6 +552,7 @@ def shut_down(): visited_urls.close() planned_urls.close() crawled_urls.close() + sys.exit() From de1b02ef77858d20e97f957b54b81912ddb23e89 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Thu, 21 Dec 2017 12:28:47 -0700 Subject: [PATCH 10/50] Return Correct Number of Keywords Found Previously only returned one or zero, one meant that the word was found in the document but not how many in total. --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 3 ++- Search-Engine-and-Crawler/Crawler/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 3099f70..9114fa0 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -313,8 +313,9 @@ def process_current_link (): # counts keywords in page found_count, found_keywords = count_keywords(visible_text_list, keywords) + found_keywords_as_dict = dict((x, y) for x, y in found_keywords) - found_keywords_freq_dict = Counter(found_keywords) + found_keywords_freq_dict = Counter(found_keywords_as_dict) all_keywords_dict = Counter(all_keywords) # combine both dicts to have uniform dictionary for all pages diff --git a/Search-Engine-and-Crawler/Crawler/utils.py b/Search-Engine-and-Crawler/Crawler/utils.py index c4d672d..bed6f84 100644 --- a/Search-Engine-and-Crawler/Crawler/utils.py +++ b/Search-Engine-and-Crawler/Crawler/utils.py @@ -28,8 +28,8 @@ def count_keywords(list_of_tokens, list_of_target_words): # found_what = [m.group(0) for l in list_of_target_words for m in [regex.search(l)] if m] found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m] if len(found_what) > 0: # For each one it checks if it is in the target list - num_target_words += 1 - matched_words.append(token) + num_target_words = len(found_what) + matched_words.append((token, num_target_words)) return num_target_words, matched_words # Note that we are returning a tuple (2 values) From 9dda4156665996b66f6a4395087c6e7bba700fb6 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Wed, 10 Jan 2018 13:07:04 -0700 Subject: [PATCH 11/50] Update Game Keywords, Add Error Handler For Failed http_split Also update the regex utility for matching keywords --- .../Crawler/crawlerExpand.py | 33 ++++++++++--------- .../Crawler/keywords_game.txt | 13 ++++++++ Search-Engine-and-Crawler/Crawler/utils.py | 4 ++- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 9114fa0..06069fa 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -25,7 +25,7 @@ curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) # this file should live in the same directory as the script -keywords_file = "keywords.txt" +keywords_file = "keywords_game.txt" # the output file of all observed keyword frequencies csv_file_name = "results.csv" @@ -48,7 +48,7 @@ # RegEx that is used to filter searches for URLs on any given page. #Used in is_relevant_link_from_soup and is_relevant_link_from_html functions filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment).*") -filter_title_regex = re.compile(".*[Pp]rograms.*") +filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") # Var to choose mode # "soup" uses BeautifulSoup to assign a name to a page and to search the page for URLs @@ -471,19 +471,22 @@ def process_links_from_html (html, cur_link, grab_all=False): planned_urls.write(new_link) planned_urls.write("\n") - # Remove the front of the URL (http or https) - http_split = new_link.split("://", 1) - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") - # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") + try: + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + except IndexError as e: + logging.info(str(e)) diff --git a/Search-Engine-and-Crawler/Crawler/keywords_game.txt b/Search-Engine-and-Crawler/Crawler/keywords_game.txt index bb0203f..d80a48a 100644 --- a/Search-Engine-and-Crawler/Crawler/keywords_game.txt +++ b/Search-Engine-and-Crawler/Crawler/keywords_game.txt @@ -1,4 +1,5 @@ Games +Game studies Games studies game programming game engine scripting @@ -23,3 +24,15 @@ web games mobile game serious games critical games +game +digital +media +communication +visual +animation +interactive +3D +entertainment +graphic +storytelling +virtual diff --git a/Search-Engine-and-Crawler/Crawler/utils.py b/Search-Engine-and-Crawler/Crawler/utils.py index bed6f84..020efa9 100644 --- a/Search-Engine-and-Crawler/Crawler/utils.py +++ b/Search-Engine-and-Crawler/Crawler/utils.py @@ -20,11 +20,13 @@ def count_keywords(list_of_tokens, list_of_target_words): return number of words, list of keywords found Inspiration: http://www.cademuir.eu/blog/2011/10/20/python-searching-for-a-string-within-a-list-list-comprehension/ + https://developmentality.wordpress.com/2011/09/22/python-gotcha-word-boundaries-in-regular-expressions/ """ num_target_words = 0 matched_words = [] for token in list_of_target_words: # Goes through the tokens in the list - regex = re.compile(".*({}).*".format(token)) + # regex = re.compile(".*({}).*".format(token)) # does match in-word substrings + regex = re.compile(".*(\\b{}\\b).*".format(token)) # match strictly whole words only # found_what = [m.group(0) for l in list_of_target_words for m in [regex.search(l)] if m] found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m] if len(found_what) > 0: # For each one it checks if it is in the target list From 82211e003a032eade9ccd914b6f7cf76536ef352 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 17:47:23 -0700 Subject: [PATCH 12/50] Update games keyword list --- Search-Engine-and-Crawler/Crawler/keywords_game.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/keywords_game.txt b/Search-Engine-and-Crawler/Crawler/keywords_game.txt index d80a48a..86a6636 100644 --- a/Search-Engine-and-Crawler/Crawler/keywords_game.txt +++ b/Search-Engine-and-Crawler/Crawler/keywords_game.txt @@ -16,7 +16,7 @@ game critique gaming game art game AI -video games +video game computer games console games mobile games @@ -27,7 +27,6 @@ critical games game digital media -communication visual animation interactive From a37745443363a08a1f2535e7e8cbf1e072332eb2 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 18:01:38 -0700 Subject: [PATCH 13/50] Remove All CSV Activities From Scraper This is to retain scraping as it was and implement another script to generate the CSV with keyword frequencies. The separation of concerns is to allow downloading pages once but able search mutliple times locally. Signed-off-by: Antony Oduor --- .../Crawler/crawlerExpand.py | 54 +------------------ 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 06069fa..e11c3dc 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -9,16 +9,13 @@ import string import shutil import re -from collections import Counter -import pandas as pd + try: from os import scandir, walk except ImportError: from scandir import scandir, walk import logging -from utils import get_file_content_as_list, count_keywords, write_csv - #Pay attention to robots.txt # current time, used in the names of the folder and the logging file @@ -27,9 +24,6 @@ # this file should live in the same directory as the script keywords_file = "keywords_game.txt" -# the output file of all observed keyword frequencies -csv_file_name = "results.csv" - # this should be a file input later but for now it's an array # an array of popular domains that university websites link to but we don't want to crawl ignore_domains = ["youtube", "facebook", "instagram", "twitter", "linkedin", "google", "pinterest", "snapchat"] @@ -300,47 +294,6 @@ def process_current_link (): # Find only visible text visible_text = extract_text(soup) - from pprint import pprint - # read keywords from file into a list - keywords = get_file_content_as_list(keywords_file) - # make the keywords lowercase - keywords = [x.lower() for x in keywords] - # make keywords dictionary with zero frequency as value - all_keywords = dict((el,0) for el in keywords) - - visible_text_list = visible_text.splitlines() - visible_text_list = [x.lower() for x in visible_text_list] - - # counts keywords in page - found_count, found_keywords = count_keywords(visible_text_list, keywords) - found_keywords_as_dict = dict((x, y) for x, y in found_keywords) - - found_keywords_freq_dict = Counter(found_keywords_as_dict) - - all_keywords_dict = Counter(all_keywords) - # combine both dicts to have uniform dictionary for all pages - all_keywords_dict.update(found_keywords_freq_dict) - # after merging, sort the resulting dictionary based on keys to make - # a tuples list that is always uniform for every page - sorted_keywords_list = sorted(all_keywords_dict.items()) - - # create a sorted dictionary list - final_csv_dict = [] - final_csv_dict.append({x:y for x,y in sorted_keywords_list}) - - # extract a sorted list of keywords to write as CSV headers - headers = [str(x) for x, y in sorted_keywords_list] - # prepend url header onto the keywords list - headers.insert(0, u'url') - headers.insert(1, u'frequency_sum') - #logging.info(headers) - - # prepend the current URL onto the frequencies dict object - final_csv_dict[0]['frequency_sum']= sum(final_csv_dict[0].values()) - final_csv_dict[0]['url']= current_url - - write_csv(csv_file_name, headers, final_csv_dict) - if visible_text: #save it as a text file try: # Create and open the file with that name @@ -540,11 +493,6 @@ def shut_down(): global planned_urls global crawled_urls - df = pd.read_csv(csv_file_name) - df = df.sort_values(['frequency_sum'], ascending=[0]) - sorted_csv_file_name = "results_sorted.csv" - df.to_csv(sorted_csv_file_name, index=False) - # Get the time that the command finished end_time = time.time() # Print overall time taken to console From 5e4b05b43a0703ec40eb0877a43aeec996bc83f3 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 22:59:35 -0700 Subject: [PATCH 14/50] Remove page_url Markup in Text Content This is to make extracting URL faster from the saved files when consuming the local pages. --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index e11c3dc..2af014f 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -299,9 +299,7 @@ def process_current_link (): # Create and open the file with that name fo = codecs.open(name, "w", "utf-8-sig") # Write URL to that file - fo.write("\n") + fo.write(current_url + "\n") # Append the html to the file fo.write(visible_text) # Close the pipe to the file From fa03f0a530a1b369b1cab13d8aff391f60e03426 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 23:23:12 -0700 Subject: [PATCH 15/50] Initial Keyword Frequency CSV Script This is command line tool run separately on the folder to search for keywords in scraped files. --- Search-Engine-and-Crawler/Crawler/utils.py | 167 ++++++++++++++++++++- 1 file changed, 162 insertions(+), 5 deletions(-) mode change 100644 => 100755 Search-Engine-and-Crawler/Crawler/utils.py diff --git a/Search-Engine-and-Crawler/Crawler/utils.py b/Search-Engine-and-Crawler/Crawler/utils.py old mode 100644 new mode 100755 index 020efa9..f3d654e --- a/Search-Engine-and-Crawler/Crawler/utils.py +++ b/Search-Engine-and-Crawler/Crawler/utils.py @@ -1,7 +1,152 @@ +#!/usr/bin/env python +import argparse +import sys import re import csv import logging import os +import glob +from collections import Counter +import pandas as pd +from tqdm import tqdm + + +logger = logging.getLogger(__name__) + + +def main(): + + parser = argparse.ArgumentParser( + description='Generate a sorted CSV file with keyword frequencies' + ' from scraped web pages.' + ) + + parser.add_argument( + '-f', + '--folder', + dest='folder_name', + default=None, + required=True, + help='Name of directory with scraped pages (mandatory)' + ) + parser.add_argument( + '-k', + '--keywords_file', + dest='keywords_file', + default=None, + required=True, + help='File with keywords to search for in the directory (mandatory)' + ) + + args = parser.parse_args() + folder_name = args.folder_name + keywords_file = args.keywords_file + + current_working_dir = os.getcwd() # current directory we are standing on + # the output files of all observed keyword frequencies + csv_file_name = "{}_results.csv".format(folder_name) + sorted_csv_file_name = "{}_results_sorted.csv".format(folder_name) + + # with every run, remove any older result CSVs for the folder + try: + os.remove(csv_file_name) + os.remove(sorted_csv_file_name) + except FileNotFoundError as e: + pass + + # given the name of the folder, this gets all the saved page files as + # a list + all_txt_files = glob.glob( + os.path.join(current_working_dir, + "{}_*/*.*.txt".format(folder_name)), + recursive=False + ) + + # Not a good sign if list if empty... + if not all_txt_files: + logger.error("{}: Folder is empty or does not exist.". + format(folder_name)) + sys.exit() + + # read keywords from file into a list + keywords = get_file_content_as_list(keywords_file) + # make the keywords lowercase + keywords = [x.lower() for x in keywords] + # make keywords dictionary with zero frequency as value + all_keywords = dict((strip_weights(el)[0], 0) for el in keywords) + + pbar = tqdm(total=len(all_txt_files)) + tqdm.write("Found {} files to search. Please wait.". + format(len(all_txt_files))) + for idx, txt_file in enumerate(all_txt_files): + with open(txt_file) as fp: + visible_text_list = fp.readlines() + current_url = visible_text_list[0].strip().rstrip() + num_digits = len(str(len(all_txt_files))) + tqdm.write("{0:0{width}d}) Done! {1}". + format(idx+1, current_url, width=num_digits)) + + visible_text_list = [x.lower() for x in visible_text_list] + + # counts keywords in page + found_count, found_keywords = count_keywords( + visible_text_list, + keywords + ) + found_keywords_as_dict = dict((x, y) for x, y in found_keywords) + + found_keywords_freq_dict = Counter(found_keywords_as_dict) + + all_keywords_dict = Counter(all_keywords) + # combine both dicts to have uniform dictionary for all pages + all_keywords_dict.update(found_keywords_freq_dict) + # after merging, sort the resulting dictionary based on keys to + # make a tuples list that is always uniform for every page + sorted_keywords_list = sorted(all_keywords_dict.items()) + + # create a sorted dictionary list + final_csv_dict = [] + final_csv_dict.append({x: y for x, y in sorted_keywords_list}) + + # extract a sorted list of keywords to write as CSV headers + headers = [str(x) for x, y in sorted_keywords_list] + # prepend url header onto the keywords list + headers.insert(0, u'url') + headers.insert(1, u'frequency_sum') + # logger.info(headers) + + # prepend the current URL onto the frequencies dict object + final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values()) + final_csv_dict[0]['url'] = current_url + + write_csv(csv_file_name, headers, final_csv_dict) + pbar.update(1) + + sort_csv(csv_file_name, sorted_csv_file_name) + pbar.close() + + +def sort_csv(csv_input, csv_output): + """Uses pandas to sort the CSV from the highest frequency + summation to the lowest. + """ + df = pd.read_csv(csv_input) + df = df.sort_values(['frequency_sum'], ascending=[0]) + df.to_csv(csv_output, index=False) + + +def strip_weights(token): + """Extracts the weights from keywords from the file + Return keyword and assigned weight if any otherwise default weight one + """ + try: + weighted_token = token.split("|", 1)[0].strip() + token_weight = token.split("|", 1)[1] + except IndexError as e: # catch IndexError since no weight is observed + weighted_token = token.strip() + token_weight = 1 + + return weighted_token, token_weight def get_file_content_as_list(file_name): @@ -25,13 +170,15 @@ def count_keywords(list_of_tokens, list_of_target_words): num_target_words = 0 matched_words = [] for token in list_of_target_words: # Goes through the tokens in the list + weighted_token, token_weight = strip_weights(token) + # regex = re.compile(".*({}).*".format(token)) # does match in-word substrings - regex = re.compile(".*(\\b{}\\b).*".format(token)) # match strictly whole words only + regex = re.compile(".*(\\b{}\\b).*".format(weighted_token)) # match strictly whole words only # found_what = [m.group(0) for l in list_of_target_words for m in [regex.search(l)] if m] found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m] if len(found_what) > 0: # For each one it checks if it is in the target list - num_target_words = len(found_what) - matched_words.append((token, num_target_words)) + num_target_words = len(found_what)*int(token_weight) + matched_words.append((weighted_token, num_target_words)) return num_target_words, matched_words # Note that we are returning a tuple (2 values) @@ -48,7 +195,7 @@ def write_csv(output_file, keywords_header, keywords_x_freqs): else: append_write = 'w' # make a new file if not - with open(output_file, append_write) as f: + with open(output_file, append_write, encoding="utf-8") as f: # Using dictionary keys as fieldnames for the CSV file header writer = csv.DictWriter(f, keywords_header) if append_write == 'w': @@ -58,5 +205,15 @@ def write_csv(output_file, keywords_header, keywords_x_freqs): writer.writerow(d) return True except Exception as e: - logging.error('Something bad happend while writing CSV:' + str(e)) + logger.error('Something bad happend while writing CSV:' + str(e)) return False + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt as e: + logger.info("Script interrupted by user") + try: + sys.exit(0) + except SystemExit: + os._exit(0) From 71d7c9f98f10e0f9221449859c0fcc4708f57043 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 23:27:53 -0700 Subject: [PATCH 16/50] Rename utils to search --- Search-Engine-and-Crawler/Crawler/{utils.py => search.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Search-Engine-and-Crawler/Crawler/{utils.py => search.py} (100%) diff --git a/Search-Engine-and-Crawler/Crawler/utils.py b/Search-Engine-and-Crawler/Crawler/search.py similarity index 100% rename from Search-Engine-and-Crawler/Crawler/utils.py rename to Search-Engine-and-Crawler/Crawler/search.py From 25db775655527e5cf62f6f212e6e1c97af2c5200 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 23:46:46 -0700 Subject: [PATCH 17/50] Rearrange the Module to Allow Graceful Shutdown Add Keyboard interruption handling to sort results until that point --- Search-Engine-and-Crawler/Crawler/search.py | 65 +++++++++++---------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/search.py b/Search-Engine-and-Crawler/Crawler/search.py index f3d654e..ca46d06 100755 --- a/Search-Engine-and-Crawler/Crawler/search.py +++ b/Search-Engine-and-Crawler/Crawler/search.py @@ -16,36 +16,7 @@ def main(): - parser = argparse.ArgumentParser( - description='Generate a sorted CSV file with keyword frequencies' - ' from scraped web pages.' - ) - - parser.add_argument( - '-f', - '--folder', - dest='folder_name', - default=None, - required=True, - help='Name of directory with scraped pages (mandatory)' - ) - parser.add_argument( - '-k', - '--keywords_file', - dest='keywords_file', - default=None, - required=True, - help='File with keywords to search for in the directory (mandatory)' - ) - - args = parser.parse_args() - folder_name = args.folder_name - keywords_file = args.keywords_file - current_working_dir = os.getcwd() # current directory we are standing on - # the output files of all observed keyword frequencies - csv_file_name = "{}_results.csv".format(folder_name) - sorted_csv_file_name = "{}_results_sorted.csv".format(folder_name) # with every run, remove any older result CSVs for the folder try: @@ -122,8 +93,8 @@ def main(): write_csv(csv_file_name, headers, final_csv_dict) pbar.update(1) - sort_csv(csv_file_name, sorted_csv_file_name) pbar.close() + sort_csv(csv_file_name, sorted_csv_file_name) def sort_csv(csv_input, csv_output): @@ -209,10 +180,44 @@ def write_csv(output_file, keywords_header, keywords_x_freqs): return False if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description='Generate a sorted CSV file with keyword frequencies' + ' from scraped web pages.' + ) + + parser.add_argument( + '-f', + '--folder', + dest='folder_name', + default=None, + required=True, + help='Name of directory with scraped pages (mandatory)' + ) + parser.add_argument( + '-k', + '--keywords_file', + dest='keywords_file', + default=None, + required=True, + help='File with keywords to search for in the directory (mandatory)' + ) + + # these are module global variables and can be access by any function in + # this module + args = parser.parse_args() + folder_name = args.folder_name + keywords_file = args.keywords_file + + # the output files of all observed keyword frequencies + csv_file_name = "{}_results.csv".format(folder_name) + sorted_csv_file_name = "{}_results_sorted.csv".format(folder_name) + try: main() except KeyboardInterrupt as e: logger.info("Script interrupted by user") + sort_csv(csv_file_name, sorted_csv_file_name) try: sys.exit(0) except SystemExit: From c1c5def344fb619a161ae3f48856c5c29424b163 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 23:57:33 -0700 Subject: [PATCH 18/50] Update README to Include Search Script Usage Signed-off-by: Antony Oduor --- Search-Engine-and-Crawler/Crawler/README.md | 29 ++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/Search-Engine-and-Crawler/Crawler/README.md b/Search-Engine-and-Crawler/Crawler/README.md index bdb70df..5e171b9 100644 --- a/Search-Engine-and-Crawler/Crawler/README.md +++ b/Search-Engine-and-Crawler/Crawler/README.md @@ -10,12 +10,39 @@ pip install bs4 pip install tldextract pip install html5 pip install pandas +pip install tqdm ``` ### keywords Create a `keywords.txt` file on this directory with a list of keywords to look for. Each keyword is on a new line. -### Sample usage +## Organization + +There are two scripts. The `crawlerExpand.py` and `search.py`. + + +### crawlerExpand.py + +This script collects pages from the given website and stores them locally on your +machine. + +#### Sample usage ``` python crawlerExpand.py [URL] 10 50 lefolder ``` + +### search.py + +This script allows you to search the pages you have collected above using keywords +and generates a Comma Separated Values (CSV) file with all the keywords found, +their frequency and sorted. + +#### Sample usage +``` +./search.py -f myuni -k keywords_game.txt +``` + +#### Help documentation +``` +./search -h +``` From d39131f1a98afe136f6c52ec27f8fba28dfc6e67 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Fri, 12 Jan 2018 23:58:56 -0700 Subject: [PATCH 19/50] fixup! Update README to Include Search Script Usage --- Search-Engine-and-Crawler/Crawler/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Search-Engine-and-Crawler/Crawler/README.md b/Search-Engine-and-Crawler/Crawler/README.md index 5e171b9..729fa86 100644 --- a/Search-Engine-and-Crawler/Crawler/README.md +++ b/Search-Engine-and-Crawler/Crawler/README.md @@ -44,5 +44,5 @@ their frequency and sorted. #### Help documentation ``` -./search -h +./search.py -h ``` From 29ed8eb742b4dc1326a4f46182be6b4f66ec486c Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Sat, 13 Jan 2018 10:18:16 -0700 Subject: [PATCH 20/50] Allow Fuzzy or Absolute Folder Name to Search Into Putting the full folder name including the timestamp or just the prefix used in scraping. The latter method is recommended in searching across multiple folders scraped. e.g. myuni_2018-01-13-07-00-11 myuni_2018-01-13-12-23-04 will all be searched into. Signed-off-by: Antony Oduor --- Search-Engine-and-Crawler/Crawler/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Search-Engine-and-Crawler/Crawler/search.py b/Search-Engine-and-Crawler/Crawler/search.py index ca46d06..c82765c 100755 --- a/Search-Engine-and-Crawler/Crawler/search.py +++ b/Search-Engine-and-Crawler/Crawler/search.py @@ -29,7 +29,7 @@ def main(): # a list all_txt_files = glob.glob( os.path.join(current_working_dir, - "{}_*/*.*.txt".format(folder_name)), + "{}*/*.*.txt".format(folder_name)), recursive=False ) From 29bdc2bfa3da636ce8c8a70cb702424595595f64 Mon Sep 17 00:00:00 2001 From: Evgeniya Kuznetsova Date: Sat, 13 Jan 2018 11:27:14 -0700 Subject: [PATCH 21/50] Removed going to domains other than the seed --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 2af014f..13b1602 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -128,12 +128,17 @@ # Function that checks if the link provided is in the same domain as the seed def checkDomain(new_link, cur_link): new_link_domain = tldextract.extract(new_link).domain + + """Decided to not do the can-go-one-domain-away-from-the-seed rule for now. Commented it out. # 0) check whether new_link is in the list of popular domains that we don't want to crawl, if yes -> IGNORE IT if new_link_domain in ignore_domains: return False + """ # 1) check if new_link is in seed, if yes -> OK if (new_link_domain == seed): return True + + """ # 2) check if cur_link is in seed (you came from the seed even if you're in a different domain now), if yes -> OK cur_link_domain = tldextract.extract(cur_link).domain if (cur_link_domain == seed): @@ -142,6 +147,7 @@ def checkDomain(new_link, cur_link): if (new_link_domain == cur_link_domain): return True # otherwise, you're trying to leave a domain that's already not the seed, you should STOP + """ return False From 25893507822d06f6bb0a1b6de659fbddb16a6a9e Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Sat, 13 Jan 2018 18:54:07 -0700 Subject: [PATCH 22/50] Reduce CSV Writer Overhead for Faster Processing Now opens the CSV file once for appending instead of opening with each iteration as previously. Writeheader is also done before iteration starts removing condition previously to to check everytime. --- .../Crawler/keywords_game.txt | 1 + Search-Engine-and-Crawler/Crawler/search.py | 122 ++++++++---------- 2 files changed, 54 insertions(+), 69 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/keywords_game.txt b/Search-Engine-and-Crawler/Crawler/keywords_game.txt index 86a6636..3f2d7dc 100644 --- a/Search-Engine-and-Crawler/Crawler/keywords_game.txt +++ b/Search-Engine-and-Crawler/Crawler/keywords_game.txt @@ -17,6 +17,7 @@ gaming game art game AI video game +video games|20 computer games console games mobile games diff --git a/Search-Engine-and-Crawler/Crawler/search.py b/Search-Engine-and-Crawler/Crawler/search.py index c82765c..9047fc3 100755 --- a/Search-Engine-and-Crawler/Crawler/search.py +++ b/Search-Engine-and-Crawler/Crawler/search.py @@ -45,53 +45,63 @@ def main(): keywords = [x.lower() for x in keywords] # make keywords dictionary with zero frequency as value all_keywords = dict((strip_weights(el)[0], 0) for el in keywords) + all_keywords_dict = Counter(all_keywords) + + sorted_keywords_list = sorted(all_keywords_dict.items()) + + # extract a sorted list of keywords to write as CSV headers + headers = [str(x) for x, y in sorted_keywords_list] + # prepend url header onto the keywords list + headers.insert(0, u'url') + headers.insert(1, u'frequency_sum') pbar = tqdm(total=len(all_txt_files)) tqdm.write("Found {} files to search. Please wait.". format(len(all_txt_files))) - for idx, txt_file in enumerate(all_txt_files): - with open(txt_file) as fp: - visible_text_list = fp.readlines() - current_url = visible_text_list[0].strip().rstrip() - num_digits = len(str(len(all_txt_files))) - tqdm.write("{0:0{width}d}) Done! {1}". - format(idx+1, current_url, width=num_digits)) - - visible_text_list = [x.lower() for x in visible_text_list] - - # counts keywords in page - found_count, found_keywords = count_keywords( - visible_text_list, - keywords - ) - found_keywords_as_dict = dict((x, y) for x, y in found_keywords) - - found_keywords_freq_dict = Counter(found_keywords_as_dict) - - all_keywords_dict = Counter(all_keywords) - # combine both dicts to have uniform dictionary for all pages - all_keywords_dict.update(found_keywords_freq_dict) - # after merging, sort the resulting dictionary based on keys to - # make a tuples list that is always uniform for every page - sorted_keywords_list = sorted(all_keywords_dict.items()) - - # create a sorted dictionary list - final_csv_dict = [] - final_csv_dict.append({x: y for x, y in sorted_keywords_list}) - - # extract a sorted list of keywords to write as CSV headers - headers = [str(x) for x, y in sorted_keywords_list] - # prepend url header onto the keywords list - headers.insert(0, u'url') - headers.insert(1, u'frequency_sum') - # logger.info(headers) - - # prepend the current URL onto the frequencies dict object - final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values()) - final_csv_dict[0]['url'] = current_url - - write_csv(csv_file_name, headers, final_csv_dict) - pbar.update(1) + + with open(csv_file_name, 'a+', encoding="utf-8") as f: + # Using dictionary keys as fieldnames for the CSV file header + writer = csv.DictWriter(f, headers) + writer.writeheader() + + for idx, txt_file in enumerate(all_txt_files): + with open(txt_file) as fp: + visible_text_list = fp.readlines() + current_url = visible_text_list[0].strip().rstrip() + num_digits = len(str(len(all_txt_files))) + tqdm.write("{0:0{width}d}) Done! {1}". + format(idx+1, current_url, width=num_digits)) + + visible_text_list = [x.lower() for x in visible_text_list] + + # counts keywords in page + found_count, found_keywords = count_keywords( + visible_text_list, + keywords + ) + found_keywords_as_dict = dict((x, y) for x, y in found_keywords) + + found_keywords_freq_dict = Counter(found_keywords_as_dict) + + all_keywords_dict = Counter(all_keywords) + # combine both dicts to have uniform dictionary for all pages + all_keywords_dict.update(found_keywords_freq_dict) + # after merging, sort the resulting dictionary based on keys to + # make a tuples list that is always uniform for every page + sorted_keywords_list = sorted(all_keywords_dict.items()) + + # create a sorted dictionary list + final_csv_dict = [] + final_csv_dict.append({x: y for x, y in sorted_keywords_list}) + + # prepend the current URL onto the frequencies dict object + final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values()) + final_csv_dict[0]['url'] = current_url + + for d in final_csv_dict: + writer.writerow(d) + + pbar.update(1) pbar.close() sort_csv(csv_file_name, sorted_csv_file_name) @@ -153,32 +163,6 @@ def count_keywords(list_of_tokens, list_of_target_words): return num_target_words, matched_words # Note that we are returning a tuple (2 values) -def write_csv(output_file, keywords_header, keywords_x_freqs): - """Write a CSV file in the format url, , , , ... - output_file - the name of created CSV file - keywords_header - list with all the keywords to create header row of CSV - keywords_x_freqs - dictionary list with keywords and frequencies - return boolean - """ - try: - if os.path.exists(output_file): - append_write = 'a' # append if already exists - else: - append_write = 'w' # make a new file if not - - with open(output_file, append_write, encoding="utf-8") as f: - # Using dictionary keys as fieldnames for the CSV file header - writer = csv.DictWriter(f, keywords_header) - if append_write == 'w': - writer.writeheader() - - for d in keywords_x_freqs: - writer.writerow(d) - return True - except Exception as e: - logger.error('Something bad happend while writing CSV:' + str(e)) - return False - if __name__ == "__main__": parser = argparse.ArgumentParser( From 51260f536a222ad442381f7735e4d95bf3b9bb98 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Sat, 13 Jan 2018 23:47:36 -0700 Subject: [PATCH 23/50] Skip Writing Zero Frequency Pages on CSV Add logger file for search script. --- Search-Engine-and-Crawler/Crawler/README.md | 2 +- Search-Engine-and-Crawler/Crawler/search.py | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/README.md b/Search-Engine-and-Crawler/Crawler/README.md index 729fa86..f11bffb 100644 --- a/Search-Engine-and-Crawler/Crawler/README.md +++ b/Search-Engine-and-Crawler/Crawler/README.md @@ -28,7 +28,7 @@ machine. #### Sample usage ``` -python crawlerExpand.py [URL] 10 50 lefolder +python crawlerExpand.py [URL] 10 50 myuni ``` ### search.py diff --git a/Search-Engine-and-Crawler/Crawler/search.py b/Search-Engine-and-Crawler/Crawler/search.py index 9047fc3..f70c9fe 100755 --- a/Search-Engine-and-Crawler/Crawler/search.py +++ b/Search-Engine-and-Crawler/Crawler/search.py @@ -6,6 +6,7 @@ import logging import os import glob +import time from collections import Counter import pandas as pd from tqdm import tqdm @@ -13,6 +14,11 @@ logger = logging.getLogger(__name__) +# current time, used in the names of the folder and the logging file +curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) +# Create a new log file +logging.basicConfig(filename=('_unisearchlog_' + curtime + '.log'),level=logging.DEBUG) + def main(): @@ -63,15 +69,17 @@ def main(): # Using dictionary keys as fieldnames for the CSV file header writer = csv.DictWriter(f, headers) writer.writeheader() + logger.info("CSV headers written") for idx, txt_file in enumerate(all_txt_files): with open(txt_file) as fp: visible_text_list = fp.readlines() current_url = visible_text_list[0].strip().rstrip() num_digits = len(str(len(all_txt_files))) - tqdm.write("{0:0{width}d}) Done! {1}". + tqdm.write("[{0:0{width}d}] {1}". format(idx+1, current_url, width=num_digits)) + logger.info("Working on: {}".format(current_url)) visible_text_list = [x.lower() for x in visible_text_list] # counts keywords in page @@ -79,6 +87,7 @@ def main(): visible_text_list, keywords ) + logger.info("Keywords found: {}".format(found_count)) found_keywords_as_dict = dict((x, y) for x, y in found_keywords) found_keywords_freq_dict = Counter(found_keywords_as_dict) @@ -86,6 +95,7 @@ def main(): all_keywords_dict = Counter(all_keywords) # combine both dicts to have uniform dictionary for all pages all_keywords_dict.update(found_keywords_freq_dict) + logger.info("Keywords search results merged!") # after merging, sort the resulting dictionary based on keys to # make a tuples list that is always uniform for every page sorted_keywords_list = sorted(all_keywords_dict.items()) @@ -93,13 +103,20 @@ def main(): # create a sorted dictionary list final_csv_dict = [] final_csv_dict.append({x: y for x, y in sorted_keywords_list}) + logger.info("Final dictionary appended!") # prepend the current URL onto the frequencies dict object final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values()) final_csv_dict[0]['url'] = current_url + # ignore zero frequency_sum... + if final_csv_dict[0]['frequency_sum'] == 0: + pbar.update(1) + continue + for d in final_csv_dict: writer.writerow(d) + logger.info("Row written successfully!") pbar.update(1) From 6404d0f4fccff0f620e4cfcf67d85216b4efdd6d Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Sat, 13 Jan 2018 23:50:22 -0700 Subject: [PATCH 24/50] fixup! Skip Writing Zero Frequency Pages on CSV --- Search-Engine-and-Crawler/Crawler/search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/search.py b/Search-Engine-and-Crawler/Crawler/search.py index f70c9fe..e7e2b84 100755 --- a/Search-Engine-and-Crawler/Crawler/search.py +++ b/Search-Engine-and-Crawler/Crawler/search.py @@ -106,11 +106,12 @@ def main(): logger.info("Final dictionary appended!") # prepend the current URL onto the frequencies dict object - final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values()) + freq_sum = sum(final_csv_dict[0].values()) + final_csv_dict[0]['frequency_sum'] = freq_sum final_csv_dict[0]['url'] = current_url # ignore zero frequency_sum... - if final_csv_dict[0]['frequency_sum'] == 0: + if freq_sum == 0: pbar.update(1) continue From 5b0752bff999097cfacf391ac336fb3c92a9e5c6 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Sun, 14 Jan 2018 00:15:48 -0700 Subject: [PATCH 25/50] Catch Missing Title Error This was previously breaking crawling silently with AttributeError if title is 'NoneType' Signed-off-by: Antony Oduor --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 13b1602..6fdf146 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -215,14 +215,15 @@ def create_name_from_html (html): # Uses Beautiful Soup to locate the tag # Parameter soup is a soup object def create_name_from_soup (soup): - name = soup.title.string - if name: - # removes invalid characters from title + try: + name = soup.title.string + # removes invalid characters from title name = format_filename(name) + '__' + str(time.time()) logging.info('Created name ' + name) - else: + except AttributeError as e: name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp logging.warn('Failed to create a name, using \'' + name + '\' instead') + logging.error(str(e)) return name From 36858245c93a0e0d525c22898875630cb0155538 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Sun, 14 Jan 2018 00:29:21 -0700 Subject: [PATCH 26/50] Catch 'None' Name Parameter for format_filename() --- .../Crawler/crawlerExpand.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 6fdf146..cfb6347 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -9,6 +9,7 @@ import string import shutil import re +import uuid try: from os import scandir, walk @@ -183,11 +184,15 @@ def format_filename(name): """Take a string and return a valid filename constructed from the string. Uses a whitelist approach: any characters not present in valid_chars are removed. Also spaces are replaced with underscores.""" - valid_chars = "-_() %s%s" % (string.ascii_letters, string.digits) - filename = ''.join(c for c in name if c in valid_chars) - # Remove spaces in filename - filename = filename.strip() - filename = filename.replace(' ','_') + try: + valid_chars = "-_() %s%s" % (string.ascii_letters, string.digits) + filename = ''.join(c for c in name if c in valid_chars) + # Remove spaces in filename + filename = filename.strip() + filename = filename.replace(' ','_') + except TypeError as e: + filename = uuid.uuid4() + logging.error("Got and error: {}".format(str(e))) return filename From eda72a47148972983dde77d95c6623baa2e00ea7 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Sun, 14 Jan 2018 13:46:18 -0700 Subject: [PATCH 27/50] fixup! Catch 'None' Name Parameter for format_filename() --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index cfb6347..18e7100 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -191,7 +191,7 @@ def format_filename(name): filename = filename.strip() filename = filename.replace(' ','_') except TypeError as e: - filename = uuid.uuid4() + filename = str(uuid.uuid4()) logging.error("Got and error: {}".format(str(e))) return filename From 9b97e32fec074bdf76f75a89838cb8e60bfd7515 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Sun, 14 Jan 2018 22:28:28 -0700 Subject: [PATCH 28/50] Update Games Keywords File --- .../Crawler/keywords_game.txt | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/keywords_game.txt b/Search-Engine-and-Crawler/Crawler/keywords_game.txt index 3f2d7dc..7a71f70 100644 --- a/Search-Engine-and-Crawler/Crawler/keywords_game.txt +++ b/Search-Engine-and-Crawler/Crawler/keywords_game.txt @@ -1,35 +1,36 @@ -Games -Game studies -Games studies -game programming +Games|10 +Game studies|50 +Games studies|50 +game programming|10 game engine scripting -Game Design +Game Design|50 game writing game production games and learning game research +game development|50 games and society -Game studies -game(s) studies +Game studies|50 +game(s) studies|50 game analysis game critique -gaming +gaming|30 game art -game AI -video game -video games|20 -computer games +game AI|50 +video game|40 +video games|40 +computer games|50 console games -mobile games +mobile games|20 web games mobile game -serious games +serious games|50 critical games -game +game|20 digital media visual -animation +animation|50 interactive 3D entertainment From ec7804c4ace78806b04687f3211325aad22033fc Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Sun, 14 Jan 2018 22:35:10 -0700 Subject: [PATCH 29/50] Minor Fixes to Indentation and Spaces --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 18e7100..c78e412 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -41,7 +41,7 @@ target_dir = directory + "_" + curtime # RegEx that is used to filter searches for URLs on any given page. -#Used in is_relevant_link_from_soup and is_relevant_link_from_html functions +# Used in is_relevant_link_from_soup and is_relevant_link_from_html functions filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment).*") filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") @@ -129,7 +129,7 @@ # Function that checks if the link provided is in the same domain as the seed def checkDomain(new_link, cur_link): new_link_domain = tldextract.extract(new_link).domain - + """Decided to not do the can-go-one-domain-away-from-the-seed rule for now. Commented it out. # 0) check whether new_link is in the list of popular domains that we don't want to crawl, if yes -> IGNORE IT if new_link_domain in ignore_domains: @@ -138,7 +138,7 @@ def checkDomain(new_link, cur_link): # 1) check if new_link is in seed, if yes -> OK if (new_link_domain == seed): return True - + """ # 2) check if cur_link is in seed (you came from the seed even if you're in a different domain now), if yes -> OK cur_link_domain = tldextract.extract(cur_link).domain @@ -206,7 +206,7 @@ def create_name_from_html (html): name_part = name_list[-1] #grab part of html after <title name = name_part.split(">")[-1] if name: - # removes invalid characters from title + # removes invalid characters from title name = format_filename(name) + '__' + str(time.time()) logging.info('Created name ' + name) else: From 52b4ae3c7a013235675f88cee129d49b3bc8570d Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Sun, 14 Jan 2018 22:37:22 -0700 Subject: [PATCH 30/50] New Text Extractor Function and Change BS4 Library The new method extracts texts in smaller chunks for the search script to parse faster. Also to reduce non-text(<!DATA, comments, JS, CSS) that sneaked out as text. --- .../Crawler/crawlerExpand.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index c78e412..bed060a 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -48,7 +48,7 @@ # Var to choose mode # "soup" uses BeautifulSoup to assign a name to a page and to search the page for URLs # "no_soup" uses a string search – splits the page into strings using "href=" as a partition limiter, then goes from there -mode = "no_soup" # soup or no_soup +mode = "soup" # soup or no_soup # Checks if the url includes http at the front @@ -288,7 +288,7 @@ def process_current_link (): current_url = r.url # Soupify # For now it soupifies the link regardless of the mode, because it uses soup later to extract visible text from the page - soup = BeautifulSoup(html, 'html5lib') + soup = BeautifulSoup(html, 'html.parser') grab_all = is_title_page_relevant(soup) if mode=="no_soup": @@ -452,12 +452,28 @@ def process_links_from_html (html, cur_link, grab_all=False): logging.info(str(e)) +def extract_text(soup): + """Extract text from HTML pages and Return normalized text + https://stackoverflow.com/questions/30565404/remove-all-style-scripts-and-html-tags-from-an-html-page + return string + """ + for script in soup(["script", "style"]): # remove all javascript and stylesheet code + script.extract() + # get text, the separator keeps the paragraphs their usual short + # https://stackoverflow.com/a/38861217 + text = soup.get_text(separator="\n") + # break into lines and remove leading and trailing space on each + lines = (line.strip() for line in text.splitlines()) + # break multi-headlines into a line each + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + # drop blank lines + return '\n'.join(chunk for chunk in chunks if chunk) # Function to extract text elements from an HTML and return them as an array of BeautifulSoup # called from process_current_link -def extract_text(soup): +def _extract_text(soup): data = soup.findAll(text=True) result = filter(is_visible_html_element, data) all_text = "" From 4fe6f0a24a9c3802e47ea2e8a4ac646fca420309 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Sun, 14 Jan 2018 22:44:56 -0700 Subject: [PATCH 31/50] Exclude .mp3 files --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index bed060a..4ecd892 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -378,7 +378,7 @@ def process_links_from_soup (soup, cur_link, grab_all=False): # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray if new_link not in crawledURLsArray: # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link: + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: #???TODO: add checks for www.domain.com and https:// # Adds new link to array plannedURLsArray.append(new_link) @@ -426,7 +426,7 @@ def process_links_from_html (html, cur_link, grab_all=False): # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray if new_link not in crawledURLsArray: # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link: + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: #???TODO: add checks for www.domain.com and https:// # Adds new link to array plannedURLsArray.append(new_link) From 3a07ff02d6e7c48ecda1f10c1c306d9cc92dabb3 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Mon, 15 Jan 2018 10:46:32 -0700 Subject: [PATCH 32/50] Add Option to Abandon Overly Long Keyword Searches Tame extremely large pages which take forever to search for keywords. Mostly due to too much content on the page. A new optional script argument has been added, patience that abdicates when it takes too long to process. A default setting is set to 30 seconds incase one is not provided by user. --- Search-Engine-and-Crawler/Crawler/search.py | 55 ++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/search.py b/Search-Engine-and-Crawler/Crawler/search.py index e7e2b84..fdbf000 100755 --- a/Search-Engine-and-Crawler/Crawler/search.py +++ b/Search-Engine-and-Crawler/Crawler/search.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import argparse +import signal import sys import re import csv @@ -14,10 +15,24 @@ logger = logging.getLogger(__name__) + +class TookTooDamnLongException(Exception): # Custom exception class + pass + + +def toodamnlong_handler(signum, frame): # Custom signal handler + raise TookTooDamnLongException + + +# Change the behavior of SIGALRM +signal.signal(signal.SIGALRM, toodamnlong_handler) + # current time, used in the names of the folder and the logging file curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) # Create a new log file -logging.basicConfig(filename=('_unisearchlog_' + curtime + '.log'),level=logging.DEBUG) +logging.basicConfig(filename=('_unisearchlog_' + curtime + '.log'), + level=logging.DEBUG + ) def main(): @@ -72,6 +87,9 @@ def main(): logger.info("CSV headers written") for idx, txt_file in enumerate(all_txt_files): + # Start the timer. + # Once [patience] seconds are over, a SIGALRM signal is sent. + signal.alarm(patience) with open(txt_file) as fp: visible_text_list = fp.readlines() current_url = visible_text_list[0].strip().rstrip() @@ -82,11 +100,26 @@ def main(): logger.info("Working on: {}".format(current_url)) visible_text_list = [x.lower() for x in visible_text_list] - # counts keywords in page - found_count, found_keywords = count_keywords( - visible_text_list, - keywords - ) + # This try/except loop ensures that + # you'll catch TookTooDamnLongException when it's sent. + # https://stackoverflow.com/questions/25027122/break-the-function-after-certain-time + try: + # counts keywords in page + found_count, found_keywords = count_keywords( + visible_text_list, + keywords + ) + except TookTooDamnLongException: + # TODO: Keep a record of pages that took forever to search + tqdm.write("[{0:0{width}d}] Aarrrgh! " + "TOOK TOO DAMN LONG TO SEARCH! {1}". + format(idx+1, current_url, width=num_digits)) + logger.warn("TTDL >>> {} <<<".format(current_url)) + pbar.update(1) + # continue the for loop if count_keywords takes more + # than [patience] seconds + continue + logger.info("Keywords found: {}".format(found_count)) found_keywords_as_dict = dict((x, y) for x, y in found_keywords) @@ -204,12 +237,22 @@ def count_keywords(list_of_tokens, list_of_target_words): required=True, help='File with keywords to search for in the directory (mandatory)' ) + parser.add_argument( + '-p', + '--patience', + dest='patience', + default=30, + required=False, + help="Number of seconds you can give per-page-search. Life is too" \ + " short to parse unabridged web pages. Default is 30. Bye" + ) # these are module global variables and can be access by any function in # this module args = parser.parse_args() folder_name = args.folder_name keywords_file = args.keywords_file + patience = int(args.patience) # the output files of all observed keyword frequencies csv_file_name = "{}_results.csv".format(folder_name) From b548f9c18a1059931188e30fc5ae4f88ed557ebe Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Mon, 15 Jan 2018 10:52:39 -0700 Subject: [PATCH 33/50] Update Crawler Keywords With Grad and Undegrad --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 4ecd892..3919d1e 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -42,7 +42,7 @@ # RegEx that is used to filter searches for URLs on any given page. # Used in is_relevant_link_from_soup and is_relevant_link_from_html functions -filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment).*") +filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]grad).*") filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") # Var to choose mode From 93f4173c8f0a3039035c86982911bd39094f94fe Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Mon, 15 Jan 2018 10:53:54 -0700 Subject: [PATCH 34/50] fixup! Update Crawler Keywords With Grad and Undegrad --- Search-Engine-and-Crawler/Crawler/crawlerExpand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 3919d1e..78fe7e3 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -42,7 +42,7 @@ # RegEx that is used to filter searches for URLs on any given page. # Used in is_relevant_link_from_soup and is_relevant_link_from_html functions -filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]grad).*") +filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]rad).*") filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") # Var to choose mode From 3dec30f1ae6d40227ef66125f96d2c3d1dc575b2 Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG <refig@ualberta.ca> Date: Tue, 16 Jan 2018 18:59:14 -0700 Subject: [PATCH 35/50] Fixed split error and memory (file type) error --- .../Crawler/crawlerExpand.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py index 78fe7e3..8841787 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/Search-Engine-and-Crawler/Crawler/crawlerExpand.py @@ -164,9 +164,10 @@ def request_url(url): # Use requests module to get html from url as an object html = '' try: - r = requests.get(url, headers=headers) - if r.ok: - if "text/html" in r.headers["content-type"]: + head = requests.head(url, headers=headers) + if head.ok and ("text/html" in head.headers["content-type"]): + r = requests.get(url, headers=headers) + if r.ok: return r return None except KeyboardInterrupt: @@ -378,7 +379,7 @@ def process_links_from_soup (soup, cur_link, grab_all=False): # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray if new_link not in crawledURLsArray: # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: + if new_link.startswith("http") and ('.pdf' not in new_link) and ('.jpg' not in new_link) and ('.mp3' not in new_link): #???TODO: add checks for www.domain.com and https:// # Adds new link to array plannedURLsArray.append(new_link) @@ -388,17 +389,19 @@ def process_links_from_soup (soup, cur_link, grab_all=False): # Remove the front of the URL (http or https) http_split = new_link.split("://", 1) - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") - # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") + + if len(http_split)>1: + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") # checks that the text content of the link matches the filter_regex # input parameter is a string From e775b4c4ef87ed637834db4afe865b3fd0af6bb4 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Wed, 17 Jan 2018 10:27:28 -0700 Subject: [PATCH 36/50] Move the Two Scripts into Root Directory To reduce the navigation currently required to reach to the scripts. --- .../crawlerExpand.py => crawlerExpand.py | 1104 ++++++++--------- .../Crawler/keywords.txt => keywords.txt | 0 .../keywords_game.txt => keywords_game.txt | 0 .../Crawler/search.py => search.py | 0 4 files changed, 552 insertions(+), 552 deletions(-) rename Search-Engine-and-Crawler/Crawler/crawlerExpand.py => crawlerExpand.py (97%) rename Search-Engine-and-Crawler/Crawler/keywords.txt => keywords.txt (100%) rename Search-Engine-and-Crawler/Crawler/keywords_game.txt => keywords_game.txt (100%) rename Search-Engine-and-Crawler/Crawler/search.py => search.py (100%) diff --git a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py b/crawlerExpand.py similarity index 97% rename from Search-Engine-and-Crawler/Crawler/crawlerExpand.py rename to crawlerExpand.py index 8841787..86959a3 100644 --- a/Search-Engine-and-Crawler/Crawler/crawlerExpand.py +++ b/crawlerExpand.py @@ -1,552 +1,552 @@ -import requests -from bs4 import BeautifulSoup -import urllib.parse -import os -import sys -import tldextract -import time -import codecs -import string -import shutil -import re -import uuid - -try: - from os import scandir, walk -except ImportError: - from scandir import scandir, walk -import logging - -#Pay attention to robots.txt - -# current time, used in the names of the folder and the logging file -curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) - -# this file should live in the same directory as the script -keywords_file = "keywords_game.txt" - -# this should be a file input later but for now it's an array -# an array of popular domains that university websites link to but we don't want to crawl -ignore_domains = ["youtube", "facebook", "instagram", "twitter", "linkedin", "google", "pinterest", "snapchat"] - -# Arguments in order: url, total pages to look at, depth, first part of directory name -# url to start from -url = sys.argv[1] -# number of pages to iterate through -iterate = int(sys.argv[2]) -# depth to go for -depth_to_go = int(sys.argv[3]) -# directory name -directory = sys.argv[4] -target_dir = directory + "_" + curtime - -# RegEx that is used to filter searches for URLs on any given page. -# Used in is_relevant_link_from_soup and is_relevant_link_from_html functions -filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]rad).*") -filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") - -# Var to choose mode -# "soup" uses BeautifulSoup to assign a name to a page and to search the page for URLs -# "no_soup" uses a string search – splits the page into strings using "href=" as a partition limiter, then goes from there -mode = "soup" # soup or no_soup - - -# Checks if the url includes http at the front -if not url.startswith("http"): - url = "http://" + url -# Extracts the top level domain from the URL (eg. ualberta.ca, no slashes) -seed = tldextract.extract(url).domain - -# Set a header to pretend it's a browser -headers = requests.utils.default_headers() -headers.update ( - { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', - } -) - -# Checks if the directory with the given name already exists -# If it does, tries to continue a script run that was interrupted, using already existing lists of visited_urls and planned_urls -# If it doesn't, starts a new script run -if os.path.isdir(directory): - # Continuing a previous script run - - # Copy the contents of the existing directory to a new timestamped one - shutil.copytree(directory, target_dir) - os.chdir(target_dir) # then change directory to that folder - - # Open the visited_urls text file and count the number of lines in it – that's how many pages the script visited throughout its previous runs - with open("_visited_urls.txt") as f: - for i, l in enumerate(f, start=1): - pass - page = i - - # Open the file with planned urls and add them to the array of planned urls - with open("_planned_urls.txt") as f: - content = f.readlines() - #remove whitespace characters like `\n` at the end of each line - planned = content[page-1:] - plannedURLsArray = [x.strip() for x in planned] - - # Open the file with crawled urls and add them to the array of crawled urls - with open("_crawled_urls.txt") as f: - content = f.readlines() - #remove whitespace characters like `\n` at the end of each line - crawledURLsArray = [x.strip() for x in content] - - # Create a new log file - logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) - # file to log empty requests into - empty_request_log = codecs.open("_empty_requests.txt", "a", "utf-8-sig") - # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) - planned_urls = codecs.open("_planned_urls.txt", "a", "utf-8-sig") - # file to log visited urls into - URLs that have been requested and have the html - visited_urls = codecs.open("_visited_urls.txt", "a", "utf-8-sig") - # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging - crawled_urls = codecs.open("_crawled_urls.txt", "a", "utf-8-sig") - - -else: - current_dir = os.getcwd() - # Start a new script run - os.mkdir(target_dir) # make a timestampted folder - os.chdir(target_dir) # then change directory to that folder - shutil.copyfile(current_dir + "/" + keywords_file, keywords_file) # jump into working directory - # Create a log file in the folder that was just created - logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) - # file to log empty requests into - empty_request_log = codecs.open("_empty_requests.txt", "w", "utf-8-sig") - # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) - planned_urls = codecs.open("_planned_urls.txt", "w", "utf-8-sig") - plannedURLsArray = [] - # file to log visited urls into - URLs that have been requested and have the html - visited_urls = codecs.open("_visited_urls.txt", "w", "utf-8-sig") - # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging - crawled_urls = codecs.open("_crawled_urls.txt", "w", "utf-8-sig") - crawledURLsArray = [] - page = 1 - -# Function that checks if the link provided is in the same domain as the seed -def checkDomain(new_link, cur_link): - new_link_domain = tldextract.extract(new_link).domain - - """Decided to not do the can-go-one-domain-away-from-the-seed rule for now. Commented it out. - # 0) check whether new_link is in the list of popular domains that we don't want to crawl, if yes -> IGNORE IT - if new_link_domain in ignore_domains: - return False - """ - # 1) check if new_link is in seed, if yes -> OK - if (new_link_domain == seed): - return True - - """ - # 2) check if cur_link is in seed (you came from the seed even if you're in a different domain now), if yes -> OK - cur_link_domain = tldextract.extract(cur_link).domain - if (cur_link_domain == seed): - return True - # 3) check if the new link is in the same domain as the cur link (you're still in the same domain, even though it's different from seed), if yes -> OK - if (new_link_domain == cur_link_domain): - return True - # otherwise, you're trying to leave a domain that's already not the seed, you should STOP - """ - return False - - -# Fuction for requesting url -# Given a URL, go to that url and get the html and return it -# Called from main function -def request_url(url): - global headers - # Log that this URL is being saved - logging.info('Requesting ' + url) - visited_urls.write(url) - visited_urls.write("\n") - # Use requests module to get html from url as an object - html = '' - try: - head = requests.head(url, headers=headers) - if head.ok and ("text/html" in head.headers["content-type"]): - r = requests.get(url, headers=headers) - if r.ok: - return r - return None - except KeyboardInterrupt: - print("\n\nScript interrupted by user. Shutting down.") - logging.info("Script interrupted by user") - shut_down() - except Exception: - logging.exception("Couldn\'t request " + url) - return None - -# Function to create a filename out of a string -# Called from create_name -def format_filename(name): - #Taken from: https://gist.github.com/seanh/93666 - """Take a string and return a valid filename constructed from the string. - Uses a whitelist approach: any characters not present in valid_chars are - removed. Also spaces are replaced with underscores.""" - try: - valid_chars = "-_() %s%s" % (string.ascii_letters, string.digits) - filename = ''.join(c for c in name if c in valid_chars) - # Remove spaces in filename - filename = filename.strip() - filename = filename.replace(' ','_') - except TypeError as e: - filename = str(uuid.uuid4()) - logging.error("Got and error: {}".format(str(e))) - return filename - - -# Function for creating name -# Use the title of the html page as the title of the text file -# Called from process_current_link -# Uses string search to locate the <title> tag -# Parameter html is a string -def create_name_from_html (html): - name_list = (html.partition("</title")[0]).split("<title") #grab part of html before ")[-1] - if name: - # removes invalid characters from title - name = format_filename(name) + '__' + str(time.time()) - logging.info('Created name ' + name) - else: - name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp - logging.warn('Failed to create a name, using \'' + name + '\' instead') - return name - -# Function for creating name -# Use the title of the html page as the title of the text file -# Called from process_current_link -# Uses Beautiful Soup to locate the tag -# Parameter soup is a soup object -def create_name_from_soup (soup): - try: - name = soup.title.string - # removes invalid characters from title - name = format_filename(name) + '__' + str(time.time()) - logging.info('Created name ' + name) - except AttributeError as e: - name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp - logging.warn('Failed to create a name, using \'' + name + '\' instead') - logging.error(str(e)) - return name - - -#Function for deleting paired single or double quotes -def dequote(s): - """ - If a string has single or double quotes around it, remove them. - Make sure the pair of quotes match. - If a matching pair of quotes is not found, return the string unchanged. - """ - if (len(s)>= 2 and s[0] == s[-1]) and s.startswith(("'", '"')): - s = s[1:-1] - s = s.strip('"\'') - return s - - -# Function that takes link, saves the contents to text file call href_split -# Main function -def crawl(max_pages): - logging.info("Crawling through domain '" + seed + "'") - - if page == 1: - # Array that holds the queue to be visited later - plannedURLsArray.append(url) - # Logging the urls - planned_urls.write(url) - planned_urls.write("\n") - - # Gets the root of the url - url_split = url.split("://", 1) - # Array that holds urls that have been found. - # This is the array that all new URLs are checked against to prevent repeating. - # Record URL with both http and https prefixes - crawledURLsArray.append("http://" + url_split[1]) - crawledURLsArray.append("https://" + url_split[1]) - # Also log the same into the text file - crawled_urls.write("http://" + url_split[1] + "\n") - crawled_urls.write("https://" + url_split[1] + "\n") - - while page <= max_pages and len(plannedURLsArray) > 0: - process_current_link() - - -def is_title_page_relevant(soup): - return True if soup.find('title', string=filter_title_regex) else False - -# Function that grabs the first link in the list of planned urls, requests the page and processes it -def process_current_link (): - global page - - print(plannedURLsArray[0]) - # Try to get the html of the URL - r = request_url(plannedURLsArray[0]) - - if r: #if the request returned an html - html = r.text - current_url = r.url - # Soupify - # For now it soupifies the link regardless of the mode, because it uses soup later to extract visible text from the page - soup = BeautifulSoup(html, 'html.parser') - grab_all = is_title_page_relevant(soup) - - if mode=="no_soup": - # Gets the name for the file to store the html text in - name = create_name_from_html(html) - #find and process all links - process_links_from_html(html, current_url, grab_all) - else: - name = create_name_from_soup(soup) - process_links_from_soup(soup, current_url, grab_all) - - # Adds the .txt to the end of the name - name = "{0}.txt".format(name) - - # Find only visible text - visible_text = extract_text(soup) - - if visible_text: #save it as a text file - try: - # Create and open the file with that name - fo = codecs.open(name, "w", "utf-8-sig") - # Write URL to that file - fo.write(current_url + "\n") - # Append the html to the file - fo.write(visible_text) - # Close the pipe to the file - fo.close() - # Log the creation of the file - logging.info('Created file ' + name) - - except KeyboardInterrupt: - print("\n\nScript interrupted by user. Shutting down.") - logging.info("Script interrupted by user") - shut_down() - except Exception: - logging.exception("Can not encode file: " + current_url) - else: - print('No visible text in ' + url) - logging.warning('No visible text in ' + url) - # Else: html does not exist or is empty. Log error - else: - logging.warning('Request for ' + url + ' returned empty html') - empty_request_log.write(url) - empty_request_log.write("\n") - - # Update on the total number of pages - print("iterations:", page, "pages") - print("\n") - # Deletes the currently looked at URL from the queue - plannedURLsArray.pop(0) - - # Increment page count - page += 1 - # Every 50 pages checks the size of the folder. Prints the amount of data collected in MB to the console and log file - if page%50 == 0: - size_of_directory = get_tree_size(os.curdir) / 1000000 - print("Size: ", str(round(size_of_directory, 5)), "MB") - print('\n') - logging.info("Size: " + str(round(size_of_directory, 5)) + "MB") - # Prints in the log file the length of time the crawler has been running in seconds - logging.info("Has been running for " + str(time.time() - start_time) + " seconds") - # Time delay in seconds to prevent crashing the server - time.sleep(.01) - - -# checks that the text content of the link matches the filter_regex -# input parameter is a soup element!!! -def is_relevant_link_from_soup(link): - if link.find(string=filter_regex): - return True - return False - #return True #Uncomment to grab all links - -# takes soup of a page, finds all links on it -# for each link checks if it's relevant -# for each relevant link, saves it to the planned urls array (if it hasn't been crawled yet) -# and to the crawled urls array (so that we don't save it a second time later) -def process_links_from_soup (soup, cur_link, grab_all=False): - # check if the title of the current page matches the filter_title_regex - for lnk in soup.findAll('a', href=True): - # if not, check if the the link itself is relevant - if (grab_all or is_relevant_link_from_soup(lnk)): - new_link = (urllib.parse.urldefrag(lnk['href'])[0]).rstrip('/') - new_link = urllib.parse.urljoin(cur_link, new_link) - # if the link is in our main domain - if checkDomain(new_link, cur_link): - # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray - if new_link not in crawledURLsArray: - # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and ('.pdf' not in new_link) and ('.jpg' not in new_link) and ('.mp3' not in new_link): - #???TODO: add checks for www.domain.com and https:// - # Adds new link to array - plannedURLsArray.append(new_link) - # Adds new link to queue file - planned_urls.write(new_link) - planned_urls.write("\n") - - # Remove the front of the URL (http or https) - http_split = new_link.split("://", 1) - - if len(http_split)>1: - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") - # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") - -# checks that the text content of the link matches the filter_regex -# input parameter is a string -def is_relevant_link_from_html(link): - if filter_regex.match(link): - return True - return False - #return True #Uncomment to grab all links - -#Take an array of links, run the split on each and add the results to the appropriate arrays and files -def process_links_from_html (html, cur_link, grab_all=False): - print("grabbing all: ", str(grab_all)) - if html.partition('<body')[2]: - html = html.partition('<body')[2] - link_strings = html.split('href=') # split the page into sections using "href=" as a delimiter - for lnk in link_strings[1:]: - href = lnk.partition('</a')[0] # grab all text before the "</a" – this var now contains text after an href parameter and before a closing tag, and thus includes the text content of the link - if (grab_all or is_relevant_link_from_html(href)): - href = href.partition('>')[0] - href = href.partition(' ')[0] - href = dequote(href) - new_link = (urllib.parse.urldefrag(href)[0]).rstrip('/') - new_link = urllib.parse.urljoin(cur_link, new_link) - if checkDomain(new_link, cur_link): - # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray - if new_link not in crawledURLsArray: - # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: - #???TODO: add checks for www.domain.com and https:// - # Adds new link to array - plannedURLsArray.append(new_link) - # Adds new link to queue file - planned_urls.write(new_link) - planned_urls.write("\n") - - try: - # Remove the front of the URL (http or https) - http_split = new_link.split("://", 1) - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") - # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") - except IndexError as e: - logging.info(str(e)) - - -def extract_text(soup): - """Extract text from HTML pages and Return normalized text - https://stackoverflow.com/questions/30565404/remove-all-style-scripts-and-html-tags-from-an-html-page - return string - """ - for script in soup(["script", "style"]): # remove all javascript and stylesheet code - script.extract() - # get text, the separator keeps the paragraphs their usual short - # https://stackoverflow.com/a/38861217 - text = soup.get_text(separator="\n") - # break into lines and remove leading and trailing space on each - lines = (line.strip() for line in text.splitlines()) - # break multi-headlines into a line each - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - # drop blank lines - return '\n'.join(chunk for chunk in chunks if chunk) - - - -# Function to extract text elements from an HTML and return them as an array of BeautifulSoup -# called from process_current_link -def _extract_text(soup): - data = soup.findAll(text=True) - result = filter(is_visible_html_element, data) - all_text = "" - for t in result: - if t.strip(): - all_text += t + "\n" - return all_text - - -# check that the given soup element is a visible text element -# called from extract_text -def is_visible_html_element(element): - if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: - return False - elif re.match('<!--.*-->', str(element.encode('utf-8'))): - return False - return True - - - - - - -# Return total size of files in given path and subdirs by going through the tree. -# Recursive. -# Called from main function -def get_tree_size(path): - total = 0 - for entry in scandir(path): - if entry.is_dir(follow_symlinks=False): - total += get_tree_size(entry.path) - else: - total += entry.stat(follow_symlinks=False).st_size - return total - - -# Shut down gracefully and log it -def shut_down(): - global start_time - global logging - global empty_request_log - global visited_urls - global planned_urls - global crawled_urls - - # Get the time that the command finished - end_time = time.time() - # Print overall time taken to console - print("Overall time: " + str((end_time - start_time))) - # Log overall time and save to main log file - logging.info("Overall time: " + str((end_time - start_time))) - # Close all the things/pipes to files - empty_request_log.close() - visited_urls.close() - planned_urls.close() - crawled_urls.close() - - sys.exit() - - -# Get the time that the command was run -start_time = time.time() - -try: - # Call main function - crawl(iterate) - shut_down() -except KeyboardInterrupt: - print("\n\nScript interrupted by user. Shutting down.") - logging.info("Script interrupted by user") - shut_down() -except Exception: - logging.exception("Error while running script") +import requests +from bs4 import BeautifulSoup +import urllib.parse +import os +import sys +import tldextract +import time +import codecs +import string +import shutil +import re +import uuid + +try: + from os import scandir, walk +except ImportError: + from scandir import scandir, walk +import logging + +#Pay attention to robots.txt + +# current time, used in the names of the folder and the logging file +curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) + +# this file should live in the same directory as the script +keywords_file = "keywords_game.txt" + +# this should be a file input later but for now it's an array +# an array of popular domains that university websites link to but we don't want to crawl +ignore_domains = ["youtube", "facebook", "instagram", "twitter", "linkedin", "google", "pinterest", "snapchat"] + +# Arguments in order: url, total pages to look at, depth, first part of directory name +# url to start from +url = sys.argv[1] +# number of pages to iterate through +iterate = int(sys.argv[2]) +# depth to go for +depth_to_go = int(sys.argv[3]) +# directory name +directory = sys.argv[4] +target_dir = directory + "_" + curtime + +# RegEx that is used to filter searches for URLs on any given page. +# Used in is_relevant_link_from_soup and is_relevant_link_from_html functions +filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]rad).*") +filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") + +# Var to choose mode +# "soup" uses BeautifulSoup to assign a name to a page and to search the page for URLs +# "no_soup" uses a string search – splits the page into strings using "href=" as a partition limiter, then goes from there +mode = "soup" # soup or no_soup + + +# Checks if the url includes http at the front +if not url.startswith("http"): + url = "http://" + url +# Extracts the top level domain from the URL (eg. ualberta.ca, no slashes) +seed = tldextract.extract(url).domain + +# Set a header to pretend it's a browser +headers = requests.utils.default_headers() +headers.update ( + { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', + } +) + +# Checks if the directory with the given name already exists +# If it does, tries to continue a script run that was interrupted, using already existing lists of visited_urls and planned_urls +# If it doesn't, starts a new script run +if os.path.isdir(directory): + # Continuing a previous script run + + # Copy the contents of the existing directory to a new timestamped one + shutil.copytree(directory, target_dir) + os.chdir(target_dir) # then change directory to that folder + + # Open the visited_urls text file and count the number of lines in it – that's how many pages the script visited throughout its previous runs + with open("_visited_urls.txt") as f: + for i, l in enumerate(f, start=1): + pass + page = i + + # Open the file with planned urls and add them to the array of planned urls + with open("_planned_urls.txt") as f: + content = f.readlines() + #remove whitespace characters like `\n` at the end of each line + planned = content[page-1:] + plannedURLsArray = [x.strip() for x in planned] + + # Open the file with crawled urls and add them to the array of crawled urls + with open("_crawled_urls.txt") as f: + content = f.readlines() + #remove whitespace characters like `\n` at the end of each line + crawledURLsArray = [x.strip() for x in content] + + # Create a new log file + logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) + # file to log empty requests into + empty_request_log = codecs.open("_empty_requests.txt", "a", "utf-8-sig") + # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) + planned_urls = codecs.open("_planned_urls.txt", "a", "utf-8-sig") + # file to log visited urls into - URLs that have been requested and have the html + visited_urls = codecs.open("_visited_urls.txt", "a", "utf-8-sig") + # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging + crawled_urls = codecs.open("_crawled_urls.txt", "a", "utf-8-sig") + + +else: + current_dir = os.getcwd() + # Start a new script run + os.mkdir(target_dir) # make a timestampted folder + os.chdir(target_dir) # then change directory to that folder + shutil.copyfile(current_dir + "/" + keywords_file, keywords_file) # jump into working directory + # Create a log file in the folder that was just created + logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),level=logging.INFO) + # file to log empty requests into + empty_request_log = codecs.open("_empty_requests.txt", "w", "utf-8-sig") + # file to log planned urls into - URLs in the queue, that are planned to go to next (checked against visited) + planned_urls = codecs.open("_planned_urls.txt", "w", "utf-8-sig") + plannedURLsArray = [] + # file to log visited urls into - URLs that have been requested and have the html + visited_urls = codecs.open("_visited_urls.txt", "w", "utf-8-sig") + # file to log crawled urls into - URLs that crawler will "check" against to see if needs logging + crawled_urls = codecs.open("_crawled_urls.txt", "w", "utf-8-sig") + crawledURLsArray = [] + page = 1 + +# Function that checks if the link provided is in the same domain as the seed +def checkDomain(new_link, cur_link): + new_link_domain = tldextract.extract(new_link).domain + + """Decided to not do the can-go-one-domain-away-from-the-seed rule for now. Commented it out. + # 0) check whether new_link is in the list of popular domains that we don't want to crawl, if yes -> IGNORE IT + if new_link_domain in ignore_domains: + return False + """ + # 1) check if new_link is in seed, if yes -> OK + if (new_link_domain == seed): + return True + + """ + # 2) check if cur_link is in seed (you came from the seed even if you're in a different domain now), if yes -> OK + cur_link_domain = tldextract.extract(cur_link).domain + if (cur_link_domain == seed): + return True + # 3) check if the new link is in the same domain as the cur link (you're still in the same domain, even though it's different from seed), if yes -> OK + if (new_link_domain == cur_link_domain): + return True + # otherwise, you're trying to leave a domain that's already not the seed, you should STOP + """ + return False + + +# Fuction for requesting url +# Given a URL, go to that url and get the html and return it +# Called from main function +def request_url(url): + global headers + # Log that this URL is being saved + logging.info('Requesting ' + url) + visited_urls.write(url) + visited_urls.write("\n") + # Use requests module to get html from url as an object + html = '' + try: + head = requests.head(url, headers=headers) + if head.ok and ("text/html" in head.headers["content-type"]): + r = requests.get(url, headers=headers) + if r.ok: + return r + return None + except KeyboardInterrupt: + print("\n\nScript interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() + except Exception: + logging.exception("Couldn\'t request " + url) + return None + +# Function to create a filename out of a string +# Called from create_name +def format_filename(name): + #Taken from: https://gist.github.com/seanh/93666 + """Take a string and return a valid filename constructed from the string. + Uses a whitelist approach: any characters not present in valid_chars are + removed. Also spaces are replaced with underscores.""" + try: + valid_chars = "-_() %s%s" % (string.ascii_letters, string.digits) + filename = ''.join(c for c in name if c in valid_chars) + # Remove spaces in filename + filename = filename.strip() + filename = filename.replace(' ','_') + except TypeError as e: + filename = str(uuid.uuid4()) + logging.error("Got and error: {}".format(str(e))) + return filename + + +# Function for creating name +# Use the title of the html page as the title of the text file +# Called from process_current_link +# Uses string search to locate the <title> tag +# Parameter html is a string +def create_name_from_html (html): + name_list = (html.partition("</title")[0]).split("<title") #grab part of html before ")[-1] + if name: + # removes invalid characters from title + name = format_filename(name) + '__' + str(time.time()) + logging.info('Created name ' + name) + else: + name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp + logging.warn('Failed to create a name, using \'' + name + '\' instead') + return name + +# Function for creating name +# Use the title of the html page as the title of the text file +# Called from process_current_link +# Uses Beautiful Soup to locate the tag +# Parameter soup is a soup object +def create_name_from_soup (soup): + try: + name = soup.title.string + # removes invalid characters from title + name = format_filename(name) + '__' + str(time.time()) + logging.info('Created name ' + name) + except AttributeError as e: + name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp + logging.warn('Failed to create a name, using \'' + name + '\' instead') + logging.error(str(e)) + return name + + +#Function for deleting paired single or double quotes +def dequote(s): + """ + If a string has single or double quotes around it, remove them. + Make sure the pair of quotes match. + If a matching pair of quotes is not found, return the string unchanged. + """ + if (len(s)>= 2 and s[0] == s[-1]) and s.startswith(("'", '"')): + s = s[1:-1] + s = s.strip('"\'') + return s + + +# Function that takes link, saves the contents to text file call href_split +# Main function +def crawl(max_pages): + logging.info("Crawling through domain '" + seed + "'") + + if page == 1: + # Array that holds the queue to be visited later + plannedURLsArray.append(url) + # Logging the urls + planned_urls.write(url) + planned_urls.write("\n") + + # Gets the root of the url + url_split = url.split("://", 1) + # Array that holds urls that have been found. + # This is the array that all new URLs are checked against to prevent repeating. + # Record URL with both http and https prefixes + crawledURLsArray.append("http://" + url_split[1]) + crawledURLsArray.append("https://" + url_split[1]) + # Also log the same into the text file + crawled_urls.write("http://" + url_split[1] + "\n") + crawled_urls.write("https://" + url_split[1] + "\n") + + while page <= max_pages and len(plannedURLsArray) > 0: + process_current_link() + + +def is_title_page_relevant(soup): + return True if soup.find('title', string=filter_title_regex) else False + +# Function that grabs the first link in the list of planned urls, requests the page and processes it +def process_current_link (): + global page + + print(plannedURLsArray[0]) + # Try to get the html of the URL + r = request_url(plannedURLsArray[0]) + + if r: #if the request returned an html + html = r.text + current_url = r.url + # Soupify + # For now it soupifies the link regardless of the mode, because it uses soup later to extract visible text from the page + soup = BeautifulSoup(html, 'html.parser') + grab_all = is_title_page_relevant(soup) + + if mode=="no_soup": + # Gets the name for the file to store the html text in + name = create_name_from_html(html) + #find and process all links + process_links_from_html(html, current_url, grab_all) + else: + name = create_name_from_soup(soup) + process_links_from_soup(soup, current_url, grab_all) + + # Adds the .txt to the end of the name + name = "{0}.txt".format(name) + + # Find only visible text + visible_text = extract_text(soup) + + if visible_text: #save it as a text file + try: + # Create and open the file with that name + fo = codecs.open(name, "w", "utf-8-sig") + # Write URL to that file + fo.write(current_url + "\n") + # Append the html to the file + fo.write(visible_text) + # Close the pipe to the file + fo.close() + # Log the creation of the file + logging.info('Created file ' + name) + + except KeyboardInterrupt: + print("\n\nScript interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() + except Exception: + logging.exception("Can not encode file: " + current_url) + else: + print('No visible text in ' + url) + logging.warning('No visible text in ' + url) + # Else: html does not exist or is empty. Log error + else: + logging.warning('Request for ' + url + ' returned empty html') + empty_request_log.write(url) + empty_request_log.write("\n") + + # Update on the total number of pages + print("iterations:", page, "pages") + print("\n") + # Deletes the currently looked at URL from the queue + plannedURLsArray.pop(0) + + # Increment page count + page += 1 + # Every 50 pages checks the size of the folder. Prints the amount of data collected in MB to the console and log file + if page%50 == 0: + size_of_directory = get_tree_size(os.curdir) / 1000000 + print("Size: ", str(round(size_of_directory, 5)), "MB") + print('\n') + logging.info("Size: " + str(round(size_of_directory, 5)) + "MB") + # Prints in the log file the length of time the crawler has been running in seconds + logging.info("Has been running for " + str(time.time() - start_time) + " seconds") + # Time delay in seconds to prevent crashing the server + time.sleep(.01) + + +# checks that the text content of the link matches the filter_regex +# input parameter is a soup element!!! +def is_relevant_link_from_soup(link): + if link.find(string=filter_regex): + return True + return False + #return True #Uncomment to grab all links + +# takes soup of a page, finds all links on it +# for each link checks if it's relevant +# for each relevant link, saves it to the planned urls array (if it hasn't been crawled yet) +# and to the crawled urls array (so that we don't save it a second time later) +def process_links_from_soup (soup, cur_link, grab_all=False): + # check if the title of the current page matches the filter_title_regex + for lnk in soup.findAll('a', href=True): + # if not, check if the the link itself is relevant + if (grab_all or is_relevant_link_from_soup(lnk)): + new_link = (urllib.parse.urldefrag(lnk['href'])[0]).rstrip('/') + new_link = urllib.parse.urljoin(cur_link, new_link) + # if the link is in our main domain + if checkDomain(new_link, cur_link): + # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray + if new_link not in crawledURLsArray: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and ('.pdf' not in new_link) and ('.jpg' not in new_link) and ('.mp3' not in new_link): + #???TODO: add checks for www.domain.com and https:// + # Adds new link to array + plannedURLsArray.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + + if len(http_split)>1: + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + +# checks that the text content of the link matches the filter_regex +# input parameter is a string +def is_relevant_link_from_html(link): + if filter_regex.match(link): + return True + return False + #return True #Uncomment to grab all links + +#Take an array of links, run the split on each and add the results to the appropriate arrays and files +def process_links_from_html (html, cur_link, grab_all=False): + print("grabbing all: ", str(grab_all)) + if html.partition('<body')[2]: + html = html.partition('<body')[2] + link_strings = html.split('href=') # split the page into sections using "href=" as a delimiter + for lnk in link_strings[1:]: + href = lnk.partition('</a')[0] # grab all text before the "</a" – this var now contains text after an href parameter and before a closing tag, and thus includes the text content of the link + if (grab_all or is_relevant_link_from_html(href)): + href = href.partition('>')[0] + href = href.partition(' ')[0] + href = dequote(href) + new_link = (urllib.parse.urldefrag(href)[0]).rstrip('/') + new_link = urllib.parse.urljoin(cur_link, new_link) + if checkDomain(new_link, cur_link): + # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray + if new_link not in crawledURLsArray: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: + #???TODO: add checks for www.domain.com and https:// + # Adds new link to array + plannedURLsArray.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + + try: + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + except IndexError as e: + logging.info(str(e)) + + +def extract_text(soup): + """Extract text from HTML pages and Return normalized text + https://stackoverflow.com/questions/30565404/remove-all-style-scripts-and-html-tags-from-an-html-page + return string + """ + for script in soup(["script", "style"]): # remove all javascript and stylesheet code + script.extract() + # get text, the separator keeps the paragraphs their usual short + # https://stackoverflow.com/a/38861217 + text = soup.get_text(separator="\n") + # break into lines and remove leading and trailing space on each + lines = (line.strip() for line in text.splitlines()) + # break multi-headlines into a line each + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + # drop blank lines + return '\n'.join(chunk for chunk in chunks if chunk) + + + +# Function to extract text elements from an HTML and return them as an array of BeautifulSoup +# called from process_current_link +def _extract_text(soup): + data = soup.findAll(text=True) + result = filter(is_visible_html_element, data) + all_text = "" + for t in result: + if t.strip(): + all_text += t + "\n" + return all_text + + +# check that the given soup element is a visible text element +# called from extract_text +def is_visible_html_element(element): + if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: + return False + elif re.match('<!--.*-->', str(element.encode('utf-8'))): + return False + return True + + + + + + +# Return total size of files in given path and subdirs by going through the tree. +# Recursive. +# Called from main function +def get_tree_size(path): + total = 0 + for entry in scandir(path): + if entry.is_dir(follow_symlinks=False): + total += get_tree_size(entry.path) + else: + total += entry.stat(follow_symlinks=False).st_size + return total + + +# Shut down gracefully and log it +def shut_down(): + global start_time + global logging + global empty_request_log + global visited_urls + global planned_urls + global crawled_urls + + # Get the time that the command finished + end_time = time.time() + # Print overall time taken to console + print("Overall time: " + str((end_time - start_time))) + # Log overall time and save to main log file + logging.info("Overall time: " + str((end_time - start_time))) + # Close all the things/pipes to files + empty_request_log.close() + visited_urls.close() + planned_urls.close() + crawled_urls.close() + + sys.exit() + + +# Get the time that the command was run +start_time = time.time() + +try: + # Call main function + crawl(iterate) + shut_down() +except KeyboardInterrupt: + print("\n\nScript interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() +except Exception: + logging.exception("Error while running script") diff --git a/Search-Engine-and-Crawler/Crawler/keywords.txt b/keywords.txt similarity index 100% rename from Search-Engine-and-Crawler/Crawler/keywords.txt rename to keywords.txt diff --git a/Search-Engine-and-Crawler/Crawler/keywords_game.txt b/keywords_game.txt similarity index 100% rename from Search-Engine-and-Crawler/Crawler/keywords_game.txt rename to keywords_game.txt diff --git a/Search-Engine-and-Crawler/Crawler/search.py b/search.py similarity index 100% rename from Search-Engine-and-Crawler/Crawler/search.py rename to search.py From 4437ec161acc2b6eed43c956969d54318db5922f Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Wed, 17 Jan 2018 10:29:20 -0700 Subject: [PATCH 37/50] Rename crawlerExpand to crawler --- crawlerExpand.py => crawler.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename crawlerExpand.py => crawler.py (100%) diff --git a/crawlerExpand.py b/crawler.py similarity index 100% rename from crawlerExpand.py rename to crawler.py From bb6ab81da1880a55a6916cb82701d197a3a685e4 Mon Sep 17 00:00:00 2001 From: Antony Oduor <antouboduor@gmail.com> Date: Wed, 17 Jan 2018 10:35:54 -0700 Subject: [PATCH 38/50] Update README with usage instructions --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f82cd4..a86f047 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,53 @@ The original crawler can be found in crawler.py crawlerExpand.py separates tasks into functions, implements logging, URL-cleaning etc. crawlerNoBS.py utilizes simple string searches instead of the BeautifulSoup library to find new links -The "Scraper" folder utilizes the same principles as the crawler but combines it with a string search on pages in a single web domain to output a result of searching for keywords instead of saving all pages it encounters. +~~The "Scraper" folder utilizes the same principles as the crawler but combines it with a string search on pages in a single web domain to output a result of searching for keywords instead of saving all pages it encounters.~~ The "Scraper" folder has migrated and the scripts have been moved to the root of the repository i.e. here. + +## Setting up python environment +``` +virtualenv -p python3.5 env3.5 +``` + +## Requirements +``` +pip install requests +pip install bs4 +pip install tldextract +pip install html5 +pip install pandas +pip install tqdm +``` + +### keywords +Create a `keywords.txt` file on this directory with a list of keywords to look for. Each keyword is on a new line. + +## Organization + +There are two scripts. The `crawler.py` and `search.py`. + + +### crawlerExpand.py + +This script collects pages from the given website and stores them locally on your +machine. + +#### Sample usage +``` +python crawlerExpand.py [URL] 10 50 myuni +``` + +### search.py + +This script allows you to search the pages you have collected above using keywords +and generates a Comma Separated Values (CSV) file with all the keywords found, +their frequency and sorted. + +#### Sample usage +``` +./search.py -f myuni -k keywords_game.txt +``` + +#### Help documentation +``` +./search.py -h +``` From bfa0d1ce3f45b8cdfbecbd6f49132c9421006f17 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Wed, 17 Jan 2018 13:28:33 -0700 Subject: [PATCH 39/50] Add Env Line for Linux Cmd line --- crawler.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 crawler.py diff --git a/crawler.py b/crawler.py old mode 100644 new mode 100755 index 86959a3..ff4392b --- a/crawler.py +++ b/crawler.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import requests from bs4 import BeautifulSoup import urllib.parse From 531eaa051672d55a24d3609714361d334cbc36ee Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Wed, 17 Jan 2018 13:32:21 -0700 Subject: [PATCH 40/50] Remove SIGLARM Stuff Not Working in Windows Signed-off-by: Antony Oduor <aowino@gmail.com> --- search.py | 38 ++++++-------------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/search.py b/search.py index fdbf000..60d6249 100755 --- a/search.py +++ b/search.py @@ -1,6 +1,5 @@ #!/usr/bin/env python import argparse -import signal import sys import re import csv @@ -15,18 +14,6 @@ logger = logging.getLogger(__name__) - -class TookTooDamnLongException(Exception): # Custom exception class - pass - - -def toodamnlong_handler(signum, frame): # Custom signal handler - raise TookTooDamnLongException - - -# Change the behavior of SIGALRM -signal.signal(signal.SIGALRM, toodamnlong_handler) - # current time, used in the names of the folder and the logging file curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) # Create a new log file @@ -87,9 +74,7 @@ def main(): logger.info("CSV headers written") for idx, txt_file in enumerate(all_txt_files): - # Start the timer. - # Once [patience] seconds are over, a SIGALRM signal is sent. - signal.alarm(patience) + with open(txt_file) as fp: visible_text_list = fp.readlines() current_url = visible_text_list[0].strip().rstrip() @@ -103,22 +88,11 @@ def main(): # This try/except loop ensures that # you'll catch TookTooDamnLongException when it's sent. # https://stackoverflow.com/questions/25027122/break-the-function-after-certain-time - try: - # counts keywords in page - found_count, found_keywords = count_keywords( - visible_text_list, - keywords - ) - except TookTooDamnLongException: - # TODO: Keep a record of pages that took forever to search - tqdm.write("[{0:0{width}d}] Aarrrgh! " - "TOOK TOO DAMN LONG TO SEARCH! {1}". - format(idx+1, current_url, width=num_digits)) - logger.warn("TTDL >>> {} <<<".format(current_url)) - pbar.update(1) - # continue the for loop if count_keywords takes more - # than [patience] seconds - continue + # counts keywords in page + found_count, found_keywords = count_keywords( + visible_text_list, + keywords + ) logger.info("Keywords found: {}".format(found_count)) found_keywords_as_dict = dict((x, y) for x, y in found_keywords) From 6df66f2b04338d581b383e3bd01cdd6724736244 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Wed, 17 Jan 2018 13:34:45 -0700 Subject: [PATCH 41/50] Adding Support For Windows Machines --- search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/search.py b/search.py index 60d6249..b9df5b1 100755 --- a/search.py +++ b/search.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- import argparse import sys import re @@ -67,7 +68,7 @@ def main(): tqdm.write("Found {} files to search. Please wait.". format(len(all_txt_files))) - with open(csv_file_name, 'a+', encoding="utf-8") as f: + with open(csv_file_name, 'a+', encoding="utf-8-sig") as f: # Using dictionary keys as fieldnames for the CSV file header writer = csv.DictWriter(f, headers) writer.writeheader() @@ -75,7 +76,7 @@ def main(): for idx, txt_file in enumerate(all_txt_files): - with open(txt_file) as fp: + with open(txt_file, "r", encoding="utf-8-sig") as fp: visible_text_list = fp.readlines() current_url = visible_text_list[0].strip().rstrip() num_digits = len(str(len(all_txt_files))) From 727e272985f371daa7e4a0a1906b0c685f796791 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Wed, 17 Jan 2018 13:35:24 -0700 Subject: [PATCH 42/50] Add Packages Requirements File --- requirements.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..69f5660 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +beautifulsoup4==4.6.0 +bs4==0.0.1 +certifi==2017.11.5 +chardet==3.0.4 +html5lib==0.999999999 +idna==2.6 +requests==2.18.4 +requests-file==1.4.2 +six==1.11.0 +tldextract==2.2.0 +urllib3==1.22 +webencodings==0.5.1 +Whoosh==2.7.4 +pandas==0.21.1 +tqdm==4.19.5 From 23c2db5bf2faad60902e9ddbca6ecf8b2e683910 Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG <refig@ualberta.ca> Date: Thu, 18 Jan 2018 16:06:50 -0700 Subject: [PATCH 43/50] Check Link For Unwanted File Extensions Check headers on actual response, was previously requesting twice. First for headers and then page. Signed-off-by: VITA Lab ReFiG <refig@ualberta.ca> --- crawler.py | 130 +++++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 59 deletions(-) diff --git a/crawler.py b/crawler.py index ff4392b..4805793 100755 --- a/crawler.py +++ b/crawler.py @@ -43,7 +43,7 @@ # RegEx that is used to filter searches for URLs on any given page. # Used in is_relevant_link_from_soup and is_relevant_link_from_html functions -filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]rad).*") +filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]rad|[Ss]chool).*") filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") # Var to choose mode @@ -165,10 +165,9 @@ def request_url(url): # Use requests module to get html from url as an object html = '' try: - head = requests.head(url, headers=headers) - if head.ok and ("text/html" in head.headers["content-type"]): - r = requests.get(url, headers=headers) - if r.ok: + r = requests.get(url, headers=headers) + if r.ok: + if "text/html" in r.headers["content-type"]: return r return None except KeyboardInterrupt: @@ -253,6 +252,7 @@ def crawl(max_pages): logging.info("Crawling through domain '" + seed + "'") if page == 1: + # Array that holds the queue to be visited later plannedURLsArray.append(url) # Logging the urls @@ -375,34 +375,35 @@ def process_links_from_soup (soup, cur_link, grab_all=False): if (grab_all or is_relevant_link_from_soup(lnk)): new_link = (urllib.parse.urldefrag(lnk['href'])[0]).rstrip('/') new_link = urllib.parse.urljoin(cur_link, new_link) - # if the link is in our main domain - if checkDomain(new_link, cur_link): - # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray - if new_link not in crawledURLsArray: - # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and ('.pdf' not in new_link) and ('.jpg' not in new_link) and ('.mp3' not in new_link): - #???TODO: add checks for www.domain.com and https:// - # Adds new link to array - plannedURLsArray.append(new_link) - # Adds new link to queue file - planned_urls.write(new_link) - planned_urls.write("\n") - - # Remove the front of the URL (http or https) - http_split = new_link.split("://", 1) - - if len(http_split)>1: - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") + if this_is_not_media(new_link): + # if the link is in our main domain + if checkDomain(new_link, cur_link): + # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray + if new_link not in crawledURLsArray: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and ('.pdf' not in new_link) and ('.jpg' not in new_link) and ('.mp3' not in new_link): + #???TODO: add checks for www.domain.com and https:// # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") + plannedURLsArray.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + + if len(http_split)>1: + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") # checks that the text content of the link matches the filter_regex # input parameter is a string @@ -412,6 +413,16 @@ def is_relevant_link_from_html(link): return False #return True #Uncomment to grab all links +def this_is_not_media(new_link): + path = urllib.parse.urlparse(new_link).path + ext = os.path.splitext(path)[1] + unwanted = ['.mp3', '.mp4', '.doc', '.docx', '.pdf', '.jpg', '.jpg', '.css'] + if ext not in unwanted and new_link.startswith("http"): + return True + else: + return False + + #Take an array of links, run the split on each and add the results to the appropriate arrays and files def process_links_from_html (html, cur_link, grab_all=False): print("grabbing all: ", str(grab_all)) @@ -426,34 +437,35 @@ def process_links_from_html (html, cur_link, grab_all=False): href = dequote(href) new_link = (urllib.parse.urldefrag(href)[0]).rstrip('/') new_link = urllib.parse.urljoin(cur_link, new_link) - if checkDomain(new_link, cur_link): - # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray - if new_link not in crawledURLsArray: - # Ensures no jpg or pdfs are stored and that no mailto: links are stored. - if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: - #???TODO: add checks for www.domain.com and https:// - # Adds new link to array - plannedURLsArray.append(new_link) - # Adds new link to queue file - planned_urls.write(new_link) - planned_urls.write("\n") - - try: - # Remove the front of the URL (http or https) - http_split = new_link.split("://", 1) - # Add all possible link variations to file of URLs that have been looked at - # Adds new link to array - crawledURLsArray.append("http://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("http://" + http_split[1]) - crawled_urls.write("\n") + if this_is_not_media(new_link): + if checkDomain(new_link, cur_link): + # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray + if new_link not in crawledURLsArray: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: + #???TODO: add checks for www.domain.com and https:// # Adds new link to array - crawledURLsArray.append("https://" + http_split[1]) - # Adds new link to already looked at file - crawled_urls.write("https://" + http_split[1]) - crawled_urls.write("\n") - except IndexError as e: - logging.info(str(e)) + plannedURLsArray.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + + try: + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawledURLsArray.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawledURLsArray.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + except IndexError as e: + logging.info(str(e)) def extract_text(soup): From 905d0c63b3e82d41e185adf4e37084340df430e8 Mon Sep 17 00:00:00 2001 From: Antony Oduor <aowino@gmail.com> Date: Wed, 31 Jan 2018 07:00:17 -0700 Subject: [PATCH 44/50] Add Scraper Script for Crawling Multiple Websites Takes in a list of URLs from a text file and processes them Signed-off-by: Antony Oduor <aowino@gmail.com> --- README.md | 9 + requirements.txt | 3 + scraper.py | 540 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 552 insertions(+) create mode 100755 scraper.py diff --git a/README.md b/README.md index a86f047..2965f16 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,9 @@ pip install tldextract pip install html5 pip install pandas pip install tqdm +pip install grequests +pip install validators +pip install tld ``` ### keywords @@ -55,3 +58,9 @@ their frequency and sorted. ``` ./search.py -h ``` + + +#### Linux / MacOS / Windows Gotchas! + + +https://stackoverflow.com/questions/19425857/env-python-r-no-such-file-or-directory diff --git a/requirements.txt b/requirements.txt index 69f5660..96533d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,6 @@ webencodings==0.5.1 Whoosh==2.7.4 pandas==0.21.1 tqdm==4.19.5 +grequests=0.3.0 +validators=0.12.0 +tld==0.7.9 diff --git a/scraper.py b/scraper.py new file mode 100755 index 0000000..0ae8ed5 --- /dev/null +++ b/scraper.py @@ -0,0 +1,540 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import argparse +import sys +import re +import urllib.parse +import logging +import os +import time +import codecs +import requests +import string +from bs4 import BeautifulSoup +import tldextract +try: + from os import scandir, walk +except ImportError: + from scandir import scandir, walk +from tqdm import tqdm +import validators +import grequests +from tld import get_tld +from tld.utils import update_tld_names + +# update_tld_names() https://stackoverflow.com/a/22228140 +logger = logging.getLogger(__name__) + +# current time, used in the names of the folder and the logging file +curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) +# Create a new log file +logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'), + level=logging.DEBUG + ) + +# https://github.com/tqdm/tqdm/issues/481 +tqdm.monitor_interval = 0 + +# RegEx that is used to filter searches for URLs on any given page. +# Used in is_relevant_link_from_soup and is_relevant_link_from_html functions +filter_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ertificate|[Dd]egree|[Dd]iploma|[Ff]aculty|[Ss]chool|[Dd]epartment|[Uu]ndergrad|[Gr]rad|[Ss]chool).*") +filter_title_regex = re.compile(".*([Pp]rogram|[Aa]dmission|[Cc]ourse).*") + +def main(): + + current_working_dir = os.getcwd() # current directory we are standing on + + websites_list = get_file_content_as_list(websites_file) + + overall_prog = tqdm(total=len(websites_list), unit="website", desc="Overall") + for idx, website in enumerate(websites_list): + planned_urls_array = [] + crawled_urls_array = [] + + # Extracts the top level domain from the URL (eg. ualberta.ca, no slashes) + seed = tldextract.extract(website).domain + + pbar = {} + pbar[idx] = tqdm(total=max_pages, unit="page", desc=website) + if validators.url(website): + batch_website = "{}_{}".format(batch_name, get_tld(website)) + if not os.path.exists(batch_website): + os.mkdir(batch_website) + with ChDir(batch_website): + setup_crawler_files() + start_page = 1 + else: + with ChDir(batch_website): + start_page = get_start_page() + + with ChDir(batch_website): + crawl(seed, pbar[idx], start_page, planned_urls_array, crawled_urls_array, website, max_pages) + overall_prog.update(1) + +def crawl(seed, prog_upd, start_page, planned_urls_array, crawled_urls_array, website, max_pages): + """Function that takes link, saves the contents to text file call href_split + """ + logging.info("Crawling through domain '" + seed + "'") + tqdm.write("++++++++++Crawling through domain {}+++++++++++".format(seed)) + visited_urls, planned_urls, crawled_urls = setup_crawler_files() + + if start_page == 1: + + # Array that holds the queue to be visited later + planned_urls_array.append(website) + # Logging the urls + planned_urls.write(website) + planned_urls.write("\n") + + # Gets the root of the url + url_split = website.split("://", 1) + # Array that holds urls that have been found. + # This is the array that all new URLs are checked against to prevent repeating. + # Record URL with both http and https prefixes + crawled_urls_array.append("http://" + url_split[1]) + crawled_urls_array.append("https://" + url_split[1]) + # Also log the same into the text file + crawled_urls.write("http://" + url_split[1] + "\n") + crawled_urls.write("https://" + url_split[1] + "\n") + + while start_page <= max_pages and len(planned_urls_array) > 0: + start_page = process_current_link(start_page, + prog_upd, + planned_urls_array[0], + seed, + visited_urls, + crawled_urls_array, + crawled_urls, + planned_urls_array, + planned_urls, + max_pages, + ) + prog_upd.update(1) + + # Deletes the currently looked at URL from the queue + planned_urls_array.pop(0) + + +def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_array, crawled_urls, planned_urls_array, planned_urls, max_pages): + """Function that grabs the first link in the + list of planned urls, requests the page and processes it + """ + empty_request_log = codecs.open("_empty_requests.txt", "w", "utf-8") + + # Try to get the html of the URL + r = request_url(link, visited_urls) + + grab_all = False + if r: # if the request returned an html + html = r.text + current_url = r.url + # Soupify + # For now it soupifies the link regardless of the mode, + # because it uses soup later to extract visible text from the page + soup = BeautifulSoup(html, 'html.parser') + grab_all = is_title_page_relevant(soup) + + # Gets the name for the file to store the html text in + name = create_name_from_html(html) + + # find and process all links + process_links_from_html(html, + prog_upd, + current_url, + seed, + crawled_urls_array, + crawled_urls, + planned_urls_array, + planned_urls, + grab_all, + ) + + # Adds the .txt to the end of the name + name = "{0}.txt".format(name) + + # Find only visible text + visible_text = extract_text(soup) + + if visible_text: # save it as a text file + try: + # Create and open the file with that name + fo = codecs.open(name, "w", "utf-8-sig") + # Write URL to that file + fo.write(current_url + "\n") + # Append the html to the file + fo.write(visible_text) + # Close the pipe to the file + fo.close() + # Log the creation of the file + logging.info('Created file ' + name) + + except KeyboardInterrupt: + tqdm.write("Script interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() + except Exception: + logging.exception("Can not encode file: " + current_url) + else: + tqdm.write("No visible text in {}".format(link)) + logging.warning('No visible text in ' + link) + # Else: html does not exist or is empty. Log error + else: + logging.warning('Request for ' + link + ' returned empty html') + empty_request_log.write(link) + empty_request_log.write("\n") + + # Update on the total number of pages + num_digits = len(str(max_pages)) + grab_blurb = "grabbing ALL links" if grab_all else "grabbing key links" + tqdm.write("[{0:0{width}d}]:[{1}] – {2}".format(page, grab_blurb.ljust(18), link, width=num_digits)) + + # Increment page count + page += 1 + # Every 50 pages checks the size of the folder. Prints the amount of data collected in MB to the console and log file + if page % 50 == 0: + size_of_directory = get_tree_size(os.curdir) / 1000000 + tqdm.write("Size: {} MB".format(str(round(size_of_directory, 5)))) + logging.info("Size: " + str(round(size_of_directory, 5)) + "MB") + # Time delay in seconds to prevent crashing the server + time.sleep(.01) + return page + +def get_tree_size(path): + """Return total size of files in given path and subdirs by going through the tree. + Recursive. + Called from main function + """ + total = 0 + for entry in scandir(path): + if entry.is_dir(follow_symlinks=False): + total += get_tree_size(entry.path) + else: + total += entry.stat(follow_symlinks=False).st_size + return total + +def extract_links_from_page(html_page): + return re.findall(r'<a href="(http[s]?://[^">]*)', html_page) + +def process_links_from_html(html, prog_upd, cur_link, seed, crawled_urls_array, crawled_urls, planned_urls_array, planned_urls, grab_all=False): + """Take an array of links, run the split on each and add the results + to the appropriate arrays and files + """ + links = [] + # tqdm.write("grabbing all {}".format(str(grab_all))) + + if html.partition('<body')[2]: + html = html.partition('<body')[2] + link_strings = html.split('href=') # split the page into sections using "href=" as a delimiter + for lnk in link_strings[1:]: + href = lnk.partition('</a')[0] # grab all text before the "</a" – this var now contains text after an href parameter and before a closing tag, and thus includes the text content of the link + if (grab_all or is_relevant_link_from_html(href)): + href = href.partition('>')[0] + href = href.partition(' ')[0] + href = dequote(href) + new_link = (urllib.parse.urldefrag(href)[0]).rstrip('/') + new_link = urllib.parse.urljoin(cur_link, new_link) + if this_is_not_media(new_link): + if check_domain(new_link, seed): + # if the link is not in crawledURLsArray then it appends it to urls and crawledURLsArray + if new_link not in crawled_urls_array: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: + #???TODO: add checks for www.domain.com and https:// + # Adds new link to array + planned_urls_array.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + + try: + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawled_urls_array.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawled_urls_array.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + except IndexError as e: + logging.info(str(e)) + + return + +def add_to_crawled_urls_list(new_link, crawled_urls_array, crawled_urls): + """if the link is not in crawled_urls_array then it + appends it to urls and crawled_urls_array + """ + if new_link not in crawled_urls_array: + # Ensures no jpg or pdfs are stored and that no mailto: links are stored. + if new_link.startswith("http") and '.pdf' not in new_link and '.jpg' not in new_link and '.mp3' not in new_link: + #???TODO: add checks for www.domain.com and https:// + try: + # Remove the front of the URL (http or https) + http_split = new_link.split("://", 1) + # Add all possible link variations to file of URLs that have been looked at + # Adds new link to array + crawled_urls_array.append("http://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("http://" + http_split[1]) + crawled_urls.write("\n") + # Adds new link to array + crawled_urls_array.append("https://" + http_split[1]) + # Adds new link to already looked at file + crawled_urls.write("https://" + http_split[1]) + crawled_urls.write("\n") + except IndexError as e: + logging.info(str(e)) + +def add_to_planned_urls_list(new_link, planned_urls_array, planned_urls): + # Adds new link to array + planned_urls_array.append(new_link) + # Adds new link to queue file + planned_urls.write(new_link) + planned_urls.write("\n") + +def is_title_page_relevant(soup): + return True if soup.find('title', string=filter_title_regex) else False + +def this_is_not_media(new_link): + path = urllib.parse.urlparse(new_link).path + ext = os.path.splitext(path)[1] + unwanted = ['.mp3', '.mp4', '.doc', '.docx', '.pdf', '.jpg', '.jpg', '.css'] + if ext not in unwanted and new_link.startswith("http"): + return True + else: + return False + +def create_name_from_html (html): + """Function for creating name + Use the title of the html page as the title of the text file + Called from process_current_link + Uses string search to locate the <title> tag + Parameter html is a string + """ + name_list = (html.partition("</title")[0]).split("<title") #grab part of html before ")[-1] + if name: + # removes invalid characters from title + name = format_filename(name) + '__' + str(time.time()) + logging.info('Created name ' + name) + else: + name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp + logging.warn('Failed to create a name, using \'' + name + '\' instead') + return name + +def format_filename(name): + #Taken from: https://gist.github.com/seanh/93666 + """Take a string and return a valid filename constructed from the string. + Uses a whitelist approach: any characters not present in valid_chars are + removed. Also spaces are replaced with underscores.""" + try: + valid_chars = "-_() %s%s" % (string.ascii_letters, string.digits) + filename = ''.join(c for c in name if c in valid_chars) + # Remove spaces in filename + filename = filename.strip() + filename = filename.replace(' ','_') + except TypeError as e: + filename = str(uuid.uuid4()) + logging.error("Got and error: {}".format(str(e))) + return filename + +def is_relevant_link_from_html(link): + """checks that the text content of the link matches the filter_regex + input parameter is a string + """ + if filter_regex.match(link): + return True + return False + #return True #Uncomment to grab all links + +def dequote(s): + """Function for deleting paired single or double quotes + If a string has single or double quotes around it, remove them. + Make sure the pair of quotes match. + If a matching pair of quotes is not found, return the string unchanged. + """ + if (len(s)>= 2 and s[0] == s[-1]) and s.startswith(("'", '"')): + s = s[1:-1] + s = s.strip('"\'') + return s + +def extract_text(soup): + """Extract text from HTML pages and Return normalized text + https://stackoverflow.com/questions/30565404/remove-all-style-scripts-and-html-tags-from-an-html-page + return string + """ + for script in soup(["script", "style"]): # remove all javascript and stylesheet code + script.extract() + # get text, the separator keeps the paragraphs their usual short + # https://stackoverflow.com/a/38861217 + text = soup.get_text(separator="\n") + # break into lines and remove leading and trailing space on each + lines = (line.strip() for line in text.splitlines()) + # break multi-headlines into a line each + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + # drop blank lines + return '\n'.join(chunk for chunk in chunks if chunk) + +def request_url(url, visited_urls): + """Fuction for requesting url + Given a URL, go to that url and get the html and return it + Called from main function + """ + # Set a header to pretend it's a browser + headers = requests.utils.default_headers() + headers.update ( + { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', + } + ) + + # Log that this URL is being saved + logging.info('Requesting ' + url) + visited_urls.write(url) + visited_urls.write("\n") + # Use requests module to get html from url as an object + html = '' + try: + r = requests.get(url, headers=headers) + if r.ok: + if "text/html" in r.headers["content-type"]: + return r + return None + except requests.exceptions.Timeout: + # Maybe set up for a retry, or continue in a retry loop + print("\nTook too long to get the page.") + logging.info("Took too long to get the page.") + except requests.exceptions.RequestException as e: + # catastrophic error. bail. + print("\nCannot get the page.") + logging.info("Cannot get the page.") + except KeyboardInterrupt: + print("\n\nScript interrupted by user. Shutting down.") + logging.info("Script interrupted by user") + shut_down() + except Exception: + logging.exception("Couldn\'t request " + url) + return None + +def get_start_page(): + """Open the visited_urls text file and count the number of lines + in it – that's how many pages the script visited + throughout its previous runs + """ + i = 1 + with open("_visited_urls.txt") as f: + for i, l in enumerate(f, start=1): + pass + page = i + return page + +class ChDir(object): + """ + Step into a directory context on which to operate on. + """ + def __init__(self, path): + self.old_dir = os.getcwd() + self.new_dir = path + + def __enter__(self): + os.chdir(self.new_dir) + + def __exit__(self, *args): + os.chdir(self.old_dir) + +def get_file_content_as_list(file_name): + """Give a filename, open and read the contents into a list + file_name - file to be opened + return list of words + """ + with open(file_name, 'r') as file_name_handle: + return file_name_handle.read().splitlines() + +def setup_crawler_files(): + # Open the visited_urls text file + visited_handler = codecs.open("_visited_urls.txt", "a+", "utf-8") + + # Open the file with planned urls and add them to the array of planned urls + planned_handler = codecs.open("_planned_urls.txt", "a+", "utf-8") + + # Open the file with crawled urls and add them to the array of crawled urls + crawled_handler = codecs.open("_crawled_urls.txt", "a+", "utf-8") + + return visited_handler, planned_handler, crawled_handler + +def check_domain(new_link, seed): + """Function that checks if the link provided is in the + same domain as the seed + return: boolean + """ + new_link_domain = tldextract.extract(new_link).domain + if (new_link_domain == seed): + return True + return False + +# Shut down gracefully and log it +def shut_down(): + # TODO Close all the things/pipes to files + sys.exit() + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description='Crawl and scrape a list of URLs for further searching.') + + parser.add_argument( + '-w', + '--websites', + dest='websites', + default=None, + required=True, + help='The file containing list of websites URLs (mandatory)' + ) + parser.add_argument( + '-b', + '--batch', + dest='batch', + default=None, + required=True, + help='Name for this batch of processing (mandatory)' + ) + parser.add_argument( + '-r', + '--resume', + dest='resume', + default=30, + required=False, + help="Check if the given batch exists and attempt to resume" \ + " if not complete." + ) + parser.add_argument( + '-m', + '--max_pages', + dest='max_pages', + default=10000, + required=False, + help="The maximum number of pages to crawl per website" + ) + + # these are module global variables and can be access by any function in + # this module + args = parser.parse_args() + websites_file = args.websites + batch_name = args.batch + resume_attempt = args.resume + max_pages = int(args.max_pages) + + try: + main() + except KeyboardInterrupt as e: + logger.info("Script interrupted by user") + try: + sys.exit(0) + except SystemExit: + os._exit(0) From 02d2896ac40ebc22cc2fa4f30a31768fe80de141 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Mon, 5 Feb 2018 14:01:52 -0700 Subject: [PATCH 45/50] Rotate Scraper Logs From Becoming Bulky The universal scraper log file can get very big, very fast this changes enable log rotation and also move then within the relavant batch website folder. --- scraper.py | 70 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/scraper.py b/scraper.py index 0ae8ed5..838ed4b 100755 --- a/scraper.py +++ b/scraper.py @@ -5,6 +5,7 @@ import re import urllib.parse import logging +import logging.handlers import os import time import codecs @@ -24,13 +25,7 @@ # update_tld_names() https://stackoverflow.com/a/22228140 logger = logging.getLogger(__name__) - -# current time, used in the names of the folder and the logging file -curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) -# Create a new log file -logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'), - level=logging.DEBUG - ) +logger.setLevel(logging.DEBUG) # https://github.com/tqdm/tqdm/issues/481 tqdm.monitor_interval = 0 @@ -67,14 +62,34 @@ def main(): with ChDir(batch_website): start_page = get_start_page() + setup_rotating_log(batch_website, seed) + with ChDir(batch_website): crawl(seed, pbar[idx], start_page, planned_urls_array, crawled_urls_array, website, max_pages) overall_prog.update(1) +def setup_rotating_log(batch_website, seed): + with ChDir(batch_website): + # current time, used in the names of the folder and the logging file + curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) + logs_dir = "logs" + if not os.path.exists(logs_dir): + os.mkdir(logs_dir) + + log_file_name = '_uniscraperlog_{}_{}.log'.format(seed, curtime) + path_to_log_file = os.path.join(logs_dir, log_file_name) + # add a rotating logfile handler + handler = logging.handlers.RotatingFileHandler( + path_to_log_file, + maxBytes=2097152, # 2 MB + backupCount=100 + ) + logger.addHandler(handler) + def crawl(seed, prog_upd, start_page, planned_urls_array, crawled_urls_array, website, max_pages): """Function that takes link, saves the contents to text file call href_split """ - logging.info("Crawling through domain '" + seed + "'") + logger.info("Crawling through domain '" + seed + "'") tqdm.write("++++++++++Crawling through domain {}+++++++++++".format(seed)) visited_urls, planned_urls, crawled_urls = setup_crawler_files() @@ -166,20 +181,20 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_ # Close the pipe to the file fo.close() # Log the creation of the file - logging.info('Created file ' + name) + logger.info('Created file ' + name) except KeyboardInterrupt: tqdm.write("Script interrupted by user. Shutting down.") - logging.info("Script interrupted by user") + logger.info("Script interrupted by user") shut_down() except Exception: - logging.exception("Can not encode file: " + current_url) + logger.exception("Can not encode file: " + current_url) else: tqdm.write("No visible text in {}".format(link)) - logging.warning('No visible text in ' + link) + logger.warning('No visible text in ' + link) # Else: html does not exist or is empty. Log error else: - logging.warning('Request for ' + link + ' returned empty html') + logger.warning('Request for ' + link + ' returned empty html') empty_request_log.write(link) empty_request_log.write("\n") @@ -194,7 +209,7 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_ if page % 50 == 0: size_of_directory = get_tree_size(os.curdir) / 1000000 tqdm.write("Size: {} MB".format(str(round(size_of_directory, 5)))) - logging.info("Size: " + str(round(size_of_directory, 5)) + "MB") + logger.info("Size: " + str(round(size_of_directory, 5)) + "MB") # Time delay in seconds to prevent crashing the server time.sleep(.01) return page @@ -261,7 +276,7 @@ def process_links_from_html(html, prog_upd, cur_link, seed, crawled_urls_array, crawled_urls.write("https://" + http_split[1]) crawled_urls.write("\n") except IndexError as e: - logging.info(str(e)) + logger.info(str(e)) return @@ -288,7 +303,7 @@ def add_to_crawled_urls_list(new_link, crawled_urls_array, crawled_urls): crawled_urls.write("https://" + http_split[1]) crawled_urls.write("\n") except IndexError as e: - logging.info(str(e)) + logger.info(str(e)) def add_to_planned_urls_list(new_link, planned_urls_array, planned_urls): # Adds new link to array @@ -322,10 +337,10 @@ def create_name_from_html (html): if name: # removes invalid characters from title name = format_filename(name) + '__' + str(time.time()) - logging.info('Created name ' + name) + logger.info('Created name ' + name) else: name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp - logging.warn('Failed to create a name, using \'' + name + '\' instead') + logger.warn('Failed to create a name, using \'' + name + '\' instead') return name def format_filename(name): @@ -341,7 +356,7 @@ def format_filename(name): filename = filename.replace(' ','_') except TypeError as e: filename = str(uuid.uuid4()) - logging.error("Got and error: {}".format(str(e))) + logger.error("Got and error: {}".format(str(e))) return filename def is_relevant_link_from_html(link): @@ -395,7 +410,7 @@ def request_url(url, visited_urls): ) # Log that this URL is being saved - logging.info('Requesting ' + url) + logger.info('Requesting ' + url) visited_urls.write(url) visited_urls.write("\n") # Use requests module to get html from url as an object @@ -409,19 +424,25 @@ def request_url(url, visited_urls): except requests.exceptions.Timeout: # Maybe set up for a retry, or continue in a retry loop print("\nTook too long to get the page.") - logging.info("Took too long to get the page.") + logger.info("Took too long to get the page.") except requests.exceptions.RequestException as e: # catastrophic error. bail. print("\nCannot get the page.") - logging.info("Cannot get the page.") + logger.info("Cannot get the page.") except KeyboardInterrupt: print("\n\nScript interrupted by user. Shutting down.") - logging.info("Script interrupted by user") + logger.info("Script interrupted by user") shut_down() except Exception: - logging.exception("Couldn\'t request " + url) + logger.exception("Couldn\'t request " + url) return None +def exception(request, exception): + print("Problem: {}: {}".format(request.url, exception)) + +def request_urls(urls_list): + results = grequests.map((grequests.get(u) for u in urls_list), exception_handler=exception, size=5) + def get_start_page(): """Open the visited_urls text file and count the number of lines in it – that's how many pages the script visited @@ -437,6 +458,7 @@ def get_start_page(): class ChDir(object): """ Step into a directory context on which to operate on. + https://pythonadventures.wordpress.com/2013/12/15/chdir-a-context-manager-for-switching-working-directories/ """ def __init__(self, path): self.old_dir = os.getcwd() From 18634ebbf7c0692122f14c786a865ef9fd9d9c61 Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG Date: Mon, 5 Feb 2018 16:30:36 -0700 Subject: [PATCH 46/50] Fix Encoding Issues Running on py -3.5 Env --- scraper.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scraper.py b/scraper.py index 838ed4b..73642cb 100755 --- a/scraper.py +++ b/scraper.py @@ -9,7 +9,6 @@ import os import time import codecs -import requests import string from bs4 import BeautifulSoup import tldextract @@ -22,6 +21,7 @@ import grequests from tld import get_tld from tld.utils import update_tld_names +import requests # update_tld_names() https://stackoverflow.com/a/22228140 logger = logging.getLogger(__name__) @@ -201,7 +201,7 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_ # Update on the total number of pages num_digits = len(str(max_pages)) grab_blurb = "grabbing ALL links" if grab_all else "grabbing key links" - tqdm.write("[{0:0{width}d}]:[{1}] – {2}".format(page, grab_blurb.ljust(18), link, width=num_digits)) + tqdm.write("[{0:0{width}d}]:[{1}] - {2}".format(page, grab_blurb.ljust(18), link, width=num_digits)) # Increment page count page += 1 @@ -420,6 +420,7 @@ def request_url(url, visited_urls): if r.ok: if "text/html" in r.headers["content-type"]: return r + logger.info(str(r)) return None except requests.exceptions.Timeout: # Maybe set up for a retry, or continue in a retry loop @@ -449,7 +450,7 @@ def get_start_page(): throughout its previous runs """ i = 1 - with open("_visited_urls.txt") as f: + with open("_visited_urls.txt", "r", encoding="utf-8-sig") as f: for i, l in enumerate(f, start=1): pass page = i From 1938190554b0801b477cb3c937f26cce5a6451da Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG Date: Tue, 6 Feb 2018 09:01:44 -0700 Subject: [PATCH 47/50] Fix UnicodeEncodeError for tqdm on Windows --- scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper.py b/scraper.py index 73642cb..f3aba90 100755 --- a/scraper.py +++ b/scraper.py @@ -50,7 +50,7 @@ def main(): seed = tldextract.extract(website).domain pbar = {} - pbar[idx] = tqdm(total=max_pages, unit="page", desc=website) + pbar[idx] = tqdm(total=max_pages, unit="page", desc=website, ascii=True) if validators.url(website): batch_website = "{}_{}".format(batch_name, get_tld(website)) if not os.path.exists(batch_website): From 0f01b79e6b16a50bb635fc4b94a3e41f3b4a5e26 Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG Date: Tue, 6 Feb 2018 14:36:36 -0700 Subject: [PATCH 48/50] Handle Unicode Errors on Windows tqdm write crashed on power shell printing URLs with unicode / special characters like \u2019 --- scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper.py b/scraper.py index f3aba90..1138e65 100755 --- a/scraper.py +++ b/scraper.py @@ -201,7 +201,7 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_ # Update on the total number of pages num_digits = len(str(max_pages)) grab_blurb = "grabbing ALL links" if grab_all else "grabbing key links" - tqdm.write("[{0:0{width}d}]:[{1}] - {2}".format(page, grab_blurb.ljust(18), link, width=num_digits)) + tqdm.write("[{0:0{width}d}]:[{1}] - {2}".format(page, grab_blurb.ljust(18), link.encode("ascii", "ignore"), width=num_digits)) # Increment page count page += 1 From 9bb1c129e2d551e5586dc6d673d7879107667eff Mon Sep 17 00:00:00 2001 From: VITA Lab ReFiG Date: Fri, 9 Feb 2018 13:25:04 -0700 Subject: [PATCH 49/50] Close and Remove Log File Handler Properly close log file handler before jumping to new site to avoid writing log on wrong directory where it was originally written. This ensures that the logging jumps to respective website it's processing. --- scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scraper.py b/scraper.py index 1138e65..11331fa 100755 --- a/scraper.py +++ b/scraper.py @@ -62,11 +62,13 @@ def main(): with ChDir(batch_website): start_page = get_start_page() - setup_rotating_log(batch_website, seed) + handler = setup_rotating_log(batch_website, seed) with ChDir(batch_website): crawl(seed, pbar[idx], start_page, planned_urls_array, crawled_urls_array, website, max_pages) overall_prog.update(1) + handler.close() + logger.removeHandler(handler) def setup_rotating_log(batch_website, seed): with ChDir(batch_website): @@ -85,6 +87,7 @@ def setup_rotating_log(batch_website, seed): backupCount=100 ) logger.addHandler(handler) + return handler def crawl(seed, prog_upd, start_page, planned_urls_array, crawled_urls_array, website, max_pages): """Function that takes link, saves the contents to text file call href_split From 245d1d30f34dc1e3508f11969f83850d772415c5 Mon Sep 17 00:00:00 2001 From: Antony Oduor Date: Tue, 6 Mar 2018 18:40:50 -0700 Subject: [PATCH 50/50] Add Boost Terms to Refine Sorting Order Boost terms are basically search keywords that are counted on the page and given a standard weight (currently 10) in order to manipulate overall relevancy / rank of a page. For example, a scraped page with "program", "undergrad", "academic" e.t.c will get additional points + weight so that it is ranked better than a similar page appearing on non-relevant section of the website such as a blog. A new file known as boost_terms.txt, baked into the code has been added. Edit the file to add new boost terms. Signed-off-by: Antony Oduor --- boost_terms.txt | 10 ++++++++ search.py | 65 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 67 insertions(+), 8 deletions(-) create mode 100644 boost_terms.txt diff --git a/boost_terms.txt b/boost_terms.txt new file mode 100644 index 0000000..4ac5ccb --- /dev/null +++ b/boost_terms.txt @@ -0,0 +1,10 @@ +Program +Admission +Certificate +Degree +Diploma +Faculty +School +Department +Undergrad +Grad diff --git a/search.py b/search.py index b9df5b1..a7776e9 100755 --- a/search.py +++ b/search.py @@ -48,6 +48,11 @@ def main(): format(folder_name)) sys.exit() + # read boost terms from file into a list + boost_terms = get_file_content_as_list('boost_terms.txt') + # make the terms lowercase + boost_terms = [x.lower() for x in boost_terms] + # read keywords from file into a list keywords = get_file_content_as_list(keywords_file) # make the keywords lowercase @@ -62,7 +67,9 @@ def main(): headers = [str(x) for x, y in sorted_keywords_list] # prepend url header onto the keywords list headers.insert(0, u'url') - headers.insert(1, u'frequency_sum') + headers.insert(1, u'freq_boost_sum') + headers.insert(2, u'frequency_sum') + headers.insert(3, u'boost_sum') pbar = tqdm(total=len(all_txt_files)) tqdm.write("Found {} files to search. Please wait.". @@ -90,11 +97,17 @@ def main(): # you'll catch TookTooDamnLongException when it's sent. # https://stackoverflow.com/questions/25027122/break-the-function-after-certain-time # counts keywords in page - found_count, found_keywords = count_keywords( + found_count, found_keywords, broad_terms_sum = count_keywords( visible_text_list, + boost_terms, keywords ) + if broad_terms_sum < 1: + # if none of the terms exist, don't event bother + pbar.update(1) + continue + logger.info("Keywords found: {}".format(found_count)) found_keywords_as_dict = dict((x, y) for x, y in found_keywords) @@ -115,7 +128,10 @@ def main(): # prepend the current URL onto the frequencies dict object freq_sum = sum(final_csv_dict[0].values()) - final_csv_dict[0]['frequency_sum'] = freq_sum + boost_multiplied = broad_terms_sum*10 + final_csv_dict[0]['boost_sum'] = boost_multiplied + final_csv_dict[0]['frequency_sum'] = freq_sum+boost_multiplied + final_csv_dict[0]['freq_boost_sum'] = (freq_sum+boost_multiplied)+boost_multiplied final_csv_dict[0]['url'] = current_url # ignore zero frequency_sum... @@ -138,7 +154,13 @@ def sort_csv(csv_input, csv_output): summation to the lowest. """ df = pd.read_csv(csv_input) - df = df.sort_values(['frequency_sum'], ascending=[0]) + df = df.sort_values(['freq_boost_sum'], ascending=[0]) + + # remove duplicates + print(df.shape) + df.drop_duplicates(subset=['url'], keep='first', inplace=True) + print(df.shape) + df.to_csv(csv_output, index=False) @@ -165,16 +187,20 @@ def get_file_content_as_list(file_name): return file_name_handle.read().splitlines() -def count_keywords(list_of_tokens, list_of_target_words): +def count_keywords(list_of_tokens, list_of_boost_terms, list_of_target_words): """Counts how many instances of the keywords were found list_of_tokens - The list of words as haystack - list_of_target_words - The list of words as needle + boost_terms - The list of broader terms to check for after keywords search. + E.g. if words "program", "academic" appear then boost this page further. + keywords - The list of words as needle return number of words, list of keywords found Inspiration: http://www.cademuir.eu/blog/2011/10/20/python-searching-for-a-string-within-a-list-list-comprehension/ https://developmentality.wordpress.com/2011/09/22/python-gotcha-word-boundaries-in-regular-expressions/ """ num_target_words = 0 + total_weights_sum = 0 + num_target_terms = 0 matched_words = [] for token in list_of_target_words: # Goes through the tokens in the list weighted_token, token_weight = strip_weights(token) @@ -185,8 +211,32 @@ def count_keywords(list_of_tokens, list_of_target_words): found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m] if len(found_what) > 0: # For each one it checks if it is in the target list num_target_words = len(found_what)*int(token_weight) + total_weights_sum = total_weights_sum + int(token_weight) matched_words.append((weighted_token, num_target_words)) - return num_target_words, matched_words # Note that we are returning a tuple (2 values) + + if total_weights_sum > len(found_what): # check that + num_target_terms, matched_terms = relevancy_boost(list_of_tokens, list_of_boost_terms) + # print(num_target_terms, matched_terms) + + return num_target_words, matched_words, num_target_terms # Note that we are returning a tuple (2 values) + + +def relevancy_boost(list_of_tokens, boost_terms): + num_target_words = 0 + total_terms_count = 0 + matched_words = [] + for term in boost_terms: # Goes through the tokens in the list + weighted_term, term_weight = strip_weights(term) + + # regex = re.compile(".*({}).*".format(token)) # does match in-word substrings + regex = re.compile(".*(\\b{}\\b).*".format(weighted_term)) # match strictly whole words only + # found_what = [m.group(0) for l in list_of_target_words for m in [regex.search(l)] if m] + found_what = [m.group(1) for l in list_of_tokens for m in [regex.search(l)] if m] + if len(found_what) > 0: # For each one it checks if it is in the target list + num_target_words = len(found_what)*int(term_weight) + total_terms_count = total_terms_count + len(found_what) + matched_words.append((weighted_term, num_target_words)) + return total_terms_count, matched_words # Note that we are returning a tuple (2 values) if __name__ == "__main__": @@ -195,7 +245,6 @@ def count_keywords(list_of_tokens, list_of_target_words): description='Generate a sorted CSV file with keyword frequencies' ' from scraped web pages.' ) - parser.add_argument( '-f', '--folder',