From e473b6ee6d9edadef546bb1ec82313b1abed548f Mon Sep 17 00:00:00 2001 From: dimits-exe <70580393+dimits-exe@users.noreply.github.com> Date: Sun, 2 Jan 2022 18:43:01 +0200 Subject: [PATCH] Implement loading of testing data --- Exercise2/load_imdb.py | 141 ++++++++++++++++++++++++++++++----------- 1 file changed, 104 insertions(+), 37 deletions(-) diff --git a/Exercise2/load_imdb.py b/Exercise2/load_imdb.py index b9e7e1f..67f7497 100644 --- a/Exercise2/load_imdb.py +++ b/Exercise2/load_imdb.py @@ -27,16 +27,14 @@ def get_training_data(dir_path, ignored_count, max_count, sample_size = 5000): """ Reads text from the files in a directory and returns a dictionary with the M most frequent words and their wordinfo, ignoring the N most common ones. - + Parameters: dir_path (str): The path to the directory with all the training data ignored_count (int): How many of the most common words will be excluded max_count (int): How many words will be included sample_size (int): The maximum numbers of files read - Returns: (dict) A dictionary with [word (str)] : [word_info (WordInfo)] records - Throws: ValueError: if the ignored and max counts are negative """ @@ -53,21 +51,18 @@ def get_training_data(dir_path, ignored_count, max_count, sample_size = 5000): ';', ':', '?'] print("Loading postive training data...") - _read_reviews(training_data, neg_dir_path, False, sample_size // 2, + _read_training_data(training_data, neg_dir_path, False, sample_size // 2, ignored_chars) print("Loading negative training data...") - _read_reviews(training_data, pos_dir_path, True, sample_size // 2, + _read_training_data(training_data, pos_dir_path, True, sample_size // 2, ignored_chars) print("Loading succesfull.") - + if len(training_data) <= ignored_count: return training_data - # is this how Im supposed to write code in this cursed language? no, *this* is :sunglasses: - # TODO: remove this comment after getting flexed on by superior code. more :sunglasses: - # sort all the frequencies to find the upper and lower threshold for words frequencies = [info.frequency for info in training_data.values()] sorted_frequencies = sorted(frequencies, reverse=True) # biggest to smallest @@ -83,18 +78,34 @@ def get_training_data(dir_path, ignored_count, max_count, sample_size = 5000): check_threshold = lambda freq: lower_threshold <= freq <= upper_threshold return {k:v for k, v in training_data.items() if check_threshold(v.frequency)} -def get_testing_data(directory_path): +def get_testing_data(dir_path, sample_size = 5000): """ Return a dictionary containing the contents of every data file. Parameters: directory_path (string): The path to the directory containing ALL the testing data files - Returns: (dict) A dictionary with [review (string)] : [is_positive (bool)] records """ + testing_data = dict() + neg_dir_path = os.path.join(dir_path, "neg") + pos_dir_path = os.path.join(dir_path, "pos") + ignored_chars = ['"', "'", '.', ',', '>', '<', '\\', '/', '-', '(', ')', + ';', ':', '?'] + + print("Loading postive testing data...") + _read_testing_data(testing_data, neg_dir_path, False, sample_size // 2, + ignored_chars) + + print("Loading negative testing data...") + _read_testing_data(testing_data, pos_dir_path, True, sample_size // 2, + ignored_chars) + + print("Loading succesfull.") + + return testing_data -def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_chars): +def _read_training_data(wordinfo_dict, dir_path, is_positive, sample_size, ignored_chars): """ Opens all files in a directory and appends [word] : [WordInfo] records in the provided dictionary. @@ -110,7 +121,7 @@ def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_cha files = os.listdir(dir_path) ignored_chars_pattern = re.compile("[%s\\d]" % (re.escape("".join(ignored_chars)))) one_tenth_progress = min(len(files), sample_size) // 10 - + for count, file in enumerate(files): # check if file limit has been reached if count == sample_size: @@ -128,16 +139,6 @@ def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_cha clean_contents = ignored_chars_pattern.sub("", contents, 0) clean_contents = re.sub("\\s+", " ", clean_contents) - # for each text that changes look if everything is ok regarding ignored - # TODO: remove - """ - print(contents) - print() - print(clean_contents) - input() - print() - """ - # update frequencies for word in clean_contents.split(" "): wordinfo = wordinfo_dict.setdefault(word, WordInfo(word)) @@ -149,28 +150,94 @@ def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_cha if (count + 1) % one_tenth_progress == 0: # `count` is 0-indexed print("%d%% complete..." % ((count + 1) // one_tenth_progress * 10)) +def _read_testing_data(review_dict, dir_path, is_positive, sample_size, ignored_chars): + """ + Opens all files in a directory and appends [review_contents (string)] + : [is_positive (bool)] records in the provided dictionary. + + Parameters: + review_dict (dictionary): The dictionary to which the results of the file + scan will be added. + dir_path (str): The directory containing the training data + is_positive (bool): Indicates if the directory contains positive reviews + ignored_chars (list): The characters that will be removed from the text + of each file + """ + files = os.listdir(dir_path) + ignored_chars_pattern = re.compile("[%s\\d]" % (re.escape("".join(ignored_chars)))) + one_tenth_progress = min(len(files), sample_size) // 10 + + for count, file in enumerate(files): + # check if file limit has been reached + if count == sample_size: + break + + # read file contents + try: + file_path = os.path.join(dir_path, file) + f_handle = open(file_path, mode='r', encoding= "utf8") + contents = f_handle.read().lower() + finally: + f_handle.close() + + # clean contents + clean_contents = ignored_chars_pattern.sub("", contents, 0) + clean_contents = re.sub("\\s+", " ", clean_contents) + + #update dict + review_dict[clean_contents] = True if is_positive else False + + # show % progress + if (count + 1) % one_tenth_progress == 0: # `count` is 0-indexed + print("%d%% complete..." % ((count + 1) // one_tenth_progress * 10)) + def main(): import time - path_to_file = os.path.dirname(os.path.abspath(__file__)) - directory = os.path.join(path_to_file, "aclImdb", "train") + def test_train(): + path_to_file = os.path.dirname(os.path.abspath(__file__)) + directory = os.path.join(path_to_file, "aclImdb", "train") + + try: + start = time.perf_counter() + results = get_training_data(directory, 5, 10, 10000) + end = time.perf_counter() + + except os.error as err: + print(f"Process didn't complete normally due to: {err}") + + else: + get_freq = lambda item: item[1].frequency + items_by_freq = sorted(results.items(), key = get_freq, reverse=True) + + for word, wordinfo in items_by_freq: + print(word, wordinfo.frequency) + + print(f"Process ended after {end-start} seconds.") + + def test_test(): + path_to_file = os.path.dirname(os.path.abspath(__file__)) + directory = os.path.join(path_to_file, "aclImdb", "test") + + try: + start = time.perf_counter() + results = get_testing_data(directory, 5000) + end = time.perf_counter() - try: - start = time.perf_counter() - results = get_training_data(directory, 5, 10, 10000) - end = time.perf_counter() + except os.error as err: + print(f"Process didn't complete normally due to: {err}") - except os.error as err: - print(f"Process didn't complete normally due to: {err}") + else: + for count, record in enumerate(results.items()): + print(record[0][:40] + "...", record[1]) + if count == 500: + break - else: - get_freq = lambda item: item[1].frequency - items_by_freq = sorted(results.items(), key = get_freq, reverse=True) + print(f"Process ended after {end-start} seconds.") - for word, wordinfo in items_by_freq: - print(word, wordinfo.frequency) + test_test() - print(f"Process ended after {end-start} seconds.") + if __name__ == "__main__": main()