Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 104 additions & 37 deletions Exercise2/load_imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,14 @@ def get_training_data(dir_path, ignored_count, max_count, sample_size = 5000):
"""
Reads text from the files in a directory and returns a dictionary with the
M most frequent words and their wordinfo, ignoring the N most common ones.

Parameters:
dir_path (str): The path to the directory with all the training data
ignored_count (int): How many of the most common words will be excluded
max_count (int): How many words will be included
sample_size (int): The maximum numbers of files read

Returns:
(dict) A dictionary with [word (str)] : [word_info (WordInfo)] records

Throws:
ValueError: if the ignored and max counts are negative
"""
Expand All @@ -53,21 +51,18 @@ def get_training_data(dir_path, ignored_count, max_count, sample_size = 5000):
';', ':', '?']

print("Loading postive training data...")
_read_reviews(training_data, neg_dir_path, False, sample_size // 2,
_read_training_data(training_data, neg_dir_path, False, sample_size // 2,
ignored_chars)

print("Loading negative training data...")
_read_reviews(training_data, pos_dir_path, True, sample_size // 2,
_read_training_data(training_data, pos_dir_path, True, sample_size // 2,
ignored_chars)

print("Loading succesfull.")

if len(training_data) <= ignored_count:
return training_data

# is this how Im supposed to write code in this cursed language? no, *this* is :sunglasses:
# TODO: remove this comment after getting flexed on by superior code. more :sunglasses:

# sort all the frequencies to find the upper and lower threshold for words
frequencies = [info.frequency for info in training_data.values()]
sorted_frequencies = sorted(frequencies, reverse=True) # biggest to smallest
Expand All @@ -83,18 +78,34 @@ def get_training_data(dir_path, ignored_count, max_count, sample_size = 5000):
check_threshold = lambda freq: lower_threshold <= freq <= upper_threshold
return {k:v for k, v in training_data.items() if check_threshold(v.frequency)}

def get_testing_data(directory_path):
def get_testing_data(dir_path, sample_size = 5000):
"""
Return a dictionary containing the contents of every data file.

Parameters:
directory_path (string): The path to the directory containing ALL the testing data files

Returns:
(dict) A dictionary with [review (string)] : [is_positive (bool)] records
"""
testing_data = dict()
neg_dir_path = os.path.join(dir_path, "neg")
pos_dir_path = os.path.join(dir_path, "pos")
ignored_chars = ['"', "'", '.', ',', '>', '<', '\\', '/', '-', '(', ')',
';', ':', '?']

print("Loading postive testing data...")
_read_testing_data(testing_data, neg_dir_path, False, sample_size // 2,
ignored_chars)

print("Loading negative testing data...")
_read_testing_data(testing_data, pos_dir_path, True, sample_size // 2,
ignored_chars)

print("Loading succesfull.")

return testing_data

def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_chars):
def _read_training_data(wordinfo_dict, dir_path, is_positive, sample_size, ignored_chars):
"""
Opens all files in a directory and appends [word] : [WordInfo] records in
the provided dictionary.
Expand All @@ -110,7 +121,7 @@ def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_cha
files = os.listdir(dir_path)
ignored_chars_pattern = re.compile("[%s\\d]" % (re.escape("".join(ignored_chars))))
one_tenth_progress = min(len(files), sample_size) // 10

for count, file in enumerate(files):
# check if file limit has been reached
if count == sample_size:
Expand All @@ -128,16 +139,6 @@ def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_cha
clean_contents = ignored_chars_pattern.sub("", contents, 0)
clean_contents = re.sub("\\s+", " ", clean_contents)

# for each text that changes look if everything is ok regarding ignored
# TODO: remove
"""
print(contents)
print()
print(clean_contents)
input()
print()
"""

# update frequencies
for word in clean_contents.split(" "):
wordinfo = wordinfo_dict.setdefault(word, WordInfo(word))
Expand All @@ -149,28 +150,94 @@ def _read_reviews(wordinfo_dict, dir_path, is_positive, sample_size, ignored_cha
if (count + 1) % one_tenth_progress == 0: # `count` is 0-indexed
print("%d%% complete..." % ((count + 1) // one_tenth_progress * 10))

def _read_testing_data(review_dict, dir_path, is_positive, sample_size, ignored_chars):
"""
Opens all files in a directory and appends [review_contents (string)]
: [is_positive (bool)] records in the provided dictionary.

Parameters:
review_dict (dictionary): The dictionary to which the results of the file
scan will be added.
dir_path (str): The directory containing the training data
is_positive (bool): Indicates if the directory contains positive reviews
ignored_chars (list): The characters that will be removed from the text
of each file
"""
files = os.listdir(dir_path)
ignored_chars_pattern = re.compile("[%s\\d]" % (re.escape("".join(ignored_chars))))
one_tenth_progress = min(len(files), sample_size) // 10

for count, file in enumerate(files):
# check if file limit has been reached
if count == sample_size:
break

# read file contents
try:
file_path = os.path.join(dir_path, file)
f_handle = open(file_path, mode='r', encoding= "utf8")
contents = f_handle.read().lower()
finally:
f_handle.close()

# clean contents
clean_contents = ignored_chars_pattern.sub("", contents, 0)
clean_contents = re.sub("\\s+", " ", clean_contents)

#update dict
review_dict[clean_contents] = True if is_positive else False

# show % progress
if (count + 1) % one_tenth_progress == 0: # `count` is 0-indexed
print("%d%% complete..." % ((count + 1) // one_tenth_progress * 10))

def main():
import time

path_to_file = os.path.dirname(os.path.abspath(__file__))
directory = os.path.join(path_to_file, "aclImdb", "train")
def test_train():
path_to_file = os.path.dirname(os.path.abspath(__file__))
directory = os.path.join(path_to_file, "aclImdb", "train")

try:
start = time.perf_counter()
results = get_training_data(directory, 5, 10, 10000)
end = time.perf_counter()

except os.error as err:
print(f"Process didn't complete normally due to: {err}")

else:
get_freq = lambda item: item[1].frequency
items_by_freq = sorted(results.items(), key = get_freq, reverse=True)

for word, wordinfo in items_by_freq:
print(word, wordinfo.frequency)

print(f"Process ended after {end-start} seconds.")

def test_test():
path_to_file = os.path.dirname(os.path.abspath(__file__))
directory = os.path.join(path_to_file, "aclImdb", "test")

try:
start = time.perf_counter()
results = get_testing_data(directory, 5000)
end = time.perf_counter()

try:
start = time.perf_counter()
results = get_training_data(directory, 5, 10, 10000)
end = time.perf_counter()
except os.error as err:
print(f"Process didn't complete normally due to: {err}")

except os.error as err:
print(f"Process didn't complete normally due to: {err}")
else:
for count, record in enumerate(results.items()):
print(record[0][:40] + "...", record[1])
if count == 500:
break

else:
get_freq = lambda item: item[1].frequency
items_by_freq = sorted(results.items(), key = get_freq, reverse=True)
print(f"Process ended after {end-start} seconds.")

for word, wordinfo in items_by_freq:
print(word, wordinfo.frequency)
test_test()

print(f"Process ended after {end-start} seconds.")


if __name__ == "__main__":
main()