Skip to content

Commit 49b8515

Browse files
Ryan MitchellRyan Mitchell
authored andcommitted
Put ngrams in ordered dict
1 parent 1f93fba commit 49b8515

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

chapter7/2-clean2grams.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from bs4 import BeautifulSoup
33
import re
44
import string
5+
from collections import OrderedDict
56

67
def cleanInput(input):
78
input = re.sub('\n+', " ", input)
@@ -19,14 +20,22 @@ def cleanInput(input):
1920

2021
def getNgrams(input, n):
2122
input = cleanInput(input)
22-
output = []
23+
output = dict()
2324
for i in range(len(input)-n+1):
24-
output.append(input[i:i+n])
25+
newNGram = " ".join(input[i:i+n])
26+
if newNGram in output:
27+
output[newNGram] += 1
28+
else:
29+
output[newNGram] = 1
2530
return output
2631

2732
html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
2833
bsObj = BeautifulSoup(html)
2934
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()
35+
#ngrams = getNgrams(content, 2)
36+
#print(ngrams)
37+
#print("2-grams count is: "+str(len(ngrams)))
38+
3039
ngrams = getNgrams(content, 2)
31-
print(ngrams)
32-
print("2-grams count is: "+str(len(ngrams)))
40+
ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True))
41+
print(ngrams)

0 commit comments

Comments
 (0)