1+ # -*- coding: utf-8 -*-
2+ """
3+ Created on Sun Sep 04 15:24:26 2016
4+
5+ @author: DIP
6+ """
7+
8+ from normalization import normalize_corpus , parse_document
9+ from utils import build_feature_matrix , low_rank_svd
10+ import numpy as np
11+
12+
13+ toy_text = """
14+ Elephants are large mammals of the family Elephantidae
15+ and the order Proboscidea. Two species are traditionally recognised,
16+ the African elephant and the Asian elephant. Elephants are scattered
17+ throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male
18+ African elephants are the largest extant terrestrial animals. All
19+ elephants have a long trunk used for many purposes,
20+ particularly breathing, lifting water and grasping objects. Their
21+ incisors grow into tusks, which can serve as weapons and as tools
22+ for moving objects and digging. Elephants' large ear flaps help
23+ to control their body temperature. Their pillar-like legs can
24+ carry their great weight. African elephants have larger ears
25+ and concave backs while Asian elephants have smaller ears
26+ and convex or level backs.
27+ """
28+
29+
30+ from gensim .summarization import summarize , keywords
31+
32+ def text_summarization_gensim (text , summary_ratio = 0.5 ):
33+
34+ summary = summarize (text , split = True , ratio = summary_ratio )
35+ for sentence in summary :
36+ print sentence
37+
38+ docs = parse_document (toy_text )
39+ text = ' ' .join (docs )
40+ text_summarization_gensim (text , summary_ratio = 0.4 )
41+
42+
43+
44+ sentences = parse_document (toy_text )
45+ norm_sentences = normalize_corpus (sentences ,lemmatize = False )
46+
47+ total_sentences = len (norm_sentences )
48+ print 'Total Sentences in Document:' , total_sentences
49+
50+
51+
52+ num_sentences = 3
53+ num_topics = 2
54+
55+ vec , dt_matrix = build_feature_matrix (sentences ,
56+ feature_type = 'frequency' )
57+
58+ td_matrix = dt_matrix .transpose ()
59+ td_matrix = td_matrix .multiply (td_matrix > 0 )
60+
61+ u , s , vt = low_rank_svd (td_matrix , singular_count = num_topics )
62+
63+ sv_threshold = 0.5
64+ min_sigma_value = max (s ) * sv_threshold
65+ s [s < min_sigma_value ] = 0
66+
67+ salience_scores = np .sqrt (np .dot (np .square (s ), np .square (vt )))
68+ print np .round (salience_scores , 2 )
69+
70+ top_sentence_indices = salience_scores .argsort ()[- num_sentences :][::- 1 ]
71+ top_sentence_indices .sort ()
72+ print top_sentence_indices
73+
74+ for index in top_sentence_indices :
75+ print sentences [index ]
76+
77+
78+ def lsa_text_summarizer (documents , num_sentences = 2 ,
79+ num_topics = 2 , feature_type = 'frequency' ,
80+ sv_threshold = 0.5 ):
81+
82+ vec , dt_matrix = build_feature_matrix (documents ,
83+ feature_type = feature_type )
84+
85+ td_matrix = dt_matrix .transpose ()
86+ td_matrix = td_matrix .multiply (td_matrix > 0 )
87+
88+ u , s , vt = low_rank_svd (td_matrix , singular_count = num_topics )
89+ min_sigma_value = max (s ) * sv_threshold
90+ s [s < min_sigma_value ] = 0
91+
92+ salience_scores = np .sqrt (np .dot (np .square (s ), np .square (vt )))
93+ top_sentence_indices = salience_scores .argsort ()[- num_sentences :][::- 1 ]
94+ top_sentence_indices .sort ()
95+
96+ for index in top_sentence_indices :
97+ print sentences [index ]
98+
99+
100+
101+
102+ import networkx
103+
104+ num_sentences = 3
105+ vec , dt_matrix = build_feature_matrix (norm_sentences ,
106+ feature_type = 'tfidf' )
107+ similarity_matrix = (dt_matrix * dt_matrix .T )
108+ print np .round (similarity_matrix .todense (), 2 )
109+
110+ similarity_graph = networkx .from_scipy_sparse_matrix (similarity_matrix )
111+
112+ networkx .draw_networkx (similarity_graph )
113+
114+ scores = networkx .pagerank (similarity_graph )
115+
116+ ranked_sentences = sorted (((score , index )
117+ for index , score
118+ in scores .items ()),
119+ reverse = True )
120+ ranked_sentences
121+
122+ top_sentence_indices = [ranked_sentences [index ][1 ]
123+ for index in range (num_sentences )]
124+ top_sentence_indices .sort ()
125+ print top_sentence_indices
126+
127+ for index in top_sentence_indices :
128+ print sentences [index ]
129+
130+
131+ def textrank_text_summarizer (documents , num_sentences = 2 ,
132+ feature_type = 'frequency' ):
133+
134+ vec , dt_matrix = build_feature_matrix (norm_sentences ,
135+ feature_type = 'tfidf' )
136+ similarity_matrix = (dt_matrix * dt_matrix .T )
137+
138+ similarity_graph = networkx .from_scipy_sparse_matrix (similarity_matrix )
139+ scores = networkx .pagerank (similarity_graph )
140+
141+ ranked_sentences = sorted (((score , index )
142+ for index , score
143+ in scores .items ()),
144+ reverse = True )
145+
146+ top_sentence_indices = [ranked_sentences [index ][1 ]
147+ for index in range (num_sentences )]
148+ top_sentence_indices .sort ()
149+
150+ for index in top_sentence_indices :
151+ print sentences [index ]
152+
153+
154+ DOCUMENT = """
155+ The Elder Scrolls V: Skyrim is an open world action role-playing video game
156+ developed by Bethesda Game Studios and published by Bethesda Softworks.
157+ It is the fifth installment in The Elder Scrolls series, following
158+ The Elder Scrolls IV: Oblivion. Skyrim's main story revolves around
159+ the player character and their effort to defeat Alduin the World-Eater,
160+ a dragon who is prophesied to destroy the world.
161+ The game is set two hundred years after the events of Oblivion
162+ and takes place in the fictional province of Skyrim. The player completes quests
163+ and develops the character by improving skills.
164+ Skyrim continues the open world tradition of its predecessors by allowing the
165+ player to travel anywhere in the game world at any time, and to
166+ ignore or postpone the main storyline indefinitely. The player may freely roam
167+ over the land of Skyrim, which is an open world environment consisting
168+ of wilderness expanses, dungeons, cities, towns, fortresses and villages.
169+ Players may navigate the game world more quickly by riding horses,
170+ or by utilizing a fast-travel system which allows them to warp to previously
171+ Players have the option to develop their character. At the beginning of the game,
172+ players create their character by selecting one of several races,
173+ including humans, orcs, elves and anthropomorphic cat or lizard-like creatures,
174+ and then customizing their character's appearance.discovered locations. Over the
175+ course of the game, players improve their character's skills, which are numerical
176+ representations of their ability in certain areas. There are eighteen skills
177+ divided evenly among the three schools of combat, magic, and stealth.
178+ Skyrim is the first entry in The Elder Scrolls to include Dragons in the game's
179+ wilderness. Like other creatures, Dragons are generated randomly in the world
180+ and will engage in combat.
181+ """
182+
183+
184+ sentences = parse_document (DOCUMENT )
185+ norm_sentences = normalize_corpus (sentences ,lemmatize = True )
186+ print "Total Sentences:" , len (norm_sentences )
187+
188+ lsa_text_summarizer (norm_sentences , num_sentences = 3 ,
189+ num_topics = 5 , feature_type = 'frequency' ,
190+ sv_threshold = 0.5 )
191+
192+ textrank_text_summarizer (norm_sentences , num_sentences = 3 ,
193+ feature_type = 'tfidf' )
0 commit comments