@@ -24,6 +24,9 @@ def describe(node):
2424 return "%s#%s.%s" % (
2525 node .name , node .get ('id' , '' ), node .get ('class' ,'' ))
2626
27+ def _text (node ):
28+ return " " .join (node .findAll (text = True ))
29+
2730class Document :
2831 TEXT_LENGTH_THRESHOLD = 25
2932 RETRY_LENGTH = 250
@@ -63,6 +66,7 @@ def summary(self):
6366 else :
6467 if ruthless :
6568 ruthless = False
69+ self .debug ("ended up stripping too much - going for a safer parse" )
6670 # try again
6771 continue
6872 else :
@@ -125,7 +129,7 @@ def select_best_candidate(self, candidates):
125129
126130 def get_link_density (self , elem ):
127131 link_length = len ("" .join ([i .text or "" for i in elem .findAll ("a" )]))
128- text_length = len (elem . text or "" )
132+ text_length = len (_text ( elem ) )
129133 return float (link_length ) / max (text_length , 1 )
130134
131135 def score_paragraphs (self , min_text_length ):
@@ -138,7 +142,7 @@ def score_paragraphs(self, min_text_length):
138142 parent_key = HashableElement (parent_node )
139143 grand_parent_key = HashableElement (grand_parent_node )
140144
141- inner_text = elem . string
145+ inner_text = _text ( elem )
142146
143147 # If this paragraph is less than 25 characters, don't even count it.
144148 if (not inner_text ) or len (inner_text ) < min_text_length :
@@ -160,7 +164,8 @@ def score_paragraphs(self, min_text_length):
160164 # Scale the final candidates score based on link density. Good content should have a
161165 # relatively small link density (5% or less) and be mostly unaffected by this operation.
162166 for elem , candidate in candidates .items ():
163- candidate ['content_score' ] = candidate ['content_score' ] * (1 - self .get_link_density (elem ))
167+ candidate ['content_score' ] *= (1 - self .get_link_density (elem ))
168+ self .debug ("candidate %s scored %s" % (describe (elem ), candidate ['content_score' ]))
164169
165170 return candidates
166171
@@ -201,7 +206,7 @@ def debug(self, *a):
201206
202207 def remove_unlikely_candidates (self ):
203208 for elem in self .html .findAll ():
204- s = "%s%s" % (elem .get ('class' , '' ), elem .get ('id' ))
209+ s = "%s%s" % (elem .get ('class' , '' ), elem .get ('id' , '' ))
205210 if REGEXES ['unlikelyCandidatesRe' ].search (s ) and (not REGEXES ['okMaybeItsACandidateRe' ].search (s )) and elem .name != 'body' :
206211 self .debug ("Removing unlikely candidate - %s" % (s ,))
207212 elem .extract ()
@@ -245,13 +250,13 @@ def sanitize(self, node, candidates):
245250 el .extract ()
246251 self .debug ("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." %
247252 (describe (el ), weight , content_score ))
248- elif len ((el . text or "" ).split ("," )) < 10 :
253+ elif len (_text (el ).split ("," )) < 10 :
249254 counts = {}
250255 for kind in ['p' , 'img' , 'li' , 'a' , 'embed' , 'input' ]:
251256 counts [kind ] = len (el .findAll (kind ))
252257 counts ["li" ] -= 100
253258
254- content_length = len (el . text or "" ) # Count the text length excluding any surrounding whitespace
259+ content_length = len (_text ( el ) ) # Count the text length excluding any surrounding whitespace
255260 link_density = self .get_link_density (el )
256261 to_remove = False
257262 reason = ""
0 commit comments