Skip to content

Commit 1d862a0

Browse files
author
gfxmonk
committed
fixed bug where only immediate text was being considered for weights, instead of all nested text
1 parent 0eacd95 commit 1d862a0

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

readability/readability.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def describe(node):
2424
return "%s#%s.%s" % (
2525
node.name, node.get('id', ''), node.get('class',''))
2626

27+
def _text(node):
28+
return " ".join(node.findAll(text=True))
29+
2730
class Document:
2831
TEXT_LENGTH_THRESHOLD = 25
2932
RETRY_LENGTH = 250
@@ -63,6 +66,7 @@ def summary(self):
6366
else:
6467
if ruthless:
6568
ruthless = False
69+
self.debug("ended up stripping too much - going for a safer parse")
6670
# try again
6771
continue
6872
else:
@@ -125,7 +129,7 @@ def select_best_candidate(self, candidates):
125129

126130
def get_link_density(self, elem):
127131
link_length = len("".join([i.text or "" for i in elem.findAll("a")]))
128-
text_length = len(elem.text or "")
132+
text_length = len(_text(elem))
129133
return float(link_length) / max(text_length, 1)
130134

131135
def score_paragraphs(self, min_text_length):
@@ -138,7 +142,7 @@ def score_paragraphs(self, min_text_length):
138142
parent_key = HashableElement(parent_node)
139143
grand_parent_key = HashableElement(grand_parent_node)
140144

141-
inner_text = elem.string
145+
inner_text = _text(elem)
142146

143147
# If this paragraph is less than 25 characters, don't even count it.
144148
if (not inner_text) or len(inner_text) < min_text_length:
@@ -160,7 +164,8 @@ def score_paragraphs(self, min_text_length):
160164
# Scale the final candidates score based on link density. Good content should have a
161165
# relatively small link density (5% or less) and be mostly unaffected by this operation.
162166
for elem, candidate in candidates.items():
163-
candidate['content_score'] = candidate['content_score'] * (1 - self.get_link_density(elem))
167+
candidate['content_score'] *= (1 - self.get_link_density(elem))
168+
self.debug("candidate %s scored %s" % (describe(elem), candidate['content_score']))
164169

165170
return candidates
166171

@@ -201,7 +206,7 @@ def debug(self, *a):
201206

202207
def remove_unlikely_candidates(self):
203208
for elem in self.html.findAll():
204-
s = "%s%s" % (elem.get('class', ''), elem.get('id'))
209+
s = "%s%s" % (elem.get('class', ''), elem.get('id', ''))
205210
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.name != 'body':
206211
self.debug("Removing unlikely candidate - %s" % (s,))
207212
elem.extract()
@@ -245,13 +250,13 @@ def sanitize(self, node, candidates):
245250
el.extract()
246251
self.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." %
247252
(describe(el), weight, content_score))
248-
elif len((el.text or "").split(",")) < 10:
253+
elif len(_text(el).split(",")) < 10:
249254
counts = {}
250255
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
251256
counts[kind] = len(el.findAll(kind))
252257
counts["li"] -= 100
253258

254-
content_length = len(el.text or "") # Count the text length excluding any surrounding whitespace
259+
content_length = len(_text(el)) # Count the text length excluding any surrounding whitespace
255260
link_density = self.get_link_density(el)
256261
to_remove = False
257262
reason = ""

0 commit comments

Comments
 (0)