Skip to content

Commit 4b11501

Browse files
committed
Merge pull request scrapy#963 from tpeng/fix-xmliter-lxml
[MRG+1] support namespace prefix in xmliter_lxml
2 parents 9706119 + 82d138e commit 4b11501

File tree

2 files changed

+35
-3
lines changed

2 files changed

+35
-3
lines changed

scrapy/contrib_exp/iterators.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,18 @@
22
from scrapy.selector import Selector
33

44

5-
def xmliter_lxml(obj, nodename, namespace=None):
5+
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
66
from lxml import etree
77
reader = _StreamReader(obj)
88
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
99
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
10-
selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
10+
selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
1111
for _, node in iterable:
1212
nodetext = etree.tostring(node)
1313
node.clear()
1414
xs = Selector(text=nodetext, type='xml')
1515
if namespace:
16-
xs.register_namespace('x', namespace)
16+
xs.register_namespace(prefix, namespace)
1717
yield xs.xpath(selxpath)[0]
1818

1919

tests/test_utils_iterators.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,38 @@ def test_xmliter_iterate_namespace(self):
124124
node = next(namespace_iter)
125125
self.assertEqual(node.xpath('text()').extract(), ['http://www.mydummycompany.com/images/item2.jpg'])
126126

127+
def test_xmliter_namespaces_prefix(self):
128+
body = """\
129+
<?xml version="1.0" encoding="UTF-8"?>
130+
<root>
131+
<h:table xmlns:h="http://www.w3.org/TR/html4/">
132+
<h:tr>
133+
<h:td>Apples</h:td>
134+
<h:td>Bananas</h:td>
135+
</h:tr>
136+
</h:table>
137+
138+
<f:table xmlns:f="http://www.w3schools.com/furniture">
139+
<f:name>African Coffee Table</f:name>
140+
<f:width>80</f:width>
141+
<f:length>120</f:length>
142+
</f:table>
143+
144+
</root>
145+
"""
146+
response = XmlResponse(url='http://mydummycompany.com', body=body)
147+
my_iter = self.xmliter(response, 'table', 'http://www.w3.org/TR/html4/', 'h')
148+
149+
node = next(my_iter)
150+
self.assertEqual(len(node.xpath('h:tr/h:td').extract()), 2)
151+
self.assertEqual(node.xpath('h:tr/h:td[1]/text()').extract(), ['Apples'])
152+
self.assertEqual(node.xpath('h:tr/h:td[2]/text()').extract(), ['Bananas'])
153+
154+
my_iter = self.xmliter(response, 'table', 'http://www.w3schools.com/furniture', 'f')
155+
156+
node = next(my_iter)
157+
self.assertEqual(node.xpath('f:name/text()').extract(), ['African Coffee Table'])
158+
127159

128160
class UtilsCsvTestCase(unittest.TestCase):
129161
sample_feeds_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds')

0 commit comments

Comments
 (0)