Skip to content

Commit 0a29b99

Browse files
committed
More scrapy
1 parent 7db6393 commit 0a29b99

File tree

5 files changed

+8
-1
lines changed

5 files changed

+8
-1
lines changed

v2/Chapter05_Scrapy/wikiSpider/wikiSpider/article.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,8 @@ def start_requests(self):
1111
return [scrapy.Request(url=url, callback=self.parse) for url in urls]
1212

1313
def parse(self, response):
14+
url = response.url
1415
title = response.css('h1::text').extract_first()
16+
print('URL is: {}'.format(url))
1517
print('Title is: {}'.format(title))
18+

v2/Chapter05_Scrapy/wikiSpider/wikiSpider/articleItems.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class ArticleSpider(CrawlSpider):
1212

1313
def parse_items(self, response):
1414
article = Article()
15+
article['url'] = response.url
1516
article['title'] = response.css('h1::text').extract_first()
1617
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
1718
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()

v2/Chapter05_Scrapy/wikiSpider/wikiSpider/articles.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@ class ArticleSpider(CrawlSpider):
88
rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse_items', follow=True)]
99

1010
def parse_items(self, response):
11+
url = response.url
1112
title = response.css('h1::text').extract_first()
1213
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
1314
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
1415
lastUpdated = lastUpdated.replace('This page was last edited on ', '')
16+
print('URL is: {}'.format(url))
1517
print('title is: {} '.format(title))
1618
print('text is: {}'.format(text))
1719
print('Last updated: {}'.format(lastUpdated))

v2/Chapter05_Scrapy/wikiSpider/wikiSpider/articlesMoreRules.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def parse_items(self, response, is_article):
1414
print(response.url)
1515
title = response.css('h1::text').extract_first()
1616
if is_article:
17+
url = response.url
1718
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
1819
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
1920
lastUpdated = lastUpdated.replace('This page was last edited on ', '')

v2/Chapter05_Scrapy/wikiSpider/wikiSpider/items.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010

1111
class Article(scrapy.Item):
12+
url = scrapy.Field()
1213
title = scrapy.Field()
1314
text = scrapy.Field()
1415
lastUpdated = scrapy.Field()
15-
pass

0 commit comments

Comments
 (0)