Skip to content

Commit fb9cc07

Browse files
committed
Added pipelines
1 parent 0a29b99 commit fb9cc07

File tree

2 files changed

+31
-9
lines changed

2 files changed

+31
-9
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from scrapy.contrib.linkextractors import LinkExtractor
2+
from scrapy.contrib.spiders import CrawlSpider, Rule
3+
from wikiSpider.items import Article
4+
5+
class ArticleSpider(CrawlSpider):
6+
name = 'articlePipelines'
7+
allowed_domains = ['wikipedia.org']
8+
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
9+
rules = [
10+
Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True),
11+
]
12+
13+
def parse_items(self, response):
14+
article = Article()
15+
article['url'] = response.url
16+
article['title'] = response.css('h1::text').extract_first()
17+
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
18+
article['lastUpdated'] = response.css('li#footer-info-lastmod::text').extract_first()
19+
return article
Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1-
# -*- coding: utf-8 -*-
2-
3-
# Define your item pipelines here
4-
#
5-
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6-
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7-
1+
from datetime import datetime
2+
from wikiSpider.items import Article
3+
from string import whitespace
84

95
class WikispiderPipeline(object):
10-
def process_item(self, item, spider):
11-
return item
6+
def process_item(self, article, spider):
7+
dateStr = article['lastUpdated']
8+
# This page was last edited on 26 January 2018, at 03:56.
9+
article['lastUpdated'] = article['lastUpdated'].replace('This page was last edited on', '')
10+
article['lastUpdated'] = article['lastUpdated'].strip()
11+
article['lastUpdated'] = datetime.strptime(article['lastUpdated'], '%d %B %Y, at %H:%M.')
12+
article['text'] = [line for line in article['text'] if line not in whitespace]
13+
article['text'] = ''.join(article['text'])
14+
return article

0 commit comments

Comments
 (0)