Skip to content

Commit 7db6393

Browse files
committed
More Scrapy stuff
1 parent 1f4e5c1 commit 7db6393

File tree

5 files changed

+77
-15
lines changed

5 files changed

+77
-15
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import scrapy
2+
3+
class ArticleSpider(scrapy.Spider):
4+
name='article'
5+
6+
def start_requests(self):
7+
urls = [
8+
"http://en.wikipedia.org/wiki/Python_%28programming_language%29",
9+
"https://en.wikipedia.org/wiki/Functional_programming",
10+
"https://en.wikipedia.org/wiki/Monty_Python"]
11+
return [scrapy.Request(url=url, callback=self.parse) for url in urls]
12+
13+
def parse(self, response):
14+
title = response.css('h1::text').extract_first()
15+
print('Title is: {}'.format(title))
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from scrapy.contrib.linkextractors import LinkExtractor
2+
from scrapy.contrib.spiders import CrawlSpider, Rule
3+
from wikiSpider.items import Article
4+
5+
class ArticleSpider(CrawlSpider):
6+
name = 'articleItems'
7+
allowed_domains = ['wikipedia.org']
8+
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
9+
rules = [
10+
Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True),
11+
]
12+
13+
def parse_items(self, response):
14+
article = Article()
15+
article['title'] = response.css('h1::text').extract_first()
16+
article['text'] = response.xpath('//div[@id="mw-content-text"]//text()').extract()
17+
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
18+
article['lastUpdated'] = lastUpdated.replace('This page was last edited on ', '')
19+
return article
Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
1-
import scrapy
1+
from scrapy.contrib.linkextractors import LinkExtractor
2+
from scrapy.contrib.spiders import CrawlSpider, Rule
23

3-
class ArticleSpider(scrapy.Spider):
4-
name='articles'
4+
class ArticleSpider(CrawlSpider):
5+
name = 'articles'
6+
allowed_domains = ['wikipedia.org']
7+
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
8+
rules = [Rule(LinkExtractor(allow=r'.*'), callback='parse_items', follow=True)]
59

6-
def start_requests(self):
7-
urls = [
8-
"http://en.wikipedia.org/wiki/Python_%28programming_language%29",
9-
"https://en.wikipedia.org/wiki/Functional_programming",
10-
"https://en.wikipedia.org/wiki/Monty_Python"]
11-
return [scrapy.Request(url=a, callback=self.parse) for a in urls]
12-
13-
def parse(self, response):
10+
def parse_items(self, response):
1411
title = response.css('h1::text').extract_first()
15-
print('Title is: {}'.format(title))
12+
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
13+
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
14+
lastUpdated = lastUpdated.replace('This page was last edited on ', '')
15+
print('title is: {} '.format(title))
16+
print('text is: {}'.format(text))
17+
print('Last updated: {}'.format(lastUpdated))
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from scrapy.contrib.linkextractors import LinkExtractor
2+
from scrapy.contrib.spiders import CrawlSpider, Rule
3+
4+
class ArticleSpider(CrawlSpider):
5+
name = 'articles'
6+
allowed_domains = ['wikipedia.org']
7+
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
8+
rules = [
9+
Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}),
10+
Rule(LinkExtractor(allow='.*'), callback='parse_items', cb_kwargs={'is_article': False})
11+
]
12+
13+
def parse_items(self, response, is_article):
14+
print(response.url)
15+
title = response.css('h1::text').extract_first()
16+
if is_article:
17+
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
18+
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
19+
lastUpdated = lastUpdated.replace('This page was last edited on ', '')
20+
print('Title is: {} '.format(title))
21+
print('title is: {} '.format(title))
22+
print('text is: {}'.format(text))
23+
else:
24+
print('This is not an article: {}'.format(title))
25+

v2/Chapter05_Scrapy/wikiSpider/wikiSpider/items.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
import scrapy
99

1010

11-
class WikispiderItem(scrapy.Item):
12-
# define the fields for your item here like:
13-
# name = scrapy.Field()
11+
class Article(scrapy.Item):
12+
title = scrapy.Field()
13+
text = scrapy.Field()
14+
lastUpdated = scrapy.Field()
1415
pass

0 commit comments

Comments
 (0)