Skip to content

Commit be095a5

Browse files
committed
Added the files
1 parent e7e9df5 commit be095a5

File tree

4 files changed

+43
-0
lines changed

4 files changed

+43
-0
lines changed

.extract/cache.db

72 KB
Binary file not shown.

.extract/webscraping.log

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2014-05-08 18:35:34,513 INFO Downloading http://www.flipkart.com/adidas-printed-men-s-round-neck-t-shirt/p/itmdutywpyg8xgq8?pid=TSHDUTYVXDPGYDKZ&srno=b_1&ref=bf84fb87-616d-4615-a111-8210c8693606

extract.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import sys
2+
import os
3+
import csv
4+
from urlparse import urlparse
5+
from webscraping import download , xpath
6+
7+
def extract(url):
8+
'''
9+
Function that extracts product info from websites listed in the csv page . It takes the url as an argument.
10+
'''
11+
D = download.Download()
12+
13+
f = open(os.path.join(os.path.dirname(__file__),'webpage_xpath.csv'), 'rb') #Joining absolute path so that the function can be used inside an app
14+
reader = csv.reader(f)
15+
row = list(reader)
16+
item ={}
17+
for r in range(0,3):
18+
if url.find(row[r][0])>=0:
19+
xpath1 = row[r][1]
20+
xpath2 = row[r][2]
21+
xpath3 = row[r][3]
22+
23+
html = D.get(url)
24+
25+
item['name'] = xpath.get(html,'%s//text()' % xpath1)
26+
item['price'] = xpath.get(html,'%s//text()' % xpath2)
27+
item['image'] = xpath.get(html, '%s' % xpath3)
28+
29+
return item
30+
31+
else:
32+
continue
33+
if item == {}:
34+
item={'name':'invalid url'}
35+
return item
36+
37+
if __name__ == '__main__':
38+
url="".join( sys.argv[1:] )
39+
extract(url)

webpage_xpath.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
website,xpath1,xpath2,xpath3
2+
flipkart,"//h1[@itemprop=""name""]","//span[@class=""fk-font-verybig pprice fk-bold""]","//div[@class=""image-wrapper""]/img/@src"
3+
amazon,"//span[@id=""productTitle""]","//span[@id=""priceblock_ourprice""]","//div[@class=""imgTagWrapper""]/img/@src"

0 commit comments

Comments
 (0)