Skip to content

Commit fc97386

Browse files
DAFT-15: Allow an initial offset (TheJokersThief#17)
1 parent 1fae567 commit fc97386

File tree

4 files changed

+20
-16
lines changed

4 files changed

+20
-16
lines changed

daft_scraper/listing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33
from marshmallow import Schema, fields, INCLUDE, post_load
44
from marshmallow.utils import missing
5+
from urllib.parse import urljoin
56

67
from daft_scraper import Daft
78

@@ -98,7 +99,7 @@ def convert_bed_and_bath(self, value):
9899
return missing
99100

100101
def get_url(self, seo_friendly_path):
101-
return "".join([self.URL_BASE, seo_friendly_path])
102+
return urljoin(self.URL_BASE, seo_friendly_path)
102103

103104
@post_load
104105
def post_load(self, data, **kwargs):

daft_scraper/search/__init__.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,17 @@
22
import sys
33
from bs4 import BeautifulSoup
44
from enum import Enum
5-
from typing import List
5+
from typing import List, Callable
66

77
from daft_scraper import Daft
88
from daft_scraper.listing import Listing, ListingSchema
99
from daft_scraper.search.options import Option, PriceOption, SalePriceOption
1010

1111

12+
def empty_post_process_hook(listing: Listing, raw_data: dict) -> Listing:
13+
return listing
14+
15+
1216
class SearchType(Enum):
1317
RENT = "property-for-rent"
1418
SALE = "property-for-sale"
@@ -22,16 +26,16 @@ class DaftSearch():
2226
PAGE_SIZE = 20
2327
SALE_TYPES = [SearchType.SALE, SearchType.NEW_HOMES, SearchType.COMMERCIAL_SALE]
2428

25-
def __init__(self, search_type: SearchType):
29+
def __init__(self, search_type: SearchType, post_process_hook: Callable = empty_post_process_hook):
2630
self.search_type = search_type
31+
self.post_process_hook = post_process_hook
2732
self.site = Daft()
2833

29-
def search(self, query: List[Option], max_pages: int = sys.maxsize):
34+
def search(self, query: List[Option], max_pages: int = sys.maxsize, page_offset: int = 0):
3035
path = self._build_search_path()
3136

3237
# Convert options to their string form
3338
options = self._translate_options(query)
34-
listings = []
3539

3640
# If only one location is specified, it should be in the URL, not the params
3741
locations = options.get('location', [])
@@ -41,7 +45,7 @@ def search(self, query: List[Option], max_pages: int = sys.maxsize):
4145

4246
# Init pagination params
4347
options['pageSize'] = self.PAGE_SIZE
44-
options['from'] = 0
48+
options['from'] = self._calc_offset(page_offset)
4549

4650
# Fetch the first page and get pagination info
4751
page_data = self._get_page_data(path, options)
@@ -50,14 +54,12 @@ def search(self, query: List[Option], max_pages: int = sys.maxsize):
5054

5155
while current_page < min(totalPages, max_pages):
5256
listing_data = page_data['props']['pageProps']['listings']
53-
listings.extend(self._get_listings(listing_data))
57+
yield from self._get_listings(listing_data)
5458

5559
options['from'] = self._calc_offset(current_page)
5660
page_data = self._get_page_data(path, options)
5761
current_page = page_data['props']['pageProps']['paging']['currentPage']
5862

59-
return listings
60-
6163
def _build_search_path(self):
6264
"""Build the URL path for searches"""
6365
return "/".join([self.search_type.value, "ireland"])
@@ -85,10 +87,11 @@ def _get_page_data(self, path, params):
8587

8688
def _get_listings(self, listings: dict):
8789
"""Convert a dict of listings into marshalled objects"""
88-
return [
89-
Listing(ListingSchema().load(listing['listing']))
90-
for listing in listings
91-
]
90+
for listing in listings:
91+
yield self.post_process_hook(
92+
Listing(ListingSchema().load(listing['listing'])),
93+
listing
94+
)
9295

9396
def _calc_offset(self, current_page: int):
9497
"""Calculate the offset for pagination"""

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "daft-scraper"
3-
version = "1.2.7"
3+
version = "1.3.0"
44
description = "A webscraper for Daft.ie"
55
authors = ["Evan Smith <[email protected]>"]
66
license = "MIT"

tests/test_search.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def test_search(self, *args):
3131
BedOption(1, 4),
3232
]
3333

34-
got = self.api.search(options)
34+
got = list(self.api.search(options))
3535
self.assertEqual(got[0].id, 1443907)
3636

3737
def test__translate_options(self):
@@ -83,7 +83,7 @@ def test__get_listings(self, *args):
8383
path = self.api._build_search_path()
8484
page_data = self.api._get_page_data(path, params={})
8585

86-
got = self.api._get_listings(page_data['props']['pageProps']['listings'])
86+
got = list(self.api._get_listings(page_data['props']['pageProps']['listings']))
8787
self.assertEqual(got[0].id, 1443907)
8888

8989
def test__calc_offset(self):

0 commit comments

Comments
 (0)