22import sys
33from bs4 import BeautifulSoup
44from enum import Enum
5- from typing import List
5+ from typing import List , Callable
66
77from daft_scraper import Daft
88from daft_scraper .listing import Listing , ListingSchema
99from daft_scraper .search .options import Option , PriceOption , SalePriceOption
1010
1111
12+ def empty_post_process_hook (listing : Listing , raw_data : dict ) -> Listing :
13+ return listing
14+
15+
1216class SearchType (Enum ):
1317 RENT = "property-for-rent"
1418 SALE = "property-for-sale"
@@ -22,16 +26,16 @@ class DaftSearch():
2226 PAGE_SIZE = 20
2327 SALE_TYPES = [SearchType .SALE , SearchType .NEW_HOMES , SearchType .COMMERCIAL_SALE ]
2428
25- def __init__ (self , search_type : SearchType ):
29+ def __init__ (self , search_type : SearchType , post_process_hook : Callable = empty_post_process_hook ):
2630 self .search_type = search_type
31+ self .post_process_hook = post_process_hook
2732 self .site = Daft ()
2833
29- def search (self , query : List [Option ], max_pages : int = sys .maxsize ):
34+ def search (self , query : List [Option ], max_pages : int = sys .maxsize , page_offset : int = 0 ):
3035 path = self ._build_search_path ()
3136
3237 # Convert options to their string form
3338 options = self ._translate_options (query )
34- listings = []
3539
3640 # If only one location is specified, it should be in the URL, not the params
3741 locations = options .get ('location' , [])
@@ -41,7 +45,7 @@ def search(self, query: List[Option], max_pages: int = sys.maxsize):
4145
4246 # Init pagination params
4347 options ['pageSize' ] = self .PAGE_SIZE
44- options ['from' ] = 0
48+ options ['from' ] = self . _calc_offset ( page_offset )
4549
4650 # Fetch the first page and get pagination info
4751 page_data = self ._get_page_data (path , options )
@@ -50,14 +54,12 @@ def search(self, query: List[Option], max_pages: int = sys.maxsize):
5054
5155 while current_page < min (totalPages , max_pages ):
5256 listing_data = page_data ['props' ]['pageProps' ]['listings' ]
53- listings . extend ( self ._get_listings (listing_data ) )
57+ yield from self ._get_listings (listing_data )
5458
5559 options ['from' ] = self ._calc_offset (current_page )
5660 page_data = self ._get_page_data (path , options )
5761 current_page = page_data ['props' ]['pageProps' ]['paging' ]['currentPage' ]
5862
59- return listings
60-
6163 def _build_search_path (self ):
6264 """Build the URL path for searches"""
6365 return "/" .join ([self .search_type .value , "ireland" ])
@@ -85,10 +87,11 @@ def _get_page_data(self, path, params):
8587
8688 def _get_listings (self , listings : dict ):
8789 """Convert a dict of listings into marshalled objects"""
88- return [
89- Listing (ListingSchema ().load (listing ['listing' ]))
90- for listing in listings
91- ]
90+ for listing in listings :
91+ yield self .post_process_hook (
92+ Listing (ListingSchema ().load (listing ['listing' ])),
93+ listing
94+ )
9295
9396 def _calc_offset (self , current_page : int ):
9497 """Calculate the offset for pagination"""
0 commit comments