22const fs = require ( "fs-extra" ) ;
33const PDFMerger = require ( "pdf-merger-js" ) ;
44const puppeteer = require ( "puppeteer" ) ;
5- const { outputPath, overwrite } = require ( "./config.json" ) ;
5+ const { outputPath, overwrite, maxParallel } = require ( "./config.json" ) ;
6+
7+ function sleep ( ms ) {
8+ return new Promise ( resolve => setTimeout ( resolve , ms ) ) ;
9+ }
10+
11+ async function newPage ( browser , cookies ) {
12+ let page = await browser . newPage ( ) ;
13+ await page . setJavaScriptEnabled ( true ) ;
14+ await page . setDefaultNavigationTimeout ( 90000 ) ;
15+ await page . setCookie ( ...cookies ) ;
16+ await page . emulateMedia ( "screen" ) ;
17+ return page ;
18+ }
619
720async function ensureGoTo ( page , url ) {
8- let response = await page . goto ( url , { "waitUntil" : "networkidle2" } ) ;
9- while ( response . status ( ) !== 200 ) {
10- response = page . reload ( ) ;
21+ let response = await page . goto ( url , { "waitUntil" : "networkidle2" } ) . catch ( ( ) => false ) ;
22+ while ( response && response . status ( ) !== 200 ) {
23+ await sleep ( 10000 ) ;
24+ response = await page . reload ( ) . catch ( ( ) => false ) ;
25+ }
26+ if ( ! response ) {
27+ await sleep ( 10000 ) ;
28+ let newPage = await ensureGoTo ( page , url ) ;
29+ return newPage ;
1130 }
1231 return page ;
1332}
1433
15- async function convertToPDF ( page , url , name , i , stylesheet ) {
34+ async function convertToPDF ( tab , url , name , i , stylesheet ) {
1635 let filename = `${ i } .pdf` ;
1736 let path = `${ outputPath } /${ name } /${ filename } ` ;
1837
@@ -21,55 +40,58 @@ async function convertToPDF(page, url, name, i, stylesheet) {
2140 }
2241
2342 await fs . ensureDir ( path . replace ( filename , "" ) ) ;
24- page = await ensureGoTo ( page , url ) ;
43+ let page = await ensureGoTo ( tab , url ) ;
2544 await page . addStyleTag ( { "content" : stylesheet } ) ;
2645
2746 let height = await page . evaluate ( ( ) => {
2847 let article = document . querySelector ( "#content article" ) || document . querySelector ( "#content" ) || document . body ;
29- return article . scrollHeight ;
48+ return 0.95 * article . scrollHeight ; // seems there is some extra percentage of extra length
3049 } ) ;
3150 await page . pdf ( { path, height, "printBackground" : true } ) ;
3251 return path ;
3352}
3453
54+ async function scrapeGuide ( guide , browser , cookies , stylesheet ) {
55+ let { url, title } = guide ;
56+ let path = `${ outputPath } /${ title } .pdf` ;
57+ if ( ! overwrite && await fs . pathExists ( path ) ) {
58+ console . log ( path ) ;
59+ return ;
60+ }
61+
62+ let merger = new PDFMerger ( ) ;
63+ let page = await newPage ( browser , cookies ) ;
64+ page = await ensureGoTo ( page , url ) ;
65+
66+ let pages = await page . evaluate ( ( ) => [ ...document . querySelectorAll ( "#chapters a[data-section-id]" ) ] . map ( e => e . href ) ) ;
67+ for ( let i = 1 ; i <= pages . length ; i ++ ) {
68+ let path = await convertToPDF ( page , pages [ i - 1 ] , title , i , stylesheet ) ;
69+ merger . add ( path ) ;
70+ console . log ( path ) ;
71+ }
72+
73+ await merger . save ( path ) ;
74+ console . log ( path ) ;
75+ }
76+
3577( async ( ) => {
3678 const stylesheet = await fs . readFile ( "stylesheet.css" , "utf8" ) ;
3779 const cookies = await fs . readJSON ( "cookies.json" ) ;
3880 const browser = await puppeteer . launch ( ) ;
39- const page = await browser . newPage ( ) ;
4081
41- await page . setJavaScriptEnabled ( true ) ;
42- await page . setDefaultNavigationTimeout ( 90000 ) ;
43- await page . setCookie ( ...cookies ) ;
44- await page . emulateMedia ( "screen" ) ;
82+ let page = await newPage ( browser , cookies ) ;
4583 page = await ensureGoTo ( page , "https://primagames.com/accounts/account/my_guides" ) ;
4684 let guides = await page . evaluate ( ( ) => [ ...document . querySelectorAll ( "a.cover" ) ] . map ( e => ( {
4785 "url" : e . href ,
4886 "title" : e . nextSiblings ( ".title" ) [ 0 ] . innerText . replace ( / [ ^ A - Z a - z 0 - 9 ] + / g, "" )
4987 } ) ) ) ;
88+ await page . close ( ) ;
89+ console . log ( `Found ${ guides . length } eGuides` ) ;
5090
51- for ( let guide of guides ) {
52- let merger = new PDFMerger ( ) ;
53- let { url, title } = guide ;
54- page = await ensureGoTo ( page , url ) ;
55-
56- let pages = await page . evaluate ( ( ) => [ ...document . querySelectorAll ( "#chapters a[data-section-id]" ) ] . map ( e => e . href ) ) ;
57- for ( let i = 1 ; i <= pages . length ; i ++ ) {
58- let path = await convertToPDF ( page , pages [ i - 1 ] , title , i , stylesheet ) ;
59- console . log ( path ) ;
60- merger . add ( path ) ;
61- }
62-
63- let savePath = `${ outputPath } /${ title } .pdf` ;
64- if ( ! overwrite && await fs . pathExists ( savePath ) ) {
65- console . log ( savePath ) ;
66- continue ;
67- }
68-
69- await merger . save ( savePath ) ;
70- console . log ( savePath ) ;
91+ for ( let i = 0 ; i < guides . length ; i += maxParallel ) {
92+ await Promise . all ( guides . slice ( i , i + maxParallel ) . map ( guide => scrapeGuide ( guide , browser , cookies , stylesheet ) ) ) ;
93+ console . log ( i ) ;
7194 }
72-
7395 await browser . close ( ) ;
7496 process . exit ( 0 ) ;
7597} ) ( ) . catch ( err => {
0 commit comments