Skip to content

Commit cc3a9d2

Browse files
committed
implementation of maxParallel option
1 parent c60321d commit cc3a9d2

File tree

1 file changed

+55
-33
lines changed

1 file changed

+55
-33
lines changed

index.js

Lines changed: 55 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,36 @@
22
const fs = require("fs-extra");
33
const PDFMerger = require("pdf-merger-js");
44
const puppeteer = require("puppeteer");
5-
const { outputPath, overwrite } = require("./config.json");
5+
const { outputPath, overwrite, maxParallel } = require("./config.json");
6+
7+
function sleep(ms) {
8+
return new Promise(resolve => setTimeout(resolve, ms));
9+
}
10+
11+
async function newPage(browser, cookies) {
12+
let page = await browser.newPage();
13+
await page.setJavaScriptEnabled(true);
14+
await page.setDefaultNavigationTimeout(90000);
15+
await page.setCookie(...cookies);
16+
await page.emulateMedia("screen");
17+
return page;
18+
}
619

720
async function ensureGoTo(page, url) {
8-
let response = await page.goto(url, { "waitUntil": "networkidle2" });
9-
while (response.status() !== 200) {
10-
response = page.reload();
21+
let response = await page.goto(url, { "waitUntil": "networkidle2" }).catch(() => false);
22+
while (response && response.status() !== 200) {
23+
await sleep(10000);
24+
response = await page.reload().catch(() => false);
25+
}
26+
if (!response) {
27+
await sleep(10000);
28+
let newPage = await ensureGoTo(page, url);
29+
return newPage;
1130
}
1231
return page;
1332
}
1433

15-
async function convertToPDF(page, url, name, i, stylesheet) {
34+
async function convertToPDF(tab, url, name, i, stylesheet) {
1635
let filename = `${i}.pdf`;
1736
let path = `${outputPath}/${name}/${filename}`;
1837

@@ -21,55 +40,58 @@ async function convertToPDF(page, url, name, i, stylesheet) {
2140
}
2241

2342
await fs.ensureDir(path.replace(filename, ""));
24-
page = await ensureGoTo(page, url);
43+
let page = await ensureGoTo(tab, url);
2544
await page.addStyleTag({ "content": stylesheet });
2645

2746
let height = await page.evaluate(() => {
2847
let article = document.querySelector("#content article") || document.querySelector("#content") || document.body;
29-
return article.scrollHeight;
48+
return 0.95 * article.scrollHeight; // seems there is some extra percentage of extra length
3049
});
3150
await page.pdf({ path, height, "printBackground": true });
3251
return path;
3352
}
3453

54+
async function scrapeGuide(guide, browser, cookies, stylesheet) {
55+
let { url, title } = guide;
56+
let path = `${outputPath}/${title}.pdf`;
57+
if (!overwrite && await fs.pathExists(path)) {
58+
console.log(path);
59+
return;
60+
}
61+
62+
let merger = new PDFMerger();
63+
let page = await newPage(browser, cookies);
64+
page = await ensureGoTo(page, url);
65+
66+
let pages = await page.evaluate(() => [...document.querySelectorAll("#chapters a[data-section-id]")].map(e => e.href));
67+
for (let i = 1; i <= pages.length; i++) {
68+
let path = await convertToPDF(page, pages[i - 1], title, i, stylesheet);
69+
merger.add(path);
70+
console.log(path);
71+
}
72+
73+
await merger.save(path);
74+
console.log(path);
75+
}
76+
3577
(async() => {
3678
const stylesheet = await fs.readFile("stylesheet.css", "utf8");
3779
const cookies = await fs.readJSON("cookies.json");
3880
const browser = await puppeteer.launch();
39-
const page = await browser.newPage();
4081

41-
await page.setJavaScriptEnabled(true);
42-
await page.setDefaultNavigationTimeout(90000);
43-
await page.setCookie(...cookies);
44-
await page.emulateMedia("screen");
82+
let page = await newPage(browser, cookies);
4583
page = await ensureGoTo(page, "https://primagames.com/accounts/account/my_guides");
4684
let guides = await page.evaluate(() => [...document.querySelectorAll("a.cover")].map(e => ({
4785
"url": e.href,
4886
"title": e.nextSiblings(".title")[0].innerText.replace(/[^A-Za-z0-9 ]+/g, "")
4987
})));
88+
await page.close();
89+
console.log(`Found ${guides.length} eGuides`);
5090

51-
for (let guide of guides) {
52-
let merger = new PDFMerger();
53-
let { url, title } = guide;
54-
page = await ensureGoTo(page, url);
55-
56-
let pages = await page.evaluate(() => [...document.querySelectorAll("#chapters a[data-section-id]")].map(e => e.href));
57-
for (let i = 1; i <= pages.length; i++) {
58-
let path = await convertToPDF(page, pages[i - 1], title, i, stylesheet);
59-
console.log(path);
60-
merger.add(path);
61-
}
62-
63-
let savePath = `${outputPath}/${title}.pdf`;
64-
if (!overwrite && await fs.pathExists(savePath)) {
65-
console.log(savePath);
66-
continue;
67-
}
68-
69-
await merger.save(savePath);
70-
console.log(savePath);
91+
for (let i = 0; i < guides.length; i += maxParallel) {
92+
await Promise.all(guides.slice(i, i + maxParallel).map(guide => scrapeGuide(guide, browser, cookies, stylesheet)));
93+
console.log(i);
7194
}
72-
7395
await browser.close();
7496
process.exit(0);
7597
})().catch(err => {

0 commit comments

Comments
 (0)