This repository has been archived by the owner on Apr 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathloader.ts
125 lines (124 loc) · 4.14 KB
/
loader.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import puppeteer from 'puppeteer';
import { CacheHTML } from './cache';
import { Pool } from './pool';
import { debugLog } from './logging';
export class Loader {
badURLs: Set<string>;
badURLsConfirmed: Set<string>;
pool: Pool;
cache: CacheHTML;
timeout: number;
constructor(pool: Pool, cache: CacheHTML, timeout: number) {
this.badURLs = new Set<string>();
this.badURLsConfirmed = new Set<string>();
this.pool = pool;
this.cache = cache;
this.timeout = timeout;
}
spitHTML = async (url: string, wait: string | undefined) => {
// Check if it's in bad URLs
if (this.badURLsConfirmed.has(url)) {
throw Error(url + " is a confirmed bad URL");
}
// Check if it's in our cache
if (this.cache.hasValue(url)) {
debugLog.loader("cache has HTML for URL " + url)
return this.cache.getValueHTML(url);
}
// Get a browser instance and create a page
const browser = await this.pool.getBrowser();
const page = await browser.createPage();
const tasks = [];
tasks.push(page.setCacheEnabled(false));
// Avoid all unecessary HTTP requests
tasks.push(page.setRequestInterception(true));
page.on('request', req => {
if (Loader.requestIsAllowed(req)) {
req.continue();
} else {
req.abort();
}
});
// Load and wait for the page
debugLog.loader("going to page " + url);
let html: string;
try {
await Promise.all(tasks);
await page.goto(url, {
waitUntil: Loader.buildWaitUntil(wait),
timeout: this.timeout,
});
html = await page.content();
} catch (e) {
this.recordBadURL(url);
throw e;
}
// Cleaning up
page.close(); // async but no need to wait
this.cache.reduceSize(); // async but no need to wait
this.cache.cleanOld(); // async but no need to wait
this.cache.setValue(url, html); // async but no need to wait
debugLog.loader("spitting HTML of URL " + url);
return html;
}
recordBadURL = (url: string) => {
if (this.badURLsConfirmed.has(url)) {
return;
} else if (this.badURLs.has(url)) {
this.badURLs.delete(url);
this.badURLsConfirmed.add(url);
} else {
this.badURLs.add(url);
}
}
static buildWaitUntil = (wait: string | undefined) => {
let waitUntil: puppeteer.LoadEvent;
switch (wait) {
case "load":
waitUntil = "load";
break
case "domcontentloaded":
waitUntil = "domcontentloaded";
break;
case "2":
waitUntil = "networkidle0";
break;
case "3":
waitUntil = "networkidle2";
break;
case undefined:
waitUntil = "networkidle0";
break;
default:
throw Error(`wait parameter ${wait} is invalid`);
}
return waitUntil;
}
static requestIsAllowed = (req: puppeteer.Request) => {
const whitelist = [
"document",
"script",
"xhr",
"fetch"
];
const url = req.url();
if (!whitelist.includes(req.resourceType())) {
debugLog.loader("unallowed resource type for resource URL: " + url);
return false;
}
const blacklist = [
"www.google-analytics.com",
"/gtag/js",
"gs.js",
"analytics.js"
];
for (const blacklisted of blacklist) {
const arr = url.match(blacklisted);
if (arr != null && arr.length > 0) {
debugLog.loader("blacklisted resource URL: " + url);
return false;
}
}
return true;
}
}