-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdeno_examples.ts
285 lines (244 loc) · 9.21 KB
/
deno_examples.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import { ScrapflyClient, ScrapeConfig, ScreenshotConfig, ExtractionConfig, log } from 'jsr:@scrapfly/scrapfly-sdk';
// You can enable debug logs to see more details
log.setLevel('DEBUG');
/* To start, you can always get your account information using the .account() method
*/
export async function getAccount(apiKey: string) {
const client = new ScrapflyClient({ key: apiKey});
const account = await client.account();
console.log('account');
console.log(account);
}
/* For a basic scrape the only required parameter is the URL
*/
export async function basicGet(apiKey: string) {
const client = new ScrapflyClient({ key: apiKey});
let scrape_result = await client.scrape(
new ScrapeConfig({
url: 'https://httpbin.dev/html',
// Anti Scraping Protection bypass - enable this when scraping protected targets
asp: true,
// server side cache - great for repeated requests
cache: true,
cache_ttl: 3600, // in seconds
// cache_clear: true, // you can always clear the cache explicitly!
}),
);
// the scrape_result.result contains all result details
console.log("web log url:"); // you can check web UI for request details:
console.log(scrape_result.result.log_url);
console.log("page content:");
console.log(scrape_result.result.content);
console.log("response headers:");
console.log(scrape_result.result.response_headers);
console.log("response cookies:");
console.log(scrape_result.result.cookies);
}
/* Enabling js_render enabled scrapfly cloud browsers and enables
* a bunch of other features like browser control, js execution, screenshots, etc.
*/
export async function JSRender(apiKey: string) {
const client = new ScrapflyClient({ key: apiKey});
let scrape_result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/product/1',
// enable browsers:
render_js: true,
// this enables more options
// you can wait for some element to appear:
wait_for_selector: '.review',
// you can wait explicitly for N seconds
rendering_wait: 3000, // 3 seconds
// you can control the browser through scenarios:
// https://scrapfly.io/docs/scrape-api/javascript-scenario
js_scenario: [
{ click: { selector: '#load-more-reviews' }},
{ wait: 2000},
],
// or even run any custom JS code!
js: 'return document.querySelector(".review").innerText',
}),
);
// the scrape_result.result contains all result details:
console.log("web log url:"); // you can check web UI for request details:
console.log(scrape_result.result.log_url);
console.log("page content:");
console.log(scrape_result.result.content.substring(0, 1000) + '...');
console.log("browser data capture");
console.log(scrape_result.result.browser_data);
}
/* Use AI extraction capabilities with the the web scraping API
* all Extraction API methods are supported, see below examples for more
*/
export async function scrapeExtraction(apiKey: string) {
const client = new ScrapflyClient({ key: apiKey});
let scrape_result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/product/1',
// enable browsers:
render_js: true,
// use LLM prompt for auto parsing
extraction_prompt: "Extract the product specification in json format",
})
);
// access the extraction result
console.log("extraction result:");
console.log(scrape_result.result.extracted_data);
}
/* Scrapfly Extraction API offers LLM (Language Learning Model) based extraction
* This example demonstrates how to use LLM query HTML files
* https://scrapfly.io/docs/extraction-api/llm-prompt
*/
export async function extractionLLM(apiKey: string) {
const client = new ScrapflyClient({ key: apiKey});
// First, get HTML either from Web Scraping API or your own storage
let html = (await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/product/1',
}),
)).result.content;
// LLM Parsing - extract content using LLM queries
let llm_result = await client.extract(
new ExtractionConfig({
// identify content type like text/html or text/markdown etc.
content_type: "text/html",
body: html,
// use any prompt
extraction_prompt: "get product price only"
})
)
console.log('llm extraction');
console.log(llm_result);
// You can also request LLM to output specific formats like JSON or CSV
let llm_format_result = await client.extract(
new ExtractionConfig({
content_type: "text/html",
body: html,
// directly request format
extraction_prompt: "get product price and currency in JSON"
})
)
console.log('llm extraction in JSON');
console.log(llm_format_result);
}
/* Scrapfly Extraction API offers Auto Extract engine
* Which can extract common web objects like products, articles etc.
* https://scrapfly.io/docs/extraction-api/automatic-ai
*/
export async function extractionAutoExtract(apiKey: string){
const client = new ScrapflyClient({ key: apiKey});
// First, get HTML either from Web Scraping API or your own storage
let html = (await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/product/1',
}),
)).result.content;
// LLM Parsing - extract content using LLM queries
let product_result = await client.extract(
new ExtractionConfig({
// identify content type like text/html or text/markdown etc.
content_type: "text/html",
body: html,
// define model type: product, article etc.
// see https://scrapfly.io/docs/extraction-api/automatic-ai#models
extraction_model: "product"
})
)
console.log('product auto extract');
console.log(product_result);
}
/* Scrapfly Extraction API offers Template extraction engine
* Use JSON schemas to markup extraction rules using XPath or CSS selectors
* https://scrapfly.io/docs/extraction-api/rules-and-template
*/
export async function extractionTemplates(apiKey: string){
const client = new ScrapflyClient({ key: apiKey});
// First, get HTML either from Web Scraping API or your own storage
let html = (await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/reviews',
render_js: true,
wait_for_selector: '.review',
}),
)).result.content;
// Define your template, see https://scrapfly.io/docs/extraction-api/rules-and-template
let template = {
"source": "html",
"selectors": [
{
"name": "date_posted",
"type": "css",
"query": "[data-testid='review-date']::text",
"multiple": true,
"formatters": [ {
"name": "datetime",
"args": {"format": "%Y, %b %d — %A"}
} ]
}
]
}
let template_result = await client.extract(
new ExtractionConfig({
body: html,
content_type: "text/html",
// provide template:
extraction_ephemeral_template: template,
})
);
console.log('product extract');
console.log(template_result);
}
/* Scrapfly Screenshot API made for super easy screenshot capture
* capture screenshots of full pages or specific sections
* https://scrapfly.io/docs/screenshot-api/getting-started
*/
export async function screenshot(apiKey: string) {
const client = new ScrapflyClient({ key: apiKey});
let screenshot_result = await client.screenshot(
new ScreenshotConfig({
url: 'https://web-scraping.dev/product/1',
// by default 1920x1080 will be captured but resolution can take any value
resolution: '540x1200', // for example - tall smartphone viewport
// to capture all visible parts use capture with full page
capture: "fullpage",
// you can also capture specific elements with css or xpath
// wait_for_selector: "#reviews", // wait for review to load
// capture: "#reviews", // capture only reviews element
// for pages that require scrolling to load elements (like endless paging) use
auto_scroll: true,
}),
);
console.log("captured screenshot:");
console.log(screenshot_result);
// use the shortcut to save screenshots to file:
client.saveScreenshot(screenshot_result, 'screenshot');
console.log("saved screenshot to ./screenshot.jpg");
}
// CLI entry point
async function main(): Promise<void> {
if (Deno.args.length < 2) {
console.log(
`Usage: deno run --allow-net deno_examples.ts <functionName> <apiKey>\n` +
`getAccount - Get account information\n` +
`basicGet - Basic scrape\n` +
`JSRender - Scrape with JS rendering\n` +
`extractionLLM - Extract content using LLM queries\n` +
`extractionAutoExtract - Extract common web objects using Auto Extract\n` +
`extractionTemplates - Extract content using Template engine\n` +
`screenshots - Capture screenshots using Screenshot API\n`
);
return;
}
const [functionName, apiKey] = Deno.args;
// Dynamically import the current module
const module = await import('./deno_examples.ts');
if (module[functionName]) {
(module[functionName] as Function)(apiKey);
} else {
console.log(`Function ${functionName} not found.`);
}
}
// Check if the script is being run directly
if (import.meta.main) {
main();
}