From 08cede5154832840a3e515cee613dc097d07619b Mon Sep 17 00:00:00 2001 From: Mazen Ramadan Date: Mon, 20 May 2024 09:29:09 +0300 Subject: [PATCH 1/3] add screenshot_flags and format api params support --- src/scrapeconfig.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/scrapeconfig.ts b/src/scrapeconfig.ts index d7f2091..26bdf8d 100644 --- a/src/scrapeconfig.ts +++ b/src/scrapeconfig.ts @@ -24,6 +24,7 @@ export class ScrapeConfig { proxy_pool?: string = null; session?: string = null; tags: Set = new Set(); + format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' = 'raw'; //unchanged correlation_id?: string = null; cookies?: Rec = null; body?: string = null; @@ -34,6 +35,7 @@ export class ScrapeConfig { wait_for_selector?: string = null; session_sticky_proxy = false; screenshots?: Rec = null; + screenshot_flags?: string = null; webhook?: string = null; timeout?: number = null; // in milliseconds js_scenario?: Rec = null; @@ -60,6 +62,7 @@ export class ScrapeConfig { proxy_pool?: string; session?: string; tags?: Array; + format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw'; correlation_id?: string; cookies?: Rec; body?: string; @@ -69,6 +72,7 @@ export class ScrapeConfig { rendering_wait?: number; wait_for_selector?: string; screenshots?: Rec; + screenshot_flags?: string; session_sticky_proxy?: boolean; webhook?: string; timeout?: number; // in milliseconds @@ -96,6 +100,7 @@ export class ScrapeConfig { this.proxy_pool = options.proxy_pool ?? this.proxy_pool; this.session = options.session ?? this.session; this.tags = new Set(options.tags) ?? this.tags; + this.format = options.format ?? this.format; this.correlation_id = options.correlation_id ?? this.correlation_id; this.cookies = options.cookies ? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v])) @@ -106,6 +111,7 @@ export class ScrapeConfig { this.rendering_wait = options.rendering_wait ?? this.rendering_wait; this.wait_for_selector = options.wait_for_selector ?? this.wait_for_selector; this.screenshots = options.screenshots ?? this.screenshots; + this.screenshot_flags = options.screenshot_flags ?? this.screenshot_flags; this.webhook = options.webhook ?? this.webhook; this.timeout = options.timeout ?? this.timeout; this.js_scenario = options.js_scenario ?? this.js_scenario; @@ -194,6 +200,13 @@ export class ScrapeConfig { Object.keys(this.screenshots).forEach((key) => { params[`screenshots[${key}]`] = this.screenshots[key]; }); + if (this.screenshot_flags) { + params.screenshot_flags = this.screenshot_flags; + } + } else { + if (this.screenshot_flags) { + log.warn('Params "screenshot_flags" is ignored. Works only if screenshots is enabled'); + } } if (this.auto_scroll !== null) { params.auto_scroll = this.auto_scroll; @@ -247,6 +260,9 @@ export class ScrapeConfig { if (this.tags.size > 0) { params.tags = Array.from(this.tags).join(','); } + if (this.format) { + params.format = this.format; + } if (this.correlation_id) { params.correlation_id = this.correlation_id; } From dd709bbc795cf6d8dd8dff208acc60b3ab76f5c1 Mon Sep 17 00:00:00 2001 From: Mazen Ramadan Date: Wed, 22 May 2024 18:02:00 +0300 Subject: [PATCH 2/3] update scrapeConfig tests and remove default format api param --- __tests__/scrapeconfig.test.ts | 26 ++++++++++++++++++++++++++ src/scrapeconfig.ts | 4 ++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/__tests__/scrapeconfig.test.ts b/__tests__/scrapeconfig.test.ts index f7e63cc..4c179ed 100644 --- a/__tests__/scrapeconfig.test.ts +++ b/__tests__/scrapeconfig.test.ts @@ -194,6 +194,21 @@ describe('url param generation', () => { 'screenshots[everything]': 'fullpage', }); }); + it('screenshot flags converted to params', () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + screenshots: { everything: 'fullpage' }, + screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format", + render_js: true, + }); + expect(config.toApiParams({ key: '1234' })).toEqual({ + key: '1234', + url: 'http://httpbin.dev/get', + 'screenshots[everything]': 'fullpage', + screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format", + render_js: true, + }); + }); it('asp enables', () => { const config = new ScrapeConfig({ url: 'http://httpbin.dev/get', @@ -238,6 +253,17 @@ describe('url param generation', () => { tags: 'foo,bar,gaz', }); }); + it('format set', () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + format: "markdown", + }); + expect(config.toApiParams({ key: '1234' })).toEqual({ + key: '1234', + url: 'http://httpbin.dev/get', + format: "markdown", + }); + }); it('debug sets', () => { const config = new ScrapeConfig({ url: 'http://httpbin.dev/get', diff --git a/src/scrapeconfig.ts b/src/scrapeconfig.ts index 26bdf8d..f1237f4 100644 --- a/src/scrapeconfig.ts +++ b/src/scrapeconfig.ts @@ -24,7 +24,7 @@ export class ScrapeConfig { proxy_pool?: string = null; session?: string = null; tags: Set = new Set(); - format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' = 'raw'; //unchanged + format?: string = null; // raw(unchanged) correlation_id?: string = null; cookies?: Rec = null; body?: string = null; @@ -62,7 +62,7 @@ export class ScrapeConfig { proxy_pool?: string; session?: string; tags?: Array; - format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw'; + format?: string; correlation_id?: string; cookies?: Rec; body?: string; From 82153cab84ef90fb8a357587863f2a33b0170f20 Mon Sep 17 00:00:00 2001 From: Mazen Ramadan Date: Thu, 23 May 2024 13:55:29 +0300 Subject: [PATCH 3/3] update scrape config param types and add examples --- __tests__/scrapeconfig.test.ts | 8 ++++++- examples/scrape-as-markdown.js | 16 ++++++++++++++ examples/screenshot-with-image-loading.js | 27 +++++++++++++++++++++++ src/scrapeconfig.ts | 17 ++++++++------ 4 files changed, 60 insertions(+), 8 deletions(-) create mode 100644 examples/scrape-as-markdown.js create mode 100644 examples/screenshot-with-image-loading.js diff --git a/__tests__/scrapeconfig.test.ts b/__tests__/scrapeconfig.test.ts index 4c179ed..646d30f 100644 --- a/__tests__/scrapeconfig.test.ts +++ b/__tests__/scrapeconfig.test.ts @@ -198,7 +198,13 @@ describe('url param generation', () => { const config = new ScrapeConfig({ url: 'http://httpbin.dev/get', screenshots: { everything: 'fullpage' }, - screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format", + screenshot_flags: [ + "load_images", + "dark_mode", + "block_banners", + "high_quality", + "print_media_format" + ], render_js: true, }); expect(config.toApiParams({ key: '1234' })).toEqual({ diff --git a/examples/scrape-as-markdown.js b/examples/scrape-as-markdown.js new file mode 100644 index 0000000..e48b56e --- /dev/null +++ b/examples/scrape-as-markdown.js @@ -0,0 +1,16 @@ +/* +This example shows how to capture page screenshots with images and additional configuration in scrapfly +*/ +import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; + +const key = 'YOUR SCRAPFLY KEY'; +const client = new ScrapflyClient({ key }); +const result = await client.scrape( + new ScrapeConfig({ + url: 'https://web-scraping.dev/products/', + // scrape the page data as markdown format supproted by LLMs. + // None=raw(unchanged), other supported formats are: json, text, clean_html + format: "markdown" + }), +); +console.log(result.result.content); diff --git a/examples/screenshot-with-image-loading.js b/examples/screenshot-with-image-loading.js new file mode 100644 index 0000000..93193e4 --- /dev/null +++ b/examples/screenshot-with-image-loading.js @@ -0,0 +1,27 @@ +/* +This example shows how to capture page screenshots with images and additional configuration in scrapfly +*/ +import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; + +const key = 'YOUR SCRAPFLY KEY'; +const client = new ScrapflyClient({ key }); +const result = await client.scrape( + new ScrapeConfig({ + url: 'https://web-scraping.dev/products/', + // enable headless browsers for screenshots + render_js: true, + // optional: you can wait for page to load before capturing + screenshots: { + everything: 'fullpage', + reviews: '#reviews', + }, + screenshot_flags: [ + "load_images", // Enable image rendering with the request, adds extra usage for the bandwidth consumed + "dark_mode", // Enable dark mode display + "block_banners", // Block cookies banners and overlay that cover the screen + "high_quality", // No compression on the output image + "print_media_format" // Render the page in the print mode + ] + }), +); +console.log(result.result.screenshots); diff --git a/src/scrapeconfig.ts b/src/scrapeconfig.ts index f1237f4..e973eaa 100644 --- a/src/scrapeconfig.ts +++ b/src/scrapeconfig.ts @@ -3,10 +3,13 @@ import { log } from './logger.js'; import { Rec, HttpMethod } from './types.js'; import { ScrapeConfigError } from './errors.js'; +type ScreenshotFlags = "load_images" | "dark_mode" | "block_banners" | "high_quality" | "print_media_format"; +type Format = "raw" | "json" | "text" | "markdown" | "clean_html"; + export class ScrapeConfig { static PUBLIC_DATACENTER_POOL = 'public_datacenter_pool'; static PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool'; - + url: string; retry = true; method: HttpMethod = 'GET'; @@ -24,7 +27,7 @@ export class ScrapeConfig { proxy_pool?: string = null; session?: string = null; tags: Set = new Set(); - format?: string = null; // raw(unchanged) + format?: Format = null; // raw(unchanged) correlation_id?: string = null; cookies?: Rec = null; body?: string = null; @@ -35,7 +38,7 @@ export class ScrapeConfig { wait_for_selector?: string = null; session_sticky_proxy = false; screenshots?: Rec = null; - screenshot_flags?: string = null; + screenshot_flags?: ScreenshotFlags[] = null; webhook?: string = null; timeout?: number = null; // in milliseconds js_scenario?: Rec = null; @@ -62,7 +65,7 @@ export class ScrapeConfig { proxy_pool?: string; session?: string; tags?: Array; - format?: string; + format?: Format; correlation_id?: string; cookies?: Rec; body?: string; @@ -72,7 +75,7 @@ export class ScrapeConfig { rendering_wait?: number; wait_for_selector?: string; screenshots?: Rec; - screenshot_flags?: string; + screenshot_flags?: ScreenshotFlags[]; session_sticky_proxy?: boolean; webhook?: string; timeout?: number; // in milliseconds @@ -201,7 +204,7 @@ export class ScrapeConfig { params[`screenshots[${key}]`] = this.screenshots[key]; }); if (this.screenshot_flags) { - params.screenshot_flags = this.screenshot_flags; + params.screenshot_flags = this.screenshot_flags.join(','); } } else { if (this.screenshot_flags) { @@ -261,7 +264,7 @@ export class ScrapeConfig { params.tags = Array.from(this.tags).join(','); } if (this.format) { - params.format = this.format; + params.format = this.format.valueOf(); } if (this.correlation_id) { params.correlation_id = this.correlation_id;