diff --git a/__tests__/scrapeconfig.test.ts b/__tests__/scrapeconfig.test.ts index f7e63cc..646d30f 100644 --- a/__tests__/scrapeconfig.test.ts +++ b/__tests__/scrapeconfig.test.ts @@ -194,6 +194,27 @@ describe('url param generation', () => { 'screenshots[everything]': 'fullpage', }); }); + it('screenshot flags converted to params', () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + screenshots: { everything: 'fullpage' }, + screenshot_flags: [ + "load_images", + "dark_mode", + "block_banners", + "high_quality", + "print_media_format" + ], + render_js: true, + }); + expect(config.toApiParams({ key: '1234' })).toEqual({ + key: '1234', + url: 'http://httpbin.dev/get', + 'screenshots[everything]': 'fullpage', + screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format", + render_js: true, + }); + }); it('asp enables', () => { const config = new ScrapeConfig({ url: 'http://httpbin.dev/get', @@ -238,6 +259,17 @@ describe('url param generation', () => { tags: 'foo,bar,gaz', }); }); + it('format set', () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + format: "markdown", + }); + expect(config.toApiParams({ key: '1234' })).toEqual({ + key: '1234', + url: 'http://httpbin.dev/get', + format: "markdown", + }); + }); it('debug sets', () => { const config = new ScrapeConfig({ url: 'http://httpbin.dev/get', diff --git a/examples/scrape-as-markdown.js b/examples/scrape-as-markdown.js new file mode 100644 index 0000000..e48b56e --- /dev/null +++ b/examples/scrape-as-markdown.js @@ -0,0 +1,16 @@ +/* +This example shows how to capture page screenshots with images and additional configuration in scrapfly +*/ +import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; + +const key = 'YOUR SCRAPFLY KEY'; +const client = new ScrapflyClient({ key }); +const result = await client.scrape( + new ScrapeConfig({ + url: 'https://web-scraping.dev/products/', + // scrape the page data as markdown format supproted by LLMs. + // None=raw(unchanged), other supported formats are: json, text, clean_html + format: "markdown" + }), +); +console.log(result.result.content); diff --git a/examples/screenshot-with-image-loading.js b/examples/screenshot-with-image-loading.js new file mode 100644 index 0000000..93193e4 --- /dev/null +++ b/examples/screenshot-with-image-loading.js @@ -0,0 +1,27 @@ +/* +This example shows how to capture page screenshots with images and additional configuration in scrapfly +*/ +import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; + +const key = 'YOUR SCRAPFLY KEY'; +const client = new ScrapflyClient({ key }); +const result = await client.scrape( + new ScrapeConfig({ + url: 'https://web-scraping.dev/products/', + // enable headless browsers for screenshots + render_js: true, + // optional: you can wait for page to load before capturing + screenshots: { + everything: 'fullpage', + reviews: '#reviews', + }, + screenshot_flags: [ + "load_images", // Enable image rendering with the request, adds extra usage for the bandwidth consumed + "dark_mode", // Enable dark mode display + "block_banners", // Block cookies banners and overlay that cover the screen + "high_quality", // No compression on the output image + "print_media_format" // Render the page in the print mode + ] + }), +); +console.log(result.result.screenshots); diff --git a/src/scrapeconfig.ts b/src/scrapeconfig.ts index d7f2091..e973eaa 100644 --- a/src/scrapeconfig.ts +++ b/src/scrapeconfig.ts @@ -3,10 +3,13 @@ import { log } from './logger.js'; import { Rec, HttpMethod } from './types.js'; import { ScrapeConfigError } from './errors.js'; +type ScreenshotFlags = "load_images" | "dark_mode" | "block_banners" | "high_quality" | "print_media_format"; +type Format = "raw" | "json" | "text" | "markdown" | "clean_html"; + export class ScrapeConfig { static PUBLIC_DATACENTER_POOL = 'public_datacenter_pool'; static PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool'; - + url: string; retry = true; method: HttpMethod = 'GET'; @@ -24,6 +27,7 @@ export class ScrapeConfig { proxy_pool?: string = null; session?: string = null; tags: Set = new Set(); + format?: Format = null; // raw(unchanged) correlation_id?: string = null; cookies?: Rec = null; body?: string = null; @@ -34,6 +38,7 @@ export class ScrapeConfig { wait_for_selector?: string = null; session_sticky_proxy = false; screenshots?: Rec = null; + screenshot_flags?: ScreenshotFlags[] = null; webhook?: string = null; timeout?: number = null; // in milliseconds js_scenario?: Rec = null; @@ -60,6 +65,7 @@ export class ScrapeConfig { proxy_pool?: string; session?: string; tags?: Array; + format?: Format; correlation_id?: string; cookies?: Rec; body?: string; @@ -69,6 +75,7 @@ export class ScrapeConfig { rendering_wait?: number; wait_for_selector?: string; screenshots?: Rec; + screenshot_flags?: ScreenshotFlags[]; session_sticky_proxy?: boolean; webhook?: string; timeout?: number; // in milliseconds @@ -96,6 +103,7 @@ export class ScrapeConfig { this.proxy_pool = options.proxy_pool ?? this.proxy_pool; this.session = options.session ?? this.session; this.tags = new Set(options.tags) ?? this.tags; + this.format = options.format ?? this.format; this.correlation_id = options.correlation_id ?? this.correlation_id; this.cookies = options.cookies ? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v])) @@ -106,6 +114,7 @@ export class ScrapeConfig { this.rendering_wait = options.rendering_wait ?? this.rendering_wait; this.wait_for_selector = options.wait_for_selector ?? this.wait_for_selector; this.screenshots = options.screenshots ?? this.screenshots; + this.screenshot_flags = options.screenshot_flags ?? this.screenshot_flags; this.webhook = options.webhook ?? this.webhook; this.timeout = options.timeout ?? this.timeout; this.js_scenario = options.js_scenario ?? this.js_scenario; @@ -194,6 +203,13 @@ export class ScrapeConfig { Object.keys(this.screenshots).forEach((key) => { params[`screenshots[${key}]`] = this.screenshots[key]; }); + if (this.screenshot_flags) { + params.screenshot_flags = this.screenshot_flags.join(','); + } + } else { + if (this.screenshot_flags) { + log.warn('Params "screenshot_flags" is ignored. Works only if screenshots is enabled'); + } } if (this.auto_scroll !== null) { params.auto_scroll = this.auto_scroll; @@ -247,6 +263,9 @@ export class ScrapeConfig { if (this.tags.size > 0) { params.tags = Array.from(this.tags).join(','); } + if (this.format) { + params.format = this.format.valueOf(); + } if (this.correlation_id) { params.correlation_id = this.correlation_id; }