Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add screenshot_flags and format api params support #1

Merged
merged 3 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions __tests__/scrapeconfig.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,27 @@ describe('url param generation', () => {
'screenshots[everything]': 'fullpage',
});
});
it('screenshot flags converted to params', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
screenshots: { everything: 'fullpage' },
screenshot_flags: [
"load_images",
"dark_mode",
"block_banners",
"high_quality",
"print_media_format"
],
render_js: true,
});
expect(config.toApiParams({ key: '1234' })).toEqual({
key: '1234',
url: 'http://httpbin.dev/get',
'screenshots[everything]': 'fullpage',
screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format",
render_js: true,
});
});
it('asp enables', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
Expand Down Expand Up @@ -238,6 +259,17 @@ describe('url param generation', () => {
tags: 'foo,bar,gaz',
});
});
it('format set', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
format: "markdown",
});
expect(config.toApiParams({ key: '1234' })).toEqual({
key: '1234',
url: 'http://httpbin.dev/get',
format: "markdown",
});
});
it('debug sets', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
Expand Down
16 changes: 16 additions & 0 deletions examples/scrape-as-markdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
This example shows how to capture page screenshots with images and additional configuration in scrapfly
*/
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';

const key = 'YOUR SCRAPFLY KEY';
const client = new ScrapflyClient({ key });
const result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/products/',
// scrape the page data as markdown format supproted by LLMs.
// None=raw(unchanged), other supported formats are: json, text, clean_html
format: "markdown"
}),
);
console.log(result.result.content);
27 changes: 27 additions & 0 deletions examples/screenshot-with-image-loading.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
This example shows how to capture page screenshots with images and additional configuration in scrapfly
*/
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';

const key = 'YOUR SCRAPFLY KEY';
const client = new ScrapflyClient({ key });
const result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/products/',
// enable headless browsers for screenshots
render_js: true,
// optional: you can wait for page to load before capturing
screenshots: {
everything: 'fullpage',
reviews: '#reviews',
},
screenshot_flags: [
"load_images", // Enable image rendering with the request, adds extra usage for the bandwidth consumed
"dark_mode", // Enable dark mode display
"block_banners", // Block cookies banners and overlay that cover the screen
"high_quality", // No compression on the output image
"print_media_format" // Render the page in the print mode
]
}),
);
console.log(result.result.screenshots);
21 changes: 20 additions & 1 deletion src/scrapeconfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ import { log } from './logger.js';
import { Rec, HttpMethod } from './types.js';
import { ScrapeConfigError } from './errors.js';

type ScreenshotFlags = "load_images" | "dark_mode" | "block_banners" | "high_quality" | "print_media_format";
type Format = "raw" | "json" | "text" | "markdown" | "clean_html";

export class ScrapeConfig {
static PUBLIC_DATACENTER_POOL = 'public_datacenter_pool';
static PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool';

url: string;
retry = true;
method: HttpMethod = 'GET';
Expand All @@ -24,6 +27,7 @@ export class ScrapeConfig {
proxy_pool?: string = null;
session?: string = null;
tags: Set<string> = new Set<string>();
format?: Format = null; // raw(unchanged)
correlation_id?: string = null;
cookies?: Rec<string> = null;
body?: string = null;
Expand All @@ -34,6 +38,7 @@ export class ScrapeConfig {
wait_for_selector?: string = null;
session_sticky_proxy = false;
screenshots?: Rec<any> = null;
screenshot_flags?: ScreenshotFlags[] = null;
webhook?: string = null;
timeout?: number = null; // in milliseconds
js_scenario?: Rec<any> = null;
Expand All @@ -60,6 +65,7 @@ export class ScrapeConfig {
proxy_pool?: string;
session?: string;
tags?: Array<string>;
format?: Format;
correlation_id?: string;
cookies?: Rec<string>;
body?: string;
Expand All @@ -69,6 +75,7 @@ export class ScrapeConfig {
rendering_wait?: number;
wait_for_selector?: string;
screenshots?: Rec<any>;
screenshot_flags?: ScreenshotFlags[];
session_sticky_proxy?: boolean;
webhook?: string;
timeout?: number; // in milliseconds
Expand Down Expand Up @@ -96,6 +103,7 @@ export class ScrapeConfig {
this.proxy_pool = options.proxy_pool ?? this.proxy_pool;
this.session = options.session ?? this.session;
this.tags = new Set(options.tags) ?? this.tags;
this.format = options.format ?? this.format;
this.correlation_id = options.correlation_id ?? this.correlation_id;
this.cookies = options.cookies
? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v]))
Expand All @@ -106,6 +114,7 @@ export class ScrapeConfig {
this.rendering_wait = options.rendering_wait ?? this.rendering_wait;
this.wait_for_selector = options.wait_for_selector ?? this.wait_for_selector;
this.screenshots = options.screenshots ?? this.screenshots;
this.screenshot_flags = options.screenshot_flags ?? this.screenshot_flags;
this.webhook = options.webhook ?? this.webhook;
this.timeout = options.timeout ?? this.timeout;
this.js_scenario = options.js_scenario ?? this.js_scenario;
Expand Down Expand Up @@ -194,6 +203,13 @@ export class ScrapeConfig {
Object.keys(this.screenshots).forEach((key) => {
params[`screenshots[${key}]`] = this.screenshots[key];
});
if (this.screenshot_flags) {
params.screenshot_flags = this.screenshot_flags.join(',');
}
} else {
if (this.screenshot_flags) {
log.warn('Params "screenshot_flags" is ignored. Works only if screenshots is enabled');
}
}
if (this.auto_scroll !== null) {
params.auto_scroll = this.auto_scroll;
Expand Down Expand Up @@ -247,6 +263,9 @@ export class ScrapeConfig {
if (this.tags.size > 0) {
params.tags = Array.from(this.tags).join(',');
}
if (this.format) {
params.format = this.format.valueOf();
}
if (this.correlation_id) {
params.correlation_id = this.correlation_id;
}
Expand Down
Loading