Skip to content

Commit

Permalink
Merge pull request #1 from mazen-r/main
Browse files Browse the repository at this point in the history
Add screenshot_flags and format api params support
  • Loading branch information
Granitosaurus authored May 24, 2024
2 parents d39eae8 + 82153ca commit cb2ab48
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 1 deletion.
32 changes: 32 additions & 0 deletions __tests__/scrapeconfig.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,27 @@ describe('url param generation', () => {
'screenshots[everything]': 'fullpage',
});
});
it('screenshot flags converted to params', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
screenshots: { everything: 'fullpage' },
screenshot_flags: [
"load_images",
"dark_mode",
"block_banners",
"high_quality",
"print_media_format"
],
render_js: true,
});
expect(config.toApiParams({ key: '1234' })).toEqual({
key: '1234',
url: 'http://httpbin.dev/get',
'screenshots[everything]': 'fullpage',
screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format",
render_js: true,
});
});
it('asp enables', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
Expand Down Expand Up @@ -238,6 +259,17 @@ describe('url param generation', () => {
tags: 'foo,bar,gaz',
});
});
it('format set', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
format: "markdown",
});
expect(config.toApiParams({ key: '1234' })).toEqual({
key: '1234',
url: 'http://httpbin.dev/get',
format: "markdown",
});
});
it('debug sets', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
Expand Down
16 changes: 16 additions & 0 deletions examples/scrape-as-markdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
This example shows how to capture page screenshots with images and additional configuration in scrapfly
*/
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';

const key = 'YOUR SCRAPFLY KEY';
const client = new ScrapflyClient({ key });
const result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/products/',
// scrape the page data as markdown format supproted by LLMs.
// None=raw(unchanged), other supported formats are: json, text, clean_html
format: "markdown"
}),
);
console.log(result.result.content);
27 changes: 27 additions & 0 deletions examples/screenshot-with-image-loading.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
This example shows how to capture page screenshots with images and additional configuration in scrapfly
*/
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';

const key = 'YOUR SCRAPFLY KEY';
const client = new ScrapflyClient({ key });
const result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/products/',
// enable headless browsers for screenshots
render_js: true,
// optional: you can wait for page to load before capturing
screenshots: {
everything: 'fullpage',
reviews: '#reviews',
},
screenshot_flags: [
"load_images", // Enable image rendering with the request, adds extra usage for the bandwidth consumed
"dark_mode", // Enable dark mode display
"block_banners", // Block cookies banners and overlay that cover the screen
"high_quality", // No compression on the output image
"print_media_format" // Render the page in the print mode
]
}),
);
console.log(result.result.screenshots);
21 changes: 20 additions & 1 deletion src/scrapeconfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ import { log } from './logger.js';
import { Rec, HttpMethod } from './types.js';
import { ScrapeConfigError } from './errors.js';

type ScreenshotFlags = "load_images" | "dark_mode" | "block_banners" | "high_quality" | "print_media_format";
type Format = "raw" | "json" | "text" | "markdown" | "clean_html";

export class ScrapeConfig {
static PUBLIC_DATACENTER_POOL = 'public_datacenter_pool';
static PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool';

url: string;
retry = true;
method: HttpMethod = 'GET';
Expand All @@ -24,6 +27,7 @@ export class ScrapeConfig {
proxy_pool?: string = null;
session?: string = null;
tags: Set<string> = new Set<string>();
format?: Format = null; // raw(unchanged)
correlation_id?: string = null;
cookies?: Rec<string> = null;
body?: string = null;
Expand All @@ -34,6 +38,7 @@ export class ScrapeConfig {
wait_for_selector?: string = null;
session_sticky_proxy = false;
screenshots?: Rec<any> = null;
screenshot_flags?: ScreenshotFlags[] = null;
webhook?: string = null;
timeout?: number = null; // in milliseconds
js_scenario?: Rec<any> = null;
Expand All @@ -60,6 +65,7 @@ export class ScrapeConfig {
proxy_pool?: string;
session?: string;
tags?: Array<string>;
format?: Format;
correlation_id?: string;
cookies?: Rec<string>;
body?: string;
Expand All @@ -69,6 +75,7 @@ export class ScrapeConfig {
rendering_wait?: number;
wait_for_selector?: string;
screenshots?: Rec<any>;
screenshot_flags?: ScreenshotFlags[];
session_sticky_proxy?: boolean;
webhook?: string;
timeout?: number; // in milliseconds
Expand Down Expand Up @@ -96,6 +103,7 @@ export class ScrapeConfig {
this.proxy_pool = options.proxy_pool ?? this.proxy_pool;
this.session = options.session ?? this.session;
this.tags = new Set(options.tags) ?? this.tags;
this.format = options.format ?? this.format;
this.correlation_id = options.correlation_id ?? this.correlation_id;
this.cookies = options.cookies
? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v]))
Expand All @@ -106,6 +114,7 @@ export class ScrapeConfig {
this.rendering_wait = options.rendering_wait ?? this.rendering_wait;
this.wait_for_selector = options.wait_for_selector ?? this.wait_for_selector;
this.screenshots = options.screenshots ?? this.screenshots;
this.screenshot_flags = options.screenshot_flags ?? this.screenshot_flags;
this.webhook = options.webhook ?? this.webhook;
this.timeout = options.timeout ?? this.timeout;
this.js_scenario = options.js_scenario ?? this.js_scenario;
Expand Down Expand Up @@ -194,6 +203,13 @@ export class ScrapeConfig {
Object.keys(this.screenshots).forEach((key) => {
params[`screenshots[${key}]`] = this.screenshots[key];
});
if (this.screenshot_flags) {
params.screenshot_flags = this.screenshot_flags.join(',');
}
} else {
if (this.screenshot_flags) {
log.warn('Params "screenshot_flags" is ignored. Works only if screenshots is enabled');
}
}
if (this.auto_scroll !== null) {
params.auto_scroll = this.auto_scroll;
Expand Down Expand Up @@ -247,6 +263,9 @@ export class ScrapeConfig {
if (this.tags.size > 0) {
params.tags = Array.from(this.tags).join(',');
}
if (this.format) {
params.format = this.format.valueOf();
}
if (this.correlation_id) {
params.correlation_id = this.correlation_id;
}
Expand Down

0 comments on commit cb2ab48

Please sign in to comment.