Skip to content

Commit

Permalink
add header normalization for header checks in screenshot API and web …
Browse files Browse the repository at this point in the history
…scraping API selector generator and format code
  • Loading branch information
Granitosaurus committed Aug 22, 2024
1 parent a09d6b9 commit 8a32f82
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 17 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ $ deno lint
# publish JSR:
$ deno publish
# build NPM package:
$ deno build-npm
$ deno task build-npm
# publish NPM:
$ cd npm && npm publish
```
3 changes: 2 additions & 1 deletion __tests__/client/screenshot.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { assertEquals, assertRejects } from "https://deno.land/[email protected]/asser
import { stub } from "https://deno.land/std/testing/mock.ts";
import type { RequestOptions } from '../../src/utils.ts';
import { mockedStream, responseFactory } from '../utils.ts';
import { ScreenshotResult } from '../../src/result.ts';

Deno.test('screenshot: succeeds', async () => {
const KEY = '__API_KEY__';
Expand All @@ -23,7 +24,7 @@ Deno.test('screenshot: succeeds', async () => {
status: 200,
headers: {
'content-encoding': 'gzip',
'content-type': 'image/png',
'ContEnT-TyPe': 'image/png', // ensure case insensitivity
'x-scrapfly-upstream-http-code': '200',
'x-scrapfly-upstream-url': url,
},
Expand Down
8 changes: 8 additions & 0 deletions __tests__/result.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ Deno.test('cheerio selector lazy loads and caches itself', async () => {
// cheerio.load is called exactly once - means it's cached
});


Deno.test('cheerio selector loads with case sensitive headers', async () => {
const response = JSON.parse(await Deno.readTextFile('__tests__/data/response_html_case_sensitive_headers.json'));
const result = new ScrapeResult(response);
assertEquals(result.selector('h1').text(), 'Herman Melville - Moby-Dick');
});


Deno.test('throws ContentTypeError when accessing .selector on JSON data', async () => {
const responseJsonSuccess = JSON.parse(await Deno.readTextFile('__tests__/data/response_json_success.json'));
const result = new ScrapeResult(responseJsonSuccess);
Expand Down
2 changes: 1 addition & 1 deletion deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
},
"name": "@scrapfly/scrapfly-sdk",
"exports": "./src/main.ts",
"version": "0.6.4",
"version": "0.6.5",
"description": "SDK for Scrapfly.io API for web scraping, screenshotting and data extraction",
"tasks": {
"start": "deno run --allow-net --allow-read src/main.ts",
Expand Down
2 changes: 1 addition & 1 deletion examples/bun/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Bun is a modern javascript runtime that can execute both javascript and typescri

These examples demonstrate Typescript SDK usage with Bun and for that install the SDK using jsr.io which distributes Typescript files:

```
```bash
$ bunx jsr add @scrapfly/scrapfly-sdk
```

Expand Down
16 changes: 8 additions & 8 deletions src/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ export class ScrapflyClient {
/**
* Handle clob and blob large objects
*/
async handleLargeObjects(result: any, format: "clob" | "blob"): Promise<ScrapeResult> {
async handleLargeObjects(result: any, format: 'clob' | 'blob'): Promise<ScrapeResult> {
let response: Response;

try {
const url = new URL(result.content);
const params = { key: this.key };
Expand All @@ -117,14 +117,14 @@ export class ScrapflyClient {
}

const content: string = await response.text();
result.content = content
result.content = content;
if (format === 'clob') {
result.format = 'text'
result.format = 'text';
}
if (format === 'blob') {
result.format = 'binary'
result.format = 'binary';
}
return result
return result;
}

/**
Expand Down Expand Up @@ -209,9 +209,9 @@ export class ScrapflyClient {
throw new errors.ApiHttpClientError(JSON.stringify(data));
}

const content_format = data.result.format
const content_format = data.result.format;
if (content_format === 'clob' || content_format === 'blob') {
data.result = await this.handleLargeObjects(data.result, content_format)
data.result = await this.handleLargeObjects(data.result, content_format);
}

const result = this.handleResponse(
Expand Down
14 changes: 9 additions & 5 deletions src/result.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { HttpMethod, Rec } from './types.ts';
import * as errors from './errors.ts';
import { cheerio } from './deps.ts';
import { normalizeHeaders } from './utils.ts';

export type ConfigData = {
url: string;
Expand Down Expand Up @@ -208,7 +209,8 @@ export class ScrapeResult {

get selector(): cheerio.CheerioAPI {
if (!this._selector) {
if (!this.result.response_headers['content-type'].includes('text/html')) {
const headers = normalizeHeaders(this.result.response_headers);
if (!headers['content-type'].includes('text/html')) {
throw new errors.ContentTypeError(
`Cannot use selector on non-html content-type, received: ${this.result.response_headers['content-type']}`,
);
Expand Down Expand Up @@ -287,20 +289,22 @@ export class ScreenshotResult {
}

private defineMetadata(response: Response): ScreenshotMetadata {
const contentType = response.headers.get('content-type');
const headers = normalizeHeaders(response.headers);
const contentType = headers['content-type'];
let extension_name = '';
if (contentType) {
extension_name = contentType.split('/')[1].split(';')[0];
}
return {
extension_name: extension_name,
upstream_status_code: parseInt(response.headers.get('X-Scrapfly-Upstream-Http-Code') || '200', 10),
upstream_url: response.headers.get('X-Scrapfly-Upstream-Url') || '',
upstream_status_code: parseInt(headers['x-scrapfly-upstream-http-code'] || '200', 10),
upstream_url: headers['x-scrapfly-upstream-url'] || '',
};
}

private decodeResponse(response: Response, data: ArrayBuffer): object | null {
if (response.headers.get('content-type') === 'json') {
const headers = normalizeHeaders(response.headers);
if (headers['content-type'] === 'json') {
return JSON.parse(new TextDecoder().decode(data));
}
return null;
Expand Down
17 changes: 17 additions & 0 deletions src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { log } from './logger.ts';
import type { Rec } from './types.ts';

export function urlsafe_b64encode(data: string): string {
const encoder = new TextEncoder();
Expand Down Expand Up @@ -53,3 +54,19 @@ export async function fetchRetry(

throw lastError;
}

export function normalizeHeaders(headers: Rec<string> | Headers): Rec<string> {
const normalizedHeaders: Rec<string> = {};

if (headers instanceof Headers) {
headers.forEach((value, key) => {
normalizedHeaders[key.toLowerCase()] = value;
});
} else {
Object.keys(headers).forEach((key) => {
normalizedHeaders[key.toLowerCase()] = headers[key];
});
}

return normalizedHeaders;
}

0 comments on commit 8a32f82

Please sign in to comment.