From 8a32f8283ba85ccbb2e02cbf8ef21528924683ba Mon Sep 17 00:00:00 2001 From: granitosaurus Date: Thu, 22 Aug 2024 16:33:42 +0700 Subject: [PATCH] add header normalization for header checks in screenshot API and web scraping API selector generator and format code --- README.md | 2 +- __tests__/client/screenshot.test.ts | 3 ++- __tests__/result.test.ts | 8 ++++++++ deno.json | 2 +- examples/bun/README.md | 2 +- src/client.ts | 16 ++++++++-------- src/result.ts | 14 +++++++++----- src/utils.ts | 17 +++++++++++++++++ 8 files changed, 47 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 6a12874..73699ba 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ $ deno lint # publish JSR: $ deno publish # build NPM package: -$ deno build-npm +$ deno task build-npm # publish NPM: $ cd npm && npm publish ``` \ No newline at end of file diff --git a/__tests__/client/screenshot.test.ts b/__tests__/client/screenshot.test.ts index 4638ba5..8c9d9fb 100644 --- a/__tests__/client/screenshot.test.ts +++ b/__tests__/client/screenshot.test.ts @@ -5,6 +5,7 @@ import { assertEquals, assertRejects } from "https://deno.land/std@0.224.0/asser import { stub } from "https://deno.land/std/testing/mock.ts"; import type { RequestOptions } from '../../src/utils.ts'; import { mockedStream, responseFactory } from '../utils.ts'; +import { ScreenshotResult } from '../../src/result.ts'; Deno.test('screenshot: succeeds', async () => { const KEY = '__API_KEY__'; @@ -23,7 +24,7 @@ Deno.test('screenshot: succeeds', async () => { status: 200, headers: { 'content-encoding': 'gzip', - 'content-type': 'image/png', + 'ContEnT-TyPe': 'image/png', // ensure case insensitivity 'x-scrapfly-upstream-http-code': '200', 'x-scrapfly-upstream-url': url, }, diff --git a/__tests__/result.test.ts b/__tests__/result.test.ts index 2f30ee7..77d1ee1 100644 --- a/__tests__/result.test.ts +++ b/__tests__/result.test.ts @@ -12,6 +12,14 @@ Deno.test('cheerio selector lazy loads and caches itself', async () => { // cheerio.load is called exactly once - means it's cached }); + +Deno.test('cheerio selector loads with case sensitive headers', async () => { + const response = JSON.parse(await Deno.readTextFile('__tests__/data/response_html_case_sensitive_headers.json')); + const result = new ScrapeResult(response); + assertEquals(result.selector('h1').text(), 'Herman Melville - Moby-Dick'); +}); + + Deno.test('throws ContentTypeError when accessing .selector on JSON data', async () => { const responseJsonSuccess = JSON.parse(await Deno.readTextFile('__tests__/data/response_json_success.json')); const result = new ScrapeResult(responseJsonSuccess); diff --git a/deno.json b/deno.json index 68bc02a..bc3d1ca 100644 --- a/deno.json +++ b/deno.json @@ -4,7 +4,7 @@ }, "name": "@scrapfly/scrapfly-sdk", "exports": "./src/main.ts", - "version": "0.6.4", + "version": "0.6.5", "description": "SDK for Scrapfly.io API for web scraping, screenshotting and data extraction", "tasks": { "start": "deno run --allow-net --allow-read src/main.ts", diff --git a/examples/bun/README.md b/examples/bun/README.md index 63deaca..38cdec8 100644 --- a/examples/bun/README.md +++ b/examples/bun/README.md @@ -4,7 +4,7 @@ Bun is a modern javascript runtime that can execute both javascript and typescri These examples demonstrate Typescript SDK usage with Bun and for that install the SDK using jsr.io which distributes Typescript files: -``` +```bash $ bunx jsr add @scrapfly/scrapfly-sdk ``` diff --git a/src/client.ts b/src/client.ts index 37dc83c..2acf8e8 100644 --- a/src/client.ts +++ b/src/client.ts @@ -95,9 +95,9 @@ export class ScrapflyClient { /** * Handle clob and blob large objects */ - async handleLargeObjects(result: any, format: "clob" | "blob"): Promise { + async handleLargeObjects(result: any, format: 'clob' | 'blob'): Promise { let response: Response; - + try { const url = new URL(result.content); const params = { key: this.key }; @@ -117,14 +117,14 @@ export class ScrapflyClient { } const content: string = await response.text(); - result.content = content + result.content = content; if (format === 'clob') { - result.format = 'text' + result.format = 'text'; } if (format === 'blob') { - result.format = 'binary' + result.format = 'binary'; } - return result + return result; } /** @@ -209,9 +209,9 @@ export class ScrapflyClient { throw new errors.ApiHttpClientError(JSON.stringify(data)); } - const content_format = data.result.format + const content_format = data.result.format; if (content_format === 'clob' || content_format === 'blob') { - data.result = await this.handleLargeObjects(data.result, content_format) + data.result = await this.handleLargeObjects(data.result, content_format); } const result = this.handleResponse( diff --git a/src/result.ts b/src/result.ts index 2500002..1029990 100644 --- a/src/result.ts +++ b/src/result.ts @@ -1,6 +1,7 @@ import type { HttpMethod, Rec } from './types.ts'; import * as errors from './errors.ts'; import { cheerio } from './deps.ts'; +import { normalizeHeaders } from './utils.ts'; export type ConfigData = { url: string; @@ -208,7 +209,8 @@ export class ScrapeResult { get selector(): cheerio.CheerioAPI { if (!this._selector) { - if (!this.result.response_headers['content-type'].includes('text/html')) { + const headers = normalizeHeaders(this.result.response_headers); + if (!headers['content-type'].includes('text/html')) { throw new errors.ContentTypeError( `Cannot use selector on non-html content-type, received: ${this.result.response_headers['content-type']}`, ); @@ -287,20 +289,22 @@ export class ScreenshotResult { } private defineMetadata(response: Response): ScreenshotMetadata { - const contentType = response.headers.get('content-type'); + const headers = normalizeHeaders(response.headers); + const contentType = headers['content-type']; let extension_name = ''; if (contentType) { extension_name = contentType.split('/')[1].split(';')[0]; } return { extension_name: extension_name, - upstream_status_code: parseInt(response.headers.get('X-Scrapfly-Upstream-Http-Code') || '200', 10), - upstream_url: response.headers.get('X-Scrapfly-Upstream-Url') || '', + upstream_status_code: parseInt(headers['x-scrapfly-upstream-http-code'] || '200', 10), + upstream_url: headers['x-scrapfly-upstream-url'] || '', }; } private decodeResponse(response: Response, data: ArrayBuffer): object | null { - if (response.headers.get('content-type') === 'json') { + const headers = normalizeHeaders(response.headers); + if (headers['content-type'] === 'json') { return JSON.parse(new TextDecoder().decode(data)); } return null; diff --git a/src/utils.ts b/src/utils.ts index 0f3068c..b308f26 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,4 +1,5 @@ import { log } from './logger.ts'; +import type { Rec } from './types.ts'; export function urlsafe_b64encode(data: string): string { const encoder = new TextEncoder(); @@ -53,3 +54,19 @@ export async function fetchRetry( throw lastError; } + +export function normalizeHeaders(headers: Rec | Headers): Rec { + const normalizedHeaders: Rec = {}; + + if (headers instanceof Headers) { + headers.forEach((value, key) => { + normalizedHeaders[key.toLowerCase()] = value; + }); + } else { + Object.keys(headers).forEach((key) => { + normalizedHeaders[key.toLowerCase()] = headers[key]; + }); + } + + return normalizedHeaders; +}