diff --git a/CHANGELOG.md b/CHANGELOG.md index ac73b4e7..4ab0556d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,8 @@ # [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05) - ### Features -* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) +- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15) diff --git a/src/config.ts b/src/config.ts index 787744ce..13178e47 100644 --- a/src/config.ts +++ b/src/config.ts @@ -26,6 +26,15 @@ export const configSchema = z.object({ * @default "" */ exclude: z.string().or(z.array(z.string())).optional(), + /** + * Set Crawlee strategy to check certain parts of the URLs found. + * @example "same-origin" + * @default "same-hostname" + * @see https://crawlee.dev/api/core/enum/EnqueueStrategy + */ + crawlStrategy: z + .enum(["all", "same-origin", "same-hostname", "same-domain"]) + .optional(), /** * Selector to grab the inner text from * @example ".docs-builder-container" diff --git a/src/core.ts b/src/core.ts index c996f2bb..02c15e16 100644 --- a/src/core.ts +++ b/src/core.ts @@ -96,7 +96,11 @@ export async function crawl(config: Config) { exclude: typeof config.exclude === "string" ? [config.exclude] - : config.exclude ?? [], + : (config.exclude ?? []), + strategy: + typeof config.crawlStrategy === "string" + ? config.crawlStrategy + : undefined, }); }, // Comment this option to scrape the full website.