-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
34 changed files
with
12,368 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Specify the base Docker image. You can read more about | ||
# the available images at https://crawlee.dev/docs/guides/docker-images | ||
# You can also use any other image from Docker Hub. | ||
FROM apify/actor-node-playwright-chrome:22-1.46.0 AS builder | ||
|
||
# Copy just package.json and package-lock.json | ||
# to speed up the build using Docker layer cache. | ||
COPY --chown=myuser package*.json ./ | ||
|
||
# Install all dependencies. Don't audit to speed up the installation. | ||
RUN npm install --include=dev --audit=false | ||
|
||
# Next, copy the source files using the user set | ||
# in the base image. | ||
COPY --chown=myuser . ./ | ||
|
||
# Install all dependencies and build the project. | ||
# Don't audit to speed up the installation. | ||
RUN npm run build | ||
|
||
# Create final image | ||
FROM apify/actor-node-playwright-firefox:22-1.46.0 | ||
|
||
# Copy just package.json and package-lock.json | ||
# to speed up the build using Docker layer cache. | ||
COPY --chown=myuser package*.json ./ | ||
|
||
# Install NPM packages, skip optional and development dependencies to | ||
# keep the image small. Avoid logging too much and print the dependency | ||
# tree for debugging | ||
RUN npm --quiet set progress=false \ | ||
&& npm install --omit=dev --omit=optional \ | ||
&& echo "Installed NPM packages:" \ | ||
&& (npm list --omit=dev --all || true) \ | ||
&& echo "Node.js version:" \ | ||
&& node --version \ | ||
&& echo "NPM version:" \ | ||
&& npm --version \ | ||
&& rm -r ~/.npm | ||
|
||
# Remove the existing firefox installation | ||
RUN rm -rf ${PLAYWRIGHT_BROWSERS_PATH}/* | ||
|
||
# Install all required playwright dependencies for firefox | ||
RUN npx playwright install firefox | ||
# symlink the firefox binary to the root folder in order to bypass the versioning and resulting browser launch crashes. | ||
RUN ln -s ${PLAYWRIGHT_BROWSERS_PATH}/firefox-*/firefox/firefox ${PLAYWRIGHT_BROWSERS_PATH}/ | ||
|
||
# Overrides the dynamic library used by Firefox to determine trusted root certificates with p11-kit-trust.so, which loads the system certificates. | ||
RUN rm $PLAYWRIGHT_BROWSERS_PATH/firefox-*/firefox/libnssckbi.so | ||
RUN ln -s /usr/lib/x86_64-linux-gnu/pkcs11/p11-kit-trust.so $(ls -d $PLAYWRIGHT_BROWSERS_PATH/firefox-*)/firefox/libnssckbi.so | ||
|
||
# Copy built JS files from builder image | ||
COPY --from=builder --chown=myuser /home/myuser/dist ./dist | ||
|
||
# Next, copy the remaining files and directories with the source code. | ||
# Since we do this after NPM install, quick build will be really fast | ||
# for most source file changes. | ||
COPY --chown=myuser . ./ | ||
|
||
# Run the image. | ||
CMD npm run start:prod --silent |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
{ | ||
"actorSpecification": 1, | ||
"name": "rag-web-browser", | ||
"title": "RAG Web browser", | ||
"description": "Web browser for a retrieval augmented generation workflows. Retrieve and return website content from the top Google Search Results Pages", | ||
"version": "0.1", | ||
"input": "./input_schema.json", | ||
"dockerfile": "./Dockerfile", | ||
"storages": { | ||
"dataset": { | ||
"actorSpecification": 1, | ||
"title": "RAG Web browser", | ||
"description": "Too see all scraped properties, export the whole dataset or select All fields instead of Overview", | ||
"views": { | ||
"overview": { | ||
"title": "Overview", | ||
"description": "Selected fields from the dataset", | ||
"transformation": { | ||
"fields": [ | ||
"metadata.url", | ||
"metadata.title", | ||
"text" | ||
], | ||
"flatten": ["metadata"] | ||
}, | ||
"display": { | ||
"component": "table", | ||
"properties": { | ||
"metadata.url": { | ||
"label": "Page URL", | ||
"format": "text" | ||
}, | ||
"metadata.title": { | ||
"label": "Page Title", | ||
"format": "text" | ||
}, | ||
"text": { | ||
"label": "Extracted text", | ||
"format": "text" | ||
} | ||
} | ||
} | ||
}, | ||
"googleSearchResults": { | ||
"title": "Google Search Results", | ||
"description": "Title, Description and URL of the Google Search Results", | ||
"transformation": { | ||
"fields": [ | ||
"googleSearchResult.description", | ||
"googleSearchResult.title", | ||
"googleSearchResult.url" | ||
], | ||
"flatten": ["googleSearchResult"] | ||
}, | ||
"display": { | ||
"component": "table", | ||
"properties": { | ||
"googleSearchResult.description": { | ||
"label": "Description", | ||
"format": "text" | ||
}, | ||
"googleSearchResult.title": { | ||
"label": "Title", | ||
"format": "text" | ||
}, | ||
"googleSearchResult.url": { | ||
"label": "URL", | ||
"format": "text" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
{ | ||
"title": "RAG Web Browser", | ||
"description": "RAG Web Browser for a retrieval augmented generation workflows. Retrieve and return website content from the top Google Search Results Pages", | ||
"type": "object", | ||
"schemaVersion": 1, | ||
"properties": { | ||
"query": { | ||
"title": "Search term(s)", | ||
"type": "string", | ||
"description": "Use regular search words or enter Google Search URLs. You can also apply [advanced Google search techniques](https://blog.apify.com/how-to-scrape-google-like-a-pro/), such as <code>AI site:twitter.com</code> or <code>javascript OR python</code>", | ||
"prefill": "apify rag browser", | ||
"editor": "textarea", | ||
"pattern": "[^\\s]+" | ||
}, | ||
"maxResults": { | ||
"title": "Number of top search results to return from Google. Only organic results are returned and counted", | ||
"type": "integer", | ||
"description": "The number of top organic search results to return and scrape text from", | ||
"prefill": 3, | ||
"minimum": 1, | ||
"maximum": 50 | ||
}, | ||
"outputFormats": { | ||
"title": "Output formats", | ||
"type": "array", | ||
"description": "Select the desired output formats for the retrieved content", | ||
"editor": "select", | ||
"default": ["text"], | ||
"items": { | ||
"type": "string", | ||
"enum": ["text", "markdown", "html"], | ||
"enumTitles": ["Plain text", "Markdown", "HTML"] | ||
} | ||
}, | ||
"requestTimeoutSecs": { | ||
"title": "Request timeout in seconds", | ||
"type": "integer", | ||
"description": "The maximum time (in seconds) allowed for request. If the request exceeds this time, it will be marked as failed and only already finished results will be returned", | ||
"minimum": 1, | ||
"maximum": 600, | ||
"default": 60 | ||
}, | ||
"proxyGroupSearch": { | ||
"title": "Search Proxy Group", | ||
"type": "string", | ||
"description": "Select the proxy group for loading search results", | ||
"editor": "select", | ||
"default": "GOOGLE_SERP", | ||
"enum": ["GOOGLE_SERP", "SHADER"], | ||
"sectionCaption": "Google Search Settings" | ||
}, | ||
"maxRequestRetriesSearch": { | ||
"title": "Maximum number of retries for Google search request on network / server errors", | ||
"type": "integer", | ||
"description": "The maximum number of times the Google search crawler will retry the request on network, proxy or server errors. If the (n+1)-th request still fails, the crawler will mark this request as failed.", | ||
"minimum": 0, | ||
"maximum": 3, | ||
"default": 1 | ||
}, | ||
"proxyConfiguration": { | ||
"title": "Crawler: Proxy configuration", | ||
"type": "object", | ||
"description": "Enables loading the websites from IP addresses in specific geographies and to circumvent blocking.", | ||
"default": { | ||
"useApifyProxy": true | ||
}, | ||
"prefill": { | ||
"useApifyProxy": true | ||
}, | ||
"editor": "proxy", | ||
"sectionCaption": "Content Crawler Settings" | ||
}, | ||
"initialConcurrency": { | ||
"title": "Initial concurrency", | ||
"type": "integer", | ||
"description": "Initial number of Playwright browsers running in parallel. The system scales this value based on CPU and memory usage.", | ||
"minimum": 0, | ||
"maximum": 50, | ||
"default": 5 | ||
}, | ||
"minConcurrency": { | ||
"title": "Minimal concurrency", | ||
"type": "integer", | ||
"description": "Minimum number of Playwright browsers running in parallel. Useful for defining a base level of parallelism.", | ||
"minimum": 1, | ||
"maximum": 50, | ||
"default": 3 | ||
}, | ||
"maxConcurrency": { | ||
"title": "Maximal concurrency", | ||
"type": "integer", | ||
"description": "Maximum number of browsers or clients running in parallel to avoid overloading target websites.", | ||
"minimum": 1, | ||
"maximum": 50, | ||
"default": 10 | ||
}, | ||
"maxRequestRetries": { | ||
"title": "Maximum number of retries for Playwright content crawler", | ||
"type": "integer", | ||
"description": "Maximum number of retry attempts on network, proxy, or server errors. If the (n+1)-th request fails, it will be marked as failed.", | ||
"minimum": 0, | ||
"maximum": 3, | ||
"default": 1 | ||
}, | ||
"requestTimeoutContentCrawlSecs": { | ||
"title": "Request timeout for content crawling", | ||
"type": "integer", | ||
"description": "Timeout (in seconds) for making requests for each search result, including fetching and processing its content.\n\nThe value must be smaller than the 'Request timeout in seconds' setting.", | ||
"minimum": 1, | ||
"maximum": 60, | ||
"default": 30 | ||
}, | ||
"dynamicContentWaitSecs": { | ||
"title": "Wait for dynamic content (seconds)", | ||
"type": "integer", | ||
"description": "Maximum time (in seconds) to wait for dynamic content to load. The crawler processes the page once this time elapses or when the network becomes idle.", | ||
"default": 10 | ||
}, | ||
"removeCookieWarnings": { | ||
"title": "Remove cookie warnings", | ||
"type": "boolean", | ||
"description": "If enabled, removes cookie consent dialogs to improve text extraction accuracy. Note that this will impact latency.", | ||
"default": true | ||
}, | ||
"debugMode": { | ||
"title": "Debug mode (stores debugging information in dataset)", | ||
"type": "boolean", | ||
"description": "If enabled, the Actor will store debugging information in the dataset's debug field", | ||
"default": false, | ||
"sectionCaption": "Debug Settings" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# configurations | ||
.idea | ||
|
||
# crawlee and apify storage folders | ||
apify_storage | ||
crawlee_storage | ||
storage | ||
|
||
# installed files | ||
node_modules | ||
|
||
# git folder | ||
.git | ||
|
||
# data | ||
data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
root = true | ||
|
||
[*] | ||
indent_style = space | ||
indent_size = 4 | ||
charset = utf-8 | ||
trim_trailing_whitespace = true | ||
insert_final_newline = true | ||
end_of_line = lf | ||
max_line_length = 120 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
{ | ||
"root": true, | ||
"env": { | ||
"browser": true, | ||
"es2020": true, | ||
"node": true | ||
}, | ||
"extends": [ | ||
"@apify/eslint-config-ts" | ||
], | ||
"parserOptions": { | ||
"project": "./tsconfig.json", | ||
"ecmaVersion": 2020 | ||
}, | ||
"ignorePatterns": [ | ||
"node_modules", | ||
"dist", | ||
"**/*.d.ts" | ||
], | ||
"plugins": ["import"], | ||
"rules": { | ||
"import/order": [ | ||
"error", | ||
{ | ||
"groups": [ | ||
["builtin", "external"], | ||
"internal", | ||
["parent", "sibling", "index"] | ||
], | ||
"newlines-between": "always", | ||
"alphabetize": { | ||
"order": "asc", | ||
"caseInsensitive": true | ||
} | ||
} | ||
], | ||
"max-len": ["error", { "code": 120, "ignoreUrls": true, "ignoreStrings": true, "ignoreTemplateLiterals": true }] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# This file tells Git which files shouldn't be added to source control | ||
|
||
.DS_Store | ||
.idea | ||
dist | ||
node_modules | ||
apify_storage | ||
storage | ||
|
||
# Added by Apify CLI | ||
.venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
This changelog summarizes all changes of the RAG Web Browser | ||
|
||
### 2024-09-24 | ||
|
||
🚀 Features | ||
- Updated README.md to include tips on improving latency | ||
- Set initialConcurrency to 5 | ||
- Set minConcurrency to 3 | ||
- Set logLevel to INFO | ||
|
||
### 2024-09-20 | ||
|
||
🐛 Bug Fixes | ||
- Fix response format when crawler fails | ||
|
||
### 2024-09-24 | ||
|
||
🚀 Features | ||
- Add ability to create new crawlers using query parameters | ||
- Update Dockerfile to node version 22 | ||
|
||
🐛 Bug Fixes | ||
- Fix playwright key creation | ||
|
||
### 2024-09-11 | ||
|
||
🚀 Features | ||
- Initial version of the RAG Web Browser |
Oops, something went wrong.