Skip to content

Commit

Permalink
RAG Web Browser implementation (#1)
Browse files Browse the repository at this point in the history
* Actor implementation
  • Loading branch information
jirispilka authored Oct 2, 2024
1 parent 75c474f commit 932a411
Show file tree
Hide file tree
Showing 34 changed files with 12,368 additions and 21 deletions.
62 changes: 62 additions & 0 deletions .actor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:22-1.46.0 AS builder

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install all dependencies. Don't audit to speed up the installation.
RUN npm install --include=dev --audit=false

# Next, copy the source files using the user set
# in the base image.
COPY --chown=myuser . ./

# Install all dependencies and build the project.
# Don't audit to speed up the installation.
RUN npm run build

# Create final image
FROM apify/actor-node-playwright-firefox:22-1.46.0

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm

# Remove the existing firefox installation
RUN rm -rf ${PLAYWRIGHT_BROWSERS_PATH}/*

# Install all required playwright dependencies for firefox
RUN npx playwright install firefox
# symlink the firefox binary to the root folder in order to bypass the versioning and resulting browser launch crashes.
RUN ln -s ${PLAYWRIGHT_BROWSERS_PATH}/firefox-*/firefox/firefox ${PLAYWRIGHT_BROWSERS_PATH}/

# Overrides the dynamic library used by Firefox to determine trusted root certificates with p11-kit-trust.so, which loads the system certificates.
RUN rm $PLAYWRIGHT_BROWSERS_PATH/firefox-*/firefox/libnssckbi.so
RUN ln -s /usr/lib/x86_64-linux-gnu/pkcs11/p11-kit-trust.so $(ls -d $PLAYWRIGHT_BROWSERS_PATH/firefox-*)/firefox/libnssckbi.so

# Copy built JS files from builder image
COPY --from=builder --chown=myuser /home/myuser/dist ./dist

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./

# Run the image.
CMD npm run start:prod --silent
76 changes: 76 additions & 0 deletions .actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"actorSpecification": 1,
"name": "rag-web-browser",
"title": "RAG Web browser",
"description": "Web browser for a retrieval augmented generation workflows. Retrieve and return website content from the top Google Search Results Pages",
"version": "0.1",
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "RAG Web browser",
"description": "Too see all scraped properties, export the whole dataset or select All fields instead of Overview",
"views": {
"overview": {
"title": "Overview",
"description": "Selected fields from the dataset",
"transformation": {
"fields": [
"metadata.url",
"metadata.title",
"text"
],
"flatten": ["metadata"]
},
"display": {
"component": "table",
"properties": {
"metadata.url": {
"label": "Page URL",
"format": "text"
},
"metadata.title": {
"label": "Page Title",
"format": "text"
},
"text": {
"label": "Extracted text",
"format": "text"
}
}
}
},
"googleSearchResults": {
"title": "Google Search Results",
"description": "Title, Description and URL of the Google Search Results",
"transformation": {
"fields": [
"googleSearchResult.description",
"googleSearchResult.title",
"googleSearchResult.url"
],
"flatten": ["googleSearchResult"]
},
"display": {
"component": "table",
"properties": {
"googleSearchResult.description": {
"label": "Description",
"format": "text"
},
"googleSearchResult.title": {
"label": "Title",
"format": "text"
},
"googleSearchResult.url": {
"label": "URL",
"format": "text"
}
}
}
}
}
}
}
}
133 changes: 133 additions & 0 deletions .actor/input_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{
"title": "RAG Web Browser",
"description": "RAG Web Browser for a retrieval augmented generation workflows. Retrieve and return website content from the top Google Search Results Pages",
"type": "object",
"schemaVersion": 1,
"properties": {
"query": {
"title": "Search term(s)",
"type": "string",
"description": "Use regular search words or enter Google Search URLs. You can also apply [advanced Google search techniques](https://blog.apify.com/how-to-scrape-google-like-a-pro/), such as <code>AI site:twitter.com</code> or <code>javascript OR python</code>",
"prefill": "apify rag browser",
"editor": "textarea",
"pattern": "[^\\s]+"
},
"maxResults": {
"title": "Number of top search results to return from Google. Only organic results are returned and counted",
"type": "integer",
"description": "The number of top organic search results to return and scrape text from",
"prefill": 3,
"minimum": 1,
"maximum": 50
},
"outputFormats": {
"title": "Output formats",
"type": "array",
"description": "Select the desired output formats for the retrieved content",
"editor": "select",
"default": ["text"],
"items": {
"type": "string",
"enum": ["text", "markdown", "html"],
"enumTitles": ["Plain text", "Markdown", "HTML"]
}
},
"requestTimeoutSecs": {
"title": "Request timeout in seconds",
"type": "integer",
"description": "The maximum time (in seconds) allowed for request. If the request exceeds this time, it will be marked as failed and only already finished results will be returned",
"minimum": 1,
"maximum": 600,
"default": 60
},
"proxyGroupSearch": {
"title": "Search Proxy Group",
"type": "string",
"description": "Select the proxy group for loading search results",
"editor": "select",
"default": "GOOGLE_SERP",
"enum": ["GOOGLE_SERP", "SHADER"],
"sectionCaption": "Google Search Settings"
},
"maxRequestRetriesSearch": {
"title": "Maximum number of retries for Google search request on network / server errors",
"type": "integer",
"description": "The maximum number of times the Google search crawler will retry the request on network, proxy or server errors. If the (n+1)-th request still fails, the crawler will mark this request as failed.",
"minimum": 0,
"maximum": 3,
"default": 1
},
"proxyConfiguration": {
"title": "Crawler: Proxy configuration",
"type": "object",
"description": "Enables loading the websites from IP addresses in specific geographies and to circumvent blocking.",
"default": {
"useApifyProxy": true
},
"prefill": {
"useApifyProxy": true
},
"editor": "proxy",
"sectionCaption": "Content Crawler Settings"
},
"initialConcurrency": {
"title": "Initial concurrency",
"type": "integer",
"description": "Initial number of Playwright browsers running in parallel. The system scales this value based on CPU and memory usage.",
"minimum": 0,
"maximum": 50,
"default": 5
},
"minConcurrency": {
"title": "Minimal concurrency",
"type": "integer",
"description": "Minimum number of Playwright browsers running in parallel. Useful for defining a base level of parallelism.",
"minimum": 1,
"maximum": 50,
"default": 3
},
"maxConcurrency": {
"title": "Maximal concurrency",
"type": "integer",
"description": "Maximum number of browsers or clients running in parallel to avoid overloading target websites.",
"minimum": 1,
"maximum": 50,
"default": 10
},
"maxRequestRetries": {
"title": "Maximum number of retries for Playwright content crawler",
"type": "integer",
"description": "Maximum number of retry attempts on network, proxy, or server errors. If the (n+1)-th request fails, it will be marked as failed.",
"minimum": 0,
"maximum": 3,
"default": 1
},
"requestTimeoutContentCrawlSecs": {
"title": "Request timeout for content crawling",
"type": "integer",
"description": "Timeout (in seconds) for making requests for each search result, including fetching and processing its content.\n\nThe value must be smaller than the 'Request timeout in seconds' setting.",
"minimum": 1,
"maximum": 60,
"default": 30
},
"dynamicContentWaitSecs": {
"title": "Wait for dynamic content (seconds)",
"type": "integer",
"description": "Maximum time (in seconds) to wait for dynamic content to load. The crawler processes the page once this time elapses or when the network becomes idle.",
"default": 10
},
"removeCookieWarnings": {
"title": "Remove cookie warnings",
"type": "boolean",
"description": "If enabled, removes cookie consent dialogs to improve text extraction accuracy. Note that this will impact latency.",
"default": true
},
"debugMode": {
"title": "Debug mode (stores debugging information in dataset)",
"type": "boolean",
"description": "If enabled, the Actor will store debugging information in the dataset's debug field",
"default": false,
"sectionCaption": "Debug Settings"
}
}
}
16 changes: 16 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

# data
data
10 changes: 10 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf
max_line_length = 120
39 changes: 39 additions & 0 deletions .eslintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"root": true,
"env": {
"browser": true,
"es2020": true,
"node": true
},
"extends": [
"@apify/eslint-config-ts"
],
"parserOptions": {
"project": "./tsconfig.json",
"ecmaVersion": 2020
},
"ignorePatterns": [
"node_modules",
"dist",
"**/*.d.ts"
],
"plugins": ["import"],
"rules": {
"import/order": [
"error",
{
"groups": [
["builtin", "external"],
"internal",
["parent", "sibling", "index"]
],
"newlines-between": "always",
"alphabetize": {
"order": "asc",
"caseInsensitive": true
}
}
],
"max-len": ["error", { "code": 120, "ignoreUrls": true, "ignoreStrings": true, "ignoreTemplateLiterals": true }]
}
}
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
dist
node_modules
apify_storage
storage

# Added by Apify CLI
.venv
28 changes: 28 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
This changelog summarizes all changes of the RAG Web Browser

### 2024-09-24

🚀 Features
- Updated README.md to include tips on improving latency
- Set initialConcurrency to 5
- Set minConcurrency to 3
- Set logLevel to INFO

### 2024-09-20

🐛 Bug Fixes
- Fix response format when crawler fails

### 2024-09-24

🚀 Features
- Add ability to create new crawlers using query parameters
- Update Dockerfile to node version 22

🐛 Bug Fixes
- Fix playwright key creation

### 2024-09-11

🚀 Features
- Initial version of the RAG Web Browser
Loading

0 comments on commit 932a411

Please sign in to comment.