RAG Web Browser implementation (#1)

* Actor implementation
apify · Oct 2, 2024 · 932a411 · 932a411
1 parent 75c474f
commit 932a411
Show file tree

Hide file tree

Showing 34 changed files with 12,368 additions and 21 deletions.
diff --git a/.actor/Dockerfile b/.actor/Dockerfile
@@ -0,0 +1,62 @@
+# Specify the base Docker image. You can read more about
+# the available images at https://crawlee.dev/docs/guides/docker-images
+# You can also use any other image from Docker Hub.
+FROM apify/actor-node-playwright-chrome:22-1.46.0 AS builder
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install all dependencies. Don't audit to speed up the installation.
+RUN npm install --include=dev --audit=false
+
+# Next, copy the source files using the user set
+# in the base image.
+COPY --chown=myuser . ./
+
+# Install all dependencies and build the project.
+# Don't audit to speed up the installation.
+RUN npm run build
+
+# Create final image
+FROM apify/actor-node-playwright-firefox:22-1.46.0
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install NPM packages, skip optional and development dependencies to
+# keep the image small. Avoid logging too much and print the dependency
+# tree for debugging
+RUN npm --quiet set progress=false \
+    && npm install --omit=dev --omit=optional \
+    && echo "Installed NPM packages:" \
+    && (npm list --omit=dev --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version \
+    && rm -r ~/.npm
+
+# Remove the existing firefox installation
+RUN rm -rf ${PLAYWRIGHT_BROWSERS_PATH}/*
+
+# Install all required playwright dependencies for firefox
+RUN npx playwright install firefox
+# symlink the firefox binary to the root folder in order to bypass the versioning and resulting browser launch crashes.
+RUN ln -s ${PLAYWRIGHT_BROWSERS_PATH}/firefox-*/firefox/firefox ${PLAYWRIGHT_BROWSERS_PATH}/
+
+# Overrides the dynamic library used by Firefox to determine trusted root certificates with p11-kit-trust.so, which loads the system certificates.
+RUN rm $PLAYWRIGHT_BROWSERS_PATH/firefox-*/firefox/libnssckbi.so
+RUN ln -s /usr/lib/x86_64-linux-gnu/pkcs11/p11-kit-trust.so $(ls -d $PLAYWRIGHT_BROWSERS_PATH/firefox-*)/firefox/libnssckbi.so
+
+# Copy built JS files from builder image
+COPY --from=builder --chown=myuser /home/myuser/dist ./dist
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after NPM install, quick build will be really fast
+# for most source file changes.
+COPY --chown=myuser . ./
+
+# Run the image.
+CMD npm run start:prod --silent
diff --git a/.actor/actor.json b/.actor/actor.json
@@ -0,0 +1,76 @@
+{
+    "actorSpecification": 1,
+    "name": "rag-web-browser",
+    "title": "RAG Web browser",
+    "description": "Web browser for a retrieval augmented generation workflows. Retrieve and return website content from the top Google Search Results Pages",
+    "version": "0.1",
+    "input": "./input_schema.json",
+    "dockerfile": "./Dockerfile",
+    "storages": {
+        "dataset": {
+            "actorSpecification": 1,
+            "title": "RAG Web browser",
+            "description": "Too see all scraped properties, export the whole dataset or select All fields instead of Overview",
+            "views": {
+                "overview": {
+                    "title": "Overview",
+                    "description": "Selected fields from the dataset",
+                    "transformation": {
+                        "fields": [
+                            "metadata.url",
+                            "metadata.title",
+                            "text"
+                        ],
+                        "flatten": ["metadata"]
+                    },
+                    "display": {
+                        "component": "table",
+                        "properties": {
+                            "metadata.url": {
+                                "label": "Page URL",
+                                "format": "text"
+                            },
+                            "metadata.title": {
+                                "label": "Page Title",
+                                "format": "text"
+                            },
+                            "text": {
+                                "label": "Extracted text",
+                                "format": "text"
+                            }
+                        }
+                    }
+                },
+                "googleSearchResults": {
+                    "title": "Google Search Results",
+                    "description": "Title, Description and URL of the Google Search Results",
+                    "transformation": {
+                        "fields": [
+                            "googleSearchResult.description",
+                            "googleSearchResult.title",
+                            "googleSearchResult.url"
+                        ],
+                        "flatten": ["googleSearchResult"]
+                    },
+                    "display": {
+                        "component": "table",
+                        "properties": {
+                            "googleSearchResult.description": {
+                                "label": "Description",
+                                "format": "text"
+                            },
+                            "googleSearchResult.title": {
+                                "label": "Title",
+                                "format": "text"
+                            },
+                            "googleSearchResult.url": {
+                                "label": "URL",
+                                "format": "text"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -0,0 +1,133 @@
+{
+    "title": "RAG Web Browser",
+    "description": "RAG Web Browser for a retrieval augmented generation workflows. Retrieve and return website content from the top Google Search Results Pages",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "query": {
+            "title": "Search term(s)",
+            "type": "string",
+            "description": "Use regular search words or enter Google Search URLs. You can also apply [advanced Google search techniques](https://blog.apify.com/how-to-scrape-google-like-a-pro/), such as <code>AI site:twitter.com</code> or <code>javascript OR python</code>",
+            "prefill": "apify rag browser",
+            "editor": "textarea",
+            "pattern": "[^\\s]+"
+        },
+        "maxResults": {
+            "title": "Number of top search results to return from Google. Only organic results are returned and counted",
+            "type": "integer",
+            "description": "The number of top organic search results to return and scrape text from",
+            "prefill": 3,
+            "minimum": 1,
+            "maximum": 50
+        },
+        "outputFormats": {
+            "title": "Output formats",
+            "type": "array",
+            "description": "Select the desired output formats for the retrieved content",
+            "editor": "select",
+            "default": ["text"],
+            "items": {
+                "type": "string",
+                "enum": ["text", "markdown", "html"],
+                "enumTitles": ["Plain text", "Markdown", "HTML"]
+            }
+        },
+        "requestTimeoutSecs": {
+            "title": "Request timeout in seconds",
+            "type": "integer",
+            "description": "The maximum time (in seconds) allowed for request. If the request exceeds this time, it will be marked as failed and only already finished results will be returned",
+            "minimum": 1,
+            "maximum": 600,
+            "default": 60
+        },
+        "proxyGroupSearch": {
+            "title": "Search Proxy Group",
+            "type": "string",
+            "description": "Select the proxy group for loading search results",
+            "editor": "select",
+            "default": "GOOGLE_SERP",
+            "enum": ["GOOGLE_SERP", "SHADER"],
+            "sectionCaption": "Google Search Settings"
+        },
+        "maxRequestRetriesSearch": {
+            "title": "Maximum number of retries for Google search request on network / server errors",
+            "type": "integer",
+            "description": "The maximum number of times the Google search crawler will retry the request on network, proxy or server errors. If the (n+1)-th request still fails, the crawler will mark this request as failed.",
+            "minimum": 0,
+            "maximum": 3,
+            "default": 1
+        },
+        "proxyConfiguration": {
+            "title": "Crawler: Proxy configuration",
+            "type": "object",
+            "description": "Enables loading the websites from IP addresses in specific geographies and to circumvent blocking.",
+            "default": {
+                "useApifyProxy": true
+            },
+            "prefill": {
+                "useApifyProxy": true
+            },
+            "editor": "proxy",
+            "sectionCaption": "Content Crawler Settings"
+        },
+        "initialConcurrency": {
+            "title": "Initial concurrency",
+            "type": "integer",
+            "description": "Initial number of Playwright browsers running in parallel. The system scales this value based on CPU and memory usage.",
+            "minimum": 0,
+            "maximum": 50,
+            "default": 5
+        },
+        "minConcurrency": {
+            "title": "Minimal concurrency",
+            "type": "integer",
+            "description": "Minimum number of Playwright browsers running in parallel. Useful for defining a base level of parallelism.",
+            "minimum": 1,
+            "maximum": 50,
+            "default": 3
+        },
+        "maxConcurrency": {
+            "title": "Maximal concurrency",
+            "type": "integer",
+            "description": "Maximum number of browsers or clients running in parallel to avoid overloading target websites.",
+            "minimum": 1,
+            "maximum": 50,
+            "default": 10
+        },
+        "maxRequestRetries": {
+            "title": "Maximum number of retries for Playwright content crawler",
+            "type": "integer",
+            "description": "Maximum number of retry attempts on network, proxy, or server errors. If the (n+1)-th request fails, it will be marked as failed.",
+            "minimum": 0,
+            "maximum": 3,
+            "default": 1
+        },
+        "requestTimeoutContentCrawlSecs": {
+            "title": "Request timeout for content crawling",
+            "type": "integer",
+            "description": "Timeout (in seconds) for making requests for each search result, including fetching and processing its content.\n\nThe value must be smaller than the 'Request timeout in seconds' setting.",
+            "minimum": 1,
+            "maximum": 60,
+            "default": 30
+        },
+        "dynamicContentWaitSecs": {
+            "title": "Wait for dynamic content (seconds)",
+            "type": "integer",
+            "description": "Maximum time (in seconds) to wait for dynamic content to load. The crawler processes the page once this time elapses or when the network becomes idle.",
+            "default": 10
+        },
+        "removeCookieWarnings": {
+            "title": "Remove cookie warnings",
+            "type": "boolean",
+            "description": "If enabled, removes cookie consent dialogs to improve text extraction accuracy. Note that this will impact latency.",
+            "default": true
+        },
+        "debugMode": {
+            "title": "Debug mode (stores debugging information in dataset)",
+            "type": "boolean",
+            "description": "If enabled, the Actor will store debugging information in the dataset's debug field",
+            "default": false,
+            "sectionCaption": "Debug Settings"
+        }
+    }
+}
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,16 @@
+# configurations
+.idea
+
+# crawlee and apify storage folders
+apify_storage
+crawlee_storage
+storage
+
+# installed files
+node_modules
+
+# git folder
+.git
+
+# data
+data
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,10 @@
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+end_of_line = lf
+max_line_length = 120
diff --git a/.eslintrc b/.eslintrc
@@ -0,0 +1,39 @@
+{
+    "root": true,
+    "env": {
+        "browser": true,
+        "es2020": true,
+        "node": true
+    },
+    "extends": [
+        "@apify/eslint-config-ts"
+    ],
+    "parserOptions": {
+        "project": "./tsconfig.json",
+        "ecmaVersion": 2020
+    },
+    "ignorePatterns": [
+        "node_modules",
+        "dist",
+        "**/*.d.ts"
+    ],
+    "plugins": ["import"],
+    "rules": {
+        "import/order": [
+            "error",
+            {
+                "groups": [
+                    ["builtin", "external"],
+                    "internal",
+                    ["parent", "sibling", "index"]
+                ],
+                "newlines-between": "always",
+                "alphabetize": {
+                    "order": "asc",
+                    "caseInsensitive": true
+                }
+            }
+        ],
+        "max-len": ["error", { "code": 120, "ignoreUrls": true, "ignoreStrings": true, "ignoreTemplateLiterals": true }]
+    }
+}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+# This file tells Git which files shouldn't be added to source control
+
+.DS_Store
+.idea
+dist
+node_modules
+apify_storage
+storage
+
+# Added by Apify CLI
+.venv
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,28 @@
+This changelog summarizes all changes of the RAG Web Browser
+
+### 2024-09-24
+
+🚀 Features
+- Updated README.md to include tips on improving latency
+- Set initialConcurrency to 5
+- Set minConcurrency to 3
+- Set logLevel to INFO
+
+### 2024-09-20
+
+🐛 Bug Fixes
+- Fix response format when crawler fails
+
+### 2024-09-24
+
+🚀 Features
+- Add ability to create new crawlers using query parameters
+- Update Dockerfile to node version 22
+
+🐛 Bug Fixes
+- Fix playwright key creation
+
+### 2024-09-11
+
+🚀 Features
+- Initial version of the RAG Web Browser