apify · jirispilka · Oct 2, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/.actor/Dockerfile b/.actor/Dockerfile
@@ -0,0 +1,53 @@
+# Specify the base Docker image. You can read more about
+# the available images at https://crawlee.dev/docs/guides/docker-images
+# You can also use any other image from Docker Hub.
+FROM apify/actor-node-playwright-chrome:18-1.40.0 AS builder
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install all dependencies. Don't audit to speed up the installation.
+RUN npm install --include=dev --audit=false
+
+# Next, copy the source files using the user set
+# in the base image.
+COPY --chown=myuser . ./
+
+# Install all dependencies and build the project.
+# Don't audit to speed up the installation.
+RUN npm run build
+
+# Create final image
+FROM apify/actor-node-playwright-firefox:18-1.40.0
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install NPM packages, skip optional and development dependencies to
+# keep the image small. Avoid logging too much and print the dependency
+# tree for debugging
+RUN npm --quiet set progress=false \
+    && npm install --omit=dev --omit=optional \
+    && echo "Installed NPM packages:" \
+    && (npm list --omit=dev --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version \
+    && rm -r ~/.npm
+
+# Copy built JS files from builder image
+COPY --from=builder --chown=myuser /home/myuser/dist ./dist
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after NPM install, quick build will be really fast
+# for most source file changes.
+COPY --chown=myuser . ./
+
+RUN rm $PLAYWRIGHT_BROWSERS_PATH/firefox-*/firefox/libnssckbi.so
+RUN ln -s /usr/lib/x86_64-linux-gnu/pkcs11/p11-kit-trust.so $(ls -d $PLAYWRIGHT_BROWSERS_PATH/firefox-*)/firefox/libnssckbi.so
+
+# Run the image.
+CMD npm run start:prod --silent
diff --git a/.actor/actor.json b/.actor/actor.json
@@ -0,0 +1,63 @@
+{
+	"actorSpecification": 1,
+	"name": "serp-content-crawler",
+	"title": "SERP Content Crawler",
+	"description": "Retrieve website content from the top Google Search Results Pages (SERPs)",
+	"version": "0.0",
+	"meta": {
+		"templateId": "ts-crawlee-cheerio"
+	},
+	"input": "./input_schema.json",
+	"dockerfile": "./Dockerfile",
+    "storage": {
+        "dataset": {
+            "actorSpecification": 1,
+            "title": "SERP Content Crawler Dataset",
+            "description": "",
+            "views": {
+                "default": {
+                    "title": "Text",
+                    "description": "View of URLs of web pages and their content as simple plain text.",
+                    "transformation": {
+                        "fields": [
+                            "url",
+                            "text"
+                        ]
+                    },
+                    "display": {
+                        "component": "table",
+                        "properties": {
+                            "url": {
+                                "label": "Webpage URL"
+                            },
+                            "text": {
+                                "label": "Extracted text"
+                            }
+                        }
+                    }
+                },
+                "markdown": {
+                    "title": "Markdown",
+                    "description": "View of URLs of web pages and their content as Markdown with formatting.",
+                    "transformation": {
+                        "fields": [
+                            "url",
+                            "markdown"
+                        ]
+                    },
+                    "display": {
+                        "component": "table",
+                        "properties": {
+                            "url": {
+                                "label": "Webpage URL"
+                            },
+                            "markdown": {
+                                "label": "Extracted Markdown"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -0,0 +1,35 @@
+{
+    "title": "SERP Content Crawler",
+    "description": "Retrieve website content from the top Google Search Results Pages (SERPs)",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "queries": {
+            "title": "Search term(s)",
+            "type": "string",
+            "description": "Use regular search words or enter Google Search URLs. You can also apply [advanced Google search techniques](https://blog.apify.com/how-to-scrape-google-like-a-pro/), such as <code>AI site:twitter.com</code> or <code>javascript OR python</code>.",
+            "prefill": "apify\nllm",
+            "editor": "textarea",
+            "pattern": "[^\\s]+"
+        },
+        "maxResults": {
+            "title": "Max results to return",
+            "type": "integer",
+            "description": "",
+            "prefill": 3.0,
+            "minimum": 1.0
+        },
+        "proxyConfiguration": {
+            "title": "Crawler: Proxy configuration",
+            "type": "object",
+            "description": "Enables loading the websites from IP addresses in specific geographies and to circumvent blocking.",
+            "default": {
+                "useApifyProxy": true
+            },
+            "prefill": {
+                "useApifyProxy": true
+            },
+            "editor": "proxy"
+        }
+    }
+}
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,13 @@
+# configurations
+.idea
+
+# crawlee and apify storage folders
+apify_storage
+crawlee_storage
+storage
+
+# installed files
+node_modules
+
+# git folder
+.git
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,9 @@
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+end_of_line = lf
diff --git a/.eslintrc b/.eslintrc
@@ -0,0 +1,38 @@
+{
+    "root": true,
+    "env": {
+        "browser": true,
+        "es2020": true,
+        "node": true
+    },
+    "extends": [
+        "@apify/eslint-config-ts"
+    ],
+    "parserOptions": {
+        "project": "./tsconfig.json",
+        "ecmaVersion": 2020
+    },
+    "ignorePatterns": [
+        "node_modules",
+        "dist",
+        "**/*.d.ts"
+    ],
+    "plugins": ["import"],
+    "rules": {
+        "import/order": [
+            "error",
+            {
+                "groups": [
+                    ["builtin", "external"],
+                    "internal",
+                    ["parent", "sibling", "index"]
+                ],
+                "newlines-between": "always",
+                "alphabetize": {
+                    "order": "asc",
+                    "caseInsensitive": true
+                }
+            }
+        ]
+    }
+}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+# This file tells Git which files shouldn't be added to source control
+
+.DS_Store
+.idea
+dist
+node_modules
+apify_storage
+storage
+
+# Added by Apify CLI
+.venv
diff --git a/README.md b/README.md
@@ -1,28 +1,20 @@
-# Solution template
-This repository serves as a template for creating repositories for new solutions. Using a template makes it easier to create new repos with pre-defined contents and ensures consistency between the repositories.
+## Fast Google Search Result Content Crawler
 
-## How to use this template
+This Actor retrieves website content from the top Google Search Results Pages (SERPs).
+Given a search query, it fetches the first page of Google search results, then crawls the top sites to extract text content.
+It is capable of extracting content from JavaScript-enabled websites and can bypass anti-scraping protections.
+The extracted web content is saved as plain text or markdown.
+This Actor is ideal for adding up-to-date Google search knowledge to your LLM applications.
 
-1. Click the Use this template button in the top right corner.
-2. Choose a name for the repository. It should be in the format `vendor`-`customer`-`solution`-`...`. 
-   - **Examples:**
-   - apify-thorn-facebook-scraper
-   - topmonks-microsoft-google-scraper
-   - topmonks-microsoft-google-data-processor
-   - devbros-apple-some-codename-scraper
-3. Make sure the repo is **private**.
-4. Create the repo.
+This Actor is a combination of a two more powerful Apify actors:
+- [Google Search Results Scraper](https://apify.com/apify/google-search-scraper)
+- [Website Content Crawler](https://apify.com/apify/website-content-crawler)
 
-**Once you have the repo created:**
+#### Looking to scrape Google Search Results?
+- Check out the [Google Search Results Scraper](https://apify.com/apify/google-search-scraper) actor.
 
-1. Go to Settings -> Manage Access -> Invite teams or people.
-2. Add the **Apify Team** as **admin**. If the solution will be delivered by a partner, add their team as **admin** too.
-4. Edit this README and fill in the details in the template below. If a field cannot be filled, write **N/A**.
-5. Finally, delete this guide from the Readme, so that only the newly added details will remain.
-6. You're done! Thanks for using the template!
+#### Need to extract content from a list of URLs?
+- Explore the the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor.
 
-# vendor-customer-solution
-
-**Kanban link:** Add link to the Apify Kanban card.
-
-**Issue link:** Add link to the issue created in Delivery Issue Tracker or some other tracking issue.
+Browsing Tool
+- https://community.openai.com/t/new-assistants-browse-with-bing-ability/479383/27