-
Notifications
You must be signed in to change notification settings - Fork 16.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
community[minor]: Add Scrapfly Loader community integration (#22036)
Added [Scrapfly](https://scrapfly.io/) Web Loader integration. Scrapfly is a web scraping API that allows extracting web page data into accessible markdown or text datasets. - __Description__: Added Scrapfly web loader for retrieving web page data as markdown or text. - Dependencies: scrapfly-sdk - Twitter: @thealchemi1st --------- Co-authored-by: Bagatur <[email protected]>
- Loading branch information
Showing
4 changed files
with
182 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## ScrapFly\n", | ||
"[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Installation\n", | ||
"Install ScrapFly Python SDK and he required Langchain packages using pip:\n", | ||
"```shell\n", | ||
"pip install scrapfly-sdk langchain langchain-community\n", | ||
"```" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Usage" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain_community.document_loaders import ScrapflyLoader\n", | ||
"\n", | ||
"scrapfly_loader = ScrapflyLoader(\n", | ||
" [\"https://web-scraping.dev/products\"],\n", | ||
" api_key=\"Your ScrapFly API key\", # Get your API key from https://www.scrapfly.io/\n", | ||
" ignore_scrape_failures=True, # Ignore unprocessable web pages and log their exceptions\n", | ||
")\n", | ||
"\n", | ||
"# Load documents from URLs as markdown\n", | ||
"documents = scrapfly_loader.load()\n", | ||
"print(documents)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"The ScrapflyLoader also allows passigng ScrapeConfig object for customizing the scrape request. See the documentation for the full feature details and their API params: https://scrapfly.io/docs/scrape-api/getting-started" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain_community.document_loaders import ScrapflyLoader\n", | ||
"\n", | ||
"scrapfly_scrape_config = {\n", | ||
" \"asp\": True, # Bypass scraping blocking and antibot solutions, like Cloudflare\n", | ||
" \"render_js\": True, # Enable JavaScript rendering with a cloud headless browser\n", | ||
" \"proxy_pool\": \"public_residential_pool\", # Select a proxy pool (datacenter or residnetial)\n", | ||
" \"country\": \"us\", # Select a proxy location\n", | ||
" \"auto_scroll\": True, # Auto scroll the page\n", | ||
" \"js\": \"\", # Execute custom JavaScript code by the headless browser\n", | ||
"}\n", | ||
"\n", | ||
"scrapfly_loader = ScrapflyLoader(\n", | ||
" [\"https://web-scraping.dev/products\"],\n", | ||
" api_key=\"Your ScrapFly API key\", # Get your API key from https://www.scrapfly.io/\n", | ||
" ignore_scrape_failures=True, # Ignore unprocessable web pages and log their exceptions\n", | ||
" scrape_config=scrapfly_scrape_config, # Pass the scrape_config object\n", | ||
" scrape_format=\"markdown\", # The scrape result format, either `markdown`(default) or `text`\n", | ||
")\n", | ||
"\n", | ||
"# Load documents from URLs as markdown\n", | ||
"documents = scrapfly_loader.load()\n", | ||
"print(documents)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
69 changes: 69 additions & 0 deletions
69
libs/community/langchain_community/document_loaders/scrapfly.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
"""Scrapfly Web Reader.""" | ||
import logging | ||
from typing import Iterator, List, Literal, Optional | ||
|
||
from langchain_core.document_loaders import BaseLoader | ||
from langchain_core.documents import Document | ||
from langchain_core.utils import get_from_env | ||
|
||
logger = logging.getLogger(__file__) | ||
|
||
|
||
class ScrapflyLoader(BaseLoader): | ||
"""Turn a url to llm accessible markdown with `Scrapfly.io`. | ||
For further details, visit: https://scrapfly.io/docs/sdk/python | ||
""" | ||
|
||
def __init__( | ||
self, | ||
urls: List[str], | ||
*, | ||
api_key: Optional[str] = None, | ||
scrape_format: Literal["markdown", "text"] = "markdown", | ||
scrape_config: Optional[dict] = None, | ||
continue_on_failure: bool = True, | ||
) -> None: | ||
"""Initialize client. | ||
Args: | ||
urls: List of urls to scrape. | ||
api_key: The Scrapfly API key. If not specified must have env var | ||
SCRAPFLY_API_KEY set. | ||
scrape_format: Scrape result format, one or "markdown" or "text". | ||
scrape_config: Dictionary of ScrapFly scrape config object. | ||
continue_on_failure: Whether to continue if scraping a url fails. | ||
""" | ||
try: | ||
from scrapfly import ScrapflyClient | ||
except ImportError: | ||
raise ImportError( | ||
"`scrapfly` package not found, please run `pip install scrapfly-sdk`" | ||
) | ||
if not urls: | ||
raise ValueError("URLs must be provided.") | ||
api_key = api_key or get_from_env("api_key", "SCRAPFLY_API_KEY") | ||
self.scrapfly = ScrapflyClient(key=api_key) | ||
self.urls = urls | ||
self.scrape_format = scrape_format | ||
self.scrape_config = scrape_config | ||
self.continue_on_failure = continue_on_failure | ||
|
||
def lazy_load(self) -> Iterator[Document]: | ||
from scrapfly import ScrapeConfig | ||
|
||
scrape_config = self.scrape_config if self.scrape_config is not None else {} | ||
for url in self.urls: | ||
try: | ||
response = self.scrapfly.scrape( | ||
ScrapeConfig(url, format=self.scrape_format, **scrape_config) | ||
) | ||
yield Document( | ||
page_content=response.scrape_result["content"], | ||
metadata={"url": url}, | ||
) | ||
except Exception as e: | ||
if self.continue_on_failure: | ||
logger.error(f"Error fetching data from {url}, exception: {e}") | ||
else: | ||
raise e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters