Skip to content

Commit

Permalink
Better content cleaner output parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
SuperMuel committed Jan 29, 2025
1 parent 4a7cf57 commit 3a93741
Show file tree
Hide file tree
Showing 5 changed files with 326 additions and 13 deletions.
97 changes: 92 additions & 5 deletions analyzer/src/util.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,95 @@
import asyncio

from datetime import datetime, timezone
import logging
import aiohttp
from pydantic import HttpUrl
from shared.models import Article
from beanie import PydanticObjectId
from pydantic import HttpUrl, ValidationError
from pydantic_core import Url
from shared.content_fetching_models import (
ArticleContentCleanerOutput,
ContentFetchingResult,
UrlToMarkdownConversion,
)
from shared.models import Article, SearchProvider
from shared.region import Region


logger = logging.getLogger(__name__)


def create_test_article(
workspace_id: PydanticObjectId = PydanticObjectId("507f1f77bcf86cd799439011"),
title: str = "Test Article Title",
url: Url = Url("https://example.com/test-article"),
body: str = "This is a test article body with some content for testing purposes.",
found_at: datetime = datetime(2024, 1, 1, tzinfo=timezone.utc),
date: datetime = datetime(2024, 1, 1, tzinfo=timezone.utc),
region: Region = Region.FRANCE,
image: Url = Url("https://example.com/test-image.jpg"),
source: str = "Test Source",
content: str = "Full content of the test article goes here.",
ingestion_run_id: PydanticObjectId = PydanticObjectId("507f1f77bcf86cd799439012"),
vector_indexed: bool = False,
provider: SearchProvider = "serperdev",
content_fetching_result: ContentFetchingResult | None = ContentFetchingResult(
url=Url("https://example.com/test-article"),
content_cleaner_output=ArticleContentCleanerOutput(
title="Test Article Title",
cleaned_article_content="Full content of the test article goes here.",
),
url_to_markdown_conversion=UrlToMarkdownConversion(
url=Url("https://example.com/test-article"),
markdown="# Test Article\n\nTest content",
extraction_method="firecrawl",
metadata={
"og:image": "https://example.com/og-image.jpg",
"og:title": "Test Article Title",
},
),
),
content_cleaning_error: str | None = None,
) -> Article:
return Article(
workspace_id=workspace_id,
title=title,
url=url,
body=body,
found_at=found_at,
date=date,
region=region,
image=image,
source=source,
content=content,
ingestion_run_id=ingestion_run_id,
vector_indexed=vector_indexed,
provider=provider,
content_fetching_result=content_fetching_result,
content_cleaning_error=content_cleaning_error,
)


def try_get_firecrawl_image(article: Article) -> Url | None:
if not article.content_fetching_result:
return None

fetch_result = article.content_fetching_result.url_to_markdown_conversion
if not fetch_result.metadata:
return None

image_url = fetch_result.metadata.get("og:image")
if not image_url:
return None

try:
parsed_url = Url(image_url)
except ValidationError:
logger.error(f"Error while parsing image URL: {image_url}")
return None

return parsed_url


async def get_first_valid_image(articles: list[Article]) -> HttpUrl | None:
"""
Asynchronously retrieves the first valid image URL from a list of articles.
Expand All @@ -26,14 +108,19 @@ async def get_first_valid_image(articles: list[Article]) -> HttpUrl | None:

async with aiohttp.ClientSession(timeout=timeout) as session:
for article in articles:
if not article.image:
# first try to get the image we got from Firecrawl since it's often better quality.
image_url = try_get_firecrawl_image(article) or article.image

if not image_url:
continue

try:
async with session.head(str(article.image)) as response:
async with session.head(str(image_url)) as response:
if response.status == 200:
content_type = response.headers.get("Content-Type", "")
if content_type.startswith("image/"):
return article.image
return image_url

except aiohttp.ClientError:
continue
except asyncio.TimeoutError:
Expand Down
70 changes: 70 additions & 0 deletions analyzer/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pytest
from make_it_sync import make_sync
from mongomock_motor import AsyncMongoMockClient
from pydantic_core import Url
from src.util import create_test_article, try_get_firecrawl_image

from shared.content_fetching_models import (
ArticleContentCleanerOutput,
ContentFetchingResult,
UrlToMarkdownConversion,
)
from shared.db import my_init_beanie


@pytest.fixture(autouse=True)
def my_fixture():
client = AsyncMongoMockClient()
make_sync(my_init_beanie)(client)
yield


def test_try_get_firecrawl_image():
# Test case 1: Article with valid og:image
article1 = create_test_article()
result1 = try_get_firecrawl_image(article1)
assert isinstance(result1, Url)
assert str(result1) == "https://example.com/og-image.jpg"

# Test case 2: Article without content_fetching_result
article2 = create_test_article(content_fetching_result=None)
result2 = try_get_firecrawl_image(article2)
assert result2 is None

# Test case 3: Article with empty metadata
article3 = create_test_article(
content_fetching_result=ContentFetchingResult(
url=Url("https://example.com/test-article"),
content_cleaner_output=ArticleContentCleanerOutput(
title="Test Article Title",
cleaned_article_content="Test content",
),
url_to_markdown_conversion=UrlToMarkdownConversion(
url=Url("https://example.com/test-article"),
markdown="# Test Article\n\nTest content",
extraction_method="firecrawl",
metadata={},
),
),
)
result3 = try_get_firecrawl_image(article3)
assert result3 is None

# Test case 4: Article with invalid URL
article4 = create_test_article(
content_fetching_result=ContentFetchingResult(
url=Url("https://example.com/test-article"),
content_cleaner_output=ArticleContentCleanerOutput(
title="Test Article Title",
cleaned_article_content="Test content",
),
url_to_markdown_conversion=UrlToMarkdownConversion(
url=Url("https://example.com/test-article"),
markdown="# Test Article\n\nTest content",
extraction_method="firecrawl",
metadata={"og:image": "not_a_valid_url"},
),
),
)
result4 = try_get_firecrawl_image(article4)
assert result4 is None
87 changes: 80 additions & 7 deletions ingester/src/content_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from langchain import hub
from langchain.chat_models import init_chat_model
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.runnables import Runnable
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import Runnable, RunnableLambda

from shared.content_fetching_models import ArticleContentCleanerOutput
from src.ingester_settings import ingester_settings
Expand All @@ -19,21 +20,93 @@ class ArticleContentCleaner:
Cleans the markdown content of an article using an LLM.
"""

def __init__(self, llm: BaseChatModel | None = None):
def __init__(
self,
llm: BaseChatModel | None = None,
retry_llm: BaseChatModel | None = None,
):
self.llm = llm or init_chat_model(ingester_settings.CONTENT_CLEANER_MODEL)
self.retry_llm = retry_llm or init_chat_model(
ingester_settings.CONTENT_CLEANER_RETRY_MODEL
)
self.chain = self._create_chain()

@staticmethod
def _parse_str_output(completion: str) -> ArticleContentCleanerOutput:
"""
Parse the LLM output which should be in XML-style format.
The LLM output should be in one of these formats:
For successful cleaning:
```
<title>The title of the article</title>
<content>
The markdown formatted content of the article
</content>
```
For errors:
```
<error>The error message</error>
```
"""
import re

INVALID_OUTPUT_ERROR_MESSAGE = """The output format is invalid.
If you successfully extracted and cleaned the article, present the result in the following format:
```
<title>The title of the article</title>
<content>
The markdown formatted content of the article
</content>
```
If you encountered any issues that prevented you from cleaning the article, present an error message in the following format:
```
<error>The error message</error>
```
"""

completion = completion.strip()

# Check for error first
error_match = re.search(
r"<error>(.*?)</error>", completion, re.IGNORECASE | re.DOTALL
)
if error_match:
return ArticleContentCleanerOutput(error=error_match.group(1).strip())

# Extract title and content
title_match = re.search(r"<title>(.*?)</title>", completion, re.DOTALL)
if not title_match:
raise ValueError(INVALID_OUTPUT_ERROR_MESSAGE)
title = title_match.group(1).strip()

content_match = re.search(r"<content>(.*?)</content>", completion, re.DOTALL)
if not content_match:
raise ValueError(INVALID_OUTPUT_ERROR_MESSAGE)

content = content_match.group(1).strip()

return ArticleContentCleanerOutput(
title=title,
cleaned_article_content=content,
)

def _create_chain(self) -> ArticleContentCleanerChain:
"""
Creates the LangChain chain for article content cleaning.
"""
prompt = hub.pull(ingester_settings.ARTICLE_CONTENT_CLEANER_PROMPT_REF)

structured_llm = self.llm.with_structured_output(ArticleContentCleanerOutput)

return (prompt | structured_llm).with_config(
run_name="article_content_cleaner_chain"
)
return (
prompt
| self.llm
| StrOutputParser()
| RunnableLambda(self._parse_str_output)
).with_config(run_name="article_content_cleaner_chain")

def get_chain(self) -> ArticleContentCleanerChain:
"""
Expand Down
2 changes: 1 addition & 1 deletion ingester/src/ingester_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class IngesterSettings(BaseSettings):
# Content Cleaner settings
CONTENT_CLEANER_MODEL: str = "gpt-4o-mini"
ARTICLE_CONTENT_CLEANER_PROMPT_REF: str = (
"clean-article-content" # Reference to Langsmith Hub
"clean-article-content-no-structured-output" # Reference to Langsmith Hub
)
FIRECRAWL_API_KEY: SecretStr = Field(default=...)

Expand Down
83 changes: 83 additions & 0 deletions ingester/tests/test_content_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pytest
from shared.content_fetching_models import ArticleContentCleanerOutput
from src.content_cleaner import ArticleContentCleaner


def test_parse_output_success():
cleaner = ArticleContentCleaner()
test_completion = """<title>Test Title</title>
<content>
This is the content
With multiple lines
And more content
</content>"""

result = cleaner._parse_str_output(test_completion)

assert isinstance(result, ArticleContentCleanerOutput)
assert result.title == "Test Title"
assert (
result.cleaned_article_content
== "This is the content\nWith multiple lines\nAnd more content"
)
assert result.error is None


def test_parse_output_error():
cleaner = ArticleContentCleaner()
test_completion = "<error>Invalid content format</error>"

result = cleaner._parse_str_output(test_completion)

assert isinstance(result, ArticleContentCleanerOutput)
assert result.error == "Invalid content format"
assert result.title is None
assert result.cleaned_article_content is None


def test_parse_output_with_whitespace():
cleaner = ArticleContentCleaner()
test_completion = """
<title>
Test Title
</title>
<content>
This is the content
With multiple lines
And more content
</content>
"""

result = cleaner._parse_str_output(test_completion)

assert isinstance(result, ArticleContentCleanerOutput)
assert result.title == "Test Title"
assert (
result.cleaned_article_content
== "This is the content\nWith multiple lines\nAnd more content"
)
assert result.error is None


def test_parse_output_invalid_format():
cleaner = ArticleContentCleaner()
test_completion = "Invalid format"

with pytest.raises(ValueError):
cleaner._parse_str_output(test_completion)


def test_parse_output_no_title():
cleaner = ArticleContentCleaner()
test_completion = "<content>This is the content</content>"

with pytest.raises(ValueError):
cleaner._parse_str_output(test_completion)


def test_parse_output_no_content():
cleaner = ArticleContentCleaner()
test_completion = "<title>Test Title</title>"

with pytest.raises(ValueError):
cleaner._parse_str_output(test_completion)

0 comments on commit 3a93741

Please sign in to comment.