Better content cleaner output parsing

SuperMuel · Jan 29, 2025 · 3a93741 · 3a93741
1 parent 4a7cf57
commit 3a93741
Show file tree

Hide file tree

Showing 5 changed files with 326 additions and 13 deletions.
diff --git a/analyzer/src/util.py b/analyzer/src/util.py
@@ -1,13 +1,95 @@
 import asyncio
+
+from datetime import datetime, timezone
 import logging
 import aiohttp
-from pydantic import HttpUrl
-from shared.models import Article
+from beanie import PydanticObjectId
+from pydantic import HttpUrl, ValidationError
+from pydantic_core import Url
+from shared.content_fetching_models import (
+    ArticleContentCleanerOutput,
+    ContentFetchingResult,
+    UrlToMarkdownConversion,
+)
+from shared.models import Article, SearchProvider
+from shared.region import Region
 
 
 logger = logging.getLogger(__name__)
 
 
+def create_test_article(
+    workspace_id: PydanticObjectId = PydanticObjectId("507f1f77bcf86cd799439011"),
+    title: str = "Test Article Title",
+    url: Url = Url("https://example.com/test-article"),
+    body: str = "This is a test article body with some content for testing purposes.",
+    found_at: datetime = datetime(2024, 1, 1, tzinfo=timezone.utc),
+    date: datetime = datetime(2024, 1, 1, tzinfo=timezone.utc),
+    region: Region = Region.FRANCE,
+    image: Url = Url("https://example.com/test-image.jpg"),
+    source: str = "Test Source",
+    content: str = "Full content of the test article goes here.",
+    ingestion_run_id: PydanticObjectId = PydanticObjectId("507f1f77bcf86cd799439012"),
+    vector_indexed: bool = False,
+    provider: SearchProvider = "serperdev",
+    content_fetching_result: ContentFetchingResult | None = ContentFetchingResult(
+        url=Url("https://example.com/test-article"),
+        content_cleaner_output=ArticleContentCleanerOutput(
+            title="Test Article Title",
+            cleaned_article_content="Full content of the test article goes here.",
+        ),
+        url_to_markdown_conversion=UrlToMarkdownConversion(
+            url=Url("https://example.com/test-article"),
+            markdown="# Test Article\n\nTest content",
+            extraction_method="firecrawl",
+            metadata={
+                "og:image": "https://example.com/og-image.jpg",
+                "og:title": "Test Article Title",
+            },
+        ),
+    ),
+    content_cleaning_error: str | None = None,
+) -> Article:
+    return Article(
+        workspace_id=workspace_id,
+        title=title,
+        url=url,
+        body=body,
+        found_at=found_at,
+        date=date,
+        region=region,
+        image=image,
+        source=source,
+        content=content,
+        ingestion_run_id=ingestion_run_id,
+        vector_indexed=vector_indexed,
+        provider=provider,
+        content_fetching_result=content_fetching_result,
+        content_cleaning_error=content_cleaning_error,
+    )
+
+
+def try_get_firecrawl_image(article: Article) -> Url | None:
+    if not article.content_fetching_result:
+        return None
+
+    fetch_result = article.content_fetching_result.url_to_markdown_conversion
+    if not fetch_result.metadata:
+        return None
+
+    image_url = fetch_result.metadata.get("og:image")
+    if not image_url:
+        return None
+
+    try:
+        parsed_url = Url(image_url)
+    except ValidationError:
+        logger.error(f"Error while parsing image URL: {image_url}")
+        return None
+
+    return parsed_url
+
+
 async def get_first_valid_image(articles: list[Article]) -> HttpUrl | None:
     """
     Asynchronously retrieves the first valid image URL from a list of articles.
@@ -26,14 +108,19 @@ async def get_first_valid_image(articles: list[Article]) -> HttpUrl | None:
 
     async with aiohttp.ClientSession(timeout=timeout) as session:
         for article in articles:
-            if not article.image:
+            # first try to get the image we got from Firecrawl since it's often better quality.
+            image_url = try_get_firecrawl_image(article) or article.image
+
+            if not image_url:
                 continue
+
             try:
-                async with session.head(str(article.image)) as response:
+                async with session.head(str(image_url)) as response:
                     if response.status == 200:
                         content_type = response.headers.get("Content-Type", "")
                         if content_type.startswith("image/"):
-                            return article.image
+                            return image_url
+
             except aiohttp.ClientError:
                 continue
             except asyncio.TimeoutError:

diff --git a/analyzer/tests/test_util.py b/analyzer/tests/test_util.py
@@ -0,0 +1,70 @@
+import pytest
+from make_it_sync import make_sync
+from mongomock_motor import AsyncMongoMockClient
+from pydantic_core import Url
+from src.util import create_test_article, try_get_firecrawl_image
+
+from shared.content_fetching_models import (
+    ArticleContentCleanerOutput,
+    ContentFetchingResult,
+    UrlToMarkdownConversion,
+)
+from shared.db import my_init_beanie
+
+
+@pytest.fixture(autouse=True)
+def my_fixture():
+    client = AsyncMongoMockClient()
+    make_sync(my_init_beanie)(client)
+    yield
+
+
+def test_try_get_firecrawl_image():
+    # Test case 1: Article with valid og:image
+    article1 = create_test_article()
+    result1 = try_get_firecrawl_image(article1)
+    assert isinstance(result1, Url)
+    assert str(result1) == "https://example.com/og-image.jpg"
+
+    # Test case 2: Article without content_fetching_result
+    article2 = create_test_article(content_fetching_result=None)
+    result2 = try_get_firecrawl_image(article2)
+    assert result2 is None
+
+    # Test case 3: Article with empty metadata
+    article3 = create_test_article(
+        content_fetching_result=ContentFetchingResult(
+            url=Url("https://example.com/test-article"),
+            content_cleaner_output=ArticleContentCleanerOutput(
+                title="Test Article Title",
+                cleaned_article_content="Test content",
+            ),
+            url_to_markdown_conversion=UrlToMarkdownConversion(
+                url=Url("https://example.com/test-article"),
+                markdown="# Test Article\n\nTest content",
+                extraction_method="firecrawl",
+                metadata={},
+            ),
+        ),
+    )
+    result3 = try_get_firecrawl_image(article3)
+    assert result3 is None
+
+    # Test case 4: Article with invalid URL
+    article4 = create_test_article(
+        content_fetching_result=ContentFetchingResult(
+            url=Url("https://example.com/test-article"),
+            content_cleaner_output=ArticleContentCleanerOutput(
+                title="Test Article Title",
+                cleaned_article_content="Test content",
+            ),
+            url_to_markdown_conversion=UrlToMarkdownConversion(
+                url=Url("https://example.com/test-article"),
+                markdown="# Test Article\n\nTest content",
+                extraction_method="firecrawl",
+                metadata={"og:image": "not_a_valid_url"},
+            ),
+        ),
+    )
+    result4 = try_get_firecrawl_image(article4)
+    assert result4 is None
diff --git a/ingester/src/content_cleaner.py b/ingester/src/content_cleaner.py
@@ -3,7 +3,8 @@
 from langchain import hub
 from langchain.chat_models import init_chat_model
 from langchain_core.language_models.chat_models import BaseChatModel
-from langchain_core.runnables import Runnable
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import Runnable, RunnableLambda
 
 from shared.content_fetching_models import ArticleContentCleanerOutput
 from src.ingester_settings import ingester_settings
@@ -19,21 +20,93 @@ class ArticleContentCleaner:
     Cleans the markdown content of an article using an LLM.
     """
 
-    def __init__(self, llm: BaseChatModel | None = None):
+    def __init__(
+        self,
+        llm: BaseChatModel | None = None,
+        retry_llm: BaseChatModel | None = None,
+    ):
         self.llm = llm or init_chat_model(ingester_settings.CONTENT_CLEANER_MODEL)
+        self.retry_llm = retry_llm or init_chat_model(
+            ingester_settings.CONTENT_CLEANER_RETRY_MODEL
+        )
         self.chain = self._create_chain()
 
+    @staticmethod
+    def _parse_str_output(completion: str) -> ArticleContentCleanerOutput:
+        """
+        Parse the LLM output which should be in XML-style format.
+
+        The LLM output should be in one of these formats:
+
+        For successful cleaning:
+        ```
+        <title>The title of the article</title>
+        <content>
+        The markdown formatted content of the article
+        </content>
+        ```
+
+        For errors:
+        ```
+        <error>The error message</error>
+        ```
+        """
+        import re
+
+        INVALID_OUTPUT_ERROR_MESSAGE = """The output format is invalid.
+
+If you successfully extracted and cleaned the article, present the result in the following format:
+```
+<title>The title of the article</title>
+<content>
+The markdown formatted content of the article
+</content>
+```
+
+If you encountered any issues that prevented you from cleaning the article, present an error message in the following format:
+```
+<error>The error message</error>
+```
+"""
+
+        completion = completion.strip()
+
+        # Check for error first
+        error_match = re.search(
+            r"<error>(.*?)</error>", completion, re.IGNORECASE | re.DOTALL
+        )
+        if error_match:
+            return ArticleContentCleanerOutput(error=error_match.group(1).strip())
+
+        # Extract title and content
+        title_match = re.search(r"<title>(.*?)</title>", completion, re.DOTALL)
+        if not title_match:
+            raise ValueError(INVALID_OUTPUT_ERROR_MESSAGE)
+        title = title_match.group(1).strip()
+
+        content_match = re.search(r"<content>(.*?)</content>", completion, re.DOTALL)
+        if not content_match:
+            raise ValueError(INVALID_OUTPUT_ERROR_MESSAGE)
+
+        content = content_match.group(1).strip()
+
+        return ArticleContentCleanerOutput(
+            title=title,
+            cleaned_article_content=content,
+        )
+
     def _create_chain(self) -> ArticleContentCleanerChain:
         """
         Creates the LangChain chain for article content cleaning.
         """
         prompt = hub.pull(ingester_settings.ARTICLE_CONTENT_CLEANER_PROMPT_REF)
 
-        structured_llm = self.llm.with_structured_output(ArticleContentCleanerOutput)
-
-        return (prompt | structured_llm).with_config(
-            run_name="article_content_cleaner_chain"
-        )
+        return (
+            prompt
+            | self.llm
+            | StrOutputParser()
+            | RunnableLambda(self._parse_str_output)
+        ).with_config(run_name="article_content_cleaner_chain")
 
     def get_chain(self) -> ArticleContentCleanerChain:
         """

diff --git a/ingester/src/ingester_settings.py b/ingester/src/ingester_settings.py
@@ -47,7 +47,7 @@ class IngesterSettings(BaseSettings):
     # Content Cleaner settings
     CONTENT_CLEANER_MODEL: str = "gpt-4o-mini"
     ARTICLE_CONTENT_CLEANER_PROMPT_REF: str = (
-        "clean-article-content"  # Reference to Langsmith Hub
+        "clean-article-content-no-structured-output"  # Reference to Langsmith Hub
     )
     FIRECRAWL_API_KEY: SecretStr = Field(default=...)
 

diff --git a/ingester/tests/test_content_cleaner.py b/ingester/tests/test_content_cleaner.py
@@ -0,0 +1,83 @@
+import pytest
+from shared.content_fetching_models import ArticleContentCleanerOutput
+from src.content_cleaner import ArticleContentCleaner
+
+
+def test_parse_output_success():
+    cleaner = ArticleContentCleaner()
+    test_completion = """<title>Test Title</title>
+<content>
+This is the content
+With multiple lines
+And more content
+</content>"""
+
+    result = cleaner._parse_str_output(test_completion)
+
+    assert isinstance(result, ArticleContentCleanerOutput)
+    assert result.title == "Test Title"
+    assert (
+        result.cleaned_article_content
+        == "This is the content\nWith multiple lines\nAnd more content"
+    )
+    assert result.error is None
+
+
+def test_parse_output_error():
+    cleaner = ArticleContentCleaner()
+    test_completion = "<error>Invalid content format</error>"
+
+    result = cleaner._parse_str_output(test_completion)
+
+    assert isinstance(result, ArticleContentCleanerOutput)
+    assert result.error == "Invalid content format"
+    assert result.title is None
+    assert result.cleaned_article_content is None
+
+
+def test_parse_output_with_whitespace():
+    cleaner = ArticleContentCleaner()
+    test_completion = """
+<title>
+    Test Title
+</title>
+<content>
+This is the content
+With multiple lines
+And more content
+</content>
+"""
+
+    result = cleaner._parse_str_output(test_completion)
+
+    assert isinstance(result, ArticleContentCleanerOutput)
+    assert result.title == "Test Title"
+    assert (
+        result.cleaned_article_content
+        == "This is the content\nWith multiple lines\nAnd more content"
+    )
+    assert result.error is None
+
+
+def test_parse_output_invalid_format():
+    cleaner = ArticleContentCleaner()
+    test_completion = "Invalid format"
+
+    with pytest.raises(ValueError):
+        cleaner._parse_str_output(test_completion)
+
+
+def test_parse_output_no_title():
+    cleaner = ArticleContentCleaner()
+    test_completion = "<content>This is the content</content>"
+
+    with pytest.raises(ValueError):
+        cleaner._parse_str_output(test_completion)
+
+
+def test_parse_output_no_content():
+    cleaner = ArticleContentCleaner()
+    test_completion = "<title>Test Title</title>"
+
+    with pytest.raises(ValueError):
+        cleaner._parse_str_output(test_completion)