-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Better content cleaner output parsing
- Loading branch information
Showing
5 changed files
with
326 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import pytest | ||
from make_it_sync import make_sync | ||
from mongomock_motor import AsyncMongoMockClient | ||
from pydantic_core import Url | ||
from src.util import create_test_article, try_get_firecrawl_image | ||
|
||
from shared.content_fetching_models import ( | ||
ArticleContentCleanerOutput, | ||
ContentFetchingResult, | ||
UrlToMarkdownConversion, | ||
) | ||
from shared.db import my_init_beanie | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def my_fixture(): | ||
client = AsyncMongoMockClient() | ||
make_sync(my_init_beanie)(client) | ||
yield | ||
|
||
|
||
def test_try_get_firecrawl_image(): | ||
# Test case 1: Article with valid og:image | ||
article1 = create_test_article() | ||
result1 = try_get_firecrawl_image(article1) | ||
assert isinstance(result1, Url) | ||
assert str(result1) == "https://example.com/og-image.jpg" | ||
|
||
# Test case 2: Article without content_fetching_result | ||
article2 = create_test_article(content_fetching_result=None) | ||
result2 = try_get_firecrawl_image(article2) | ||
assert result2 is None | ||
|
||
# Test case 3: Article with empty metadata | ||
article3 = create_test_article( | ||
content_fetching_result=ContentFetchingResult( | ||
url=Url("https://example.com/test-article"), | ||
content_cleaner_output=ArticleContentCleanerOutput( | ||
title="Test Article Title", | ||
cleaned_article_content="Test content", | ||
), | ||
url_to_markdown_conversion=UrlToMarkdownConversion( | ||
url=Url("https://example.com/test-article"), | ||
markdown="# Test Article\n\nTest content", | ||
extraction_method="firecrawl", | ||
metadata={}, | ||
), | ||
), | ||
) | ||
result3 = try_get_firecrawl_image(article3) | ||
assert result3 is None | ||
|
||
# Test case 4: Article with invalid URL | ||
article4 = create_test_article( | ||
content_fetching_result=ContentFetchingResult( | ||
url=Url("https://example.com/test-article"), | ||
content_cleaner_output=ArticleContentCleanerOutput( | ||
title="Test Article Title", | ||
cleaned_article_content="Test content", | ||
), | ||
url_to_markdown_conversion=UrlToMarkdownConversion( | ||
url=Url("https://example.com/test-article"), | ||
markdown="# Test Article\n\nTest content", | ||
extraction_method="firecrawl", | ||
metadata={"og:image": "not_a_valid_url"}, | ||
), | ||
), | ||
) | ||
result4 = try_get_firecrawl_image(article4) | ||
assert result4 is None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import pytest | ||
from shared.content_fetching_models import ArticleContentCleanerOutput | ||
from src.content_cleaner import ArticleContentCleaner | ||
|
||
|
||
def test_parse_output_success(): | ||
cleaner = ArticleContentCleaner() | ||
test_completion = """<title>Test Title</title> | ||
<content> | ||
This is the content | ||
With multiple lines | ||
And more content | ||
</content>""" | ||
|
||
result = cleaner._parse_str_output(test_completion) | ||
|
||
assert isinstance(result, ArticleContentCleanerOutput) | ||
assert result.title == "Test Title" | ||
assert ( | ||
result.cleaned_article_content | ||
== "This is the content\nWith multiple lines\nAnd more content" | ||
) | ||
assert result.error is None | ||
|
||
|
||
def test_parse_output_error(): | ||
cleaner = ArticleContentCleaner() | ||
test_completion = "<error>Invalid content format</error>" | ||
|
||
result = cleaner._parse_str_output(test_completion) | ||
|
||
assert isinstance(result, ArticleContentCleanerOutput) | ||
assert result.error == "Invalid content format" | ||
assert result.title is None | ||
assert result.cleaned_article_content is None | ||
|
||
|
||
def test_parse_output_with_whitespace(): | ||
cleaner = ArticleContentCleaner() | ||
test_completion = """ | ||
<title> | ||
Test Title | ||
</title> | ||
<content> | ||
This is the content | ||
With multiple lines | ||
And more content | ||
</content> | ||
""" | ||
|
||
result = cleaner._parse_str_output(test_completion) | ||
|
||
assert isinstance(result, ArticleContentCleanerOutput) | ||
assert result.title == "Test Title" | ||
assert ( | ||
result.cleaned_article_content | ||
== "This is the content\nWith multiple lines\nAnd more content" | ||
) | ||
assert result.error is None | ||
|
||
|
||
def test_parse_output_invalid_format(): | ||
cleaner = ArticleContentCleaner() | ||
test_completion = "Invalid format" | ||
|
||
with pytest.raises(ValueError): | ||
cleaner._parse_str_output(test_completion) | ||
|
||
|
||
def test_parse_output_no_title(): | ||
cleaner = ArticleContentCleaner() | ||
test_completion = "<content>This is the content</content>" | ||
|
||
with pytest.raises(ValueError): | ||
cleaner._parse_str_output(test_completion) | ||
|
||
|
||
def test_parse_output_no_content(): | ||
cleaner = ArticleContentCleaner() | ||
test_completion = "<title>Test Title</title>" | ||
|
||
with pytest.raises(ValueError): | ||
cleaner._parse_str_output(test_completion) |