Skip to content

Commit

Permalink
feature/add-optional-kwargs: Added optional kwargs
Browse files Browse the repository at this point in the history
Most of the init arguments in the ``HTML2Text`` class are hardcoded in
constants and modifiable only by the cli, not through the library
usage. This adds the possibility to pass kwargs through the function
call ``html2text`` or class init.

Please note that the commit contains syntax that is not recognizable
by ``mypy``, but is correct. Note: python/mypy#5719
  • Loading branch information
Tomasz Grining committed Dec 11, 2019
1 parent 2d2c702 commit a6d9f9a
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 35 deletions.
2 changes: 1 addition & 1 deletion docs/how_it_works.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Used to provide various configuration settings to the converter. They are as fol
- INLINE_LINKS for formatting images and links
- PROTECT_LINKS protect from line breaks
- GOOGLE_LIST_INDENT no of pixels to indent nested lists
- IGNORE_ANCHORS
- IGNORE_LINKS
- IGNORE_IMAGES
- IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible.
- IMAGES_TO_ALT
Expand Down
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ simple indications of their function.
- INLINE_LINKS for formatting images and links
- PROTECT_LINKS protect from line breaks
- GOOGLE_LIST_INDENT no of pixels to indent nested lists
- IGNORE_ANCHORS
- IGNORE_LINKS
- IGNORE_IMAGES
- IMAGES_AS_HTML always generate HTML tags for images; preserves `height`, `width`, `alt` if possible.
- IMAGES_TO_ALT
Expand Down
74 changes: 43 additions & 31 deletions html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,39 @@


class HTML2Text(html.parser.HTMLParser):
init_params = [
"bypass_tables",
"close_quote",
"default_image_alt",
"escape_snob",
"google_list_indent",
"ignore_emphasis",
"ignore_images",
"ignore_links",
"ignore_tables",
"images_as_html",
"images_to_alt",
"images_with_size",
"inline_links",
"links_each_paragraph",
"mark_code",
"open_quote",
"pad_tables",
"protect_links",
"single_line_break",
"skip_internal_links",
"unicode_snob",
"use_automatic_links",
"wrap_links",
"wrap_list_items",
]

def __init__(
self,
out: Optional[OutCallback] = None,
baseurl: str = "",
bodywidth: int = config.BODY_WIDTH,
**kwargs
) -> None:
"""
Input parameters:
Expand All @@ -52,37 +80,16 @@ def __init__(
self.split_next_td = False
self.td_count = 0
self.table_start = False
self.unicode_snob = config.UNICODE_SNOB # covered in cli
self.escape_snob = config.ESCAPE_SNOB # covered in cli
self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
self.body_width = bodywidth # covered in cli
self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli
self.inline_links = config.INLINE_LINKS # covered in cli
self.protect_links = config.PROTECT_LINKS # covered in cli
self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli
self.ignore_links = config.IGNORE_ANCHORS # covered in cli
self.ignore_images = config.IGNORE_IMAGES # covered in cli
self.images_as_html = config.IMAGES_AS_HTML # covered in cli
self.images_to_alt = config.IMAGES_TO_ALT # covered in cli
self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
self.bypass_tables = config.BYPASS_TABLES # covered in cli
self.ignore_tables = config.IGNORE_TABLES # covered in cli
self.google_doc = False # covered in cli
self.ul_item_mark = "*" # covered in cli
self.emphasis_mark = "_" # covered in cli
self.google_doc = False
self.ul_item_mark = "*"
self.emphasis_mark = "_"
self.strong_mark = "**"
self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli
self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
self.hide_strikethrough = False # covered in cli
self.mark_code = config.MARK_CODE
self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
self.wrap_links = config.WRAP_LINKS # covered in cli
self.pad_tables = config.PAD_TABLES # covered in cli
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
self.hide_strikethrough = False
self.tag_callback = None
self.open_quote = config.OPEN_QUOTE # covered in cli
self.close_quote = config.CLOSE_QUOTE # covered in cli
self.body_width = bodywidth

for param in self.init_params:
setattr(self, param, kwargs.get(param, getattr(config, param.upper())))

if out is None:
self.out = self.outtextf
Expand Down Expand Up @@ -939,9 +946,14 @@ def optwrap(self, text: str) -> str:
return result


def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
def html2text(
html: str,
baseurl: str = "",
bodywidth: Optional[int] = None,
**kwargs: Optional[OutCallback]
) -> str:
if bodywidth is None:
bodywidth = config.BODY_WIDTH
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth, **kwargs)

return h.handle(html)
2 changes: 1 addition & 1 deletion html2text/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class bcolors:
"--ignore-links",
dest="ignore_links",
action="store_true",
default=config.IGNORE_ANCHORS,
default=config.IGNORE_LINKS,
help="don't include any formatting for links",
)
p.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion html2text/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# Values Google and others may use to indicate bold text
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")

IGNORE_ANCHORS = False
IGNORE_LINKS = False
IGNORE_IMAGES = False
IMAGES_AS_HTML = False
IMAGES_TO_ALT = False
Expand Down
12 changes: 12 additions & 0 deletions test/test_html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,15 @@ def _skip_certain_tags(h2t, tag, attrs, start):
"some <i>italics</i> too."
)
assert ret == ("this is a txt and this is a with text and some _italics_ too.\n\n")


def test_kwargs_in_class():
h = html2text.HTML2Text(wrap_links=False)
assert h.wrap_links is False


def test_kwargs_in_function():
test_data = "<a href='http://foo.com/" + "foo-bar/" * 10 + "'>Foo</a>"
wrapped = html2text.html2text(test_data, wrap_links=True)
unwrapped = html2text.html2text(test_data, wrap_links=False)
assert wrapped != unwrapped

0 comments on commit a6d9f9a

Please sign in to comment.