Skip to content

Commit

Permalink
fix: remove cyclic imports (#458)
Browse files Browse the repository at this point in the history
* fix: remove cyclic imports

* fix import

* lint code
  • Loading branch information
adbar authored Dec 14, 2023
1 parent 05a73de commit b341138
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 6 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def get_long_description():
"charset_normalizer >= 3.2.0; python_version >= '3.7'",
"courlan >= 0.9.5",
"htmldate >= 1.6.0",
"importlib_metadata; python_version < '3.8'",
"justext >= 3.0.0",
"lxml >= 4.9.3 ; platform_system != 'Darwin'",
"lxml == 4.9.2 ; platform_system == 'Darwin'",
Expand Down
15 changes: 11 additions & 4 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,20 @@
from courlan import UrlStore
from courlan.network import redirection_test

from . import __version__
try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version


from .settings import DEFAULT_CONFIG
from .utils import (URL_BLACKLIST_REGEX, decode_response, make_chunks,
uniquify_list)


LOGGER = logging.getLogger(__name__)
PKG_VERSION = version("trafilatura")

NUM_CONNECTIONS = 50
MAX_REDIRECTS = 2

Expand All @@ -42,11 +51,9 @@
RETRY_STRATEGY = None

DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = 'trafilatura/' + __version__ + ' (+https://github.com/adbar/trafilatura)'
USER_AGENT = 'trafilatura/' + PKG_VERSION + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS['User-Agent'] = USER_AGENT

LOGGER = logging.getLogger(__name__)

RawResponse = namedtuple('RawResponse', ['data', 'status', 'url'])


Expand Down
13 changes: 11 additions & 2 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,28 @@

import logging
import lzma

from html import unescape
from json import dumps as json_dumps
from pathlib import Path
from pickle import load as load_pickle

try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version

from lxml.etree import (Element, RelaxNG, SubElement, XMLParser, fromstring,
tostring)

from . import __version__

from .filters import text_chars_test
from .utils import sanitize, sanitize_tree


LOGGER = logging.getLogger(__name__)
PKG_VERSION = version("trafilatura")

# validation
TEI_SCHEMA = str(Path(__file__).parent / 'data/tei-schema-pickle.lzma')
TEI_VALID_TAGS = {'ab', 'body', 'cell', 'code', 'del', 'div', 'graphic', 'head', 'hi', \
Expand Down Expand Up @@ -395,7 +404,7 @@ def write_fullheader(teidoc, docmeta):
tags_list.text = ','.join(docmeta.tags)
encodingdesc = SubElement(header, 'encodingDesc')
appinfo = SubElement(encodingdesc, 'appInfo')
application = SubElement(appinfo, 'application', version=__version__, ident='Trafilatura')
application = SubElement(appinfo, 'application', version=PKG_VERSION, ident='Trafilatura')
label = SubElement(application, 'label')
label.text = 'Trafilatura'
pointer = SubElement(application, 'ptr', target='https://github.com/adbar/trafilatura')
Expand Down

0 comments on commit b341138

Please sign in to comment.