-
-
Notifications
You must be signed in to change notification settings - Fork 285
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8ece7fb
Showing
12 changed files
with
370 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
[run] | ||
source = textract | ||
|
||
omit = | ||
tests/* | ||
setup.py | ||
|
||
[report] | ||
exclude_lines = | ||
if __name__ == .__main__.: | ||
except .*etree.*: | ||
except ImportError: | ||
except .*requests.exceptions.*: | ||
except UnboundLocalError: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Compiled python modules. | ||
*.pyc | ||
|
||
# logs | ||
*.log | ||
*.txt | ||
|
||
# packaging | ||
dist/ | ||
build/ | ||
*.egg-info/ | ||
|
||
# tests | ||
.tox/ | ||
.cache/ | ||
.coverage |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
sudo: false | ||
language: python | ||
dist: xenial | ||
|
||
matrix: | ||
include: | ||
- python: "3.4" | ||
env: TOXENV=py34 | ||
- python: "3.5" | ||
env: TOXENV=py35 | ||
- python: "3.6" | ||
env: TOXENV=py36 | ||
- python: "3.7" | ||
env: TOXENV=py37 | ||
- python: "3.8-dev" | ||
env: TOXENV=py38 | ||
allow_failures: | ||
- python: "3.8-dev" | ||
env: TOXENV=py38 | ||
# fast_finish: true | ||
|
||
before_install: | ||
- pip install lxml | ||
- pip install codecov tox | ||
|
||
install: | ||
- pip install . | ||
|
||
script: | ||
- tox | ||
# - pytest | ||
# - coverage run --source htmldate tests/unit_tests.py | ||
|
||
after_success: | ||
- codecov | ||
|
||
notifications: | ||
email: | ||
on_success: never | ||
on_failure: change | ||
|
||
#cache: | ||
# pip: true |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
include README.rst LICENSE tox.ini | ||
include tests/unit_tests.py | ||
graft tests/cache/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
textract: ... | ||
============================================== | ||
|
||
.. image:: https://img.shields.io/pypi/v/textract.svg | ||
:target: https://pypi.python.org/pypi/textract | ||
|
||
.. image:: https://img.shields.io/pypi/l/textract.svg | ||
:target: https://pypi.python.org/pypi/textract | ||
|
||
.. image:: https://img.shields.io/pypi/pyversions/textract.svg | ||
:target: https://pypi.python.org/pypi/textract | ||
|
||
.. image:: https://img.shields.io/travis/adbar/textract.svg | ||
:target: https://travis-ci.org/adbar/textract | ||
|
||
.. image:: https://img.shields.io/codecov/c/github/adbar/textract.svg | ||
:target: https://codecov.io/gh/adbar/textract | ||
|
||
|
||
Description here. | ||
|
||
|
||
.. contents:: **Contents** | ||
:backlinks: none | ||
|
||
|
||
Features | ||
-------- | ||
|
||
|
||
Installation | ||
------------ | ||
|
||
|
||
With Python | ||
----------- | ||
|
||
|
||
Additional information | ||
---------------------- | ||
|
||
Context | ||
~~~~~~~ | ||
|
||
This module is part of methods to derive metadata from web documents in order to build text corpora for computational linguistic and NLP analysis. For more information: | ||
|
||
- Barbaresi, Adrien. "`Efficient construction of metadata-enhanced web corpora <https://hal.archives-ouvertes.fr/hal-01348706/document>`_", Proceedings of the `10th Web as Corpus Workshop (WAC-X) <https://www.sigwac.org.uk/wiki/WAC-X>`_, 2016. | ||
|
||
Kudos to... | ||
~~~~~~~~~~~ | ||
|
||
- `lxml <http://lxml.de/>`_ | ||
|
||
|
||
Contact | ||
~~~~~~~ | ||
|
||
Pull requests are welcome. | ||
|
||
See my `contact page <http://adrien.barbaresi.eu/contact.html>`_ for additional details. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Seamlessly extract the date of web pages based on header or body. | ||
http://github.com/adbar/textract | ||
""" | ||
|
||
from codecs import open # python2 | ||
import os | ||
from setuptools import setup # find_packages, | ||
|
||
#try: | ||
# from setuptools import setup | ||
#except ImportError: | ||
# from distutils.core import setup | ||
|
||
|
||
here = os.path.abspath(os.path.dirname(__file__)) | ||
packages = ['textract'] | ||
|
||
|
||
def readme(): | ||
with open(os.path.join(here, 'README.rst'), 'r', 'utf-8') as readmefile: | ||
return readmefile.read() | ||
|
||
setup( | ||
name='textract', | ||
version='0.0.1', | ||
description='', | ||
long_description=readme(), | ||
classifiers=[ | ||
# As from http://pypi.python.org/pypi?%3Aaction=list_classifiers | ||
'Development Status :: 3 - Alpha', | ||
#'Development Status :: 4 - Beta', | ||
#'Development Status :: 5 - Production/Stable', | ||
#'Development Status :: 6 - Mature', | ||
#'Development Status :: 7 - Inactive', | ||
'Intended Audience :: Developers', | ||
'Intended Audience :: Information Technology', | ||
'Intended Audience :: Science/Research', | ||
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', | ||
'Programming Language :: Python', | ||
'Programming Language :: Python :: 3', | ||
'Programming Language :: Python :: 3.4', | ||
'Programming Language :: Python :: 3.5', | ||
'Programming Language :: Python :: 3.6', | ||
'Programming Language :: Python :: 3.7', | ||
'Topic :: Software Development :: Libraries :: Python Modules', | ||
'Topic :: Scientific/Engineering', | ||
'Topic :: Scientific/Engineering :: Information Analysis', | ||
'Topic :: Text Processing :: Linguistic', | ||
'Topic :: Text Processing :: Markup :: HTML', | ||
], | ||
keywords=['entity-extraction', 'html-extraction', 'html-parsing', 'metadata-extraction', 'webarchives', 'web-scraping'], | ||
url='http://github.com/adbar/textract', | ||
author='Adrien Barbaresi', | ||
author_email='[email protected]', | ||
license='GPLv3+', | ||
packages=packages, | ||
include_package_data=True, | ||
install_requires=[ | ||
'lxml == 4.3.0', # CPython parser issue with version 4.3.1 | ||
# 'requests >= 2.19.0', | ||
], | ||
# python_requires='>=3', | ||
#entry_points = { | ||
# 'console_scripts': ['htmldate=htmldate.cli:main'], | ||
#}, | ||
# platforms='any', | ||
tests_require=['pytest', 'tox'], | ||
zip_safe=False, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import absolute_import, division, print_function, unicode_literals | ||
|
||
|
||
import os | ||
|
||
|
||
import textract | ||
import pytest # unittest? | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Unit tests for the htmldate library. | ||
""" | ||
|
||
import logging | ||
import os | ||
import sys | ||
# https://docs.pytest.org/en/latest/ | ||
|
||
import textract | ||
|
||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) | ||
|
||
|
||
MOCK_PAGES = { \ | ||
'http://blog.python.org/2016/12/python-360-is-now-available.html': 'blog.python.org.html', \ | ||
} | ||
# | ||
|
||
|
||
TEST_DIR = os.path.abspath(os.path.dirname(__file__)) | ||
|
||
def load_mock_page(url): | ||
'''load mock page from samples''' | ||
with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r') as inputf: | ||
htmlstring = inputf.read() | ||
return htmlstring | ||
|
||
|
||
if __name__ == '__main__': | ||
load_mock_page() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Extract the date of web pages, or web archeology in practice. | ||
""" | ||
|
||
## meta | ||
__title__ = 'textract' | ||
__author__ = 'Adrien Barbaresi' | ||
__license__ = 'GNU GPL v3' | ||
__copyright__ = 'Copyright 2019, Adrien Barbaresi' | ||
__version__ = '0.0.1' | ||
|
||
## imports | ||
from .core import * | ||
|
||
## logging best practices | ||
# http://docs.python-guide.org/en/latest/writing/logging/ | ||
# https://github.com/requests/requests/blob/master/requests/__init__.py | ||
|
||
import logging | ||
try: # Python 2.7+ | ||
from logging import NullHandler | ||
except ImportError: | ||
class NullHandler(logging.Handler): | ||
def emit(self, record): | ||
pass | ||
logging.getLogger(__name__).addHandler(NullHandler()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Module bundling all functions needed to determine the date of HTML strings or LXML trees. | ||
""" | ||
|
||
## This file is available from https://github.com/adbar/textract | ||
## under GNU GPL v3 license | ||
|
||
# compatibility | ||
from __future__ import absolute_import, division, print_function, unicode_literals | ||
|
||
# from future import standard_library | ||
# standard_library.install_aliases() | ||
|
||
# standard | ||
import datetime | ||
import logging | ||
import re | ||
import time | ||
|
||
# from codecs import open | ||
from collections import Counter | ||
|
||
try: | ||
from cStringIO import StringIO # Python 2 | ||
except ImportError: | ||
from io import StringIO # Python 3 | ||
|
||
# third-party | ||
from lxml import etree, html | ||
from lxml.html.clean import Cleaner | ||
|
||
|
||
## TODO: | ||
# ... | ||
|
||
|
||
|
||
## INIT | ||
logger = logging.getLogger(__name__) | ||
|
||
EXPRESSIONS = [ | ||
"//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]", | ||
] | ||
# "//*[contains(@class, 'fa-clock-o')]", | ||
|
||
|
||
cleaner = Cleaner() | ||
cleaner.comments = True | ||
cleaner.embedded = True | ||
cleaner.forms = False | ||
cleaner.frames = True | ||
cleaner.javascript = False | ||
cleaner.links = False | ||
cleaner.meta = False | ||
cleaner.page_structure = True | ||
cleaner.processing_instructions = True | ||
cleaner.remove_unknown_tags = False | ||
cleaner.safe_attrs_only = False | ||
cleaner.scripts = False | ||
cleaner.style = False | ||
cleaner.kill_tags = ['audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'table', 'video'] | ||
# 'embed', 'figure', 'img', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[tox] | ||
envlist = | ||
py34, py35, py36, py37, jython, pypy, pypy3, flake8 | ||
skip_missing_interpreters = | ||
true | ||
|
||
[testenv] | ||
commands=py.test --cov htmldate {posargs} | ||
# py.test --cov-report term-missing --cov=myproj tests/ | ||
deps= | ||
pytest | ||
pytest-cov | ||
|
||
;[testenv:flake8] | ||
;basepython = python3.5 # python3.4 | ||
;deps = | ||
; flake8 | ||
;commands = | ||
; flake8 htmldate tests --max-line-length=120 | ||
|
||
[pytest] | ||
python_files = tests/*test*.py | ||
norecursedirs = .tox |