initial commit

adbar · Apr 4, 2019 · 8ece7fb · 8ece7fb
commit 8ece7fb
Show file tree

Hide file tree

Showing 12 changed files with 370 additions and 0 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,14 @@
+[run]
+source = textract
+
+omit =
+    tests/*
+    setup.py
+
+[report]
+exclude_lines =
+    if __name__ == .__main__.:
+    except .*etree.*:
+    except ImportError:
+    except .*requests.exceptions.*:
+    except UnboundLocalError:
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+# Compiled python modules.
+*.pyc
+
+# logs
+*.log
+*.txt
+
+# packaging
+dist/
+build/
+*.egg-info/
+
+# tests
+.tox/
+.cache/
+.coverage
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,43 @@
+sudo: false
+language: python
+dist: xenial
+
+matrix:
+  include:
+    - python: "3.4"
+      env: TOXENV=py34
+    - python: "3.5"
+      env: TOXENV=py35
+    - python: "3.6"
+      env: TOXENV=py36
+    - python: "3.7"
+      env: TOXENV=py37
+    - python: "3.8-dev"
+      env: TOXENV=py38
+  allow_failures:
+    - python: "3.8-dev"
+      env: TOXENV=py38
+  # fast_finish: true
+
+before_install:
+  - pip install lxml
+  - pip install codecov tox
+
+install:
+  - pip install .
+
+script:
+  - tox
+  # - pytest
+  # - coverage run --source htmldate tests/unit_tests.py
+
+after_success:
+  - codecov
+
+notifications:
+  email:
+    on_success: never
+    on_failure: change
+
+#cache:
+#  pip: true
diff --git a/LICENSE b/LICENSE
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+include README.rst LICENSE tox.ini
+include tests/unit_tests.py
+graft tests/cache/
diff --git a/README.rst b/README.rst
@@ -0,0 +1,60 @@
+textract: ...
+==============================================
+
+.. image:: https://img.shields.io/pypi/v/textract.svg
+    :target: https://pypi.python.org/pypi/textract
+
+.. image:: https://img.shields.io/pypi/l/textract.svg
+    :target: https://pypi.python.org/pypi/textract
+
+.. image:: https://img.shields.io/pypi/pyversions/textract.svg
+    :target: https://pypi.python.org/pypi/textract
+
+.. image:: https://img.shields.io/travis/adbar/textract.svg
+    :target: https://travis-ci.org/adbar/textract
+
+.. image:: https://img.shields.io/codecov/c/github/adbar/textract.svg
+    :target: https://codecov.io/gh/adbar/textract
+
+
+Description here.
+
+
+.. contents:: **Contents**
+    :backlinks: none
+
+
+Features
+--------
+
+
+Installation
+------------
+
+
+With Python
+-----------
+
+
+Additional information
+----------------------
+
+Context
+~~~~~~~
+
+This module is part of methods to derive metadata from web documents in order to build text corpora for computational linguistic and NLP analysis. For more information:
+
+-  Barbaresi, Adrien. "`Efficient construction of metadata-enhanced web corpora <https://hal.archives-ouvertes.fr/hal-01348706/document>`_", Proceedings of the `10th Web as Corpus Workshop (WAC-X) <https://www.sigwac.org.uk/wiki/WAC-X>`_, 2016.
+
+Kudos to...
+~~~~~~~~~~~
+
+-  `lxml <http://lxml.de/>`_
+
+
+Contact
+~~~~~~~
+
+Pull requests are welcome.
+
+See my `contact page <http://adrien.barbaresi.eu/contact.html>`_ for additional details.
diff --git a/setup.py b/setup.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Seamlessly extract the date of web pages based on header or body.
+http://github.com/adbar/textract
+"""
+
+from codecs import open # python2
+import os
+from setuptools import setup # find_packages,
+
+#try:
+#    from setuptools import setup
+#except ImportError:
+#    from distutils.core import setup
+
+
+here = os.path.abspath(os.path.dirname(__file__))
+packages = ['textract']
+
+
+def readme():
+    with open(os.path.join(here, 'README.rst'), 'r', 'utf-8') as readmefile:
+        return readmefile.read()
+
+setup(
+    name='textract',
+    version='0.0.1',
+    description='',
+    long_description=readme(),
+    classifiers=[
+        # As from http://pypi.python.org/pypi?%3Aaction=list_classifiers
+        'Development Status :: 3 - Alpha',
+        #'Development Status :: 4 - Beta',
+        #'Development Status :: 5 - Production/Stable',
+        #'Development Status :: 6 - Mature',
+        #'Development Status :: 7 - Inactive',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Information Technology',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Information Analysis',
+        'Topic :: Text Processing :: Linguistic',
+        'Topic :: Text Processing :: Markup :: HTML',
+    ],
+    keywords=['entity-extraction', 'html-extraction', 'html-parsing', 'metadata-extraction',  'webarchives', 'web-scraping'],
+    url='http://github.com/adbar/textract',
+    author='Adrien Barbaresi',
+    author_email='[email protected]',
+    license='GPLv3+',
+    packages=packages,
+    include_package_data=True,
+    install_requires=[
+        'lxml == 4.3.0', # CPython parser issue with version 4.3.1
+        # 'requests >= 2.19.0',
+    ],
+    # python_requires='>=3',
+    #entry_points = {
+    #    'console_scripts': ['htmldate=htmldate.cli:main'],
+    #},
+    # platforms='any',
+    tests_require=['pytest', 'tox'],
+    zip_safe=False,
+)
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+import os
+
+
+import textract
+import pytest # unittest?
+
+
+
+
+
+
+
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for the htmldate library.
+"""
+
+import logging
+import os
+import sys
+# https://docs.pytest.org/en/latest/
+
+import textract
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+
+
+MOCK_PAGES = { \
+'http://blog.python.org/2016/12/python-360-is-now-available.html': 'blog.python.org.html', \
+}
+# 
+
+
+TEST_DIR = os.path.abspath(os.path.dirname(__file__))
+
+def load_mock_page(url):
+    '''load mock page from samples'''
+    with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r') as inputf:
+        htmlstring = inputf.read()
+    return htmlstring
+
+
+if __name__ == '__main__':
+    load_mock_page()
diff --git a/textract/__init__.py b/textract/__init__.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Extract the date of web pages, or web archeology in practice.
+"""
+
+## meta
+__title__ = 'textract'
+__author__ = 'Adrien Barbaresi'
+__license__ = 'GNU GPL v3'
+__copyright__ = 'Copyright 2019, Adrien Barbaresi'
+__version__ = '0.0.1'
+
+## imports
+from .core import *
+
+## logging best practices
+# http://docs.python-guide.org/en/latest/writing/logging/
+# https://github.com/requests/requests/blob/master/requests/__init__.py
+
+import logging
+try:  # Python 2.7+
+    from logging import NullHandler
+except ImportError:
+    class NullHandler(logging.Handler):
+        def emit(self, record):
+            pass
+logging.getLogger(__name__).addHandler(NullHandler())
diff --git a/textract/core.py b/textract/core.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Module bundling all functions needed to determine the date of HTML strings or LXML trees.
+"""
+
+## This file is available from https://github.com/adbar/textract
+## under GNU GPL v3 license
+
+# compatibility
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# from future import standard_library
+# standard_library.install_aliases()
+
+# standard
+import datetime
+import logging
+import re
+import time
+
+# from codecs import open
+from collections import Counter
+
+try:
+    from cStringIO import StringIO # Python 2
+except ImportError:
+    from io import StringIO # Python 3
+
+# third-party
+from lxml import etree, html
+from lxml.html.clean import Cleaner
+
+
+## TODO:
+# ...
+
+
+
+## INIT
+logger = logging.getLogger(__name__)
+
+EXPRESSIONS = [
+    "//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]",
+]
+# "//*[contains(@class, 'fa-clock-o')]",
+
+
+cleaner = Cleaner()
+cleaner.comments = True
+cleaner.embedded = True
+cleaner.forms = False
+cleaner.frames = True
+cleaner.javascript = False
+cleaner.links = False
+cleaner.meta = False
+cleaner.page_structure = True
+cleaner.processing_instructions = True
+cleaner.remove_unknown_tags = False
+cleaner.safe_attrs_only = False
+cleaner.scripts = False
+cleaner.style = False
+cleaner.kill_tags = ['audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'table', 'video']
+# 'embed', 'figure', 'img',
diff --git a/tox.ini b/tox.ini
@@ -0,0 +1,23 @@
+[tox]
+envlist =
+    py34, py35, py36, py37, jython, pypy, pypy3, flake8
+skip_missing_interpreters =
+    true
+
+[testenv]
+commands=py.test --cov htmldate {posargs}
+# py.test --cov-report term-missing --cov=myproj tests/
+deps=
+    pytest
+    pytest-cov
+
+;[testenv:flake8]
+;basepython = python3.5 # python3.4
+;deps =
+;    flake8
+;commands =
+;    flake8 htmldate tests --max-line-length=120
+
+[pytest]
+python_files = tests/*test*.py
+norecursedirs = .tox