Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Apr 4, 2019
0 parents commit 8ece7fb
Show file tree
Hide file tree
Showing 12 changed files with 370 additions and 0 deletions.
14 changes: 14 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[run]
source = textract

omit =
tests/*
setup.py

[report]
exclude_lines =
if __name__ == .__main__.:
except .*etree.*:
except ImportError:
except .*requests.exceptions.*:
except UnboundLocalError:
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Compiled python modules.
*.pyc

# logs
*.log
*.txt

# packaging
dist/
build/
*.egg-info/

# tests
.tox/
.cache/
.coverage
43 changes: 43 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
sudo: false
language: python
dist: xenial

matrix:
include:
- python: "3.4"
env: TOXENV=py34
- python: "3.5"
env: TOXENV=py35
- python: "3.6"
env: TOXENV=py36
- python: "3.7"
env: TOXENV=py37
- python: "3.8-dev"
env: TOXENV=py38
allow_failures:
- python: "3.8-dev"
env: TOXENV=py38
# fast_finish: true

before_install:
- pip install lxml
- pip install codecov tox

install:
- pip install .

script:
- tox
# - pytest
# - coverage run --source htmldate tests/unit_tests.py

after_success:
- codecov

notifications:
email:
on_success: never
on_failure: change

#cache:
# pip: true
Empty file added LICENSE
Empty file.
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include README.rst LICENSE tox.ini
include tests/unit_tests.py
graft tests/cache/
60 changes: 60 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
textract: ...
==============================================

.. image:: https://img.shields.io/pypi/v/textract.svg
:target: https://pypi.python.org/pypi/textract

.. image:: https://img.shields.io/pypi/l/textract.svg
:target: https://pypi.python.org/pypi/textract

.. image:: https://img.shields.io/pypi/pyversions/textract.svg
:target: https://pypi.python.org/pypi/textract

.. image:: https://img.shields.io/travis/adbar/textract.svg
:target: https://travis-ci.org/adbar/textract

.. image:: https://img.shields.io/codecov/c/github/adbar/textract.svg
:target: https://codecov.io/gh/adbar/textract


Description here.


.. contents:: **Contents**
:backlinks: none


Features
--------


Installation
------------


With Python
-----------


Additional information
----------------------

Context
~~~~~~~

This module is part of methods to derive metadata from web documents in order to build text corpora for computational linguistic and NLP analysis. For more information:

- Barbaresi, Adrien. "`Efficient construction of metadata-enhanced web corpora <https://hal.archives-ouvertes.fr/hal-01348706/document>`_", Proceedings of the `10th Web as Corpus Workshop (WAC-X) <https://www.sigwac.org.uk/wiki/WAC-X>`_, 2016.

Kudos to...
~~~~~~~~~~~

- `lxml <http://lxml.de/>`_


Contact
~~~~~~~

Pull requests are welcome.

See my `contact page <http://adrien.barbaresi.eu/contact.html>`_ for additional details.
72 changes: 72 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Seamlessly extract the date of web pages based on header or body.
http://github.com/adbar/textract
"""

from codecs import open # python2
import os
from setuptools import setup # find_packages,

#try:
# from setuptools import setup
#except ImportError:
# from distutils.core import setup


here = os.path.abspath(os.path.dirname(__file__))
packages = ['textract']


def readme():
with open(os.path.join(here, 'README.rst'), 'r', 'utf-8') as readmefile:
return readmefile.read()

setup(
name='textract',
version='0.0.1',
description='',
long_description=readme(),
classifiers=[
# As from http://pypi.python.org/pypi?%3Aaction=list_classifiers
'Development Status :: 3 - Alpha',
#'Development Status :: 4 - Beta',
#'Development Status :: 5 - Production/Stable',
#'Development Status :: 6 - Mature',
#'Development Status :: 7 - Inactive',
'Intended Audience :: Developers',
'Intended Audience :: Information Technology',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Text Processing :: Linguistic',
'Topic :: Text Processing :: Markup :: HTML',
],
keywords=['entity-extraction', 'html-extraction', 'html-parsing', 'metadata-extraction', 'webarchives', 'web-scraping'],
url='http://github.com/adbar/textract',
author='Adrien Barbaresi',
author_email='[email protected]',
license='GPLv3+',
packages=packages,
include_package_data=True,
install_requires=[
'lxml == 4.3.0', # CPython parser issue with version 4.3.1
# 'requests >= 2.19.0',
],
# python_requires='>=3',
#entry_points = {
# 'console_scripts': ['htmldate=htmldate.cli:main'],
#},
# platforms='any',
tests_require=['pytest', 'tox'],
zip_safe=False,
)
17 changes: 17 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals


import os


import textract
import pytest # unittest?







32 changes: 32 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
"""
Unit tests for the htmldate library.
"""

import logging
import os
import sys
# https://docs.pytest.org/en/latest/

import textract

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


MOCK_PAGES = { \
'http://blog.python.org/2016/12/python-360-is-now-available.html': 'blog.python.org.html', \
}
#


TEST_DIR = os.path.abspath(os.path.dirname(__file__))

def load_mock_page(url):
'''load mock page from samples'''
with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r') as inputf:
htmlstring = inputf.read()
return htmlstring


if __name__ == '__main__':
load_mock_page()
27 changes: 27 additions & 0 deletions textract/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
"""
Extract the date of web pages, or web archeology in practice.
"""

## meta
__title__ = 'textract'
__author__ = 'Adrien Barbaresi'
__license__ = 'GNU GPL v3'
__copyright__ = 'Copyright 2019, Adrien Barbaresi'
__version__ = '0.0.1'

## imports
from .core import *

## logging best practices
# http://docs.python-guide.org/en/latest/writing/logging/
# https://github.com/requests/requests/blob/master/requests/__init__.py

import logging
try: # Python 2.7+
from logging import NullHandler
except ImportError:
class NullHandler(logging.Handler):
def emit(self, record):
pass
logging.getLogger(__name__).addHandler(NullHandler())
63 changes: 63 additions & 0 deletions textract/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
"""
Module bundling all functions needed to determine the date of HTML strings or LXML trees.
"""

## This file is available from https://github.com/adbar/textract
## under GNU GPL v3 license

# compatibility
from __future__ import absolute_import, division, print_function, unicode_literals

# from future import standard_library
# standard_library.install_aliases()

# standard
import datetime
import logging
import re
import time

# from codecs import open
from collections import Counter

try:
from cStringIO import StringIO # Python 2
except ImportError:
from io import StringIO # Python 3

# third-party
from lxml import etree, html
from lxml.html.clean import Cleaner


## TODO:
# ...



## INIT
logger = logging.getLogger(__name__)

EXPRESSIONS = [
"//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]",
]
# "//*[contains(@class, 'fa-clock-o')]",


cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = ['audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'table', 'video']
# 'embed', 'figure', 'img',
23 changes: 23 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[tox]
envlist =
py34, py35, py36, py37, jython, pypy, pypy3, flake8
skip_missing_interpreters =
true

[testenv]
commands=py.test --cov htmldate {posargs}
# py.test --cov-report term-missing --cov=myproj tests/
deps=
pytest
pytest-cov

;[testenv:flake8]
;basepython = python3.5 # python3.4
;deps =
; flake8
;commands =
; flake8 htmldate tests --max-line-length=120

[pytest]
python_files = tests/*test*.py
norecursedirs = .tox

0 comments on commit 8ece7fb

Please sign in to comment.