forked from ckan/ckanext-xloader
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request ckan#58 from qld-gov-au/develop
Develop to master - fix overly aggressive date parsing
- Loading branch information
Showing
6 changed files
with
95 additions
and
159 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,161 +1,71 @@ | ||
# -*- coding: utf-8 -*- | ||
import csv | ||
import datetime | ||
from decimal import Decimal, InvalidOperation | ||
from itertools import chain | ||
import re | ||
import six | ||
|
||
from ckan.plugins.toolkit import asbool | ||
from dateutil.parser import isoparser, parser | ||
from dateutil.parser import ParserError | ||
|
||
from tabulator import helpers | ||
from tabulator.parser import Parser | ||
from dateutil.parser import isoparser, parser, ParserError | ||
|
||
from ckan.plugins.toolkit import config | ||
|
||
CSV_SAMPLE_LINES = 1000 | ||
DATE_REGEX = re.compile(r'''^\d{1,4}[-/.\s]\S+[-/.\s]\S+''') | ||
|
||
|
||
class XloaderCSVParser(Parser): | ||
"""Extends tabulator CSVParser to detect datetime and numeric values. | ||
class TypeConverter: | ||
""" Post-process table cells to convert strings into numbers and timestamps | ||
as desired. | ||
""" | ||
|
||
# Public | ||
|
||
options = [ | ||
'delimiter', | ||
'doublequote', | ||
'escapechar', | ||
'quotechar', | ||
'quoting', | ||
'skipinitialspace', | ||
'lineterminator' | ||
] | ||
|
||
def __init__(self, loader, force_parse=False, **options): | ||
super(XloaderCSVParser, self).__init__(loader, force_parse, **options) | ||
# Set attributes | ||
self.__loader = loader | ||
self.__options = options | ||
self.__force_parse = force_parse | ||
self.__extended_rows = None | ||
self.__encoding = None | ||
self.__dialect = None | ||
self.__chars = None | ||
|
||
@property | ||
def closed(self): | ||
return self.__chars is None or self.__chars.closed | ||
|
||
def open(self, source, encoding=None): | ||
# Close the character stream, if necessary, before reloading it. | ||
self.close() | ||
self.__chars = self.__loader.load(source, encoding=encoding) | ||
self.__encoding = getattr(self.__chars, 'encoding', encoding) | ||
if self.__encoding: | ||
self.__encoding.lower() | ||
self.reset() | ||
|
||
def close(self): | ||
if not self.closed: | ||
self.__chars.close() | ||
|
||
def reset(self): | ||
helpers.reset_stream(self.__chars) | ||
self.__extended_rows = self.__iter_extended_rows() | ||
|
||
@property | ||
def encoding(self): | ||
return self.__encoding | ||
|
||
@property | ||
def dialect(self): | ||
if self.__dialect: | ||
dialect = { | ||
'delimiter': self.__dialect.delimiter, | ||
'doubleQuote': self.__dialect.doublequote, | ||
'lineTerminator': self.__dialect.lineterminator, | ||
'quoteChar': self.__dialect.quotechar, | ||
'skipInitialSpace': self.__dialect.skipinitialspace, | ||
} | ||
if self.__dialect.escapechar is not None: | ||
dialect['escapeChar'] = self.__dialect.escapechar | ||
return dialect | ||
|
||
@property | ||
def extended_rows(self): | ||
return self.__extended_rows | ||
|
||
# Private | ||
|
||
def __iter_extended_rows(self): | ||
|
||
def type_value(value): | ||
"""Returns numeric values as Decimal(). Uses dateutil to parse | ||
date values. Otherwise, returns values as it receives them | ||
(strings). | ||
""" | ||
if value in ('', None): | ||
return '' | ||
|
||
try: | ||
return Decimal(value) | ||
except InvalidOperation: | ||
pass | ||
|
||
try: | ||
i = isoparser() | ||
return i.isoparse(value) | ||
except ValueError: | ||
pass | ||
|
||
try: | ||
p = parser() | ||
yearfirst = asbool(config.get( | ||
'ckanext.xloader.parse_dates_yearfirst', False)) | ||
dayfirst = asbool(config.get( | ||
'ckanext.xloader.parse_dates_dayfirst', False)) | ||
return p.parse(value, yearfirst=yearfirst, dayfirst=dayfirst) | ||
except ParserError: | ||
pass | ||
|
||
return value | ||
|
||
sample, dialect = self.__prepare_dialect(self.__chars) | ||
items = csv.reader(chain(sample, self.__chars), dialect=dialect) | ||
for row_number, item in enumerate(items, start=1): | ||
values = [] | ||
for value in item: | ||
value = type_value(value) | ||
values.append(value) | ||
yield row_number, None, list(values) | ||
|
||
def __prepare_dialect(self, stream): | ||
|
||
# Get sample | ||
sample = [] | ||
while True: | ||
try: | ||
sample.append(next(stream)) | ||
except StopIteration: | ||
break | ||
if len(sample) >= CSV_SAMPLE_LINES: | ||
break | ||
|
||
# Get dialect | ||
def __init__(self, types=None): | ||
self.types = types | ||
|
||
def convert_types(self, extended_rows): | ||
""" Try converting cells to numbers or timestamps if applicable. | ||
If a list of types was supplied, use that. | ||
If not, then try converting each column to numeric first, | ||
then to a timestamp. If both fail, just keep it as a string. | ||
""" | ||
for row_number, headers, row in extended_rows: | ||
for cell_index, cell_value in enumerate(row): | ||
if cell_value is None: | ||
row[cell_index] = '' | ||
if not cell_value: | ||
continue | ||
cell_type = self.types[cell_index] if self.types else None | ||
if cell_type in [Decimal, None]: | ||
converted_value = to_number(cell_value) | ||
if converted_value: | ||
row[cell_index] = converted_value | ||
continue | ||
if cell_type in [datetime.datetime, None]: | ||
converted_value = to_timestamp(cell_value) | ||
if converted_value: | ||
row[cell_index] = converted_value | ||
yield (row_number, headers, row) | ||
|
||
|
||
def to_number(value): | ||
if not isinstance(value, six.string_types): | ||
return None | ||
try: | ||
return Decimal(value) | ||
except InvalidOperation: | ||
return None | ||
|
||
|
||
def to_timestamp(value): | ||
if not isinstance(value, six.string_types) or not DATE_REGEX.search(value): | ||
return None | ||
try: | ||
i = isoparser() | ||
return i.isoparse(value) | ||
except ValueError: | ||
try: | ||
separator = '' | ||
delimiter = self.__options.get('delimiter', ',\t;|') | ||
dialect = csv.Sniffer().sniff(separator.join(sample), delimiter) | ||
if not dialect.escapechar: | ||
dialect.doublequote = True | ||
except csv.Error: | ||
class dialect(csv.excel): | ||
pass | ||
for key, value in self.__options.items(): | ||
setattr(dialect, key, value) | ||
# https://github.com/frictionlessdata/FrictionlessDarwinCore/issues/1 | ||
if getattr(dialect, 'quotechar', None) == '': | ||
setattr(dialect, 'quoting', csv.QUOTE_NONE) | ||
|
||
self.__dialect = dialect | ||
return sample, dialect | ||
p = parser() | ||
yearfirst = asbool(config.get('ckanext.xloader.parse_dates_yearfirst', False)) | ||
dayfirst = asbool(config.get('ckanext.xloader.parse_dates_dayfirst', False)) | ||
return p.parse(value, yearfirst=yearfirst, dayfirst=dayfirst) | ||
except ParserError: | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Title,Postal postcode,Latitude,Longitude,Mon am,Mon pm,Last updated | ||
Adavale,4474,-25.9092582,144.5975769,8:00,16:00,19/07/2018 | ||
Aramac,4726,-22.971298,145.241481,9:00-13:00,14:00-16:45,17/07/2018 | ||
Barcaldine,4725,-23.55327901,145.289156,9:00-12:30,13:30-16:30,20/07/2018 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters