Skip to content

Commit

Permalink
Prioritize numbers next to currencies
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed May 22, 2019
1 parent 4d9c393 commit 42a3bcf
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 18 deletions.
68 changes: 53 additions & 15 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
import re
import string
from typing import Callable, Optional, Pattern, List, Tuple
from typing import Callable, Match, Optional, Pattern, List, Tuple
from decimal import Decimal, InvalidOperation

import attr
Expand Down Expand Up @@ -36,11 +36,11 @@ def fromstring(cls, price: Optional[str],
``price`` string, it could be **preferred** over a value extracted
from ``currency_hint`` string.
"""
amount_text = extract_price_text(price) if price is not None else None
currency, source = _extract_currency_symbol(price, currency_hint)
amount_text = extract_price_text(price, currency if source == price else None) if price is not None else None
amount_num = parse_number(amount_text) if amount_text is not None else None
currency = extract_currency_symbol(price, currency_hint)
if currency is not None:
currency = currency.strip()
currency = currency.group(0).strip()
return Price(
amount=amount_num,
currency=currency,
Expand Down Expand Up @@ -120,11 +120,11 @@ def or_regex(symbols: List[str]) -> Pattern:
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
def _extract_currency_symbol(price: Optional[str], currency_hint: Optional[str]) -> Tuple[Optional[Match], Optional[str]]:
"""
Guess currency symbol from extracted price and currency strings.
Return an empty string if symbol is not found.
Guess the currency symbol from extracted price and currency strings.
Return a (`match object`_, source_string) tuple with the symbol found and
the string where it was found, or (None, None) if no symbol is found.
"""
methods: List[Tuple[Callable, Optional[str]]] = [
(_search_safe_currency, price),
Expand All @@ -142,17 +142,32 @@ def extract_currency_symbol(price: Optional[str],
for meth, attr in methods:
m = meth(attr) if attr else None
if m:
return m.group(0)
return m, attr

return None, None


def extract_currency_symbol(price: Optional[str],
currency_hint: Optional[str]) -> Optional[str]:
"""
Guess currency symbol from extracted price and currency strings.
Return the symbol as found as a string, or None if no symbol is found.
"""
match, _ = _extract_currency_symbol(price, currency_hint)
if match:
return match.group(0)
return None


def extract_price_text(price: str) -> Optional[str]:
def extract_price_text(price: str, currency_match: Optional[Match] = None) -> Optional[str]:
"""
Extract text of a price from a string which contains price and
maybe some other text. If multiple price-looking substrings are present,
the first is returned (FIXME: it is better to return a number
which is near a currency symbol).
maybe some other text.
If a match object of the currency within the `price` string is provided,
amounts before or after the matched currency substring are prioritized.
Otherwise, if multiple price-looking substrings are present, the first is
returned.
>>> extract_price_text("price: $12.99")
'12.99'
Expand Down Expand Up @@ -189,16 +204,39 @@ def extract_price_text(price: str) -> Optional[str]:
""", price, re.VERBOSE)
if m:
return m.group(0).replace(' ', '')

def number_from_match(m):
return m.group(1).strip(',.').strip()

if currency_match is not None:

m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s*$ # only match right before the currency symbol
""", price[:currency_match.start(0)], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
^\s* # only match right after the currency symbol
(\d[\d\s.,]*) # number, probably with thousand separators
\s* # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price[currency_match.end(0):], re.VERBOSE)
if m:
return number_from_match(m)

m = re.search(r"""
(\d[\d\s.,]*) # number, probably with thousand separators
\s* # skip whitespace
(?:[^%\d]|$) # capture next symbol - it shouldn't be %
""", price, re.VERBOSE)

if m:
return m.group(1).strip(',.').strip()
return number_from_match(m)

if 'free' in price.lower():
return '0'

return None


Expand Down
6 changes: 3 additions & 3 deletions tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ def __eq__(self, other):
Example('€', '€ 139.00',
'€', '139.00', 139),
Example('There are 163 products.', 'From 26 to 50 €',
'€', '26', 26),
'€', '50', 50),
Example('Pris NOK 1 999,00', '139,00',
'NOK', '139,00', 139),
Example('/sqft', '1.52',
Expand Down Expand Up @@ -1901,13 +1901,13 @@ def __eq__(self, other):
'CHF', '19.90', 19.90),
Example('', '530,42 Zł',
'Zł', '530,42', 530.42),
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
]


PRICE_PARSING_EXAMPLES_XFAIL = [
# amount is picked as a price
Example('3 Ausgaben für nur 14,85 EUR', '3 Ausgaben für nur 14,85 EUR',
'EUR', '14,85', 14.85),
Example(None, 'Buy Now - 2 Litre Was $120.00 Now $60.00',
'$', '60.00', 60),
Example('Цена: уточняйте (мин. заказ: 1 )', 'Цена: уточняйте (мин. заказ: 1 )',
Expand Down

0 comments on commit 42a3bcf

Please sign in to comment.