-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleancorp.py
154 lines (120 loc) · 4.92 KB
/
cleancorp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import re, string, collections, functools, unicodedata
from termdata import terms_by_country, terms_by_type, terms_by_industry
def build_term_set(input_dic):
"""
Build tuples and sort by term length
@input: {'key' : [term_list], }
@return: [[key, term ], ]
"""
tuple_list = []
for key in input_dic:
for item in input_dic[key]:
temp_tuple = key, item
tuple_list.append(temp_tuple)
tuple_list = sorted(tuple_list, key=lambda part: len(part[1]), reverse=True)
return tuple_list
# business_types / abbreviation
SORTED_TYPES = build_term_set(terms_by_type)
# country / abbreviations
SORTED_COUNTRIES = build_term_set(terms_by_country)
# industry / term
SORTED_INDUSTRIES = build_term_set(terms_by_industry)
# All abbreviations sorted by length
all_sorted = SORTED_TYPES + SORTED_COUNTRIES
all_suffix = [item for key, item in all_sorted]
SORTED_SUFFIX = sorted(all_suffix, key=lambda x: len(x), reverse=True)
ALL_TERMS = SORTED_SUFFIX + [term for key, term in SORTED_INDUSTRIES]
class CleanCorp:
def __init__(self, business_name):
self.original_name = business_name
@property
@functools.lru_cache()
def _sanitized_name(self):
""" Sanitize the orignal string """
original_name = self.original_name
# Replacing comma with space
sanitized = original_name.replace(',', ' ').replace(u"\uFF0C", ' ')
# Get rid of extra spaces
sanitized = " ".join(sanitized.split()).lower()
# Get rid of all trailing punctuation except '.'
match = re.search(r'[^\.\w]+$', sanitized, flags=re.UNICODE)
if match:
sanitized = sanitized[:match.span()[0]]
return sanitized
def _match_terms(self, term_set):
"""
Match terms found in business_name to corresponding keys
@input: [[key, term], ]
@return: [key, ]
"""
business_name = self._sanitized_name
found_terms = []
for key, term in term_set:
# Composite terms (pty ltd, s de rl, ...)
if ' ' in term and business_name.find(term) > 0:
found_terms.append(key)
# Simple terms (ltd., a.g., ...)
elif term in business_name.split():
found_terms.append(key)
# this sucks
found_terms = sorted(set(found_terms), reverse=True)
if found_terms: return found_terms
else: return None
def _remove_terms(self, term_list):
""" Returns business_name without terms in term_list """
business_name = self._sanitized_name
for term in term_list:
# Composite terms ("pty ltd", "s de rl", ...)
if ' ' in term and business_name.find(term) > 0:
business_name = business_name.replace(term, '')
# Single word terms
elif term in business_name.split():
business_name = business_name.replace(term, '')
return " ".join(business_name.split())
@property
@functools.lru_cache()
def clean_name(self):
""" Clean company terms (ltd, corp, ...) from the business_name """
_clean = self._remove_terms(SORTED_SUFFIX)
return _clean
@property
@functools.lru_cache()
def entity_type(self):
""" Probable type of business entity """
_type = self._match_terms(SORTED_TYPES)
# if the entity type is not resolved but an industry term is found
# the entity type is 'Unknown'
if not _type and self.industry:
_type = 'Unknown'
return _type
@property
@functools.lru_cache()
def country(self):
""" Probable country of incorporation """
return self._match_terms(SORTED_COUNTRIES)
@property
@functools.lru_cache()
def industry(self):
""" Probable industry """
industry = self._match_terms(SORTED_INDUSTRIES)
# If other industry terms are found, remove 'Unknown'
if industry and len(industry) > 1 and 'Unknown' in industry:
industry.remove('Unknown')
return industry
def is_company(self):
""" True if any terms are found in the original name """
if self.entity_type: return True
else: return False
def as_dict(self):
""" Returns a dictionary of all the attributes """
_dict = {'is_company' : self.is_company(),
'original_name' : self.original_name,
'clean_name' : self.clean_name,
'entity_type' : self.entity_type,
'industry' : self.industry,
'country' : self.country}
return _dict
def __repr__(self):
""" Format the object's print: CleanCorp([original_name]) """
_str = self.original_name
return f'CleanCorp([{_str}])'