-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnormalize_company.py
198 lines (166 loc) · 6.04 KB
/
normalize_company.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import re
from thefuzz import fuzz
from typing import Dict, List, Set
def load_company_mappings() -> Dict[str, str]:
"""Define common company name variations and their normalized forms."""
return {
"WM SUPERCENTER": "WALMART",
"WALMART.COM": "WALMART",
"WAL-MART": "WALMART",
"WALMART WALMART": "WALMART",
"WALMART 800": "WALMART",
"SAMSCLUB": "SAMS CLUB",
"HEB ONLINE": "HEB",
"DD DOORDASH": "DOORDASH",
"DOORDASH*": "DOORDASH",
"IC* INSTACART": "INSTACART",
"ATT*": "AT&T",
"AMEX": "AMERICAN EXPRESS",
"PP*": "PAYPAL",
"SQ *": "SQUARE",
"GRUBHUB*": "GRUBHUB",
"UBER *": "UBER",
"APPLE.COM": "APPLE",
"GOOGLE *": "GOOGLE",
}
def should_exact_match(description: str) -> bool:
"""Check if this description should only be matched exactly."""
keywords = ['ZELLE', 'INTERNET TRANSFER', 'TRANSFER TO']
return any(keyword in description.upper() for keyword in keywords)
def initial_clean(name: str) -> str:
"""Initial cleaning of company names."""
if not name:
return ""
# If it's a transfer-type transaction, return it as-is
if should_exact_match(name):
return name.upper().strip()
result = name.upper().strip()
# Remove common patterns
patterns_to_remove = [
r'\s+\d{6,}',
r'#\d+',
r'F\d{4,}',
r'\*[A-Z0-9]+',
r'\s+PMT\s*$',
r'\s+RETRY\s+PYMT\s*$',
r'ACH\s+PMT\s*$',
r'AUTO\s+PYMT\s*$',
r'\s+MOBILE\s+PMT\s*$',
r'\s+ONLINE\s+PMT\s*$',
r'\b\d{3}-\d{3}-\d{4}\b',
r'\b\d{1,3}\s*[A-Z\s]+ST[A-Z\s]*\b',
r'\s+\d{1,5}\s+[A-Z\s]+(?:STREET|ST|AVENUE|AVE|ROAD|RD|DRIVE|DR|LANE|LN|BLVD|PARKWAY|PKY|HWY)\b',
r'(?<=\s)\d{5}(?:-\d{4})?(?=\s|$)',
r'WWW\.[A-Z0-9.-]+\.[A-Z]{2,}',
r'\.COM/?[A-Z]*\s*$',
r',\s*[A-Z]{2},\s*US[A]?$',
r',\s*[A-Z]{2}\s*$',
]
for pattern in patterns_to_remove:
result = re.sub(pattern, '', result)
result = ' '.join(result.split())
return result.strip('* ')
def find_best_match(name: str, known_companies: Set[str], threshold: int = 85) -> str:
"""Find the best matching company name using fuzzy matching."""
best_ratio = 0
best_match = name
for known in known_companies:
ratio = fuzz.ratio(name, known)
if ratio > best_ratio and ratio >= threshold:
best_ratio = ratio
best_match = known
return best_match
def build_company_groups(descriptions: List[str]) -> Dict[str, str]:
"""Build groups of similar company names."""
cleaned_names = [initial_clean(desc) for desc in descriptions]
unique_companies = set(cleaned_names)
mapping = {}
processed = set()
base_mappings = {
"WM SUPERCENTER": "WALMART",
"WALMART.COM": "WALMART",
"WAL-MART": "WALMART",
"WALMART WALMART": "WALMART",
"WALMART 800": "WALMART",
"SAMSCLUB": "SAMS CLUB",
"HEB ONLINE": "HEB",
"DD DOORDASH": "DOORDASH",
"DOORDASH*": "DOORDASH",
"IC* INSTACART": "INSTACART",
"ATT*": "AT&T",
"AMEX": "AMERICAN EXPRESS",
"PP*": "PAYPAL",
"SQ *": "SQUARE",
"GRUBHUB*": "GRUBHUB",
"UBER *": "UBER",
"APPLE.COM": "APPLE",
"GOOGLE *": "GOOGLE",
}
for name in unique_companies:
if name in processed:
continue
if not name:
continue
# Transfer transactions get exact matching
if should_exact_match(name):
mapping[name] = name
processed.add(name)
continue
# Check predefined mappings
found = False
for key, value in base_mappings.items():
if name.startswith(key):
mapping[name] = value
processed.add(name)
found = True
break
if found:
continue
# Find similar names using fuzzy matching
similar_group = []
for other in unique_companies:
if other not in processed and not should_exact_match(other) and fuzz.ratio(name, other) > 85:
similar_group.append(other)
processed.add(other)
if similar_group:
canonical = min(similar_group, key=len)
for variant in similar_group:
mapping[variant] = canonical
return mapping
def normalize_company_with_fuzzy(name: str, company_groups: Dict[str, str]) -> str:
"""
Normalize company name using fuzzy matching and predefined groups.
Args:
name: Raw company name
company_groups: Mapping of company name variants to canonical forms
Returns:
Normalized company name
"""
cleaned = initial_clean(name)
# If we have an exact match in our groups, use it
if cleaned in company_groups:
return company_groups[cleaned]
# If no exact match, try to find the best match
return find_best_match(cleaned, set(company_groups.values()))
def create_normalizer(transactions: List[str]):
"""Create a normalizer function pre-loaded with transaction data."""
company_groups = build_company_groups(transactions)
def normalizer(name: str) -> str:
return normalize_company_with_fuzzy(name, company_groups)
return normalizer
# Example usage:
def test_normalizer(transactions: List[str]):
"""Test the normalizer with actual transaction data."""
normalizer = create_normalizer(transactions)
test_cases = [
"WALMART.COM 8009256278 702 SW 8TH ST BENTONVILLE, AR, US",
"WALMART 800 BENTONVILLE",
"WALMART WALMART.COM",
"WAL-MART #2637",
"WM SUPERCENTER #1129",
]
print("Testing normalizer with Walmart variations:")
for test in test_cases:
normalized = normalizer(test)
print(f"\nOriginal: {test}")
print(f"Normalized: {normalized}")