-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilarity.py
203 lines (162 loc) · 5.04 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
from shapely import wkt
import sys
def parseArgs():
"""Parse the system arguments into a list.
Finds all the arguments in the form key=value and assign them to a list.
returns list
"""
args = sys.argv
arguments = {}
for arg in args:
if arg.find('=') != -1:
elem = arg.split('=')
arguments[elem[0]] = elem[1]
return arguments
def isNaN(value):
"""Checks if value is NaN.
returns boolean
"""
return value != value
def SameWKT(arg_by_ref):
"""Finds POIs with the same geometry.
It removes from dataset all POIs with the same geometry and places them in a new dataset.
arg_by_ref -- The array containing as first item the initial dataset
returns dataset -- The dataset containing POIs with the same geometry
"""
data = arg_by_ref[0]
unique = data.WKT.unique()
definition = list(data.columns.values)
P = pd.DataFrame(columns=definition)
D = pd.DataFrame(columns=definition)
for wkt in unique:
row = data.loc[data.WKT == wkt]
length = row.WKT.count()
if length == 1:
P = P.append(row)
else:
D = D.append(row)
arg_by_ref[0] = P
return D
def findMatches(arg_by_ref, checkForName=False):
"""Finds POIs with the same geometry in 2 datasets.
For each POI in the first dataset, check whether there is a corresponding POI in the 2nd one.
If it exists, move the POI from the second dataset to a resulting dataset B. In any case, the
POIs from the first dataset are going to be moved in the resulting dataset A.
arg_by_ref array -- The array containing the 2 datasets
checkForName boolean -- Whether to also check for same name
returns tuple -- The two resulting datasets
"""
dataA = arg_by_ref[0]
dataB = arg_by_ref[1]
definition = list(dataA.columns.values)
res_A = pd.DataFrame(columns=definition)
res_B = pd.DataFrame(columns=definition)
for index, poiA in dataA.iterrows():
wkt = poiA.WKT
if checkForName:
poiB = dataB.loc[(dataB.WKT == wkt) & (dataB[NAME] == poiA[NAME])]
else:
poiB = dataB.loc[dataB.WKT == wkt]
exists = (poiB.WKT.count() > 0)
if exists:
res_B = res_B.append(poiB)
dataB = dataB.drop(poiB.index)
res_A = res_A.append(poiA)
dataA = dataA.drop(index)
arg_by_ref[0] = dataA
arg_by_ref[1] = dataB
return (res_A, res_B)
def calculateScore(score, dataA, dataB, checkForName=False):
"""Calculates the score comparing the corresponding fields of 2 datasets.
score double -- The initial score
dataA dataframe -- The first dataset
dataB dataframe -- The second dataset
returns double -- The calculated score
"""
penalties = {
'WKT': 1.0,
NAME: 0.5,
ADDRESS: 0.3,
PHONE: 0.1
}
columns = list(dataA.columns.values)
for indexA, poiA in dataA.iterrows():
if (checkForName):
candIDates = dataB.loc[(dataB.WKT == poiA.WKT) & (dataB[NAME] == poiA[NAME])]
else:
candIDates = dataB.loc[dataB.WKT == poiA.WKT]
if candIDates.index.size == 0:
continue
poiB = candIDates.loc[candIDates.index[0]]
score += 1
for field in columns:
if (isNaN(poiA[field]) and isNaN(poiB[field])) or str(poiA[field]).lower() == str(poiB[field]).lower() or poiB[field] == '' or isNaN(poiB[field]):
continue
if field in penalties:
penalty = penalties[field]
else:
penalty = 0.1
# print('Penalty for', field, ':', penalty)
# print(poiA[field], poiB[field])
# print()
score -= penalty
return score
"""Main procedure
Arguments:
ID -- The name of the unique key of the datasets
fileA -- The full path of the first dataset
fileB -- The full path of the second dataset
name -- The 'name' field of the datasets
address -- The 'address' field of the datasets
phone -- The 'phone' field of the datasets
"""
args = parseArgs()
ID = args['id']
NAME = args['name']
ADDRESS = args['address']
PHONE = args['phone']
print(args['fileA'], '-', args['fileB'])
# exit()
MP = pd.read_csv(args['fileA'], index_col=ID)
SP = pd.read_csv(args['fileB'], index_col=ID)
geometry = MP.WKT.apply(wkt.loads)
geometry = geometry.apply(wkt.dumps, rounding_precision=5)
MP.WKT = geometry
geometry = SP.WKT.apply(wkt.loads)
geometry = geometry.apply(wkt.dumps, rounding_precision=5)
SP.WKT = geometry
numberOfPOIs = MP.index.size
wrapper = [MP]
MD = SameWKT(wrapper)
MP = wrapper[0]
wrapper = [SP]
SD = SameWKT(wrapper)
SP = wrapper[0]
# Initialize the score
score = 0.0
# Find matching geometries for the 2 datasets (single geometry)
wrapper = [MP, SP]
(MM, SM) = findMatches(wrapper)
# print(MM, SM)
MP = wrapper[0]
SP = wrapper[1]
# Add 1 point for each match
score += 1.0*SM.index.size
# Remove 0.5 point for each remaining unmatched POI
# print('Remove', 0.5*SP.index.size, 'points')
score -= 0.5*SP.index.size
# Examine the remaining features of the matched POIs
score = calculateScore(score, MM, SM)
# Same procedure for double geometries
wrapper = [MD, SD]
(MDM, SDM) = findMatches(wrapper, checkForName=True)
MD = wrapper[0]
SD = wrapper[1]
score += 1.0*SDM.index.size
score = calculateScore(score, MDM, SDM, checkForName=True)
# print('Remove', 0.5*SD.index.size, 'points')
score -= 0.5*SD.index.size
# Normalize the score
score = score/(2*numberOfPOIs)
print('Score:', round(score*100, 1))