-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHierarchical_clustering.py
92 lines (75 loc) · 3.27 KB
/
Hierarchical_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Ilkin Huseynli - 09.10.17
# Hierarchical_clustering
import numpy as np
def cluster(mat, sim_percent=0.7):
"""
Single Linkage Hierarchical Clustering for given similarity matrix
:param mat: numpy.ndarray , similarity matrix
:param sim_percent: threshold percentage for clustering
:return: returns list of labels. Each row demonstrates a cluster
"""
label = [] # keeps clustered ids. Each row is a cluster
sim_value = np.amax(mat) # largest similarity value of the list
threshold = sim_percent*sim_value # threshold is 70% percent of max value
count = mat.shape[0]
while sim_value >= threshold:
# row and column value(s) of the list
rows, cols = np.where(mat == sim_value)
# if number of largest value is more than one
for idx in range(len(rows)):
index_of_Cluster1 = 0
# print(rows[idx],cols[idx],">>",mat[rows[idx]][cols[idx]])
# True if first value added to the list
isFirstInList = False
isSecondInList = False
value1 = rows[idx]
value2 = cols[idx]
for line in label:
# value1 is already in label list
if value1 in line:
isFirstInList = True
# if value2 is in the same cluster with value 1
if value2 in line:
break # don't do anything
else:
index_of_Cluster2 = 0
# check if value2 is in another cluster
for other_line in label:
if value2 in other_line:
label[index_of_Cluster1] += label[index_of_Cluster2]
del label[index_of_Cluster2]
isSecondInList = True
break
index_of_Cluster2 += 1
# value2 is not in the the list
if not isSecondInList:
# append value2
label[index_of_Cluster1].append(value2)
index_of_Cluster1 += 1
# first value is not in the list but we must check if second in the list
if not isFirstInList:
index_of_Cluster2 = 0
isSecondInList = False
for line in label:
if value2 in line:
label[index_of_Cluster2].append(value1)
isSecondInList = True
break
index_of_Cluster2 += 1
# both values are not in the list
if not isFirstInList and not isSecondInList:
label.append([value1, value2])
# changing max value to prevent double check
mat[rows[idx]][cols[idx]] = -1.0
# print(rows[idx],cols[idx],">>",mat[rows[idx]][cols[idx]])
# next largest value of the list
sim_value = np.amax(mat)
# adding all remaining indexes as a 1 sample cluster
for i in range(0, count):
isAppended = False
for line in label:
if i in line:
isAppended = True
if not isAppended:
label.append([i])
return label