-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshow_rfm.py
159 lines (129 loc) · 4.58 KB
/
show_rfm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
"""
@author: gtwell
"""
import pandas as pd
import matplotlib.pyplot as plt
import copy
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import ipdb
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from pylab import mpl
import argparse
mpl.rcParams['font.sans-serif'] = ['FangSong']
mpl.rcParams['axes.unicode_minus'] = False
def load_dataset(dataPath):
data = pd.read_csv(dataPath,
names=['vip', 'L', 'firstrq', 'lastrq','R','totalbuytimes',
'lastyearbuytimes', 'totalamt', 'lastyearamt'])
data = data[~data['firstrq'].str.contains('N')]
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
new_data = data.loc[:10000, ['R', 'lastyearbuytimes', 'totalamt']]
new_data = new_data.sample(frac=1).reset_index(drop=True)
data_copy = new_data.copy().apply(pd.to_numeric)
new_data = preprocessing.scale(new_data, axis=0)
new_data = pd.DataFrame(new_data, columns=['r', 'f', 'm'])
# new_data.columns = ['r', 'f', 'm']
print(data_copy.head(3))
print(new_data.head(3))
out = (new_data, data_copy)
return out
def train_kmeans(data, K):
"""args:
data: train set
K: k-means the K
return:
cluster centers"""
kmodel = KMeans(n_clusters = K)
kmodel.fit(data)
# print(kmodel.cluster_centers_)
# print(len(kmodel.labels_))
out = (kmodel.cluster_centers_, kmodel.labels_)
return out
def save_file(file_name, x_train, kmodel):
r1 = pd.Series(kmodel.labels_).value_counts()
r2 = pd.DataFrame(kmodel.cluster_centers_)
r = pd.concat([r2, r1], axis=1)
r.columns = list(x_train.columns) + ['类别数目']
r.to_csv('./counts.csv', index=False)
add_label = pd.concat([x_train, pd.Series(kmodel.labels_, index=x_train.index)], axis=1)
add_label.columns = list(x_train.columns) + ['聚类类别']
add_label.to_csv(file_name, index=False)
def plot_radar(centers, K):
"""args:
centers: return from train_kmeans"""
# 使用ggplot的绘图风格
plt.style.use('ggplot')
# 构造数据
values = []
for i in range(K):
values.append(centers[i])
feature = ['距离上一次购买天数','频率','总金额']
N = len(values[0])
# 设置雷达图的角度,用于平分切开一个圆面
angles=np.linspace(0, 2*np.pi, N, endpoint=False)
# 为了使雷达图一圈封闭起来,需要下面的步骤
for i in range(K):
values[i] = np.concatenate((values[i],[values[i][0]]))
angles=np.concatenate((angles,[angles[0]]))
# 绘图
fig=plt.figure()
ax = fig.add_subplot(111, polar=True)
for i in range(K):
# 绘制折线图
ax.plot(angles, values[i], 'o-', linewidth=2, label = '客户{}'.format(i))
# 填充颜色
ax.fill(angles, values[i], alpha=0.25)
# 添加每个特征的标签
ax.set_thetagrids(angles * 180/np.pi, feature)
# 设置雷达图的范围
# ax.set_ylim(0,5)
# 添加标题
plt.title('不同客户聚类')
# 添加网格线
ax.grid(True)
# 设置图例
plt.legend(loc = 'best')
# 显示图形
plt.show()
def plot_3d(train, kmodel_label):
"""args:
train: r f m three dimension vector
kmodel.labels_: predicted labels
return a 3D plot"""
add_label = pd.concat([train, pd.Series(kmodel_label, index=train.index)], axis=1)
r = add_label.iloc[:, 0]
f = add_label.iloc[:, 1]
m = add_label.iloc[:, 2]
print(r.head(5))
print(f.head(5))
print(m.head(5))
label = add_label.iloc[:, 3]
print(label.head(5))
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(r, f, m, c=label)
ax.set_xlabel('距离上一次购买天数')
ax.set_ylabel('频率')
ax.set_zlabel('总金额')
ax.view_init(azim=235)
plt.show()
if __name__ == '__main__':
input_file = './data.csv'
K = 5
parser = argparse.ArgumentParser("kmeans to different customs")
parser.add_argument('--input_file', type=str, default=input_file, help="the data from database")
parser.add_argument('--K', type=int, default=K, help='the K in kmeans algorithm')
args = parser.parse_args()
train_data, train_data_orig = load_dataset(args.input_file)
print(len(train_data))
centers, kmodel_label = train_kmeans(train_data, args.K)
np.set_printoptions(suppress=True)
# save_file('./add_label.csv', train_data, kmodel)
plot_radar(centers, args.K)
# plot_3d(train_data_orig, kmodel)
plot_3d(train_data_orig, kmodel_label)