发布于2019-08-07 12:53 阅读(2303) 评论(0) 点赞(3) 收藏(4)
描述过几天再写,先放代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
import random
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
list_all = []
cluster1_set = []
cluster2_set = []
cluster3_set = []
list_data = [] # 数据
def read_data_set(): #处理数据集
filename = open('iris.dat', 'r', encoding='utf-8')
lines = filename.readlines()
for line in lines:
if line[0] != '@': # 去除数据集前面的描述语句
list_temp = line.strip().split(', ')
list_float = []
for i in list_temp[0:-1]:
list_float.append(float(i)) #string转化为float
list_float.append(list_temp[-1])
list_all.append(list_float)
pass
# 计算每一迭代的新中心点
def aver_list(list_temp0):
sum_0 = 0
sum_1 = 0
sum_2 = 0
sum_3 = 0
for a1 in list_temp0:
sum_0 += a1[0]
sum_1 += a1[1]
sum_2 += a1[2]
sum_3 += a1[3]
return [sum_0 / len(list_temp0), sum_1 / len(list_temp0), sum_2 / len(list_temp0), sum_3 / len(list_temp0)]
def k_means():
# 找初始的三个簇中心
a = random.randint(0, 49)
b = random.randint(50, 99)
c = random.randint(100, 149)
cluster1 = list_all[a][0:-1]
cluster2 = list_all[b][0:-1]
cluster3 = list_all[c][0:-1]
print(cluster1)
print(cluster2)
print(cluster3)
count = 0
while True:
for list_data in list_all[:]:
# 欧氏距离
sum1 = ((cluster1[0] - list_data[0]) ** 2 + (cluster1[1] - list_data[1]) ** 2 \
+ (cluster1[2] - list_data[2]) ** 2 + (cluster1[3] - list_data[3]) ** 2) ** 0.5
sum2 = ((cluster2[0] - list_data[0]) ** 2 + (cluster2[1] - list_data[1]) ** 2 \
+ (cluster2[2] - list_data[2]) ** 2 + (cluster2[3] - list_data[3]) ** 2) ** 0.5
sum3 = ((cluster3[0] - list_data[0]) ** 2 + (cluster3[1] - list_data[1]) ** 2 \
+ (cluster3[2] - list_data[2]) ** 2 + (cluster3[3] - list_data[3]) ** 2) ** 0.5
if sum1 == min(sum1, sum2, sum3):
cluster1_set.append(list_data)
elif sum2 == min(sum1, sum2, sum3):
cluster2_set.append(list_data)
elif sum3 == min(sum1, sum2, sum3):
cluster3_set.append(list_data)
# 中心点
average1 = aver_list(cluster1_set)
average2 = aver_list(cluster2_set)
average3 = aver_list(cluster3_set)
# 计算中心点和上一任中心点之间的距离,判断是否停止迭代
sum1 = ((cluster1[0] - average1[0]) ** 2 + (cluster1[1] - average1[1]) ** 2 \
+ (cluster1[2] - average1[2]) ** 2 + (cluster1[3] - average1[3]) ** 2) ** 0.5
sum2 = ((cluster2[0] - average2[0]) ** 2 + (cluster2[1] - average2[1]) ** 2 \
+ (cluster2[2] - average2[2]) ** 2 + (cluster2[3] - average2[3]) ** 2) ** 0.5
sum3 = ((cluster3[0] - average3[0]) ** 2 + (cluster3[1] - average3[1]) ** 2 \
+ (cluster3[2] - average3[2]) ** 2 + (cluster3[3] - average3[3]) ** 2) ** 0.5
if (sum1 < 1e-12) & (sum2 < 1e-12) & (sum3 < 1e-12): # 临界值
break
else:
# 中心点成为新的簇点
cluster1 = average1
cluster2 = average2
cluster3 = average3
count += 1
print("第" +str(count) + "次迭代")
print(cluster1)
print(cluster2)
print(cluster3)
# 达到一定的迭代次数,终止
if count > 100000: # 迭代次数
break
# 清空 簇list
cluster1_set.clear()
cluster2_set.clear()
cluster3_set.clear()
print("--------------------------")
print(cluster1)
print(cluster2)
print(cluster3)
pass
# 计算分类正确率
def sort_correctly():
sort1 = 0
sort2 = 0
sort3 = 0
sum = 0
# 簇1的正确数
for list_wrong1 in cluster1_set[:]:
if list_wrong1[-1] == "Iris-setosa":
sort1 += 1
elif list_wrong1[-1] == "Iris-versicolor":
sort2 += 1
else:
sort3 += 1
sum += max(sort1, sort2, sort3)
sort1 = 0
sort2 = 0
sort3 = 0
# 簇2的正确数
for list_wrong2 in cluster2_set[:]:
if list_wrong2[-1] == "Iris-setosa":
sort1 += 1
elif list_wrong2[-1] == "Iris-versicolor":
sort2 += 1
else:
sort3 += 1
sum += max(sort1, sort2, sort3)
sort1 = 0
sort2 = 0
sort3 = 0
# 簇3的正确数
for list_wrong3 in cluster3_set[:]:
if list_wrong3[-1] == "Iris-setosa":
sort1 += 1
elif list_wrong3[-1] == "Iris-versicolor":
sort2 += 1
else:
sort3 += 1
sum += max(sort1, sort2, sort3)
num_right = "%.2f%%" % ((sum / len(list_all)) * 100)
print("聚类分析的正确率为" + str(num_right))
pass
def dimension_reduction():
#数据和属性一一对应
for i in range(len(list_all)):
list_data.append(list_all[i][0:-1])
pca = PCA(2)
pca.fit(list_data)
low_list_data = pca.transform(list_data)
return low_list_data
pass
def print_scatter():
low_data = dimension_reduction()
cluster1_set_x = []
cluster1_set_y = []
cluster2_set_x = []
cluster2_set_y = []
cluster3_set_x = []
cluster3_set_y = []
for i in range(len(list_all)):
if list_all[i] in cluster1_set:
cluster1_set_x.append(low_data[i][0])
cluster1_set_y.append(low_data[i][1])
elif list_all[i] in cluster2_set:
cluster2_set_x.append(low_data[i][0])
cluster2_set_y.append(low_data[i][1])
elif list_all[i] in cluster3_set:
cluster3_set_x.append(low_data[i][0])
cluster3_set_y.append(low_data[i][1])
plt.scatter(cluster1_set_x, cluster1_set_y, c='red', marker='+')
plt.scatter(cluster2_set_x, cluster2_set_y, c='green', marker='*')
plt.scatter(cluster3_set_x, cluster3_set_y, c='blue', marker='s')
plt.show()
pass
if __name__ == "__main__":
read_data_set()
k_means()
sort_correctly()
print_scatter()
作者:追梦骚年
链接:https://www.pythonheidong.com/blog/article/11046/0d7c26ce8670165bf2d7/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!