可能是上手最合适的Knn项目 源代码注释

可能是上手最合适的Knn项目
KNN的中文,有标签(监督),样本数量小于100K的非文本数据(位置相近,不是逻辑相近,离散)
近朱者赤近墨者黑的意思
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt
from pylab import *
"""
http://scikit-learn.org/dev/modules/generated/sklearn.datasets.load_iris.html
"""
iris = datasets.load_iris() # 3类*50样本*4特征 鸢yuan尾属植物
# 总数据的0.5作为训练数据 另外0.5作为测试数据
train_data, test_data, train_label, test_label = train_test_split(iris.data, iris.target, train_size=0.5)
# 初始化设定KNN分类器,括号里面可以使用参数,可以看函数定义或者官网n_neighbors weights algorithm 等这些参数可以定义
knn = KNeighborsClassifier()
knn.fit(train_data, train_label) # sklearn经典命令~~ 向分类器中喂数据
prediction = knn.predict(test_data) # 预测
# 打印预测结果
print('预测值', prediction)
print('实际值', test_label)
score = len(prediction)
for i in range(len(prediction)):
if prediction[i] != test_label[i]:
score -= 1
print('正确率', score/len(prediction), '错误个数', len(prediction)-score)
# 查看数据集,和预测没有关系
x_index = 0
y_index = 3
mpl.rcParams['font.sans-serif'] = ['SimHei']
colors = ['red', 'orange', 'green']
names_sc = ['清风藤', '云芝', '锦葵']
feature_names_sc = ['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度']
# setosa清风藤 versicolor云芝 virginica锦葵
for label, color in zip(range(len(names_sc)), colors):
plt.scatter(iris.data[iris.target == label, x_index],
iris.data[iris.target == label, y_index],
label=names_sc[label],
c=color)
plt.xlabel(feature_names_sc[x_index])
plt.ylabel(feature_names_sc[y_index])
plt.legend(loc='upper left')
plt.show()