鸢尾花、NBA数据集KNN分类

x33g5p2x  于2021-10-20 转载在 其他  
字(5.6k)|赞(0)|评价(0)|浏览(315)
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.datasets import load_iris
plt.rcParams['font.sans-serif'] = ['STFangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
iris = load_iris()
#加载iris数据集
xtrain = iris.data
# iris 数据集有四个部分:1.iris.data是样本特征
xtrain.shape
(150, 4)
ylabel = iris.target
# 2.iris.target 是样本特征对应的标签
df = pd.DataFrame(xtrain,columns=iris.feature_names)
# 将数据写成dataframe格式
df.head()
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2
df['label'] = iris.target
# 在dataframe 中加入标签列
df.head()
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)label
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
X = np.array([5,2.7,2.1,0.4])
# 定义未知的数据
# 散点图,用随意两个特征,三个不同的花朵,用三个颜色
# plt.scatter(df[df['label']==0]['sepal length (cm)'],df[df['label']==0]['sepal width (cm)'])
# plt.scatter(df[df['label']==1]['sepal length (cm)'],df[df['label']==1]['sepal width (cm)'])
# plt.scatter(df[df['label']==2]['sepal length (cm)'],df[df['label']==2]['sepal width (cm)'])
for i in range(3):
    plt.scatter(iris.data[ylabel==i,2],iris.data[ylabel==i,3])
# 用花朵类型:012做循环,画图

dist = np.sum((df.iloc[:,:4]-X)**2,axis =1)
# 计算未知样本X 与 已知样本xtrain 的距离
df['dist']= dist
# 将距离添加在主表df上
df.head()
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)labeldist
05.13.51.40.201.18
14.93.01.40.200.63
24.73.21.30.201.02
34.63.11.50.200.72
45.03.61.40.201.34
df.sort_values(by='dist')[:3]['label'].mode().index[0]
# 对距离进行排序,并取前3(k)个最近的点,将取出的点的标签取众数,众数对应的index为预测标签
0
X = np.array([5.3,2.7,6.4,1.3])
xtrain = iris.data
ylabel = iris.target
def knn(xtrain,ylabel,k,X):
    
    "实现knn:xtrain(nparray) m个特征的样本集;y样本集的标签;k邻居个数;X未知样本的特征"
    "实现回归"
    
    knn_list = []  # knn_list = [(dist,ylabel),(),()]
    for i in range(k):    
        dist = np.sum((xtrain[i] - X)**2)  #dist :返回X和样本点的距离
        knn.list.append((dist,ylabel[i]))
        
    for i in range(k,len(ylabel)):
        dist = np.sum((xtrain[i] - X)**2) # 计算第四个的距离
        max_index = knn_list.index(max(knn_list,key = lambda x:x[0]))
        
        if knn_list[max_index][0]>dist:
            knn_list[max_index] = (dist,ylabel[i])
        #如果原本的点的距离大于新点的距离,那么,我们就把新点的距离和标签,替换到knn_list里面。
        #knn_list[max_index][0] :list里的最大距离,
        
    knn = [k[-1] for k in knn_list]
    y_pre = np.mean(knn)
    # 返回标签的平均值
    return y_pre
# labels = [k[-1] for k in knn_list]
labels =[2,2,1,7,7,7]
counts = Counter(labels)
sorted(counts.items() ,key = lambda x:x[1])[-1][0]
7
def KNN(x_train,y_train,n_neighbours,X):
    
    knn_list = []
    #K个邻居,【(X和x_train第一个样本点的距离,y_train:第一个样本点的标签)】
    for i in range(n_neighbours):
        dist = np.linalg.norm(X - x_train[i],ord = 2)
        #dist :返回X和样本点的距离
        knn_list.append((dist,y_train[i]))
        
        #将结果存入knn_list

    for i in range(n_neighbours,len(x_train)):
        #从第四(k)个点开始迭代
        max_index = knn_list.index(max(knn_list , key = lambda x:x[0]))
        #返回原本knn_list的最大dist的索引 
        dist = np.linalg.norm(X - x_train[i],ord = 2)
        #计算X和第4(k)个点的距离
        if knn_list[max_index][0]>dist:
            knn_list[max_index] = (dist,y_train[i])
        #如果原本的点的距离大于新点的距离,那么,我们就把新点的距离和标签,替换到knn_list里面。

    knn = [k[-1] for k in knn_list]
    #knn 返回,原本knn_list里的所有标签
    max_count = np.mean(knn)
    # 返回标签的平均值
      
# count_pairs = Counter(knn)
# max_count = sorted(count_pairs.items() ,key = lambda x:x[1])[-1][0]
# #计算所有标签的频数,并返回频数最大的标签

    return max_count

鸢尾花数据集KNN分类

1.1导入数据

x = iris.data   # 特征
y = iris.target   # 标签

1.2 划分数据集

# 划分数据集
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=100)
# xtrain 训练集特征
# xtest 测试集特征
# ytrain 训练集标签
# ytest 测试集标签

1.3 导入模型

from sklearn.neighbors import KNeighborsClassifier   # KNN分类
from sklearn.neighbors import KNeighborsRegressor   # KNN回归
# 1.实例化
knn_clf = KNeighborsClassifier(n_neighbors=5,p=2)  #n_neighbors:几个邻居;p:2范数距离
# 2.带入训练集训练
knn_clf.fit(xtrain,ytrain)
KNeighborsClassifier()
# 3.预测数据
knn_clf.predict(xtest)
array([2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 1, 1, 1, 2, 2, 2, 0,
       2, 0, 1, 2, 1, 0, 1, 2, 1, 1, 1, 0, 0, 1, 0, 1, 2, 2, 0, 1, 2, 2,
       0])
# 准确率查询
knn_clf.score(xtest,ytest)
0.9777777777777777
array = []
for i in range(1,21):
    knn_clf = KNeighborsClassifier(n_neighbors=i,p=2)  #n_neighbors:几个邻居;p:2范数距离
    knn_clf.fit(xtrain,ytrain)
    knn_clf.predict(xtest)
    array.append(knn_clf.score(xtest,ytest))
# array
# 学习曲线
labels = [i for i in range(1,21)]
plt.plot(labels,array)
plt.show()

knn = KNeighborsClassifier(7).fit(xtrain,ytrain)
knn
KNeighborsClassifier(n_neighbors=7)

NBA球员数据KNN回归

df = pd.read_excel('E:/QfPython/BI/files/NBA球员数据.xlsx')
df
球员姓名位置身高体重年龄球龄上场次数场均时间进攻能力防守能力是否入选过全明星球队胜率球队市值球员薪金
0斯蒂芬-库里得分后卫1.91862977933.3831.93343468
1勒布朗-詹姆斯大前锋2.0311332137437.7536.14083329
2保罗-米尔萨普中锋2.0311232106933.9522.71273127
3戈登-海沃德小前锋2.031032767334.4525.38252973
4布雷克-格里芬中锋2.081142866134.0327.48862951
.............................................
392乔丹-麦克雷得分后卫1.98842613710.354.864050
393德隆-威廉姆斯控球后卫1.919133115016.086.640240
394詹姆斯-迈克尔-麦卡杜大前锋2.06104242528.783.17617
395达伦-希利亚德得分后卫1.9893241399.774.10607
396CJ-威尔考克斯得分后卫1.9688262224.881.49507

397 rows × 14 columns

df.drop(columns=['是否入选过全明星','球队胜率','球队市值','位置','球员姓名'],inplace = True)
df

# 法二:
# df.select_dtypes(include='number')
身高体重年龄球龄上场次数场均时间进攻能力防守能力球员薪金
01.91862977933.3831.93343468
12.0311332137437.7536.14083329
22.0311232106933.9522.71273127
32.031032767334.4525.38252973
42.081142866134.0327.48862951
..............................
3921.98842613710.354.864050
3931.919133115016.086.640240
3942.06104242528.783.17617
3951.9893241399.774.10607
3961.9688262224.881.49507

397 rows × 9 columns

array = df.drop(columns='球员薪金')
x = []
for i in range(397):
    x.append([array.loc[i]['身高'],array.loc[i]['体重'],array.loc[i]['年龄'],array.loc[i]['球龄'],array.loc[i]['上场次数'],array.loc[i]['场均时间'],array.loc[i]['身高'],array.loc[i]['进攻能力'],array.loc[i]['防守能力']])
y = df['球员薪金']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)
knn_clf = KNeighborsClassifier(n_neighbors=5,p=2)  #n_neighbors:几个邻居;p:2范数距离
knn_clf.fit(xtrain,ytrain)
y_pre = knn_clf.predict(xtest)
plt.plot(range(len(xtest)), y_pre, label = '预测值')
plt.plot(range(len(xtest)), ytest, label = '真实值')
plt.legend()
plt.show()

相关文章