SVM_Multi_class_classification
import numpy as np
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1
p=np.random.randn(100,2)
#将中心点移动到(5,0),作为第0类
for i in range(100):
p[i][0]+=5
p[i][1]+=0
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1,作为第1类
f=np.random.randn(100,2)
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1
t=np.random.randn(100,2)
#将中心点移动到(3.5,3.5),作为第2类
for i in range(100):
t[i][0]+=3.5
t[i][1]+=3.5
import pandas as pd
#将np数组转换成dataframe
df_p=pd.DataFrame(p,columns=['x','y'])
#加上标签z,1类标签1
df_p['z']=0
#将np数组转换成dataframe
df_f=pd.DataFrame(f,columns=['x','y'])
#加上标签z,0类标签0
df_f['z']=1
#将np数组转换成dataframe
df_t=pd.DataFrame(t,columns=['x','y'])
#加上标签z,2类标签2
df_t['z']=2
#将正负类合并成一个dataframe
res = pd.concat([df_p, df_f,df_t], axis=0)
res
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
x | y | z | |
---|---|---|---|
0 | 6.078669 | -0.517253 | 0 |
1 | 5.302986 | 1.129988 | 0 |
2 | 5.009613 | 1.225857 | 0 |
3 | 3.758161 | -0.315786 | 0 |
4 | 5.360210 | -0.375185 | 0 |
5 | 5.710155 | -0.496786 | 0 |
6 | 4.579099 | -0.531689 | 0 |
7 | 2.649697 | 1.835499 | 0 |
8 | 5.276772 | -1.717779 | 0 |
9 | 4.168785 | -0.901316 | 0 |
10 | 3.744492 | -0.281852 | 0 |
11 | 4.795925 | -1.438646 | 0 |
12 | 4.395546 | 1.470029 | 0 |
13 | 6.504895 | 0.107717 | 0 |
14 | 4.669277 | -1.802486 | 0 |
15 | 5.177591 | -0.715611 | 0 |
16 | 6.100084 | -0.803093 | 0 |
17 | 4.173074 | 0.849082 | 0 |
18 | 4.646888 | 0.050525 | 0 |
19 | 5.725629 | -0.345751 | 0 |
20 | 3.898579 | 0.587148 | 0 |
21 | 6.102218 | -2.250628 | 0 |
22 | 4.779940 | 1.648752 | 0 |
23 | 2.945206 | -0.156092 | 0 |
24 | 3.804919 | -1.226393 | 0 |
25 | 3.148943 | -0.853685 | 0 |
26 | 4.480589 | 2.014021 | 0 |
27 | 5.560275 | 2.137762 | 0 |
28 | 6.887921 | 1.943966 | 0 |
29 | 6.227569 | 0.028383 | 0 |
... | ... | ... | ... |
70 | 4.143435 | 2.546719 | 2 |
71 | 2.604495 | 3.292901 | 2 |
72 | 4.121899 | 2.666432 | 2 |
73 | 4.044238 | 3.775474 | 2 |
74 | 2.603628 | 4.173138 | 2 |
75 | 3.475392 | 3.377459 | 2 |
76 | 2.986226 | 4.487069 | 2 |
77 | 3.582220 | 4.475310 | 2 |
78 | 2.436692 | 4.918058 | 2 |
79 | 4.917040 | 3.606541 | 2 |
80 | 3.148297 | 3.048453 | 2 |
81 | 4.473144 | 4.619293 | 2 |
82 | 5.154484 | 4.372903 | 2 |
83 | 3.707397 | 3.668351 | 2 |
84 | 4.442523 | 2.497338 | 2 |
85 | 2.101259 | 3.225132 | 2 |
86 | 3.787636 | 4.148101 | 2 |
87 | 3.319200 | 3.041185 | 2 |
88 | 3.416234 | 2.522239 | 2 |
89 | 3.406666 | 3.070693 | 2 |
90 | 4.142204 | 2.908948 | 2 |
91 | 4.955018 | 2.451665 | 2 |
92 | 4.249549 | 2.185492 | 2 |
93 | 5.728465 | 3.343337 | 2 |
94 | 3.241553 | 2.228639 | 2 |
95 | 2.238972 | 3.639555 | 2 |
96 | 2.075077 | 3.650759 | 2 |
97 | 3.908256 | 2.855201 | 2 |
98 | 3.304940 | 2.721721 | 2 |
99 | 3.352008 | 3.768798 | 2 |
300 rows × 3 columns
import matplotlib.pyplot as plt
#绘制出数据集的散点图
plt.scatter(res['x'], res['y'], c=res['z'],cmap=plt.cm.Paired)
plt.xlabel('x')
plt.ylabel('y')
plt.title('random data')
plt.show()
#重置数据集索引,应为合并后数据索引重复
res.reset_index(inplace=True, drop=True)
#取索引是4的整数倍的的数据做为测试集
test=res[(res.index%4==0)]
#取索引不是4的整数倍的的数据做为训练集
train=res[(res.index%4!=0)]
from sklearn import svm
#新建SVC分类器,核函数是线性核,C将决定间隔的大小C越大间隔越小
#训练数据
X=train[['x','y']]
#选择训练集的标签
y = train['z']
#svm分类器,线性核
clf = svm.SVC(kernel='linear', C=1)
#训练
clf.fit(X, y)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
plt.scatter(X['x'], X['y'], c=y,cmap=plt.cm.Paired)
# plot the decision function
ax = plt.gca()
#获得坐标系边界
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# 0-1生成300个点
xx = np.linspace(xlim[0], xlim[1], 300)
yy = np.linspace(ylim[0], ylim[1], 300)
#生成网格坐标
YY, XX = np.meshgrid(yy, xx)
#将网格坐标组成样本
xy = np.vstack([XX.ravel(), YY.ravel()]).T
#求xy到分界线的函数距离
Z = clf.predict(xy).reshape(XX.shape)
# 绘制等高线线
ax.contour(XX, YY, Z, colors='k')
# 绘制出支持向量
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,linewidth=1, facecolors='none', edgecolors='k')
plt.show()
#预测点
clf.predict([[2,0],[2.5,0],[3,0]])
array([1, 0, 0], dtype=int64)
#训练集得分
clf.score(X,y)
0.9866666666666667
#测试集
clf.score(test[['x','y']],test[['z']])
0.96