利用scikit-learn实现svm
import numpy as np
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1
p=np.random.randn(100,2)
#将中心点移动到(2.5,2.5),作为正类
for i in range(100):
p[i][0]+=2.5
p[i][1]+=2.5
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1,作为负类
f=np.random.randn(100,2)
import pandas as pd
#将np数组转换成dataframe
df_p=pd.DataFrame(p,columns=['x','y'])
#加上标签z,正类标签1
df_p['z']=1
#加上正类的颜色红
df_p['c']='red'
#将np数组转换成dataframe
df_f=pd.DataFrame(f,columns=['x','y'])
#加上标签z,负类标签0
df_f['z']=0
#加上负类的颜色红
df_f['c']='blue'
#将正负类合并成一个dataframe
res = pd.concat([df_p, df_f], axis=0)
res
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
x | y | z | c | |
---|---|---|---|---|
0 | 4.108224 | 3.848929 | 1 | red |
1 | 2.342511 | 2.953446 | 1 | red |
2 | 0.789860 | 2.732742 | 1 | red |
3 | 2.842918 | 3.590247 | 1 | red |
4 | 3.097725 | 3.576919 | 1 | red |
5 | 3.661894 | 3.350869 | 1 | red |
6 | 2.317196 | 2.281296 | 1 | red |
7 | 2.117827 | 1.821161 | 1 | red |
8 | 1.042247 | 2.022398 | 1 | red |
9 | 1.350079 | 4.057907 | 1 | red |
10 | 2.171275 | 3.204408 | 1 | red |
11 | 1.715535 | 3.081039 | 1 | red |
12 | 2.166927 | 2.049468 | 1 | red |
13 | 2.674912 | 1.479511 | 1 | red |
14 | 3.503797 | 1.912558 | 1 | red |
15 | 1.772712 | 0.220415 | 1 | red |
16 | 2.768057 | 2.251231 | 1 | red |
17 | 0.456425 | 1.681859 | 1 | red |
18 | 2.609217 | 3.171372 | 1 | red |
19 | 3.029760 | 2.948293 | 1 | red |
20 | 2.371976 | 3.085000 | 1 | red |
21 | 1.887890 | 3.481395 | 1 | red |
22 | 1.921902 | 2.226114 | 1 | red |
23 | 1.541674 | 0.752714 | 1 | red |
24 | 4.826169 | 2.502664 | 1 | red |
25 | 0.333282 | 1.014494 | 1 | red |
26 | 2.618868 | 2.003866 | 1 | red |
27 | 1.358006 | 1.559485 | 1 | red |
28 | 2.292783 | 1.349140 | 1 | red |
29 | 3.553238 | -0.243554 | 1 | red |
... | ... | ... | ... | ... |
70 | -0.073927 | -0.320674 | 0 | blue |
71 | -1.252395 | -1.242636 | 0 | blue |
72 | -0.656555 | 0.467408 | 0 | blue |
73 | 0.558789 | 1.602697 | 0 | blue |
74 | -0.242620 | 0.732277 | 0 | blue |
75 | 1.190387 | 0.718793 | 0 | blue |
76 | 0.887169 | 1.408336 | 0 | blue |
77 | 1.010919 | -0.007951 | 0 | blue |
78 | 0.939187 | -0.307889 | 0 | blue |
79 | 0.178746 | 1.282570 | 0 | blue |
80 | 0.270536 | -0.370093 | 0 | blue |
81 | 0.375603 | 1.449885 | 0 | blue |
82 | 2.155806 | 0.682002 | 0 | blue |
83 | -1.048934 | -0.018362 | 0 | blue |
84 | -0.123291 | -0.632417 | 0 | blue |
85 | -0.548397 | -0.417168 | 0 | blue |
86 | 1.197207 | 1.201911 | 0 | blue |
87 | -0.653190 | 1.413521 | 0 | blue |
88 | -0.025027 | -0.630535 | 0 | blue |
89 | -0.834319 | -0.213447 | 0 | blue |
90 | -0.261550 | 1.198524 | 0 | blue |
91 | 0.059145 | -0.478244 | 0 | blue |
92 | -1.061583 | 0.007730 | 0 | blue |
93 | 0.349603 | -0.490094 | 0 | blue |
94 | 1.070271 | -0.156812 | 0 | blue |
95 | -0.035461 | 0.849317 | 0 | blue |
96 | 0.189620 | 0.032218 | 0 | blue |
97 | -0.236485 | -0.734719 | 0 | blue |
98 | 0.810837 | 1.736817 | 0 | blue |
99 | -1.173467 | 0.862576 | 0 | blue |
200 rows × 4 columns
import matplotlib.pyplot as plt
x = res['x']
y = res['y']
c = res['c']
#绘制出数据集的散点图
plt.scatter(x,y,color=c, marker="o")
plt.xlabel('x')
plt.ylabel('y')
plt.title('random data')
plt.show()
#重置数据集索引,应为合并后数据索引重复
res.reset_index(inplace=True, drop=True)
#取索引是4的整数倍的的数据做为测试集
test=res[(res.index%4==0)]
#取索引不是4的整数倍的的数据做为训练集
train=res[(res.index%4!=0)]
#选择训练集的特征
X=train[['x','y']]
#选择训练集的标签
Z = train['z']
from sklearn import svm
#新建SVC分类器
clf = svm.SVC(kernel='linear')
#训练
clf.fit(X, Z)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
#在训练集上的准确率
clf.score(X, Z)
0.94
#在测试集上的准确率
clf.score(test[['x','y']],test['z'])
0.96
##################################准备作出svm的分类边界##################################
#求出数据集边界值
x_min, x_max = res["x"].min() - 1, res["x"].max() + 1
y_min, y_max = res["y"].min() - 1, res["y"].max() + 1
#生成网格点坐标矩阵
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.2),np.arange(y_min, y_max, 0.2))
xx,yy
(array([[-3.45982214, -3.25982214, -3.05982214, ..., 5.34017786,
5.54017786, 5.74017786],
[-3.45982214, -3.25982214, -3.05982214, ..., 5.34017786,
5.54017786, 5.74017786],
[-3.45982214, -3.25982214, -3.05982214, ..., 5.34017786,
5.54017786, 5.74017786],
...,
[-3.45982214, -3.25982214, -3.05982214, ..., 5.34017786,
5.54017786, 5.74017786],
[-3.45982214, -3.25982214, -3.05982214, ..., 5.34017786,
5.54017786, 5.74017786],
[-3.45982214, -3.25982214, -3.05982214, ..., 5.34017786,
5.54017786, 5.74017786]]),
array([[-3.48142786, -3.48142786, -3.48142786, ..., -3.48142786,
-3.48142786, -3.48142786],
[-3.28142786, -3.28142786, -3.28142786, ..., -3.28142786,
-3.28142786, -3.28142786],
[-3.08142786, -3.08142786, -3.08142786, ..., -3.08142786,
-3.08142786, -3.08142786],
...,
[ 5.31857214, 5.31857214, 5.31857214, ..., 5.31857214,
5.31857214, 5.31857214],
[ 5.51857214, 5.51857214, 5.51857214, ..., 5.51857214,
5.51857214, 5.51857214],
[ 5.71857214, 5.71857214, 5.71857214, ..., 5.71857214,
5.71857214, 5.71857214]]))
#将网格坐标矩阵组成未知标签的数据
gird=np.c_[xx.ravel(), yy.ravel()]
gird
array([[-3.45982214, -3.48142786],
[-3.25982214, -3.48142786],
[-3.05982214, -3.48142786],
...,
[ 5.34017786, 5.71857214],
[ 5.54017786, 5.71857214],
[ 5.74017786, 5.71857214]])
#将预测未知标签的数据
label=clf.predict(gird)
label
array([0, 0, 0, ..., 1, 1, 1], dtype=int64)
#将标签转换成和xx.shape相同的矩阵用于绘制等高线
label_2 = label.reshape(xx.shape)
#绘制等高线
plt.contourf(xx, yy,label_2, cmap='hot', alpha=0.5)
#绘制原始数据
plt.scatter(x,y,color=c, marker="o")
plt.xlabel('x')
plt.ylabel('y')
plt.title('result')
plt.show()