利用scikit-learn实现svm

import numpy as np 
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1
p=np.random.randn(100,2)
#将中心点移动到(2.5,2.5),作为正类
for i in range(100):
    p[i][0]+=2.5
    p[i][1]+=2.5

#产生正态分布的数据100组,中心点(0,0),其标准差σ为1,作为负类
f=np.random.randn(100,2)
import pandas as pd 

#将np数组转换成dataframe
df_p=pd.DataFrame(p,columns=['x','y'])
#加上标签z,正类标签1
df_p['z']=1
#加上正类的颜色红
df_p['c']='red'

#将np数组转换成dataframe
df_f=pd.DataFrame(f,columns=['x','y'])
#加上标签z,负类标签0
df_f['z']=0
#加上负类的颜色红
df_f['c']='blue'

#将正负类合并成一个dataframe
res = pd.concat([df_p, df_f], axis=0)
res
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
x y z c
0 4.108224 3.848929 1 red
1 2.342511 2.953446 1 red
2 0.789860 2.732742 1 red
3 2.842918 3.590247 1 red
4 3.097725 3.576919 1 red
5 3.661894 3.350869 1 red
6 2.317196 2.281296 1 red
7 2.117827 1.821161 1 red
8 1.042247 2.022398 1 red
9 1.350079 4.057907 1 red
10 2.171275 3.204408 1 red
11 1.715535 3.081039 1 red
12 2.166927 2.049468 1 red
13 2.674912 1.479511 1 red
14 3.503797 1.912558 1 red
15 1.772712 0.220415 1 red
16 2.768057 2.251231 1 red
17 0.456425 1.681859 1 red
18 2.609217 3.171372 1 red
19 3.029760 2.948293 1 red
20 2.371976 3.085000 1 red
21 1.887890 3.481395 1 red
22 1.921902 2.226114 1 red
23 1.541674 0.752714 1 red
24 4.826169 2.502664 1 red
25 0.333282 1.014494 1 red
26 2.618868 2.003866 1 red
27 1.358006 1.559485 1 red
28 2.292783 1.349140 1 red
29 3.553238 -0.243554 1 red
... ... ... ... ...
70 -0.073927 -0.320674 0 blue
71 -1.252395 -1.242636 0 blue
72 -0.656555 0.467408 0 blue
73 0.558789 1.602697 0 blue
74 -0.242620 0.732277 0 blue
75 1.190387 0.718793 0 blue
76 0.887169 1.408336 0 blue
77 1.010919 -0.007951 0 blue
78 0.939187 -0.307889 0 blue
79 0.178746 1.282570 0 blue
80 0.270536 -0.370093 0 blue
81 0.375603 1.449885 0 blue
82 2.155806 0.682002 0 blue
83 -1.048934 -0.018362 0 blue
84 -0.123291 -0.632417 0 blue
85 -0.548397 -0.417168 0 blue
86 1.197207 1.201911 0 blue
87 -0.653190 1.413521 0 blue
88 -0.025027 -0.630535 0 blue
89 -0.834319 -0.213447 0 blue
90 -0.261550 1.198524 0 blue
91 0.059145 -0.478244 0 blue
92 -1.061583 0.007730 0 blue
93 0.349603 -0.490094 0 blue
94 1.070271 -0.156812 0 blue
95 -0.035461 0.849317 0 blue
96 0.189620 0.032218 0 blue
97 -0.236485 -0.734719 0 blue
98 0.810837 1.736817 0 blue
99 -1.173467 0.862576 0 blue

200 rows × 4 columns

import matplotlib.pyplot as plt
x = res['x']
y = res['y']
c = res['c']

#绘制出数据集的散点图
plt.scatter(x,y,color=c, marker="o")

plt.xlabel('x')
plt.ylabel('y')
plt.title('random data')
plt.show()

利用scikit-learn实现svm

#重置数据集索引,应为合并后数据索引重复
res.reset_index(inplace=True, drop=True)
#取索引是4的整数倍的的数据做为测试集
test=res[(res.index%4==0)]
#取索引不是4的整数倍的的数据做为训练集
train=res[(res.index%4!=0)]
#选择训练集的特征
X=train[['x','y']]
#选择训练集的标签
Z = train['z']
from sklearn import svm
#新建SVC分类器
clf = svm.SVC(kernel='linear')
#训练
clf.fit(X, Z)  
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
#在训练集上的准确率
clf.score(X, Z)
0.94
#在测试集上的准确率
clf.score(test[['x','y']],test['z'])
0.96
##################################准备作出svm的分类边界##################################
#求出数据集边界值
x_min, x_max = res["x"].min() - 1, res["x"].max() + 1
y_min, y_max = res["y"].min() - 1, res["y"].max() + 1
#生成网格点坐标矩阵
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.2),np.arange(y_min, y_max, 0.2))
xx,yy
(array([[-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        ...,
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786]]),
 array([[-3.48142786, -3.48142786, -3.48142786, ..., -3.48142786,
         -3.48142786, -3.48142786],
        [-3.28142786, -3.28142786, -3.28142786, ..., -3.28142786,
         -3.28142786, -3.28142786],
        [-3.08142786, -3.08142786, -3.08142786, ..., -3.08142786,
         -3.08142786, -3.08142786],
        ...,
        [ 5.31857214,  5.31857214,  5.31857214, ...,  5.31857214,
          5.31857214,  5.31857214],
        [ 5.51857214,  5.51857214,  5.51857214, ...,  5.51857214,
          5.51857214,  5.51857214],
        [ 5.71857214,  5.71857214,  5.71857214, ...,  5.71857214,
          5.71857214,  5.71857214]]))
#将网格坐标矩阵组成未知标签的数据
gird=np.c_[xx.ravel(), yy.ravel()]
gird
array([[-3.45982214, -3.48142786],
       [-3.25982214, -3.48142786],
       [-3.05982214, -3.48142786],
       ...,
       [ 5.34017786,  5.71857214],
       [ 5.54017786,  5.71857214],
       [ 5.74017786,  5.71857214]])
#将预测未知标签的数据
label=clf.predict(gird)
label
array([0, 0, 0, ..., 1, 1, 1], dtype=int64)
#将标签转换成和xx.shape相同的矩阵用于绘制等高线
label_2 = label.reshape(xx.shape)

#绘制等高线
plt.contourf(xx, yy,label_2, cmap='hot', alpha=0.5)
#绘制原始数据
plt.scatter(x,y,color=c, marker="o")

plt.xlabel('x')
plt.ylabel('y')
plt.title('result')
plt.show()

利用scikit-learn实现svm

代码已托管至github

https://github.com/matrix-yang/machine_learn_record