import numpy as np 
#产生正态分布的数据100组，中心点（0，0），其标准差σ为1
p=np.random.randn(100,2)
#将中心点移动到（2.5,2.5），作为正类
for i in range(100):
    p[i][0]+=2.5
    p[i][1]+=2.5

#产生正态分布的数据100组，中心点（0，0），其标准差σ为1，作为负类
f=np.random.randn(100,2)

import pandas as pd 

#将np数组转换成dataframe
df_p=pd.DataFrame(p,columns=['x','y'])
#加上标签z,正类标签1
df_p['z']=1
#加上正类的颜色红
df_p['c']='red'

#将np数组转换成dataframe
df_f=pd.DataFrame(f,columns=['x','y'])
#加上标签z,负类标签0
df_f['z']=0
#加上负类的颜色红
df_f['c']='blue'

#将正负类合并成一个dataframe
res = pd.concat([df_p, df_f], axis=0)
res

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	x	y	z	c
0	4.108224	3.848929	1	red
1	2.342511	2.953446	1	red
2	0.789860	2.732742	1	red
3	2.842918	3.590247	1	red
4	3.097725	3.576919	1	red
5	3.661894	3.350869	1	red
6	2.317196	2.281296	1	red
7	2.117827	1.821161	1	red
8	1.042247	2.022398	1	red
9	1.350079	4.057907	1	red
10	2.171275	3.204408	1	red
11	1.715535	3.081039	1	red
12	2.166927	2.049468	1	red
13	2.674912	1.479511	1	red
14	3.503797	1.912558	1	red
15	1.772712	0.220415	1	red
16	2.768057	2.251231	1	red
17	0.456425	1.681859	1	red
18	2.609217	3.171372	1	red
19	3.029760	2.948293	1	red
20	2.371976	3.085000	1	red
21	1.887890	3.481395	1	red
22	1.921902	2.226114	1	red
23	1.541674	0.752714	1	red
24	4.826169	2.502664	1	red
25	0.333282	1.014494	1	red
26	2.618868	2.003866	1	red
27	1.358006	1.559485	1	red
28	2.292783	1.349140	1	red
29	3.553238	-0.243554	1	red
...	...	...	...	...
70	-0.073927	-0.320674	0	blue
71	-1.252395	-1.242636	0	blue
72	-0.656555	0.467408	0	blue
73	0.558789	1.602697	0	blue
74	-0.242620	0.732277	0	blue
75	1.190387	0.718793	0	blue
76	0.887169	1.408336	0	blue
77	1.010919	-0.007951	0	blue
78	0.939187	-0.307889	0	blue
79	0.178746	1.282570	0	blue
80	0.270536	-0.370093	0	blue
81	0.375603	1.449885	0	blue
82	2.155806	0.682002	0	blue
83	-1.048934	-0.018362	0	blue
84	-0.123291	-0.632417	0	blue
85	-0.548397	-0.417168	0	blue
86	1.197207	1.201911	0	blue
87	-0.653190	1.413521	0	blue
88	-0.025027	-0.630535	0	blue
89	-0.834319	-0.213447	0	blue
90	-0.261550	1.198524	0	blue
91	0.059145	-0.478244	0	blue
92	-1.061583	0.007730	0	blue
93	0.349603	-0.490094	0	blue
94	1.070271	-0.156812	0	blue
95	-0.035461	0.849317	0	blue
96	0.189620	0.032218	0	blue
97	-0.236485	-0.734719	0	blue
98	0.810837	1.736817	0	blue
99	-1.173467	0.862576	0	blue

200 rows × 4 columns

import matplotlib.pyplot as plt
x = res['x']
y = res['y']
c = res['c']

#绘制出数据集的散点图
plt.scatter(x,y,color=c, marker="o")

plt.xlabel('x')
plt.ylabel('y')
plt.title('random data')
plt.show()

利用scikit-learn实现svm

#重置数据集索引，应为合并后数据索引重复
res.reset_index(inplace=True, drop=True)
#取索引是4的整数倍的的数据做为测试集
test=res[(res.index%4==0)]
#取索引不是4的整数倍的的数据做为训练集
train=res[(res.index%4!=0)]

#选择训练集的特征
X=train[['x','y']]
#选择训练集的标签
Z = train['z']

from sklearn import svm
#新建SVC分类器
clf = svm.SVC(kernel='linear')
#训练
clf.fit(X, Z)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#在训练集上的准确率
clf.score(X, Z)

0.94

#在测试集上的准确率
clf.score(test[['x','y']],test['z'])

0.96

##################################准备作出svm的分类边界##################################
#求出数据集边界值
x_min, x_max = res["x"].min() - 1, res["x"].max() + 1
y_min, y_max = res["y"].min() - 1, res["y"].max() + 1

#生成网格点坐标矩阵
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.2),np.arange(y_min, y_max, 0.2))
xx,yy

(array([[-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        ...,
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786],
        [-3.45982214, -3.25982214, -3.05982214, ...,  5.34017786,
          5.54017786,  5.74017786]]),
 array([[-3.48142786, -3.48142786, -3.48142786, ..., -3.48142786,
         -3.48142786, -3.48142786],
        [-3.28142786, -3.28142786, -3.28142786, ..., -3.28142786,
         -3.28142786, -3.28142786],
        [-3.08142786, -3.08142786, -3.08142786, ..., -3.08142786,
         -3.08142786, -3.08142786],
        ...,
        [ 5.31857214,  5.31857214,  5.31857214, ...,  5.31857214,
          5.31857214,  5.31857214],
        [ 5.51857214,  5.51857214,  5.51857214, ...,  5.51857214,
          5.51857214,  5.51857214],
        [ 5.71857214,  5.71857214,  5.71857214, ...,  5.71857214,
          5.71857214,  5.71857214]]))

#将网格坐标矩阵组成未知标签的数据
gird=np.c_[xx.ravel(), yy.ravel()]
gird

array([[-3.45982214, -3.48142786],
       [-3.25982214, -3.48142786],
       [-3.05982214, -3.48142786],
       ...,
       [ 5.34017786,  5.71857214],
       [ 5.54017786,  5.71857214],
       [ 5.74017786,  5.71857214]])

#将预测未知标签的数据
label=clf.predict(gird)
label

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

#将标签转换成和xx.shape相同的矩阵用于绘制等高线
label_2 = label.reshape(xx.shape)

#绘制等高线
plt.contourf(xx, yy,label_2, cmap='hot', alpha=0.5)
#绘制原始数据
plt.scatter(x,y,color=c, marker="o")

plt.xlabel('x')
plt.ylabel('y')
plt.title('result')
plt.show()

利用scikit-learn实现svm

代码已托管至github

https://github.com/matrix-yang/machine_learn_record

利用scikit-learn实现svm

代码已托管至github

相关推荐