利用DBSCAN算法对双半月数据集进行聚类
1、生成数据集
class moon_data_class(object):
def __init__(self,N,d,r,w):
self.N=N
self.w=w
self.d=d
self.r=r
def sgn(self,x):
if(x>0):
return 1;
else:
return -1;
def sig(self,x):
return 1.0/(1+np.exp(x))
def dbmoon(self):
N1 = 10*self.N
N = self.N
r = self.r
w2 = self.w/2
d = self.d
done = True
data = np.empty(0)
while done:
#generate Rectangular data
tmp_x = 2*(r+w2)*(np.random.random([N1, 1])-0.5)
tmp_y = (r+w2)*np.random.random([N1, 1])
tmp = np.concatenate((tmp_x, tmp_y), axis=1)
tmp_ds = np.sqrt(tmp_x*tmp_x + tmp_y*tmp_y)
#generate double moon data ---upper
idx = np.logical_and(tmp_ds > (r-w2), tmp_ds < (r+w2))
idx = (idx.nonzero())[0]
if data.shape[0] == 0:
data = tmp.take(idx, axis=0)
else:
data = np.concatenate((data, tmp.take(idx, axis=0)), axis=0)
if data.shape[0] >= N:
done = False
#print (data)
db_moon = data[0:N, :]
#print (db_moon)
#generate double moon data ----down
data_t = np.empty([N, 2])
data_t[:, 0] = data[0:N, 0] + r
data_t[:, 1] = -data[0:N, 1] - d
db_moon = np.concatenate((db_moon, data_t), axis=0)
return db_moon
2、计算最佳半径
def epsilon(data,Minpts):
m,n = np.shape(data)
xMax = np.max(data)
xMin = np.min(data)
eps = ((np.prod(xMax-xMin) * Minpts * math.gamma(0.5*n+1))/(m*math.sqrt(math.pi **n)))**(1.0/n)
return eps
3、DBSCAN算法
def distance(data):
m, n = np.shape(data)
dis = np.mat(np.zeros((m, m)))
for i in range(m):
for j in range(i, m):
# 计算i和j之间的欧式距离
tmp = 0
for k in range(n):
tmp += (data[i, k] - data[j, k]) * (data[i, k] - data[j, k])
dis[i, j] = np.sqrt(tmp)
dis[j, i] = dis[i, j]
return dis
def find_eps(distance_D, eps):
ind = []
n = np.shape(distance_D)[1]
for j in range(n):
if distance_D[0, j] <= eps:
ind.append(j)
return ind
def dbscan(data, eps, MinPts):
m = np.shape(data)[0]
# 区分核心点1,边界点0和噪音点-1
types = np.mat(np.zeros((1, m)))
sub_class = np.mat(np.zeros((1, m)))
# 用于判断该点是否处理过,0表示未处理过
dealed = np.mat(np.zeros((m, 1)))
# 计算每个数据点之间的距离
dis = distance(data)
# 用于标记类别
number = 1
# 对每一个点进行处理
for i in range(m):
# 找到未处理的点
if dealed[i, 0] == 0:
# 找到第i个点到其他所有点的距离
D = dis[i, ]
# 找到半径eps内的所有点
ind = find_eps(D, eps)
# 区分点的类型
# 边界点
if len(ind) > 1 and len(ind) < MinPts + 1:
types[0, i] = 0
sub_class[0, i] = 0
# 噪音点
if len(ind) == 1:
types[0, i] = -1
sub_class[0, i] = -1
dealed[i, 0] = 1
# 核心点
if len(ind) >= MinPts + 1:
types[0, i] = 1
for x in ind:
sub_class[0, x] = number
# 判断核心点是否密度可达
while len(ind) > 0:
dealed[ind[0], 0] = 1
D = dis[ind[0], ]
tmp = ind[0]
del ind[0]
ind_1 = find_eps(D, eps)
if len(ind_1) > 1: # 处理非噪音点
for x1 in ind_1:
sub_class[0, x1] = number
if len(ind_1) >= MinPts + 1:
types[0, tmp] = 1
else:
types[0, tmp] = 0
for j in range(len(ind_1)):
if dealed[ind_1[j], 0] == 0:
dealed[ind_1[j], 0] = 1
ind.append(ind_1[j])
sub_class[0, ind_1[j]] = number
number += 1
# 最后处理所有未分类的点为噪音点
ind_2 = ((sub_class == 0).nonzero())[1]
for x in ind_2:
sub_class[0, x] = -1
types[0, x] = -1
return types, sub_class
4、运行结果
原图聚类结果
5 完整代码
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 15 21:33:41 2019
@author: ASUS
"""
import numpy as np
import pylab as pl
import random as rd
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import mpl_toolkits.mplot3d
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from scipy import *
from scipy.linalg import norm, pinv
from matplotlib import pyplot as plt
class moon_data_class(object):
def __init__(self,N,d,r,w):
self.N=N
self.w=w
self.d=d
self.r=r
def sgn(self,x):
if(x>0):
return 1;
else:
return -1;
def sig(self,x):
return 1.0/(1+np.exp(x))
def dbmoon(self):
N1 = 10*self.N
N = self.N
r = self.r
w2 = self.w/2
d = self.d
done = True
data = np.empty(0)
while done:
#generate Rectangular data
tmp_x = 2*(r+w2)*(np.random.random([N1, 1])-0.5)
tmp_y = (r+w2)*np.random.random([N1, 1])
tmp = np.concatenate((tmp_x, tmp_y), axis=1)
tmp_ds = np.sqrt(tmp_x*tmp_x + tmp_y*tmp_y)
#generate double moon data ---upper
idx = np.logical_and(tmp_ds > (r-w2), tmp_ds < (r+w2))
idx = (idx.nonzero())[0]
if data.shape[0] == 0:
data = tmp.take(idx, axis=0)
else:
data = np.concatenate((data, tmp.take(idx, axis=0)), axis=0)
if data.shape[0] >= N:
done = False
#print (data)
db_moon = data[0:N, :]
#print (db_moon)
#generate double moon data ----down
data_t = np.empty([N, 2])
data_t[:, 0] = data[0:N, 0] + r
data_t[:, 1] = -data[0:N, 1] - d
db_moon = np.concatenate((db_moon, data_t), axis=0)
return db_moon
def epsilon(data,Minpts):
m,n = np.shape(data)
xMax = np.max(data)
xMin = np.min(data)
eps = ((np.prod(xMax-xMin) * Minpts * math.gamma(0.5*n+1))/(m*math.sqrt(math.pi **n)))**(1.0/n)
return eps
def distance(data):
m, n = np.shape(data)
dis = np.mat(np.zeros((m, m)))
for i in range(m):
for j in range(i, m):
# 计算i和j之间的欧式距离
tmp = 0
for k in range(n):
tmp += (data[i, k] - data[j, k]) * (data[i, k] - data[j, k])
dis[i, j] = np.sqrt(tmp)
dis[j, i] = dis[i, j]
return dis
def find_eps(distance_D, eps):
ind = []
n = np.shape(distance_D)[1]
for j in range(n):
if distance_D[0, j] <= eps:
ind.append(j)
return ind
def dbscan(data, eps, MinPts):
m = np.shape(data)[0]
# 区分核心点1,边界点0和噪音点-1
types = np.mat(np.zeros((1, m)))
sub_class = np.mat(np.zeros((1, m)))
# 用于判断该点是否处理过,0表示未处理过
dealed = np.mat(np.zeros((m, 1)))
# 计算每个数据点之间的距离
dis = distance(data)
# 用于标记类别
number = 1
# 对每一个点进行处理
for i in range(m):
# 找到未处理的点
if dealed[i, 0] == 0:
# 找到第i个点到其他所有点的距离
D = dis[i, ]
# 找到半径eps内的所有点
ind = find_eps(D, eps)
# 区分点的类型
# 边界点
if len(ind) > 1 and len(ind) < MinPts + 1:
types[0, i] = 0
sub_class[0, i] = 0
# 噪音点
if len(ind) == 1:
types[0, i] = -1
sub_class[0, i] = -1
dealed[i, 0] = 1
# 核心点
if len(ind) >= MinPts + 1:
types[0, i] = 1
for x in ind:
sub_class[0, x] = number
# 判断核心点是否密度可达
while len(ind) > 0:
dealed[ind[0], 0] = 1
D = dis[ind[0], ]
tmp = ind[0]
del ind[0]
ind_1 = find_eps(D, eps)
if len(ind_1) > 1: # 处理非噪音点
for x1 in ind_1:
sub_class[0, x1] = number
if len(ind_1) >= MinPts + 1:
types[0, tmp] = 1
else:
types[0, tmp] = 0
for j in range(len(ind_1)):
if dealed[ind_1[j], 0] == 0:
dealed[ind_1[j], 0] = 1
ind.append(ind_1[j])
sub_class[0, ind_1[j]] = number
number += 1
# 最后处理所有未分类的点为噪音点
ind_2 = ((sub_class == 0).nonzero())[1]
for x in ind_2:
sub_class[0, x] = -1
types[0, x] = -1
return types, sub_class
if __name__ == "__main__":
step=0
color=['.r','.g','.b','.y']#颜色种类
dcolor=['*r','*g','*b','*y','or','og','ob','oy']#颜色种类
frames = []
N = 100
d = -5
r = 10
width = 1
MinPts= 50
data_source = moon_data_class(N, d, r, width)
data = data_source.dbmoon()
# x0 = [1 for x in range(1,401)]
input_cells = np.array([np.reshape(data[0:2*N, 0], len(data)), np.reshape(data[0:2*N, 1], len(data))]).transpose()
labels_pre = [[-1.] for y in range(1, 401)]
labels_pos = [[1. ] for y in range(1, 401)]
label=labels_pre+labels_pos
dataSet = np.mat(input_cells)
labels = np.mat(label)
eps = epsilon(data,MinPts)
types, sub_class = dbscan(data, eps, MinPts)
plt.plot(data[0:N, 0], data[0:N, 1], 'r*', data[N:2*N, 0], data[N:2*N, 1], 'r*')
plt.show()
sub_class = np.array(sub_class)
for i in range(200):
color_type = dcolor[int(sub_class[0][i])%8]
plt.plot(data[i, 0], data[i, 1],color_type)
# elif types[0][i]==2:
# plt.plot(data[i, 0], data[i, 1],'r*')
# elif types[0][i]==3:
# plt.plot(data[i, 0], data[i, 1],'b*')
# elif types[0][i]==4:
# plt.plot(data[i, 0], data[i, 1],'g*')
plt.show()