Logistic回归问题

1. sklearn中关于LogisticRegression官方API的学习

# -*- coding: UTF-8 -*-
import numpy as np # 快速操作结构数组的工具
import pandas as pd # 数据分析处理工具

# ## logistic回归
#
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.set_params
#
# 在多分类问题中,若mutil_class为ovr,则学习算法使用one-vs-rest模式,若multi_class为multinomial,则使用交叉熵损失
# LogisticRegression使用liblinear、newton-cg、lbfgs求解器来实现正则化的Logistic回归,可以处理密集和稀疏输入
# 使用包含64位浮点数的C有序数组或CSR矩阵以获得最佳性能; 任何其他输入格式将被转换(和复制)
#
# paras:
#       penalty:str、l1|l2,默认l2,指定正则化中使用的惩罚项,newton-cg、sag和lbfgs仅支持l2惩罚项
#       dual:bool,默认False,对偶还是原始方法,dual只适用于正则化项为l2 liblinear的情况,通常样本数大于特征数的情况下,默认False
#       tol:float,可选,指定迭代终止的误差范围
#       c:float,默认1.0,正则化项系数的倒数,默认1,越小正则化强度越大
#       fit_intercept:bool,默认True,是否存在截距,若存在要加到决策函数
#       intercept_scaling:float,默认1,仅在正则化项为liblinear且fit_intercept为True时有用
#       class_weight:dict|balanced,默认None,class_weight参数用于标示分类模型中各种类型的权重,可以不输入,即不考虑权重,或者说所有类型的权重一样。
#                    如果选择输入的话,可以选择balanced让类库自己计算类型权重,或者我们自己输入各个类型的权重
#       random_state:int,RandomState instance|None,可选,默认None,随机数种子,仅在正则化优化算法为sag,liblinear时有用
#       max_iter:int,可选,算法收敛的最大迭代次数,仅在正则化优化算法为newton-cg, sag and lbfgs 才有用
#       multi_class:str,ovr|multinomial,默认ovr
#       verbose:int,默认0,对于liblinear和lbfgs求解器,将verbose设置为任何正数以表示详细程度
#       warm_start:bool,默认False,设置为True时,重用上一次调用的解决方案以适合初始化,否则,只需擦除以前的解决方案。对于liblinear解算器没用。
#       n_jobs:int,默认1,若multi_class ='ovr'“,则在对类进行并行化时使用的CPU核心数。solver无论是否指定了“multi_class”,当设置为“liblinear” 时,将忽略此参数
#       solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, 默认 ‘liblinear’.logistic回归损失函数的优化算法,一共4种
#
# attribus:
#       coef_
#       intercept_
#       n_iter
#
# methods:
#       decision_function(X):预测样本的可信度得分
#       densify():将系数矩阵转换成密集阵列格式
#       fit(X, y[, sample_weight]):根据给定的数据拟合模型
#       get_params([deep]):获取参数
#       predict(X):预测样本X的类标签
#       predict_log_proba(X):估计对数概率
#       predict_proba(X):概率估计
#       score(X, y[, sample_weight]):返回给定数据和标签的平均准确度
#       set_params(**params):设置参数
#       sparsify():将系数矩阵转换成稀疏格式
# 样本数据集,第一列为x1,第二列为x2,第三列为分类(三种类别)
data=[
        [-2.68420713, 0.32660731, 0],[-2.71539062, -0.16955685, 0],[-2.88981954, -0.13734561, 0],[-2.7464372, -0.31112432, 0],[-2.72859298, 0.33392456, 0],
        [-2.27989736, 0.74778271, 0],[-2.82089068, -0.08210451, 0],[-2.62648199, 0.17040535, 0],[-2.88795857, -0.57079803, 0],[-2.67384469, -0.1066917, 0],
        [-2.50652679,0.65193501,0],[-2.61314272,0.02152063,0],[-2.78743398,-0.22774019,0],[-3.22520045,-0.50327991,0],[-2.64354322,1.1861949,0],
        [-2.38386932,1.34475434,0],[-2.6225262,0.81808967,0],[-2.64832273,0.31913667,0],[-2.19907796,0.87924409,0],[-2.58734619,0.52047364,0],
        [1.28479459, 0.68543919, 1],[0.93241075, 0.31919809, 1],[1.46406132, 0.50418983, 1],[0.18096721, -0.82560394, 1],[1.08713449, 0.07539039, 1],
        [0.64043675, -0.41732348, 1],[1.09522371, 0.28389121, 1],[-0.75146714, -1.00110751, 1],[1.04329778, 0.22895691, 1],[-0.01019007, -0.72057487, 1],
        [-0.5110862,-1.26249195,1],[0.51109806,-0.10228411,1],[0.26233576,-0.5478933,1],[0.98404455,-0.12436042,1],[-0.174864,-0.25181557,1],
        [0.92757294,0.46823621,1],[0.65959279,-0.35197629,1],[0.23454059,-0.33192183,1],[0.94236171,-0.54182226,1],[0.0432464,-0.58148945,1],
        [2.53172698, -0.01184224, 2],[1.41407223, -0.57492506, 2],[2.61648461, 0.34193529, 2],[1.97081495, -0.18112569, 2],[2.34975798, -0.04188255, 2],
        [3.39687992, 0.54716805, 2],[0.51938325, -1.19135169, 2],[2.9320051, 0.35237701, 2],[2.31967279, -0.24554817, 2],[2.91813423, 0.78038063, 2],
        [1.66193495,0.2420384,2],[1.80234045,-0.21615461,2],[2.16537886,0.21528028,2],[1.34459422,-0.77641543,2],[1.5852673,-0.53930705,2],
        [1.90474358,0.11881899,2],[1.94924878,0.04073026,2],[3.48876538,1.17154454,2],[3.79468686,0.25326557,2],[1.29832982,-0.76101394,2],
]
# 样本数据集,第一列为x1,第二列为x2,第三列为分类(2种类别)
data1=[
    [-0.017612,14.053064,0],
    [-1.395634,4.662541,1],
    [-0.752157,6.538620,0],
    [-1.322371,7.152853,0],
    [0.423363,11.054677,0],
    [0.406704,7.067335,1],
    [0.667394,12.741452,0],
    [-2.460150,6.866805,1],
    [0.569411,9.548755,0],
    [-0.026632,10.427743,0],
    [0.850433,6.920334,1],
    [1.347183,13.175500,0],
    [1.176813,3.167020,1],
    [-1.781871,9.097953,0],
    [-0.566606,5.749003,1],
    [0.931635,1.589505,1],
    [-0.024205,6.151823,1],
    [-0.036453,2.690988,1],
    [-0.196949,0.444165,1],
    [1.014459,5.754399,1]
]
#生成X和y矩阵
dataMat = np.mat(data)
y = dataMat[:,2]   # 类别变量
b = np.ones(y.shape)  # 添加全1列向量代表b偏量
X = np.column_stack((b, dataMat[:,0:2]))  # 特征属性集和b偏量组成x
X = np.mat(X)


# 特征数据归一化
#import sklearn.preprocessing as preprocessing   #sk的去均值和归一化
#scaler=preprocessing.StandardScaler()
#X = scaler.fit_transform(X)   # 对特征数据集去均值和归一化,可以加快机器性能
#X = np.mat(X)
#print(X)

# ========逻辑回归========

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print('逻辑回归模型:\n',model)
# 使用模型预测
predicted = model.predict(X)   #预测分类
answer = model.predict_proba(X)  #预测分类概率
print(answer)

import matplotlib.pyplot as plt

# 绘制边界和散点
# 先产生x1和x2取值范围上的网格点,并预测每个网格点上的值。
h = 0.02
x1_min, x1_max = X[:,1].min() - .5, X[:,1].max() + .5
x2_min, x2_max = X[:,2].min() - .5, X[:,2].max() + .5
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))
testMat = np.c_[xx1.ravel(), xx2.ravel()]   #形成测试特征数据集
testMat = np.column_stack((np.ones(((testMat.shape[0]),1)),testMat))  #添加第一列为全1代表b偏量
testMat = np.mat(testMat)
Z = model.predict(testMat)

# 绘制区域网格图
Z = Z.reshape(xx1.shape)
plt.pcolormesh(xx1, xx2, Z, cmap=plt.cm.Paired)


# 绘制散点图 参数:x横轴 y纵轴,颜色代表分类。x图标为样本点,.表示预测点
plt.scatter(X[:,1].flatten().A[0], X[:,2].flatten().A[0],c=y.flatten().A[0],marker='x')   # 绘制样本数据集
plt.scatter(X[:,1].flatten().A[0], X[:,2].flatten().A[0],c=predicted.tolist(),marker='.') # 绘制预测数据集

# 绘制x轴和y轴坐标
plt.xlabel("x")
plt.ylabel("y")

# 显示图形
plt.show()

输出
二分类的结果:第i列表示属于第i类的概率
Logistic回归问题

Logistic回归问题

多分类的结果:
Logistic回归问题

Logistic回归问题

参考:
python机器学习库sklearn——逻辑回归
官方API

sklearn关于算法的具体描述可见:
英文文档
中文文档