Python机器学习之逻辑回归
30 Aug 2017 逻辑回归(Logistic Regression)是比较常见的机器学习算法之一,与SVM支持向量机类似,属于典型的 二分类算法。但逻辑回归相较于SVM的优点在于,前者不仅能够预测某事件是否将会发生,还包含该事件发生的可能性。例如想要知道张三是否会购买产品A,SVM所预测的结果只能告诉你,张三会购买或者不会购买,但逻辑回归能告诉你,张三有70%的概率购买产品A,从而你可以根据自己的业务设置风险率阀值。
关于逻辑回归的数学公式百度、Google已经有很好的解答,遂不再赘述。本博文主要介绍如何利用Python实现逻辑回归算法,该算法并没有引用sklearn包,因此具有一定的复杂度,但所实现的算法具有相当高的灵活性,适用于大规模数据分析系统的开发。
偏好使用sklearn的朋友,请跳转:sklearn主要模块和基本使用
输入数据集下载地址:训练集train,测试集test,预测集predict.txt
· 算法构建代码
# python -version 3.5+
import numpy as np
from numpy import *
## 构建sigmoid函数
def sigmoid(x):
return .5 * (1 + np.tanh(.5 * x))
## 训练逻辑回归模型
def trainLogRegres(train_x, train_y, opts):
# 构建训练样本
numSamples, numFeatures = shape(train_x)
alpha = opts['alpha']
maxIter = opts['maxIter']
weights = ones((numFeatures, 1))
# 选择最优的梯度下降法
for k in range(maxIter):
if opts['optimizeType'] == 'gradDescent': # 梯度下降法
output = sigmoid(train_x * weights)
error = train_y - output
weights = weights + alpha * train_x.transpose() * error
elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降法
for i in range(numSamples):
output = sigmoid(train_x[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_x[i, :].transpose() * error
elif opts['optimizeType'] == 'smoothStocGradDescent': # 平稳随机梯度下降法
dataIndex = list(range(numSamples))
for i in range(numSamples):
alpha = 4.0 / (1.0 + k + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex)))
output = sigmoid(train_x[randIndex, :] * weights)
error = train_y[randIndex, 0] - output
weights = weights + alpha * train_x[randIndex, :].transpose() * error
del (dataIndex[randIndex])
else:
raise NameError('Not support optimize method type!')
return weights
## 检验逻辑回归模型
def testLogRegres(weights, test_x, test_y):
numSamples, numFeatures = shape(test_x)
matchCount = 0
for i in range(numSamples):
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5 #设置阀值为0.5
if predict == bool(test_y[i, 0]):
matchCount += 1
accuracy = float(matchCount) / numSamples
return accuracy
## 预测逻辑回归模型
def preLogRegres(weights ,pre_x):
numSamples, numFeatures = shape(pre_x)
outfile = open('D://predict_result.txt', 'w') # 将概率值输出到文件
for i in range(numSamples):
predict = sigmoid(pre_x[i, :] * weights)[0, 0]
predict_m = sigmoid(pre_x[i, :] * weights)[0, 0] > 0.5 #是否阀值定为0.5
outfile.write(str(predict)+'\n')
print(str(predict_m) + ' ' + str(predict)) #同时输出是否及概率值
outfile.close()
· 测试结果代码
## step 1: 模型训练
def loadData(): # 载入训练集
train_x = []
train_y = []
fileIn = open('D://train.txt')
for line in fileIn.readlines():
lineArr = line.split() #设置分隔符
train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
train_y.append(float(lineArr[-1]))
return mat(train_x), mat(train_y).transpose()
fileIn.close()
train_x, train_y = loadData()
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'} # 参数设置
optimalWeights = trainLogRegres(train_x, train_y, opts)
## step 2: 模型检验
def loadData1():
test_x = []
test_y = []
fileIn = open('D://test.txt')
for line in fileIn.readlines():
lineArr = line.split()
test_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
test_y.append(float(lineArr[-1]))
return mat(test_x), mat(test_y).transpose()
fileIn.close()
test_x, test_y = loadData1()
accuracy = testLogRegres(optimalWeights, test_x, test_y)
print('模型预测精度:' + str(accuracy))
print('================================')
## step 3: 模型预测
def loadData2():
pre_x = []
fileIn = open('D://predict.txt')
for line in fileIn.readlines():
lineArr = line.split()
pre_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
return mat(pre_x)
fileIn.close()
pre_x = loadData2()
print('模型预测结果:')
predict = preLogRegres(optimalWeights, pre_x)
· 部分结果展示
模型预测精度:0.9215686274509803
================================
模型预测结果:
True 0.969439134428
False 0.0121320687856
True 0.999891075052
False 0.0276136702724
True 0.948171859849
True 0.999981963204
True 0.960074174269
True 0.999639571954
True 0.999979133078
True 0.99509838304
True 0.999965399666
True 0.999948401765
False 0.00443630610109
···
分类: 机器学习算法