Python机器学习之逻辑回归

  逻辑回归(Logistic Regression)是比较常见的机器学习算法之一,与SVM支持向量机类似,属于典型的 二分类算法。但逻辑回归相较于SVM的优点在于,前者不仅能够预测某事件是否将会发生,还包含该事件发生的可能性。例如想要知道张三是否会购买产品A,SVM所预测的结果只能告诉你,张三会购买或者不会购买,但逻辑回归能告诉你,张三有70%的概率购买产品A,从而你可以根据自己的业务设置风险率阀值。
  关于逻辑回归的数学公式百度、Google已经有很好的解答,遂不再赘述。本博文主要介绍如何利用Python实现逻辑回归算法,该算法并没有引用sklearn包,因此具有一定的复杂度,但所实现的算法具有相当高的灵活性,适用于大规模数据分析系统的开发。
  偏好使用sklearn的朋友,请跳转:sklearn主要模块和基本使用
  输入数据集下载地址:训练集train,测试集test,预测集predict.txt

· 算法构建代码

# python -version 3.5+
import numpy as np
from numpy import *

## 构建sigmoid函数
def sigmoid(x):
    return .5 * (1 + np.tanh(.5 * x))

## 训练逻辑回归模型
def trainLogRegres(train_x, train_y, opts):
    # 构建训练样本
    numSamples, numFeatures = shape(train_x)
    alpha = opts['alpha']
    maxIter = opts['maxIter']
    weights = ones((numFeatures, 1))

    # 选择最优的梯度下降法
    for k in range(maxIter):
        if opts['optimizeType'] == 'gradDescent': # 梯度下降法
            output = sigmoid(train_x * weights)
            error = train_y - output
            weights = weights + alpha * train_x.transpose() * error
        elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降法
            for i in range(numSamples):
                output = sigmoid(train_x[i, :] * weights)
                error = train_y[i, 0] - output
                weights = weights + alpha * train_x[i, :].transpose() * error
        elif opts['optimizeType'] == 'smoothStocGradDescent': # 平稳随机梯度下降法
            dataIndex = list(range(numSamples))
            for i in range(numSamples):
                alpha = 4.0 / (1.0 + k + i) + 0.01
                randIndex = int(random.uniform(0, len(dataIndex)))
                output = sigmoid(train_x[randIndex, :] * weights)
                error = train_y[randIndex, 0] - output
                weights = weights + alpha * train_x[randIndex, :].transpose() * error
                del (dataIndex[randIndex])
        else:
            raise NameError('Not support optimize method type!')
    return weights

## 检验逻辑回归模型
def testLogRegres(weights, test_x, test_y):
    numSamples, numFeatures = shape(test_x)
    matchCount = 0
    for i in range(numSamples):
        predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5 #设置阀值为0.5
        if predict == bool(test_y[i, 0]):
            matchCount += 1
    accuracy = float(matchCount) / numSamples
    return accuracy

## 预测逻辑回归模型
def preLogRegres(weights ,pre_x):
    numSamples, numFeatures = shape(pre_x)
    outfile = open('D://predict_result.txt', 'w') # 将概率值输出到文件
    for i in range(numSamples):
        predict = sigmoid(pre_x[i, :] * weights)[0, 0]
        predict_m = sigmoid(pre_x[i, :] * weights)[0, 0] > 0.5 #是否阀值定为0.5
        outfile.write(str(predict)+'\n')
        print(str(predict_m) + '  ' + str(predict)) #同时输出是否及概率值
    outfile.close()

· 测试结果代码

## step 1: 模型训练
def loadData(): # 载入训练集
    train_x = []
    train_y = []
    fileIn = open('D://train.txt')
    for line in fileIn.readlines():
        lineArr = line.split() #设置分隔符
        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
        train_y.append(float(lineArr[-1]))
    return mat(train_x), mat(train_y).transpose()
    fileIn.close()
train_x, train_y = loadData()
opts = {'alpha': 0.01, 'maxIter': 20, 'optimizeType': 'smoothStocGradDescent'}  # 参数设置
optimalWeights = trainLogRegres(train_x, train_y, opts)

## step 2: 模型检验
def loadData1():
    test_x = []
    test_y = []
    fileIn = open('D://test.txt')
    for line in fileIn.readlines():
        lineArr = line.split()
        test_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
        test_y.append(float(lineArr[-1]))
    return mat(test_x), mat(test_y).transpose()
    fileIn.close()
test_x, test_y = loadData1()
accuracy = testLogRegres(optimalWeights, test_x, test_y)
print('模型预测精度:' + str(accuracy))
print('================================')

## step 3: 模型预测
def loadData2():
    pre_x = []
    fileIn = open('D://predict.txt')
    for line in fileIn.readlines():
        lineArr = line.split()
        pre_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
    return mat(pre_x)
    fileIn.close()
pre_x = loadData2()
print('模型预测结果:')
predict = preLogRegres(optimalWeights, pre_x)

· 部分结果展示

模型预测精度:0.9215686274509803
================================
模型预测结果:
True  0.969439134428
False  0.0121320687856
True  0.999891075052
False  0.0276136702724
True  0.948171859849
True  0.999981963204
True  0.960074174269
True  0.999639571954
True  0.999979133078
True  0.99509838304
True  0.999965399666
True  0.999948401765
False  0.00443630610109
···

分类: 机器学习算法