单个神经元的逻辑分类源代码全注释 Logistic Regression

test.py
# -*- coding: utf-8 -*-
from logistic_regression import LogisticRegressionClassifier
# 从文件读取数据
# 每行数据以\t隔开,最后一列为类标号
def load_data_set(datafile):
feature_data = []
label_date = []
lines = 0
with open(datafile, 'r') as fr_file:
for each_line in fr_file:
lines = lines + 1
one_line = each_line.split('\t')
# print('len of one_line = ', len(one_line))
temp_arr = []
# 对这一行的每一列(样例是21+1列) -1是因为最后一列是标号,前面的是特征数据
for i in range(len(one_line)-1):
# 这一行形成一维数组
temp_arr.append(float(one_line[i]))
# 形成二维数组
feature_data.append(temp_arr)
# 每行最后一列是标号
label_date.append(int(float(one_line[-1].strip()))) # list。strip()移除字符串头尾字符(默认空格)
# print('lines = ', lines)
return feature_data, label_date # 返回的数据是list
def main():
# 读入的训练文件和测试文件名
train_file = r"data\train1.txt"
test_file = r"data\test1.txt"
# train_x是特征数据 train_y是标号 都是list
train_x, train_y = load_data_set(train_file)
test_x, test_y = load_data_set(test_file)
print('train_x = ', train_x)
print('train_y = ', train_y)
print('test_x = ', test_x)
print('test_y = ', test_y)
# 实例分类器
classifier = LogisticRegressionClassifier()
# 训练后的权重
# alpha为步长(学习率);maxCycles最大迭代次数
weigh = classifier.fit(train_x, train_y, alpha=0.1, max_cycles=100)
# 进行预测
classifier.predict(test_x, test_y, weigh)
# 主函数
if __name__ == "__main__":
main()
logistic_regression.py
# -*- coding: utf-8 -*-
# !/usr/bin/python
import numpy as np
class LogisticRegressionClassifier():
def __init__(self):
self._alpha = None
# 使用梯度下降方法训练模型,如果使用其它的寻参方法,此处可以做相应修改
def fit(self, train_x, train_y, alpha=0.01, max_cycles=100):
return self._grad_descent(train_x, train_y, alpha, max_cycles)
# alpha为步长(学习率) max_cycles最大迭代次数
def _grad_descent(self, feat_data, label_data, alpha, max_cycles):
data_mat = np.mat(feat_data) # size: m*n
label_mat = np.mat(label_data).transpose() # size: m*1 transpose()转置
# m行数据 n列特征
m, n = np.shape(data_mat)
weigh = np.ones((n, 1))
# 对每次迭代
for i in range(max_cycles):
# 正向计算
# 矩阵相乘 m*n * n*1 前面是特征矩阵后面是权重矩阵 = m*1 结果矩阵
hx = self._sigmoid(data_mat * weigh)
# 误差矩阵m*1
error = label_mat - hx
# 新权重 = 老权重 + 学习率*权重变化
weigh = weigh + alpha * data_mat.transpose() * error # 根据误差修改回归系数 梯度下降法
print('weigh = ', weigh)
return weigh
# 使用学习得到的参数进行分类
def predict(self, test_x, test_y, weigh):
data_mat = np.mat(test_x) # size: m*n
label_mat = np.mat(test_y).transpose() # size: m*1 transpose()转置
hx = self._sigmoid(data_mat*weigh) # size:m*1 正向计算
print('hx = ', hx)
m = len(hx)
error = 0.0
# 对每一个预测值
for i in range(m):
# 以0.5为界 这里int好像是截断取整
if float(hx[i]) > 0.5:
print('\n', str(i+1)+'-th sample ', int(label_mat[i]), 'is classfied as: 1', end='')
# 答案矩阵中第i个不为1,就说明预测错了
if int(label_mat[i]) != 1:
error += 1.0
print(" classify error.", end='')
else:
print('\n', str(i+1)+'-th sample ', int(label_mat[i]), 'is classfied as: 0', end='')
if int(label_mat[i]) != 0:
error += 1.0
print(" classify error.", end='')
# 错误数量/总数量
error_rate = error/m
print('\n', "error rate is:", "%.4f" % error_rate)
return error_rate
# 经典sigmoid函数
def _sigmoid(self, fx):
return 1.0 / (1 + np.exp(-fx))
训练和测试文件数据与格式(用tab分隔)
2 1 1
3 2 1
1 2 0
1 3 0