(1)数据预处理
a.对文本数据进行贴标签处理,标签数据类似入下:
平素体质:健康状况:良,既往有“高血压病史”多年。#1
其中1表示患有高血压,0表示没有患有高血压。
然后进行分开,文本存储在一个文件,标签存储在一个文件,文本内容和标签行对行对应。
b.对文本文件的内容进行分词。
import jieba
#读取数据生成sentences
file=open(u'/home/ubuntu/file/数据平衡分类',encoding='utf-8')
filenoclass=open(u'/home/ubuntu/file/数据平衡无分类','w')
fileclass=open(u'/home/ubuntu/file/数据平衡分类结果','w')
documents=[]
tig=[]
for lines in file:
text=lines.strip().split('#')
segs=jieba.cut(text[0])
for seg in segs:
filenoclass.write(seg+" ")
filenoclass.write('\n')
fileclass.write(str(text[1])+'\n')
filenoclass.close()
fileclass.close()
file.close()
(2)训练doc2vec得到文本向量
import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
file = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/数据平衡分类结果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)
#生成文本向量
print(model.docvecs[1])
(3)准备进行分类的数据
def getData():
#生成pandas
tigs = []
data_dict = {}
# 生成pandas数据
for tig in fileclass:
tigs.append(tig.strip())
for i in range(len(model.docvecs)):
data_dict['p' + str(i)] = model.docvecs[i]
print(tigs)
print(data_dict)
data = pd.DataFrame(data_dict)
data = data.T
data['class0'] = tigs
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
return X_train1, y_train1, X_test1, y_test1
(4)准备测试方法
def getRecognitionRate(testPre, testClass):
testNum = len(testPre)
rightNum = 0
for i in range(0, testNum):
if testClass[i] == testPre[i]:
rightNum += 1
return float(rightNum) / float(testNum)
(5)进行模型训练
import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
file = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/数据平衡分类结果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)
#生成文本向量
print(model.docvecs[1])
#使用逻辑回归进行预测
def LR():
clf = LogisticRegression()
return clf
def getRecognitionRate(testPre, testClass):
testNum = len(testPre)
rightNum = 0
for i in range(0, testNum):
if testClass[i] == testPre[i]:
rightNum += 1
return float(rightNum) / float(testNum)
def getData():
#生成pandas
tigs = []
data_dict = {}
# 生成pandas数据
for tig in fileclass:
tigs.append(tig.strip())
for i in range(len(model.docvecs)):
data_dict['p' + str(i)] = model.docvecs[i]
print(tigs)
print(data_dict)
data = pd.DataFrame(data_dict)
data = data.T
data['class0'] = tigs
X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
return X_train1, y_train1, X_test1, y_test1
T = getData()
trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]
clf_LR=LR()
clf_LR.fit(trainMatrix, trainClass)
print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass))
程序员有偿接单和程序指导加QQ:734564390