Python使用doc2vec和LR进行文本分类

(1)数据预处理
a.对文本数据进行贴标签处理,标签数据类似入下:

平素体质:健康状况:良,既往有“高血压病史”多年。#1

其中1表示患有高血压,0表示没有患有高血压。
然后进行分开,文本存储在一个文件,标签存储在一个文件,文本内容和标签行对行对应。
b.对文本文件的内容进行分词。

import jieba

#读取数据生成sentences
file=open(u'/home/ubuntu/file/数据平衡分类',encoding='utf-8')
filenoclass=open(u'/home/ubuntu/file/数据平衡无分类','w')
fileclass=open(u'/home/ubuntu/file/数据平衡分类结果','w')
documents=[]
tig=[]
for lines in file:
	text=lines.strip().split('#')
	segs=jieba.cut(text[0])
	for seg in segs:
		filenoclass.write(seg+" ")
	filenoclass.write('\n')
	fileclass.write(str(text[1])+'\n')
filenoclass.close()
fileclass.close()
file.close()


(2)训练doc2vec得到文本向量

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split

file = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/数据平衡分类结果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)
#生成文本向量
print(model.docvecs[1])

(3)准备进行分类的数据

def getData():
	#生成pandas
	tigs = []
	data_dict = {}
	# 生成pandas数据
	for tig in fileclass:
		tigs.append(tig.strip())
	for i in range(len(model.docvecs)):
		data_dict['p' + str(i)] = model.docvecs[i]
	print(tigs)
	print(data_dict)
	data = pd.DataFrame(data_dict)
	data = data.T
	data['class0'] = tigs
	X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
	return X_train1, y_train1, X_test1, y_test1

(4)准备测试方法

def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)

(5)进行模型训练

import gensim
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split

file = open(u'/home/ubuntu/file/数据平衡无分类', encoding='utf-8')
fileclass=open(u'/home/ubuntu/file/数据平衡分类结果',encoding='utf-8')
documents = gensim.models.doc2vec.TaggedLineDocument(file)
model = gensim.models.Doc2Vec(documents, size=100, window=8, min_count=100, workers=8)
#生成文本向量
print(model.docvecs[1])
#使用逻辑回归进行预测
def LR():
    clf = LogisticRegression()
    return clf
def getRecognitionRate(testPre, testClass):
    testNum = len(testPre)
    rightNum = 0
    for i in range(0, testNum):
        if testClass[i] == testPre[i]:
            rightNum += 1
    return float(rightNum) / float(testNum)
def getData():
	#生成pandas
	tigs = []
	data_dict = {}
	# 生成pandas数据
	for tig in fileclass:
		tigs.append(tig.strip())
	for i in range(len(model.docvecs)):
		data_dict['p' + str(i)] = model.docvecs[i]
	print(tigs)
	print(data_dict)
	data = pd.DataFrame(data_dict)
	data = data.T
	data['class0'] = tigs
	X_train1, X_test1, y_train1, y_test1 = train_test_split(data, data['class0'], test_size=0.4, random_state=0)
	return X_train1, y_train1, X_test1, y_test1
T = getData()
trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3]
clf_LR=LR()
clf_LR.fit(trainMatrix, trainClass)
print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass))

程序员有偿接单和程序指导加QQ:734564390

已标记关键词 清除标记
©️2020 CSDN 皮肤主题: Age of Ai 设计师:meimeiellie 返回首页