NLP计算文档相似度之TF-IDF

#!/usr/bin/python  
# -*- coding: utf-8 -*-
import numpy
import os
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#sys.setdefaultencoding('utf8')了 网易 杭研 大厦","小明 硕士 毕业 与 中国 科学院","我 爱 北京 天安门"]
trainfile = open(u'D:\python_noweightpathway\TIA\TIAxmmc.txt','r',encoding= 'utf8') #不同的documents用换行符隔开
traincorpus = trainfile.readlines()

#corpus=["我 来到 北京 清华大学","我 他 来到
trainfile.close()
corpus = traincorpus;

vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(tfidf_vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word=tfidf_vectorizer.get_feature_names()#获取词袋模型中的所有词语
weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
f = open("D:\python_noweightpathway\TIA\TIAsmilarity.txt","w+")
for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
#         print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"
    f.write(str(i+1)+"\t")
    for j in range(len(word)):
        if(weight[i][j]>0): f.write(str(j+1) + ":" + str(weight[i][j]) + " ")
    f.write("\n")
    print (i)
f.close()
# f = open("D:\python_noweightpathway\TIA\dictionary.txt","w+")
# for i in range(len(word)):
#     f.write(str(i) + "\t" + word[i].encode("utf-8") + "\n")
# f.close()

SimMatrix = (tfidf * tfidf.T).A
print (SimMatrix[1,3]) #"第一篇与第4篇的相似度"

numpy.savetxt("D:\python_noweightpathway\TIA\SimMatrix.csv", SimMatrix, delimiter=",") #保存相似度矩阵

自己实现TF-IDF算法

# -*-coding:utf8-*-
import pandas as pd
from numpy import *
file = open("jiebaResult_stopWords.txt", "r", encoding="utf8")
set_words=set()
words_list=[]
for line in file:
    words = line.strip().split(" ")
    words_list.append(words)
    for word in words:
        set_words.add(word)
file.close()
data_index=zeros([len(words_list),len(set_words)])
df=pd.DataFrame(columns=set_words,data=data_index)
tf_idf=pd.DataFrame(columns=set_words,data=data_index)
lenght=len(words_list)
for index in range(lenght):
    leng=len(words_list[index])
    for word in words_list[index]:
        df[word][index]=df[word][index]+leng
for word in words_list:
    N=len([i for i in list(df[word]) if i!=0])
    df[word]=df[word]*log(lenght/(1+N))
print(df)

算法理论学习:
TF-IDF算法理论学习
pandas中DataFrame学习

#-*-coding:utf8-*-
import pandas as pd
from numpy import *
a=zeros([3,4])
print(a)
a=[1,2,3,0,4]
#计算出现目标词的文档的个数
lenght=len([i for i in a if i!=0])
print(lenght)
#数据框获取一列
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df['col1']=df['col1']*3
print(df)
相关推荐
©️2020 CSDN 皮肤主题: Age of Ai 设计师:meimeiellie 返回首页