一,安装pip3:
#sudo apt-get install pip3
二,安装jieba:
#sudo pip3 install jieba
三,安装sklearn:
#sudo pip3 install scikit-learn
四,安装sklearn依赖(numpy,scipy):
#sudo pip3 install numpy#sudo pip3 install scipy
eg:国内安装时可能出现time-out错误---解决办法如下:
#sudo pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple numpy#sudo pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple scipy
五,简单实现分词并计算TF-IDF值:
images/loading.gif' data-original="http://images2015.cnblogs.com/blog/759962/201703/759962-20170323230728690-284211178.png" >
#!/usr/bin python3.5# coding=utf-8#import osimport jiebaimport refrom sklearn.feature_extraction.text import TfidfTransformerfrom sklearn.feature_extraction.text import CountVectorizer#import collectionsclass Tfi(object): def __init__(self): self.stop_list = [] pass def fenci(self, file): #list = [] fin = open(file, 'r') read_b = fin.read() fin.close() read_res = ''.join(re.findall(u'[a-zA-Z0-9\u4e00-\u9fa5]+', read_b)) cut_res = jieba.cut(read_res, cut_all=True) line_res = '' for i in cut_res: if i not in self.stop_list: line_res = line_res + i + ' ' fout = open('res/' + file, 'w') fout.write(line_res) fout.close() def cipin(self, fil_list): corpus = [] for fil in fil_list: ffout = open('res/'+fil, 'r') read_r = ffout.read() ffout.close() corpus.append(read_r) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() # 所有文本的关键字 weight = tfidf.toarray() for j in range(len(weight)): f = open('fes/'+fil_list[j], 'w') for i in range(len(word)): f.write(word[i]+' '+str(weight[j][i]) + '\n') f.close()if __name__ == '__main__': first = Tfi() fil_list = ['inputtext'] first.fenci('inputtext') first.cipin(fil_list)
原标题:Python3.5 数据处理
关键词:python