发布于2019-12-07 10:07 阅读(1418) 评论(0) 点赞(21) 收藏(3)
分别用Word2Vec,TfidfVectorizer,HashingVectorizer生成词向量
用’随机梯度下降(SGD)’; ‘感知机(Perceptron)’;
‘朴素贝叶斯分类器(MultinomialNB)’; ‘PA被动感知算法(Passive-Aggressive)’
训练
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd
import jieba
# 默认情况下,matplotlib不支持中文显示,我们需要进行一下设置。
# 设置字体为黑体,以支持中文显示。
rcParams["font.family"] = "SimHei"
# 设置在中文字体时,能够正常的显示负号(-)。
rcParams["axes.unicode_minus"] = False
df = pd.read_excel("./data/blogs.xlsx",names=['columnid','content'])
stop_words = ['\n','','。',',','|','【','】',':',' ','...','/','.','_','+','=','[',']','-']
# 分词处理
df.content = df.content.map(
lambda aa : ' '.join([ x for x in " ".join(jieba.cut(str(aa))).split(' ') if x not in stop_words]))
# 写入已分词
df.to_excel('./data/blogs_participled.xlsx')
import pickle
df = pd.read_csv("./data/blogs_participled.xlsx",error_bad_lines=False)
tt = ' '.join([ x for x in df.content])
cco = set(tt.split(' ')) # 转集合去重
# 保存词汇列表
with open('./data/blogswordslist.pickle',mode='wb') as f1:
pickle.dump(cco,f1)
# 加载词汇列表
with open('./data/blogswordslist.pickle',mode='rb') as f2:
a_list = pickle.load(f2)
vocabulary = list(a_list)
from gensim.models import Word2Vec
# 求出总词向量空间
df = pd.read_excel('./data/blogs_participled.xlsx')
sss = []
for x in df.content.values:
sss.append(x.split(' '))
# 生成词向量为100维,考虑上下5个单词共10个单词,采用sg=1的方法也就是skip-gram
model = Word2Vec(size=100, workers=5,sg=1)
model.build_vocab(sss)
model.train(sss,total_examples = model.corpus_count,epochs = model.iter)
model.save('./model/gensim_w2v_blog.model') # 保存模型
sss.clear()
from gensim.models import Word2Vec
new_model = Word2Vec.load('./model/gensim_w2v_blog.model') # 调用模型
sim_words = new_model.wv.most_similar(positive=['人']) # 求相似词
df = pd.read_excel('./data/blogs_participled.xlsx')
res00 = pd.DataFrame(columns=('columnid','content'))
i = 0
for k,v in df.iterrows():
# 转换为词向量
try:
res00.loc[i] = [v['columnid'],np.sum([new_model[y] for y in v['content'].split(' ')],axis=0,dtype=float)]
i+= 1
except(KeyError,TypeError):
pass
print(i)
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.preprocessing import scale
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore') # 忽视警告
# Word2Vec处理的词向量
y = res00.columnid
x = res00.content
#将x进行标准化
x_std = scale(np.array(list(x)))
y = np.array(y,dtype=float)
X_train, X_test, y_train, y_test = train_test_split(x_std, y, test_size=0.3, random_state=0)
clfs = [ SGDClassifier(max_iter=5, tol=1e-3),
Perceptron(tol=1e-3),
PassiveAggressiveClassifier(tol=1e-3),]
for clf in clfs:
# clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
clf = clf.fit(X_train,y_train)
scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
df = pd.read_excel('./data/blogs_participled.xlsx')
cv = CountVectorizer(stop_words=stop_words,vocabulary=vocabulary)
tfidf = TfidfTransformer()
aae = tfidf.fit_transform(cv.fit_transform(df.content))
y_np = np.array(df['columnid'],dtype=float)
X_train, X_test, y_train, y_test = train_test_split(aae, y_np, test_size=0.3, random_state=0)
clfs = {
'随机梯度下降(SGD)': SGDClassifier(max_iter=5, tol=1e-3),
'感知机(Perceptron)': Perceptron(tol=1e-3),
'朴素贝叶斯分类器(MultinomialNB)': MultinomialNB(alpha=0.01),
'PA被动感知算法(Passive-Aggressive)': PassiveAggressiveClassifier(tol=1e-3),
}
for cls_name, clf in clfs.items():
# clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
clf = clf.fit(X_train,y_train)
scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
print("%s Accuracy: %0.3f (+/- %0.2f)" % (cls_name,scores.mean(), scores.std() * 2))
from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf向量
stop_words = list(['\n','','。',',','|','【','】',':',' ','...','/','.','_','+','=','[',']','-'])
vectorizer = TfidfVectorizer(stop_words=stop_words,vocabulary=vocabulary)
aav = vectorizer.fit_transform(df.content.fillna(axis=0,method='ffill')) # 有缺失值用上一个填充
y_np = np.array(df['columnid'],dtype=float)
X_train, X_test, y_train, y_test = train_test_split(aav, y_np, test_size=0.3, random_state=0)
clfs = {
'随机梯度下降(SGD)': SGDClassifier(max_iter=5, tol=1e-3),
'感知机(Perceptron)': Perceptron(tol=1e-3),
'朴素贝叶斯分类器(MultinomialNB)': MultinomialNB(alpha=0.01),
'PA被动感知算法(Passive-Aggressive)': PassiveAggressiveClassifier(tol=1e-3),
}
for cls_name, clf in clfs.items():
# clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
clf = clf.fit(X_train,y_train)
scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
print("%s Accuracy: %0.3f (+/- %0.2f)" % (cls_name,scores.mean(), scores.std() * 2))
from sklearn.feature_extraction.text import HashingVectorizer
# 将文本文档的集合转换为令牌出现的矩阵
stop_words = list(['\n','','。',',','|','【','】',':',' ','(',')','(',')',','])
vectorizer = HashingVectorizer(n_features=2**10,stop_words=stop_words)
aav = vectorizer.fit_transform(df.content.fillna(axis=0,method='ffill')) # 有缺失值用上一个填充
y_np = np.array(df['columnid'],dtype=float)
X_train, X_test, y_train, y_test = train_test_split(aav, y_np, test_size=0.3, random_state=0)
clfs = {
'随机梯度下降(SGD)': SGDClassifier(max_iter=5, tol=1e-3),
'感知机(Perceptron)': Perceptron(tol=1e-3),
'PA被动感知算法(Passive-Aggressive)': PassiveAggressiveClassifier(tol=1e-3),
}
for cls_name, clf in clfs.items():
# clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
clf = clf.fit(X_train,y_train)
scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
print("%s Accuracy: %0.3f (+/- %0.2f)" % (cls_name,scores.mean(), scores.std() * 2))
# 保存模型
with open('./model/clfpa.model',mode='wb') as f1:
pickle.dump(clfp,f1)
# 加载模型
with open('./model/clfpa.model',mode='rb') as f2:
clfpa = pickle.load(f2)
ttii = '''*************************更换测试文档*************************************** '''
type(tio)
len(ttii)
aa = []
aa.append(' '.join([ x for x in " ".join(jieba.cut(str(ttii))).split(' ') if x not in stop_words]))
aa
ace = vectorizer.fit_transform(aa)
clfpa.predict(ace)
作者:j878
链接:https://www.pythonheidong.com/blog/article/169921/9516f53977366d3423ae/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!