程序员最近都爱上了这个网站  程序员们快来瞅瞅吧!  it98k网:it98k.com

本站消息

站长简介/公众号

  出租广告位,需要合作请联系站长

+关注
已关注

分类  

暂无分类

标签  

暂无标签

日期归档  

文本分类器训练

发布于2019-12-07 10:07     阅读(1418)     评论(0)     点赞(21)     收藏(3)


分别用Word2Vec,TfidfVectorizer,HashingVectorizer生成词向量
用’随机梯度下降(SGD)’; ‘感知机(Perceptron)’;
‘朴素贝叶斯分类器(MultinomialNB)’; ‘PA被动感知算法(Passive-Aggressive)’
训练

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd 
import jieba
# 默认情况下,matplotlib不支持中文显示,我们需要进行一下设置。
# 设置字体为黑体,以支持中文显示。
rcParams["font.family"] = "SimHei"
# 设置在中文字体时,能够正常的显示负号(-)。
rcParams["axes.unicode_minus"] = False
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
df = pd.read_excel("./data/blogs.xlsx",names=['columnid','content'])
stop_words = ['\n','','。',',','|','【','】',':',' ','...','/','.','_','+','=','[',']','-']
# 分词处理
df.content = df.content.map(
    lambda aa : ' '.join([ x for x in " ".join(jieba.cut(str(aa))).split(' ') if x not in stop_words]))
# 写入已分词
df.to_excel('./data/blogs_participled.xlsx')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
import pickle
df = pd.read_csv("./data/blogs_participled.xlsx",error_bad_lines=False)
tt = ' '.join([ x for x in df.content])
cco = set(tt.split(' ')) # 转集合去重
# 保存词汇列表
with open('./data/blogswordslist.pickle',mode='wb') as f1:
     pickle.dump(cco,f1)
# 加载词汇列表
with open('./data/blogswordslist.pickle',mode='rb') as f2:
     a_list = pickle.load(f2)
vocabulary = list(a_list)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
from gensim.models import Word2Vec
# 求出总词向量空间
df = pd.read_excel('./data/blogs_participled.xlsx')
sss = []
for x in df.content.values:
    sss.append(x.split(' '))
# 生成词向量为100维,考虑上下5个单词共10个单词,采用sg=1的方法也就是skip-gram
model = Word2Vec(size=100, workers=5,sg=1)  
model.build_vocab(sss)
model.train(sss,total_examples = model.corpus_count,epochs = model.iter)
model.save('./model/gensim_w2v_blog.model')            # 保存模型 
sss.clear()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
from gensim.models import Word2Vec
new_model = Word2Vec.load('./model/gensim_w2v_blog.model') # 调用模型
sim_words = new_model.wv.most_similar(positive=['人']) # 求相似词
df = pd.read_excel('./data/blogs_participled.xlsx')
res00 = pd.DataFrame(columns=('columnid','content')) 
i = 0
for k,v in df.iterrows():
    # 转换为词向量
    try:
        res00.loc[i] =  [v['columnid'],np.sum([new_model[y] for y in v['content'].split(' ')],axis=0,dtype=float)]
        i+= 1
    except(KeyError,TypeError):
        pass
print(i) 
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.preprocessing import scale
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
import warnings
warnings.filterwarnings(action='ignore') # 忽视警告
# Word2Vec处理的词向量
y = res00.columnid
x = res00.content
#将x进行标准化
x_std = scale(np.array(list(x)))
y = np.array(y,dtype=float)
X_train, X_test, y_train, y_test = train_test_split(x_std, y, test_size=0.3, random_state=0)

clfs = [ SGDClassifier(max_iter=5, tol=1e-3),
        Perceptron(tol=1e-3),
         PassiveAggressiveClassifier(tol=1e-3),]

for clf in clfs:
    # clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
    clf = clf.fit(X_train,y_train)
    scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
    print("Accuracy: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
df = pd.read_excel('./data/blogs_participled.xlsx')
cv = CountVectorizer(stop_words=stop_words,vocabulary=vocabulary)
tfidf = TfidfTransformer()
aae = tfidf.fit_transform(cv.fit_transform(df.content))
y_np = np.array(df['columnid'],dtype=float)
X_train, X_test, y_train, y_test = train_test_split(aae, y_np, test_size=0.3, random_state=0)

clfs = {
    '随机梯度下降(SGD)': SGDClassifier(max_iter=5, tol=1e-3),
    '感知机(Perceptron)': Perceptron(tol=1e-3),
    '朴素贝叶斯分类器(MultinomialNB)': MultinomialNB(alpha=0.01),
    'PA被动感知算法(Passive-Aggressive)': PassiveAggressiveClassifier(tol=1e-3),
}

for cls_name, clf in clfs.items():
#     clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
    clf = clf.fit(X_train,y_train)
    scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
    print("%s Accuracy: %0.3f (+/- %0.2f)" % (cls_name,scores.mean(), scores.std() * 2))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf向量
stop_words = list(['\n','','。',',','|','【','】',':',' ','...','/','.','_','+','=','[',']','-'])

vectorizer = TfidfVectorizer(stop_words=stop_words,vocabulary=vocabulary)
aav = vectorizer.fit_transform(df.content.fillna(axis=0,method='ffill')) # 有缺失值用上一个填充
y_np = np.array(df['columnid'],dtype=float)
X_train, X_test, y_train, y_test = train_test_split(aav, y_np, test_size=0.3, random_state=0)

clfs = {
    '随机梯度下降(SGD)': SGDClassifier(max_iter=5, tol=1e-3),
    '感知机(Perceptron)': Perceptron(tol=1e-3),
    '朴素贝叶斯分类器(MultinomialNB)': MultinomialNB(alpha=0.01),
    'PA被动感知算法(Passive-Aggressive)': PassiveAggressiveClassifier(tol=1e-3),
}

for cls_name, clf in clfs.items():
#     clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
    clf = clf.fit(X_train,y_train)
    scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
    print("%s Accuracy: %0.3f (+/- %0.2f)" % (cls_name,scores.mean(), scores.std() * 2))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
from sklearn.feature_extraction.text import HashingVectorizer
# 将文本文档的集合转换为令牌出现的矩阵
stop_words = list(['\n','','。',',','|','【','】',':',' ','(',')','(',')',','])
vectorizer = HashingVectorizer(n_features=2**10,stop_words=stop_words)
aav = vectorizer.fit_transform(df.content.fillna(axis=0,method='ffill')) # 有缺失值用上一个填充
y_np = np.array(df['columnid'],dtype=float)
X_train, X_test, y_train, y_test = train_test_split(aav, y_np, test_size=0.3, random_state=0)

clfs = {
    '随机梯度下降(SGD)': SGDClassifier(max_iter=5, tol=1e-3),
    '感知机(Perceptron)': Perceptron(tol=1e-3),
    'PA被动感知算法(Passive-Aggressive)': PassiveAggressiveClassifier(tol=1e-3),
}

for cls_name, clf in clfs.items():
#     clf = clf.partial_fit(X_train,y_train,classes=[1,2,3,4,5,6,7,8,9,10,11]) # 增量学习
    clf = clf.fit(X_train,y_train)
    scores = cross_val_score(clf,X_test,y_test,cv=5,scoring='accuracy')
    print("%s Accuracy: %0.3f (+/- %0.2f)" % (cls_name,scores.mean(), scores.std() * 2))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
# 保存模型
with open('./model/clfpa.model',mode='wb') as f1:
     pickle.dump(clfp,f1)
# 加载模型
with open('./model/clfpa.model',mode='rb') as f2:
     clfpa = pickle.load(f2)

ttii = '''*************************更换测试文档*************************************** '''
type(tio)
len(ttii)
aa = []
aa.append(' '.join([ x for x in " ".join(jieba.cut(str(ttii))).split(' ') if x not in stop_words]))
aa
ace = vectorizer.fit_transform(aa)
clfpa.predict(ace)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15


所属网站分类: 技术文章 > 博客

作者:j878

链接:https://www.pythonheidong.com/blog/article/169921/9516f53977366d3423ae/

来源:python黑洞网

任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任

21 0
收藏该文
已收藏

评论内容:(最多支持255个字符)