发布于2019-08-07 14:40 阅读(3194) 评论(0) 点赞(3) 收藏(2)
BERT句向量GPU线上调用等。出现Floating point exception and SystemError: error return without exception set 。
最近上线需要用到bert,走过了很多坑,有的甚至是不知道怎么回事,而且也很容易从解决一个问题,跳到另外一个问题,巨坑呀有木有。https://github.com/hanxiao/bert-as-service这种做成服务的,其实还是挺好的,但对做成服务的,完全无感呀。
又比如这种https://github.com/terrifyzhao/bert-utils,生成的句向量和相似度计算可调用的。但是不知道是不是yield、队列queue或者gpu、cuda、cudnn的问题,Linux的GPU上有时候会报: Floating point exception. win10和linux上debug会报: SystemError: error return without exception set 。不太敢用呀。
左思右想,只能默默地上线我一直不太爱用地keras版本了。谁让google的tensorflow也这么做呢,趋势也去迎合迎合吧。keras版本的bert和gpt-2,https://github.com/CyberZHG/keras-bert这个项目其实还很不错啦。
不说废话,直接上代码:
其实这种直接调用google训练好模型的,不微调的,简单的cpu也可以调用,还不费多少内存,就是速度慢些。
2.1 首先是模型,google预训练好的模型你得下载吧,可以去官方地址下,也可以来我这里前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket
2.2 然后是主要的代码,extract_keras_bert_feature.py
- # -*- coding: UTF-8 -*-
- # !/usr/bin/python
- # @time :2019/5/8 20:04
- # @author :Mo
- # @function :extract feature of bert and keras
-
- import codecs
- import os
-
- import keras.backend.tensorflow_backend as ktf_keras
- import numpy as np
- import tensorflow as tf
- from keras.layers import Add
- from keras.models import Model
- from keras_bert import load_trained_model_from_checkpoint, Tokenizer
-
- from FeatureProject.bert.layers_keras import NonMaskingLayer
- from conf.feature_config import gpu_memory_fraction, config_name, ckpt_name, vocab_file, max_seq_len, layer_indexes
-
- # 全局使用,使其可以django、flask、tornado等调用
- graph = None
- model = None
-
-
- # gpu配置与使用率设置
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
- config = tf.ConfigProto()
- config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction
- sess = tf.Session(config=config)
- ktf_keras.set_session(sess)
-
- class KerasBertVector():
- def __init__(self):
- self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
- # 全局使用,使其可以django、flask、tornado等调用
- global graph
- graph = tf.get_default_graph()
- global model
- model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path,
- seq_len=self.max_seq_len)
- print(model.output)
- print(len(model.layers))
- # lay = model.layers
- #一共104个layer,其中前八层包括token,pos,embed等,
- # 每4层(MultiHeadAttention,Dropout,Add,LayerNormalization)
- # 一共24层
- layer_dict = [7]
- layer_0 = 7
- for i in range(12):
- layer_0 = layer_0 + 4
- layer_dict.append(layer_0)
- # 输出它本身
- if len(layer_indexes) == 0:
- encoder_layer = model.output
- # 分类如果只有一层,就只取最后那一层的weight,取得不正确
- elif len(layer_indexes) == 1:
- if layer_indexes[0] in [i+1 for i in range(12)]:
- encoder_layer = model.get_layer(index=layer_dict[layer_indexes[0]]).output
- else:
- encoder_layer = model.get_layer(index=layer_dict[-2]).output
- # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
- else:
- # layer_indexes must be [1,2,3,......12...24]
- # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
- all_layers = [model.get_layer(index=layer_dict[lay-1]).output if lay in [i+1 for i in range(12)]
- else model.get_layer(index=layer_dict[-1]).output #如果给出不正确,就默认输出最后一层
- for lay in layer_indexes]
- print(layer_indexes)
- print(all_layers)
- # 其中layer==1的output是格式不对,第二层输入input是list
- all_layers_select = []
- for all_layers_one in all_layers:
- all_layers_select.append(all_layers_one)
- encoder_layer = Add()(all_layers_select)
- print(encoder_layer.shape)
- print("KerasBertEmbedding:")
- print(encoder_layer.shape)
- output_layer = NonMaskingLayer()(encoder_layer)
- model = Model(model.inputs, output_layer)
- # model.summary(120)
- # reader tokenizer
- self.token_dict = {}
- with codecs.open(self.dict_path, 'r', 'utf8') as reader:
- for line in reader:
- token = line.strip()
- self.token_dict[token] = len(self.token_dict)
-
- self.tokenizer = Tokenizer(self.token_dict)
-
-
- def bert_encode(self, texts):
- # 文本预处理
- input_ids = []
- input_masks = []
- input_type_ids = []
- for text in texts:
- print(text)
- tokens_text = self.tokenizer.tokenize(text)
- print('Tokens:', tokens_text)
- input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len)
- input_mask = [0 if ids == 0 else 1 for ids in input_id]
- input_ids.append(input_id)
- input_type_ids.append(input_type_id)
- input_masks.append(input_mask)
-
- input_ids = np.array(input_ids)
- input_masks = np.array(input_masks)
- input_type_ids = np.array(input_type_ids)
-
- # 全局使用,使其可以django、flask、tornado等调用
- with graph.as_default():
- predicts = model.predict([input_ids, input_type_ids], batch_size=1)
- print(predicts.shape)
- for i, token in enumerate(tokens_text):
- print(token, [len(predicts[0][i].tolist())], predicts[0][i].tolist())
-
- # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py
- mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1)
- masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9)
-
- pools = []
- for i in range(len(predicts)):
- pred = predicts[i]
- masks = input_masks.tolist()
- mask_np = np.array([masks[i]])
- pooled = masked_reduce_mean(pred, mask_np)
- pooled = pooled.tolist()
- pools.append(pooled[0])
- print('bert:', pools)
- return pools
-
-
- if __name__ == "__main__":
- bert_vector = KerasBertVector()
- pooled = bert_vector.bert_encode(['你是谁呀', '小老弟'])
- print(pooled)
- while True:
- print("input:")
- ques = input()
- print(bert_vector.bert_encode([ques]))
-
2.3 再就是layers_keras.py
- # -*- coding: UTF-8 -*-
- # !/usr/bin/python
- # @time :2019/5/10 10:49
- # @author :Mo
- # @function :create model of keras-bert for get [-2] layers
-
- from keras.engine import Layer
-
-
- class NonMaskingLayer(Layer):
- """
- fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978
- thanks for https://github.com/jacoxu
- """
-
- def __init__(self, **kwargs):
- self.supports_masking = True
- super(NonMaskingLayer, self).__init__(**kwargs)
-
- def build(self, input_shape):
- pass
-
- def compute_mask(self, input, input_mask=None):
- # do not pass the mask to the next layers
- return None
-
- def call(self, x, mask=None):
- return x
-
- def compute_output_shape(self, input_shape):
- return input_shape
2.4 最后是配置文件
- # -*- coding: UTF-8 -*-
- # !/usr/bin/python
- # @time :2019/5/10 9:13
- # @author :Mo
- # @function :path of FeatureProject
-
- import os
-
- # path of BERT model
- file_path = os.path.dirname(__file__)
- file_path = file_path.replace('conf', '') + 'Data'
- model_dir = os.path.join(file_path, 'chinese_L-12_H-768_A-12/')
- config_name = os.path.join(model_dir, 'bert_config.json')
- ckpt_name = os.path.join(model_dir, 'bert_model.ckpt')
- vocab_file = os.path.join(model_dir, 'vocab.txt')
- # gpu使用率
- gpu_memory_fraction = 0.2
- # 默认取倒数第二层的输出值作为句向量
- layer_indexes = [-2]
- # 序列的最大程度,单文本建议把该值调小
- max_seq_len = 26
希望对你有所帮助!
作者:343ueru
链接:https://www.pythonheidong.com/blog/article/11372/6034872bc1af8a44a009/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!