bert中文短文本句向量生成、相似度计算(GPU版、windows、win10、linux、django和flask可用)-python黑洞网

本站消息

站长简介/公众号

出租广告位,需要合作请联系站长

343ueru

1055

文章

826340

访问

+关注

分类

暂无分类

日期归档

2023-05(2)

2023-06(3)

bert中文短文本句向量生成、相似度计算(GPU版、windows、win10、linux、django和flask可用)

发布于2019-08-07 14:40 阅读(3194) 评论(0) 点赞(3) 收藏(2)

BERT句向量GPU线上调用等。出现Floating point exception and SystemError: error return without exception set 。

最近上线需要用到bert，走过了很多坑，有的甚至是不知道怎么回事，而且也很容易从解决一个问题，跳到另外一个问题，巨坑呀有木有。https://github.com/hanxiao/bert-as-service这种做成服务的，其实还是挺好的，但对做成服务的，完全无感呀。

又比如这种https://github.com/terrifyzhao/bert-utils，生成的句向量和相似度计算可调用的。但是不知道是不是yield、队列queue或者gpu、cuda、cudnn的问题，Linux的GPU上有时候会报: Floating point exception. win10和linux上debug会报: SystemError: error return without exception set 。不太敢用呀。

一.方案Keras+修改（项目地址在https://github.com/yongzhuo/nlp_xiaojiang/tree/master/FeatureProject/bert）:

左思右想，只能默默地上线我一直不太爱用地keras版本了。谁让google的tensorflow也这么做呢，趋势也去迎合迎合吧。keras版本的bert和gpt-2，https://github.com/CyberZHG/keras-bert这个项目其实还很不错啦。

不说废话，直接上代码:

二、代码:

其实这种直接调用google训练好模型的，不微调的，简单的cpu也可以调用，还不费多少内存，就是速度慢些。

2.1 首先是模型，google预训练好的模型你得下载吧，可以去官方地址下，也可以来我这里前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket

2.2 然后是主要的代码，extract_keras_bert_feature.py


# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time     :2019/5/8 20:04
# @author   :Mo
# @function :extract feature of bert and keras
 
import codecs
import os
 
import keras.backend.tensorflow_backend as ktf_keras
import numpy as np
import tensorflow as tf
from keras.layers import Add
from keras.models import Model
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
 
from FeatureProject.bert.layers_keras import NonMaskingLayer
from conf.feature_config import gpu_memory_fraction, config_name, ckpt_name, vocab_file, max_seq_len, layer_indexes
 
# 全局使用，使其可以django、flask、tornado等调用
graph = None
model = None
 
 
# gpu配置与使用率设置
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction
sess = tf.Session(config=config)
ktf_keras.set_session(sess)
 
class KerasBertVector():
    def __init__(self):
        self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
        # 全局使用，使其可以django、flask、tornado等调用
        global graph
        graph = tf.get_default_graph()
        global model
        model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path,
                                                   seq_len=self.max_seq_len)
        print(model.output)
        print(len(model.layers))
        # lay = model.layers
        #一共104个layer，其中前八层包括token,pos,embed等，
        # 每4层（MultiHeadAttention,Dropout,Add,LayerNormalization）
        # 一共24层
        layer_dict = [7]
        layer_0 = 7
        for i in range(12):
            layer_0 = layer_0 + 4
            layer_dict.append(layer_0)
        # 输出它本身
        if len(layer_indexes) == 0:
            encoder_layer = model.output
        # 分类如果只有一层，就只取最后那一层的weight，取得不正确
        elif len(layer_indexes) == 1:
            if layer_indexes[0] in [i+1 for i in range(12)]:
                encoder_layer = model.get_layer(index=layer_dict[layer_indexes[0]]).output
            else:
                encoder_layer = model.get_layer(index=layer_dict[-2]).output
        # 否则遍历需要取的层，把所有层的weight取出来并拼接起来shape:768*层数
        else:
            # layer_indexes must be [1,2,3,......12...24]
            # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
            all_layers = [model.get_layer(index=layer_dict[lay-1]).output if lay in [i+1 for i in range(12)]
                          else model.get_layer(index=layer_dict[-1]).output  #如果给出不正确，就默认输出最后一层
                          for lay in layer_indexes]
            print(layer_indexes)
            print(all_layers)
            # 其中layer==1的output是格式不对，第二层输入input是list
            all_layers_select = []
            for all_layers_one in all_layers:
                all_layers_select.append(all_layers_one)
            encoder_layer = Add()(all_layers_select)
            print(encoder_layer.shape)
        print("KerasBertEmbedding:")
        print(encoder_layer.shape)
        output_layer = NonMaskingLayer()(encoder_layer)
        model = Model(model.inputs, output_layer)
        # model.summary(120)
        # reader tokenizer
        self.token_dict = {}
        with codecs.open(self.dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
 
        self.tokenizer = Tokenizer(self.token_dict)
 
 
    def bert_encode(self, texts):
        # 文本预处理
        input_ids = []
        input_masks = []
        input_type_ids = []
        for text in texts:
            print(text)
            tokens_text = self.tokenizer.tokenize(text)
            print('Tokens:', tokens_text)
            input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len)
            input_mask = [0 if ids == 0 else 1 for ids in input_id]
            input_ids.append(input_id)
            input_type_ids.append(input_type_id)
            input_masks.append(input_mask)
 
        input_ids = np.array(input_ids)
        input_masks = np.array(input_masks)
        input_type_ids = np.array(input_type_ids)
 
        # 全局使用，使其可以django、flask、tornado等调用
        with graph.as_default():
            predicts = model.predict([input_ids, input_type_ids], batch_size=1)
        print(predicts.shape)
        for i, token in enumerate(tokens_text):
            print(token, [len(predicts[0][i].tolist())], predicts[0][i].tolist())
 
        # 相当于pool，采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py
        mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1)
        masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9)
 
        pools = []
        for i in range(len(predicts)):
            pred = predicts[i]
            masks = input_masks.tolist()
            mask_np = np.array([masks[i]])
            pooled = masked_reduce_mean(pred, mask_np)
            pooled = pooled.tolist()
            pools.append(pooled[0])
        print('bert:', pools)
        return pools
 
 
if __name__ == "__main__":
    bert_vector = KerasBertVector()
    pooled = bert_vector.bert_encode(['你是谁呀', '小老弟'])
    print(pooled)
    while True:
        print("input:")
        ques = input()
        print(bert_vector.bert_encode([ques]))

2.3 再就是layers_keras.py


# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time     :2019/5/10 10:49
# @author   :Mo
# @function :create model of keras-bert for get [-2] layers
 
from keras.engine import Layer
 
 
class NonMaskingLayer(Layer):
    """
    fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978
    thanks for https://github.com/jacoxu
    """
 
    def __init__(self, **kwargs):
        self.supports_masking = True
        super(NonMaskingLayer, self).__init__(**kwargs)
 
    def build(self, input_shape):
        pass
 
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None
 
    def call(self, x, mask=None):
        return x
 
    def compute_output_shape(self, input_shape):
        return input_shape

2.4 最后是配置文件


# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time     :2019/5/10 9:13
# @author   :Mo
# @function :path of FeatureProject
 
import os
 
# path of BERT model
file_path = os.path.dirname(__file__)
file_path = file_path.replace('conf', '') + 'Data'
model_dir = os.path.join(file_path, 'chinese_L-12_H-768_A-12/')
config_name = os.path.join(model_dir, 'bert_config.json')
ckpt_name = os.path.join(model_dir, 'bert_model.ckpt')
vocab_file = os.path.join(model_dir, 'vocab.txt')
# gpu使用率
gpu_memory_fraction = 0.2
# 默认取倒数第二层的输出值作为句向量
layer_indexes = [-2]
# 序列的最大程度，单文本建议把该值调小
max_seq_len = 26