FAQ-Semantic-Retrieval/transformers_encoder.py at master · iseesaw/FAQ-Semantic-Retrieval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2020-08-29 01:51:01
# @Author  : Kaiyan Zhang (minekaiyan@gmail.com)
# @Link    : https://github.com/iseesaw
# @Version : 1.0.0
import numpy as np
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel


class TransformersEncoder:
    """封装基于Transformers的句向量编码器
    """
    def __init__(
        self,
        model_name_or_path='/users6/kyzhang/embeddings/bert/bert-base-chinese',
        max_length=128,
        batch_size=128):
        """初始化

        Args:
            model_name_or_path (str, optional): Transformers模型位置或者别称（从HuggingFace服务器下载）
            max_length (int, optional): 最大长度. Defaults to 128.
        """
        print('initing encoder')
        print('loading model from from pretrained')
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path)

        # gpu & cpu
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print('using', self.device)
        self.model.to(self.device)
        self.model.eval()

        self.max_length = max_length
        self.batch_size = batch_size
        print('ending initing')

    def _assign_device(self, Tokenizer_output):
        """将tensor转移到gpu
        """
        tokens_tensor = Tokenizer_output['input_ids'].to(self.device)
        token_type_ids = Tokenizer_output['token_type_ids'].to(self.device)
        attention_mask = Tokenizer_output['attention_mask'].to(self.device)

        output = {
            'input_ids': tokens_tensor,
            'token_type_ids': token_type_ids,
            'attention_mask': attention_mask
        }

        return output

    def _mean_pooling(self, model_output, attention_mask):
        """平均池化

        Args:
            model_output ([type]): transformers 模型输出
            attention_mask (List[List[int]]): MASK, (batch, seq_length)

        Returns:
            List[List[int]]: 句向量
        """
        # (batch_size, seq_length, hidden_size)
        token_embeddings = model_output[0].cpu()

        # (batch_size, seq_length) => (batch_size, seq_length, hidden_size)
        input_mask_expanded = attention_mask.cpu().unsqueeze(-1).expand(
            token_embeddings.size()).float()

        # Only sum the non-padding token embeddings
        # (batch_size, seq_length, hidden_size) => (batch_size, hidden_size)
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)

        # smoothing, avoid being divided by zero
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def encode(self, sentences, show_progress_bar=False):
        """句向量编码器

        Args:
            sentences (List[str]): (batch_size)

        Returns:
            tensor: (batch_size, hidden_size)
        """
        # Tokenize sentences
        dataloader = DataLoader(sentences,
                                batch_size=self.batch_size,
                                shuffle=False)
        dataloader = tqdm(dataloader) if show_progress_bar else dataloader
        sentence_embeddings: torch.Tensor = None
        # Compute token embeddings
        with torch.no_grad():
            # (sequence_output, pooled_output, (hidden_states), (attentions))
            # sequence_output, (batch_size, sequence_length, hidden_size))
            #   Sequence of hidden-states at the output of the last layer of the model.
            # pooled_output, (batch_size, hidden_size))
            #   Last layer hidden-state of the first token of the sequence (classification token)
            #   further processed by a Linear layer and a Tanh activation function.
            #   The Linear layer weights are trained from the next sentence prediction
            #   (classification) objective during pre-training.
            #   not a good summary of the semantic content of the input
            #   it's better with averaging or pooling the sequence of hidden-states for the whole input sequence
            for batch_sentences in dataloader:
                encoded_input = self.tokenizer(batch_sentences,
                                               padding=True,
                                               truncation=True,
                                               max_length=self.max_length,
                                               return_tensors='pt')
                encoded_input = self._assign_device(encoded_input)
                model_output = self.model(**encoded_input)

                # Perform pooling. In this case, mean pooling
                batch_embeddings = self._mean_pooling(
                    model_output, encoded_input['attention_mask'])
                sentence_embeddings = batch_embeddings if sentence_embeddings is None else torch.cat(
                    [sentence_embeddings, batch_embeddings], dim=0)

        return sentence_embeddings


if __name__ == '__main__':
    # pip install transformers==3.0.2 ([Optional] torch==1.6.0)
    # https://github.com/huggingface/transformers
    encoder = TransformersEncoder()
    # (batch_size, hidden_size)
    print(encoder.encode(['你好呀']))