-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathtransformers_encoder.py
More file actions
executable file
·134 lines (112 loc) · 5.24 KB
/
transformers_encoder.py
File metadata and controls
executable file
·134 lines (112 loc) · 5.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2020-08-29 01:51:01
# @Author : Kaiyan Zhang (minekaiyan@gmail.com)
# @Link : https://github.com/iseesaw
# @Version : 1.0.0
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
class TransformersEncoder:
"""封装基于Transformers的句向量编码器
"""
def __init__(
self,
model_name_or_path='/users6/kyzhang/embeddings/bert/bert-base-chinese',
max_length=128,
batch_size=128):
"""初始化
Args:
model_name_or_path (str, optional): Transformers模型位置或者别称(从HuggingFace服务器下载)
max_length (int, optional): 最大长度. Defaults to 128.
"""
print('initing encoder')
print('loading model from from pretrained')
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModel.from_pretrained(model_name_or_path)
# gpu & cpu
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using', self.device)
self.model.to(self.device)
self.model.eval()
self.max_length = max_length
self.batch_size = batch_size
print('ending initing')
def _assign_device(self, Tokenizer_output):
"""将tensor转移到gpu
"""
tokens_tensor = Tokenizer_output['input_ids'].to(self.device)
token_type_ids = Tokenizer_output['token_type_ids'].to(self.device)
attention_mask = Tokenizer_output['attention_mask'].to(self.device)
output = {
'input_ids': tokens_tensor,
'token_type_ids': token_type_ids,
'attention_mask': attention_mask
}
return output
def _mean_pooling(self, model_output, attention_mask):
"""平均池化
Args:
model_output ([type]): transformers 模型输出
attention_mask (List[List[int]]): MASK, (batch, seq_length)
Returns:
List[List[int]]: 句向量
"""
# (batch_size, seq_length, hidden_size)
token_embeddings = model_output[0].cpu()
# (batch_size, seq_length) => (batch_size, seq_length, hidden_size)
input_mask_expanded = attention_mask.cpu().unsqueeze(-1).expand(
token_embeddings.size()).float()
# Only sum the non-padding token embeddings
# (batch_size, seq_length, hidden_size) => (batch_size, hidden_size)
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
# smoothing, avoid being divided by zero
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
def encode(self, sentences, show_progress_bar=False):
"""句向量编码器
Args:
sentences (List[str]): (batch_size)
Returns:
tensor: (batch_size, hidden_size)
"""
# Tokenize sentences
dataloader = DataLoader(sentences,
batch_size=self.batch_size,
shuffle=False)
dataloader = tqdm(dataloader) if show_progress_bar else dataloader
sentence_embeddings: torch.Tensor = None
# Compute token embeddings
with torch.no_grad():
# (sequence_output, pooled_output, (hidden_states), (attentions))
# sequence_output, (batch_size, sequence_length, hidden_size))
# Sequence of hidden-states at the output of the last layer of the model.
# pooled_output, (batch_size, hidden_size))
# Last layer hidden-state of the first token of the sequence (classification token)
# further processed by a Linear layer and a Tanh activation function.
# The Linear layer weights are trained from the next sentence prediction
# (classification) objective during pre-training.
# not a good summary of the semantic content of the input
# it's better with averaging or pooling the sequence of hidden-states for the whole input sequence
for batch_sentences in dataloader:
encoded_input = self.tokenizer(batch_sentences,
padding=True,
truncation=True,
max_length=self.max_length,
return_tensors='pt')
encoded_input = self._assign_device(encoded_input)
model_output = self.model(**encoded_input)
# Perform pooling. In this case, mean pooling
batch_embeddings = self._mean_pooling(
model_output, encoded_input['attention_mask'])
sentence_embeddings = batch_embeddings if sentence_embeddings is None else torch.cat(
[sentence_embeddings, batch_embeddings], dim=0)
return sentence_embeddings
if __name__ == '__main__':
# pip install transformers==3.0.2 ([Optional] torch==1.6.0)
# https://github.com/huggingface/transformers
encoder = TransformersEncoder()
# (batch_size, hidden_size)
print(encoder.encode(['你好呀']))