forked from PaddlePaddle/PaddleNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict_generation.py
151 lines (132 loc) · 6.18 KB
/
predict_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.distributed import fleet
from paddlenlp.peft import LoRAConfig, LoRAModel
from paddlenlp.transformers import (
AutoConfig,
AutoModelForConditionalGeneration,
AutoTokenizer,
)
def parse_arguments():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path", default="THUDM/glm-large-chinese", required=True, help="The directory of model."
)
parser.add_argument("--lora_path", default=None, help="The directory of LoRA parameters. Default to None")
parser.add_argument(
"--merge_tensor_parallel_path", default=None, help="The directory of model to merge tensor parallel parts."
)
parser.add_argument("--batch_size", type=int, default=2, help="The batch size of data.")
parser.add_argument("--src_length", type=int, default=200, help="The batch size of data.")
parser.add_argument("--tgt_length", type=int, default=20, help="The batch size of data.")
return parser.parse_args()
def batchfy_text(texts, batch_size):
batch_texts = []
batch_start = 0
while batch_start < len(texts):
batch_texts += [texts[batch_start : min(batch_start + batch_size, len(texts))]]
batch_start += batch_size
return batch_texts
class Predictor(object):
def __init__(self, args):
self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
self.batch_size = args.batch_size
self.args = args
tensor_parallel_degree = paddle.distributed.get_world_size()
tensor_parallel_rank = 0
if tensor_parallel_degree > 1:
strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": 1,
"mp_degree": tensor_parallel_degree,
"pp_degree": 1,
"sharding_degree": 1,
}
fleet.init(is_collective=True, strategy=strategy)
hcg = fleet.get_hybrid_communicate_group()
tensor_parallel_rank = hcg.get_model_parallel_rank()
if self.args.lora_path is not None:
lora_config = LoRAConfig.from_pretrained(self.args.lora_path)
dtype = lora_config.dtype
else:
config = AutoConfig.from_pretrained(args.model_name_or_path)
dtype = config.dtype if config.dtype is not None else "float32"
self.model = AutoModelForConditionalGeneration.from_pretrained(
args.model_name_or_path,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
dtype=dtype,
)
if self.args.lora_path is not None:
self.model = LoRAModel.from_pretrained(self.model, self.args.lora_path)
self.model.eval()
def preprocess(self, input_text):
input_text = [text.strip() + "[gMASK]" for text in input_text]
inputs = self.tokenizer(
input_text,
return_tensors="np",
add_special_tokens=True,
padding=True,
max_length=self.args.src_length,
truncation=True,
truncation_side="left",
)
inputs = self.tokenizer.build_inputs_for_generation(inputs, max_gen_length=self.args.tgt_length)
inputs_tensor = {}
for key, value in inputs.items():
inputs_tensor[key] = paddle.to_tensor(value)
return inputs_tensor
def infer(self, inputs):
result = self.model.generate(
**inputs,
decode_strategy="sampling",
top_k=1,
max_length=self.args.tgt_length,
eos_token_id=self.tokenizer.eop_token_id,
pad_token_id=self.tokenizer.pad_token_id,
)
result = result[0]
return result
def postprocess(self, infer_data):
result = []
for x in infer_data.tolist():
res = self.tokenizer.decode(x, skip_special_tokens=True)
result.append(res)
out_dict = {"result": result}
return out_dict
def predict(self, texts):
input_map = self.preprocess(texts)
infer_result = self.infer(input_map)
output = self.postprocess(infer_result)
return output
if __name__ == "__main__":
args = parse_arguments()
predictor = Predictor(args)
all_texts = [
"答案:年基准利率4.35%,上下文:从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。在已知答案的前提下,问题:",
"答案:U系列,上下文:U系列是最好的,采用国际顶尖技术(由格力自主研发)双级变频压缩机,提高压缩机运转效率,制冷制热能力更强劲;1赫兹变频技术,使空调相当于一个15 W电灯泡,更加节能省电;送风面积广,风力大;生态风,净化空气。非常不错,现在国美在做活动,可以了解一下。在已知答案的前提下,问题:",
]
batch_texts = batchfy_text(all_texts, args.batch_size)
for bs, texts in enumerate(batch_texts):
outputs = predictor.predict(texts)
for text, result in zip(texts, outputs["result"]):
print("{}\n{}".format(text, result))
if args.merge_tensor_parallel_path is not None:
predictor.model.save_pretrained(
save_dir=args.merge_tensor_parallel_path,
merge_tensor_parallel=True,
)
predictor.tokenizer.save_pretrained(args.merge_tensor_parallel_path)