-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery_model.py
100 lines (63 loc) · 3.42 KB
/
query_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from utils import stub, volume, images
@stub.function()
def query_model_ctransformers(repo_id, model_file_name, question):
from ctransformers import AutoModelForCausalLM
model_file_path = repo_id.replace('/', '_') + '/' + model_file_name
print(f"Loading model {model_file_name} into GPU...")
model = AutoModelForCausalLM.from_pretrained(model_path_or_repo_id=f'/data/models/{model_file_path}',
gpu_layers=64,
context_length=6200,
local_files_only=True
)
print("Model successfully loaded.")
print("Querying model...")
response = model(question)
print("Response generated.")
return response
@stub.function(volumes={"/data": volume}, image=images['query_model'], gpu="t4")
def query_mistral_7b_instruct_v0p2(question):
repo_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
model_file_name = "mistral-7b-instruct-v0.2.Q5_K_M.gguf"
response = stub.registered_functions['query_model_ctransformers'].local(repo_id, model_file_name, question)
return response
@stub.function(volumes={"/data": volume}, image=images['query_wittgenbot_ft'], gpu="t4")
def query_wittgenbot_ft(question):
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
wittgenbot_model_id = 'descartesevildemon/Wittgenbot-7B'
wittgenbot_file_path = '/data/models/' + wittgenbot_model_id.replace('/', '_')
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True)
print(f"Loading model {wittgenbot_model_id} into GPU...")
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=wittgenbot_file_path,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
device_map='auto',
local_files_only=True)
print("Model successfully loaded.")
print(f"Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=wittgenbot_file_path)
print("Tokenizer successfully loaded.")
print("Querying model...")
prompt_template = """
[INST]
You are a helpful chatbot assistant specialized in Ludwig Wittgenstein. Your task is to answer the following question about Ludwig Wittgenstein in a conversational, clear and coherent tone:
{question}
[/INST]
"""
prompt = prompt_template.format(question=question)
encodeds = tokenizer(prompt,
return_tensors="pt",
add_special_tokens=True)
model_inputs = encodeds.to('cuda:0')
generated_ids = model.generate(**model_inputs,
max_new_tokens=1000,
do_sample=True,
top_p=0.9,
temperature=0.7,
pad_token_id=tokenizer.unk_token_id)
decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
response = decoded[len(prompt):].strip()
print("Response generated.")
return response