-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_keyword.py
85 lines (71 loc) · 2.88 KB
/
build_keyword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from omegaconf import DictConfig, OmegaConf
import os
from typing import *
from rich.progress import track
import argparse
from transformers import AutoTokenizer
from ld4pg.data import get_dataset
from ld4pg.data.controlnet_data_module import ControlNetKeywordDataModule
_WORD_START = 'Ġ'
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="conf/controlnet/qqp_chatgpt_mask_control_conv.yaml")
parser.add_argument("--save_path", type=str, default="/home/zhuangzy/")
args = parser.parse_args()
return args
def build_dataset(cfg: DictConfig, evaluation=False):
tokenizer = AutoTokenizer.from_pretrained(cfg.params.tokenizer)
dataset = get_dataset(cfg.name)
dataset_module = ControlNetKeywordDataModule(
cfg=cfg.params,
tokenizer=tokenizer,
train_dataset=dataset[0] if not evaluation else None,
valid_dataset=dataset[1] if not evaluation else None,
test_dataset=dataset[2] if evaluation else None,
inf_train_dataloader=False,
)
return dataset_module, tokenizer
def get_keywords(dataset, tokenizer):
tokens = []
words = []
for kw_ids, ids in track(zip(dataset.kw_ids, dataset.target['input_ids'])):
kw_tokens = tokenizer.convert_ids_to_tokens(kw_ids)
word_list = []
token_list = []
try:
for idx, token in enumerate(kw_tokens):
if token not in tokenizer.special_tokens_map.values():
token = token.rstrip(_WORD_START)
if token.startswith(_WORD_START):
# a new word
word_list.append(token.strip(_WORD_START))
elif idx >= 1 and kw_tokens[idx - 1] in tokenizer.special_tokens_map.values():
# the first word
word_list.append(token)
else:
# BPE word
word_list[-1] = word_list[-1] + token
token_list.append(token)
except Exception as e:
print(kw_tokens)
print(tokenizer.convert_ids_to_tokens(ids))
print(e)
raise e
words.append(word_list)
tokens.append(token_list)
return tokens, words
def build_file(tokens: List[List[str]], save_path, prefix: str):
result = ""
for token_list in tokens:
result += " ".join(token_list) + "\n"
with open(os.path.join(save_path, f"{prefix}.txt"), "w") as f:
f.write(result)
def main(opt: argparse.Namespace):
cfg: DictConfig = OmegaConf.load(f"{opt.config}")
dataset, tokenizer = build_dataset(cfg.data, evaluation=True)
tokens, words = get_keywords(dataset.test_dataset, tokenizer)
build_file(tokens, opt.save_path, "tokens")
build_file(words, opt.save_path, "words")
if __name__ == '__main__':
option = parse_args()
main(option)