diff --git a/examples/chatbot.py b/examples/chatbot.py index 7d835acdf..063293d5d 100644 --- a/examples/chatbot.py +++ b/examples/chatbot.py @@ -14,7 +14,7 @@ from lmflow.pipeline.auto_pipeline import AutoPipeline from lmflow.models.auto_model import AutoModel from lmflow.args import ModelArguments, DatasetArguments, AutoArguments - +import torch.distributed as dist logging.disable(logging.ERROR) warnings.filterwarnings("ignore") @@ -70,7 +70,8 @@ def main(): f"#############################################################################\n" "\n" ) - print(guide_message, end="") + if dist.get_rank() == 0: + print(guide_message, end="") # context = ( # "You are a helpful assistant who follows the given instructions" @@ -80,8 +81,15 @@ def main(): end_string = "\n\n" while True: - input_text = input("User >>> ") - if not input_text: + if dist.get_rank() == 0: + input_text = input("User >>> ") + dist.broadcast_object_list([input_text]) + else: + recev_object = [None] * 1 + dist.broadcast_object_list(recev_object) + input_text = recev_object[0] + + if input_text == "exit": print("exit...") break @@ -108,7 +116,8 @@ def main(): index = response.index(end_string) response = response[:index + 1] - print("Bot: " + response, end="") + if dist.get_rank() == 0: + print("Bot: " + response, end="") context += response context = context[-model.get_max_length():] # Memory of the bot diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py index 8a68443fd..032e53621 100644 --- a/src/lmflow/models/hf_decoder_model.py +++ b/src/lmflow/models/hf_decoder_model.py @@ -48,6 +48,8 @@ from lmflow.models.decoder_model import DecoderModel from lmflow.models.interfaces.tunable import Tunable +from transformers.models.llama.modeling_llama import LlamaDecoderLayer + class HFDecoderModel(DecoderModel, Tunable): r""" @@ -179,7 +181,6 @@ def __init__( self.tune_strategy = tune_strategy elif tune_strategy == 'none': - dschf = HfDeepSpeedConfig(ds_config) self.backend_model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path) self.tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) peft_model_id = model_args.lora_model_path @@ -188,9 +189,12 @@ def __init__( self.backend_model, peft_model_id ) - deepspeed.init_distributed() - self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0] - self.ds_engine.module.eval() + self.ds_engine = deepspeed.init_inference( + self.backend_model, + mp_size=2, + dtype=torch.half, + injection_policy={LlamaDecoderLayer: ('self_attn.o_proj', 'mlp.down_proj')} + ) elif tune_strategy == 'adapter': raise NotImplementedError('adapter tune strategy not implemented')