diff --git a/src/lmflow/args.py b/src/lmflow/args.py index 480c6a8d..518957db 100644 --- a/src/lmflow/args.py +++ b/src/lmflow/args.py @@ -631,9 +631,6 @@ def __post_init__(self): if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - if self.skip_dataset_check: - logger.warning("Skip dataset check is enabled. Make sure the datasets are in the correct format.") @dataclass diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py index dd4fc9f9..0fd6da2f 100644 --- a/src/lmflow/models/hf_decoder_model.py +++ b/src/lmflow/models/hf_decoder_model.py @@ -234,7 +234,6 @@ def tokenize( "load_from_cache_file": not data_args.overwrite_cache, "desc": "Running tokenizer on dataset", "new_fingerprint": fingerprint, - "max_length": data_args.block_size, } if data_args.block_size < self.tokenizer.model_max_length: diff --git a/src/lmflow/pipeline/finetuner.py b/src/lmflow/pipeline/finetuner.py index 34eb948c..ee9b67c7 100644 --- a/src/lmflow/pipeline/finetuner.py +++ b/src/lmflow/pipeline/finetuner.py @@ -33,7 +33,7 @@ import numpy as np import lmflow.optim.optimizers as optim -from lmflow.args import OptimizerNames +from lmflow.args import OptimizerNames, DatasetArguments, ModelArguments, FinetunerArguments from lmflow.datasets.dataset import Dataset from lmflow.pipeline.base_tuner import BaseTuner from lmflow.pipeline.utils.peft_trainer import PeftTrainer, PeftSavingCallback @@ -64,7 +64,14 @@ class Finetuner(BaseTuner): Keyword arguments. """ - def __init__(self, model_args, data_args, finetuner_args, *args, **kwargs): + def __init__( + self, + model_args: ModelArguments, + data_args: DatasetArguments, + finetuner_args: FinetunerArguments, + *args, + **kwargs + ): self.model_args = model_args self.data_args = data_args