You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if [ -z "${BASH_VERSION}" ]; then
echo "Please use bash to run this script." >&1
exit 1
fi
# set some arguments here
ckpts_dir="/nas/shared/GAIR/ckpts"
output_dir="/cpfs01/shared/GAIR/GAIR_hdd/rzfan"
BASE_MODEL="/nas/shared/GAIR/ckpts/llama-3-chinese/8b"
OUTPUT_DIR=${output_dir}/TranslationDataDetection/llama3_Chinese_8b_GSM8K-ZH-train-1k_sft_answer_loss
NUM_GPUS=8
BATCH_SIZE_PER_GPU=16
TOTAL_BATCH_SIZE=128
# TOTAL_BATCH_SIZE=128
GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
echo -e "\
base model path: ${BASE_MODEL}\n\
GPU number: ${NUM_GPUS}\n\
batch size per GPU: ${BATCH_SIZE_PER_GPU}\n\
gradient accumulation steps: ${GRADIENT_ACC_STEPS}\n\
output path: ${OUTPUT_DIR}\n
"
mkdir -p "${OUTPUT_DIR}"
# OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
# if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
# echo '*' >"${OUTPUT_DIR}/.gitignore"
# fi
exec 1> >(tee "${OUTPUT_DIR}/stdout.log" >&1) 2> >(tee "${OUTPUT_DIR}/stderr.log" >&2)
# the script is based on the setting in tulu2 paper, you can modify the setting here according to your own needs'
# for detailed guidance of the parameters, run python src/train_bash.py --help
# REMEMBER TO CHANGE adjust batch size when using less than 8 gpus
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
deepspeed --include localhost:0,1,2,3,4,5,6,7 --master_port=9701 src/train.py \
--deepspeed ds_config.json \
--stage sft \
--do_train True\
--model_name_or_path ${BASE_MODEL} \
--dataset GSM8K_ZH_train \
--template llama3 \
--cutoff_len 2048 \
--finetuning_type full \
--temperature 0 \
--output_dir ${OUTPUT_DIR} \
--overwrite_cache \
--per_device_train_batch_size ${BATCH_SIZE_PER_GPU} \
--weight_decay 0.1 \
--gradient_accumulation_steps ${GRADIENT_ACC_STEPS} \
--lr_scheduler_type linear \
--warmup_ratio 0.1 \
--logging_steps 1 \
--save_steps 1000 \
--learning_rate 2e-5 \
--num_train_epochs 10 \
--plot_loss \
--report_to "wandb"\
--bf16 True \
--tf32 False \
--flash_attn auto \
--overwrite_output_dir \
--train_on_prompt False \
--max_samples 1000
OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
echo '*' >"${OUTPUT_DIR}/.gitignore"
fi
报错信息为:
import peft
File "/opt/conda/lib/python3.10/site-packages/peft/__init__.py", line 22, in <module>
from .auto import (
File "/opt/conda/lib/python3.10/site-packages/peft/auto.py", line 32, in <module>
from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
File "/opt/conda/lib/python3.10/site-packages/peft/mapping.py", line 22, in <module>
from .mixed_model import PeftMixedModel
File "/opt/conda/lib/python3.10/site-packages/peft/mixed_model.py", line 26, in <module>
from peft.tuners.mixed import COMPATIBLE_TUNER_TYPES
File "/opt/conda/lib/python3.10/site-packages/peft/tuners/__init__.py", line 21, in <module>
from .lora import LoraConfig, LoraModel, LoftQConfig
File "/opt/conda/lib/python3.10/site-packages/peft/tuners/lora/__init__.py", line 20, in <module>
from .model import LoraModel
File "/opt/conda/lib/python3.10/site-packages/peft/tuners/lora/model.py", line 49, in <module>
from .awq import dispatch_awq
File "/opt/conda/lib/python3.10/site-packages/peft/tuners/lora/awq.py", line 26, in <module>
from awq.modules.linear import WQLinear_GEMM
File "/opt/conda/lib/python3.10/site-packages/awq/__init__.py", line 2, in <module>
from awq.models.auto import AutoAWQForCausalLM
File "/opt/conda/lib/python3.10/site-packages/awq/models/__init__.py", line 1, in <module>
from .mpt import MptAWQForCausalLM
File "/opt/conda/lib/python3.10/site-packages/awq/models/mpt.py", line 1, in <module>
from .base import BaseAWQForCausalLM
File "/opt/conda/lib/python3.10/site-packages/awq/models/base.py", line 46, in <module>
from awq.quantize.quantizer import AwqQuantizer
File "/opt/conda/lib/python3.10/site-packages/awq/quantize/quantizer.py", line 10, in <module>
from awq.quantize.scale import apply_scale, apply_clip
File "/opt/conda/lib/python3.10/site-packages/awq/quantize/scale.py", line 9, in <module>
from transformers.models.gemma.modeling_gemma import GemmaRMSNorm
ModuleNotFoundError: No module named 'transformers.models.gemma'
Expected behavior
期望正常训练
Others
No response
The text was updated successfully, but these errors were encountered:
Reminder
System Info
llamafactory
version: 0.9.1.dev0Reproduction
我想对llama-3-chinese做全参数SFT微调:
报错信息为:
Expected behavior
期望正常训练
Others
No response
The text was updated successfully, but these errors were encountered: