class Trainer
""" Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
[Trainer
] is optimized to work with the [PreTrainedModel
] provided by the library. You can still use your own models defined as torch.nn.Module
as long as they work the same way as the 🤗 Transformers models. """
args, seed, deepspeed와 같은 기본 설정
self.args = args
enable_full_determinism(Self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
self.hp_name = None
self.deepspeed = None
self.is_in_train = False
accelerator 설정
--> 추후 다시 정리
self.create_accelerator_and_postprocess()
memory 설정 (가능한 빨리 하는게 좋음) ##
self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
self._memory_tracker.start()
node에 따른 올바른 log level 설정
log_level = args.get_process_log_level()
logging.set_verbosity(log_level)
force device and distributed setup init explicitly
args._setup_devices
model 설정
trainer는 model이나 model_init이 필요하다. 둘다 값이 존재하면 model_init이 model을 overwrite
model은 없지만, model_init이 있으면 self.call_model_init() 으로 모델을 설정해주는데, 개수가 0, 1 ## 확인 필요
model.class.name이 허깅페이스 AutoClass에서 지원해주는 MODEL_MAPPING_NAMES에 있는지 확인. ## model.class.name은 어떻게 하는거지??
모델 로드 후 병렬처리와 양자화 여부 확인
model에서 is_parallelizable와 model_parallel 인자로 prarallel 여부 확인 후 self.is_model_parallel 설정
model에 hf_device_map 확인 ## device의 개수가 1 보다 크면 패러랠??
devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
if len(devices) > 1:
self.is_model_parallel = True
elif len(devices) == 1: ##
self.is_model_parallel = self.args.device != torch.devie(devices[0])
else:
self.is_model_parallel = False
quantization 여부에 따라서
_is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr(
model, "_hf_peft_config_loaded", False
)
양자화 + compiled 모델 필터링 ## 왜 양자화된 모델은 compile된거면 안되지?
if _is_quantized_and_base_model and hasattr(model, "_orig_mod"):
raise ValueError(
"You cannot fine-tune quantized model with torch.compile()
make sure to pass a non-compiled model when fine-tuning a quantized model with PEFT"
)
# one place to sort out whether to place the model on device or not
# postpone switching model to cuda when:
# 1. MP - since we are trying to fit a much bigger than 1 gpu model
# 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway,
# and we only use deepspeed for training at the moment
# 3. full bf16 or fp16 eval - since the model needs to be cast to the right dtype first
# 4. FSDP - same as MP
Trainer
] inner state that will be saved along the model and optimizer when checkpointingTrainerCallback
]."""one step을 의미하며, gradient_accumulation_steps=n을 했을 땐 n번의 배치에 대해 forward, backward가 일어남.
https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py#L1530