You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi ColPali team - thanks a lot for your work & code. When I fine tune the colqwen model with load_best_model_at_end=True, it throws error FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'
(The thing which really frustrates me is that my code did work a few weeks ago on older {colpali/transformer/etc.} versions but now I simply cannot locate back that venv.. )
===
My current code is:
import os
GPU_IDX = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = GPU_IDX
from pathlib import Path
import torch
from colpali_engine.collators.visual_retriever_collator import VisualRetrieverCollator
from colpali_engine.loss import ColbertPairwiseCELoss
from colpali_engine.models import ColQwen2, ColQwen2Processor
from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer
from colpali_engine.utils.torch_utils import get_torch_device, tear_down_torch
from datasets import load_dataset
from torch import nn
from transformers import BitsAndBytesConfig, TrainingArguments
best_model_dir = Path(f"colqwen2-ft-gpu{GPU_IDX}")
best_model_dir.mkdir(exist_ok=True, parents=True)
device = get_torch_device("auto")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model_name = "vidore/colqwen2-v1.0"
model = ColQwen2.from_pretrained(
model_name,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
device_map=device,
)
for name, param in model.named_parameters():
if "lora" in name:
param.requires_grad = True
processor = ColQwen2Processor.from_pretrained(model_name)
collator = VisualRetrieverCollator(processor=processor)
from datasets import load_from_disk
ds = load_from_disk('../colpali_fine_tuning_dataset').train_test_split(test_size=0.2, seed=42)
ds["test"] = ds["test"].select(range(5))
ds["train"] = ds["train"].shuffle(seed=42)
checkpoints_dir = Path(f"checkpoints-gpu{GPU_IDX}")
checkpoints_dir.mkdir(exist_ok=True, parents=True)
training_args = TrainingArguments(
output_dir=str(checkpoints_dir),
overwrite_output_dir=True,
num_train_epochs=0.01,
per_device_train_batch_size=5,
per_device_eval_batch_size=5,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
eval_strategy="steps",
save_steps=1,
logging_steps=1,
eval_steps=1,
warmup_steps=1,
learning_rate=5e-5,
save_total_limit=1,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
)
trainer = ContrastiveTrainer(
model=model,
train_dataset=ds["train"],
eval_dataset=ds["test"],
args=training_args,
data_collator=collator,
loss_func=ColbertPairwiseCELoss(),
is_vision_model=True,
)
trainer.args.remove_unused_columns = False
train_results = trainer.train()
And it throws errors:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[1], line 96
85 trainer = ContrastiveTrainer(
86 model=model,
87 train_dataset=ds["train"],
(...) 92 is_vision_model=True,
93 )
95 trainer.args.remove_unused_columns = False
---> 96 train_results = trainer.train()
File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2164, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2162 hf_hub_utils.enable_progress_bars()
2163 else:
-> 2164 return inner_training_loop(
2165 args=args,
2166 resume_from_checkpoint=resume_from_checkpoint,
2167 trial=trial,
2168 ignore_keys_for_eval=ignore_keys_for_eval,
2169 )
File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2646, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2643 elif is_sagemaker_mp_enabled():
2644 smp.barrier()
-> 2646 self._load_best_model()
2648 # add remaining tr_loss
2649 self._total_loss_scalar += tr_loss.item()
File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2963, in Trainer._load_best_model(self)
2961 state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
2962 else:
-> 2963 state_dict = torch.load(
2964 best_model_path,
2965 map_location="cpu",
2966 **weights_only_kwarg,
2967 )
2969 # If the model is on the GPU, it still works!
2970 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
2971 # which takes *args instead of **kwargs
2972 load_result = model.load_state_dict(state_dict, False)
File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:1425, in load(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args)
1422 if "encoding" not in pickle_load_args.keys():
1423 pickle_load_args["encoding"] = "utf-8"
-> 1425 with _open_file_like(f, "rb") as opened_file:
1426 if _is_zipfile(opened_file):
1427 # The zipfile reader is going to advance the current file position.
1428 # If we want to actually tail call to torch.jit.load, we need to
1429 # reset back to the original position.
1430 orig_position = opened_file.tell()
File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:751, in _open_file_like(name_or_buffer, mode)
749 def _open_file_like(name_or_buffer, mode):
750 if _is_path(name_or_buffer):
--> 751 return _open_file(name_or_buffer, mode)
752 else:
753 if "w" in mode:
File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:732, in _open_file.__init__(self, name, mode)
731 def __init__(self, name, mode):
--> 732 super().__init__(open(name, mode))
FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'
Can you advise what's going wrong & am I missing something?
Thank you.
The text was updated successfully, but these errors were encountered:
Hi ColPali team - thanks a lot for your work & code. When I fine tune the colqwen model with
load_best_model_at_end=True
, it throws errorFileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'
(The thing which really frustrates me is that my code did work a few weeks ago on older {colpali/transformer/etc.} versions but now I simply cannot locate back that venv.. )
===
My current code is:
And it throws errors:
Can you advise what's going wrong & am I missing something?
Thank you.
The text was updated successfully, but these errors were encountered: