Skip to content

[bug?] No such file or directory: 'checkpoints/checkpoint-1/pytorch_model.bin' #256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
groklab opened this issue Apr 23, 2025 · 1 comment

Comments

@groklab
Copy link

groklab commented Apr 23, 2025

Hi ColPali team - thanks a lot for your work & code. When I fine tune the colqwen model with load_best_model_at_end=True, it throws error FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'

(The thing which really frustrates me is that my code did work a few weeks ago on older {colpali/transformer/etc.} versions but now I simply cannot locate back that venv.. )

===

My current code is:

import os

GPU_IDX = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = GPU_IDX

from pathlib import Path
import torch
from colpali_engine.collators.visual_retriever_collator import VisualRetrieverCollator
from colpali_engine.loss import ColbertPairwiseCELoss

from colpali_engine.models import ColQwen2, ColQwen2Processor

from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer
from colpali_engine.utils.torch_utils import get_torch_device, tear_down_torch
from datasets import load_dataset
from torch import nn
from transformers import BitsAndBytesConfig, TrainingArguments

best_model_dir = Path(f"colqwen2-ft-gpu{GPU_IDX}")
best_model_dir.mkdir(exist_ok=True, parents=True)

device = get_torch_device("auto")

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )


model_name = "vidore/colqwen2-v1.0"

model = ColQwen2.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map=device,
    )


for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True
        


processor = ColQwen2Processor.from_pretrained(model_name)
collator = VisualRetrieverCollator(processor=processor)

from datasets import load_from_disk

ds = load_from_disk('../colpali_fine_tuning_dataset').train_test_split(test_size=0.2, seed=42)
ds["test"] = ds["test"].select(range(5))
ds["train"] = ds["train"].shuffle(seed=42)

checkpoints_dir = Path(f"checkpoints-gpu{GPU_IDX}")
checkpoints_dir.mkdir(exist_ok=True, parents=True)

training_args = TrainingArguments(
    output_dir=str(checkpoints_dir),
    overwrite_output_dir=True,
    num_train_epochs=0.01,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    eval_strategy="steps",
    save_steps=1,
    logging_steps=1,
    eval_steps=1,
    warmup_steps=1,
    learning_rate=5e-5,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


trainer = ContrastiveTrainer(
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    args=training_args,
    data_collator=collator,
    loss_func=ColbertPairwiseCELoss(),
    is_vision_model=True,
)

trainer.args.remove_unused_columns = False
train_results = trainer.train()

And it throws errors:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 96
     85 trainer = ContrastiveTrainer(
     86     model=model,
     87     train_dataset=ds["train"],
   (...)     92     is_vision_model=True,
     93 )
     95 trainer.args.remove_unused_columns = False
---> 96 train_results = trainer.train()

File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2164, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   2162         hf_hub_utils.enable_progress_bars()
   2163 else:
-> 2164     return inner_training_loop(
   2165         args=args,
   2166         resume_from_checkpoint=resume_from_checkpoint,
   2167         trial=trial,
   2168         ignore_keys_for_eval=ignore_keys_for_eval,
   2169     )

File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2646, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2643     elif is_sagemaker_mp_enabled():
   2644         smp.barrier()
-> 2646     self._load_best_model()
   2648 # add remaining tr_loss
   2649 self._total_loss_scalar += tr_loss.item()

File /venv/uv-colpali/lib/python3.11/site-packages/transformers/trainer.py:2963, in Trainer._load_best_model(self)
   2961     state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
   2962 else:
-> 2963     state_dict = torch.load(
   2964         best_model_path,
   2965         map_location="cpu",
   2966         **weights_only_kwarg,
   2967     )
   2969 # If the model is on the GPU, it still works!
   2970 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
   2971 # which takes *args instead of **kwargs
   2972 load_result = model.load_state_dict(state_dict, False)

File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:1425, in load(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args)
   1422 if "encoding" not in pickle_load_args.keys():
   1423     pickle_load_args["encoding"] = "utf-8"
-> 1425 with _open_file_like(f, "rb") as opened_file:
   1426     if _is_zipfile(opened_file):
   1427         # The zipfile reader is going to advance the current file position.
   1428         # If we want to actually tail call to torch.jit.load, we need to
   1429         # reset back to the original position.
   1430         orig_position = opened_file.tell()

File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:751, in _open_file_like(name_or_buffer, mode)
    749 def _open_file_like(name_or_buffer, mode):
    750     if _is_path(name_or_buffer):
--> 751         return _open_file(name_or_buffer, mode)
    752     else:
    753         if "w" in mode:

File /venv/uv-colpali/lib/python3.11/site-packages/torch/serialization.py:732, in _open_file.__init__(self, name, mode)
    731 def __init__(self, name, mode):
--> 732     super().__init__(open(name, mode))

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints-gpu0/checkpoint-1/pytorch_model.bin'

Can you advise what's going wrong & am I missing something?

Thank you.

@ManuelFay
Copy link
Collaborator

hmmm - are you saving checkpoints there ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants