Shortcuts

Source code for ignite.handlers.checkpoint

import collections.abc as collections
import numbers
import os
import tempfile
import warnings
from abc import ABCMeta, abstractmethod
from collections import namedtuple
from typing import Callable, Mapping, Optional, Union

import torch
import torch.nn as nn

import ignite.distributed as idist
from ignite.engine import Engine, Events

__all__ = ["Checkpoint", "DiskSaver", "ModelCheckpoint", "BaseSaveHandler"]


[docs]class BaseSaveHandler(metaclass=ABCMeta): """Base class for save handlers Methods to override: - :meth:`~ignite.handlers.checkpoint.BaseSaveHandler.__call__` - :meth:`~ignite.handlers.checkpoint.BaseSaveHandler.remove` Note: In derived class, please, make sure that in distributed configuration overridden methods are called by a single process. Distributed configuration on XLA devices should be treated slightly differently: for saving checkpoint with `xm.save() <https://pytorch.org/xla/release/1.5/index.html#torch_xla.core.xla_model.save>`_ all processes should pass into the function. Otherwise, application gets stuck. """
[docs] @abstractmethod def __call__(self, checkpoint: Mapping, filename: str, metadata: Optional[Mapping] = None) -> None: """Method to save `checkpoint` with `filename`. Additionally, metadata dictionary is provided. Metadata contains: - `basename`: file prefix (if provided) with checkpoint name, e.g. `epoch_checkpoint`. - `score_name`: score name if provided, e.g `val_acc`. - `priority`: checkpoint priority value (higher is better), e.g. `12` or `0.6554435` Args: checkpoint (Mapping): checkpoint dictionary to save. filename (str): filename associated with checkpoint. metadata (Mapping, optional): metadata on checkpoint to save. """ pass
[docs] @abstractmethod def remove(self, filename: str) -> None: """Method to remove saved checkpoint. Args: filename (str): filename associated with checkpoint. """ pass
[docs]class Checkpoint: """Checkpoint handler can be used to periodically save and load objects which have attribute `state_dict`/`load_state_dict`. This class can use specific save handlers to store on the disk or a cloud storage, etc. The Checkpoint handler (if used with :class:`~ignite.handlers.DiskSaver`) also handles automatically moving data on TPU to CPU before writing the checkpoint. Args: to_save (Mapping): Dictionary with the objects to save. Objects should have implemented `state_dict` and ` load_state_dict` methods. If contains objects of type torch `DistributedDataParallel`_ or `DataParallel`_, their internal wrapped model is automatically saved (to avoid additional key ``module.`` in the state dictionary). save_handler (callable or :class:`~ignite.handlers.checkpoint.BaseSaveHandler`): Method or callable class to use to save engine and other provided objects. Function receives two objects: checkpoint as a dictionary and filename. If `save_handler` is callable class, it can inherit of :class:`~ignite.handlers.checkpoint.BaseSaveHandler` and optionally implement `remove` method to keep a fixed number of saved checkpoints. In case if user needs to save engine's checkpoint on a disk, `save_handler` can be defined with :class:`~ignite.handlers.DiskSaver`. filename_prefix (str, optional): Prefix for the filename to which objects will be saved. See Note for details. score_function (callable, optional): If not None, it should be a function taking a single argument, :class:`~ignite.engine.Engine` object, and returning a score (`float`). Objects with highest scores will be retained. score_name (str, optional): If `score_function` not None, it is possible to store its value using `score_name`. See Notes for more details. n_saved (int, optional): Number of objects that should be kept on disk. Older files will be removed. If set to `None`, all objects are kept. global_step_transform (callable, optional): global step transform function to output a desired global step. Input of the function is `(engine, event_name)`. Output of function should be an integer. Default is None, global_step based on attached engine. If provided, uses function output as global_step. To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`. archived (bool, optional): Deprecated argument as models saved by `torch.save` are already compressed. .. _DistributedDataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel .. _DataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel Note: This class stores a single file as a dictionary of provided objects to save. The filename has the following structure: `{filename_prefix}_{name}_{suffix}.{ext}` where - `filename_prefix` is the argument passed to the constructor, - `name` is the key in `to_save` if a single object is to store, otherwise `name` is "checkpoint". - `suffix` is composed as following `{global_step}_{score_name}={score}`. Above `global_step` defined by the output of `global_step_transform` and `score` defined by the output of `score_function`. By default, none of `score_function`, `score_name`, `global_step_transform` is defined, then suffix is setup by attached engine's current iteration. The filename will be `{filename_prefix}_{name}_{engine.state.iteration}.{ext}`. If defined a `score_function`, but without `score_name`, then suffix is defined by provided score. The filename will be `{filename_prefix}_{name}_{global_step}_{score}.pt`. If defined `score_function` and `score_name`, then the filename will be `{filename_prefix}_{name}_{score_name}={score}.{ext}`. If `global_step_transform` is provided, then the filename will be `{filename_prefix}_{name}_{global_step}_{score_name}={score}.{ext}` For example, `score_name="neg_val_loss"` and `score_function` that returns `-loss` (as objects with highest scores will be retained), then saved filename will be `{filename_prefix}_{name}_neg_val_loss=-0.1234.pt`. To get the last stored filename, handler exposes attribute `last_checkpoint`: .. code-block:: python handler = Checkpoint(...) ... print(handler.last_checkpoint) > checkpoint_12345.pt Note: This class is distributed configuration-friendly: it is not required to instantiate the class in rank 0 only process. This class supports automatically distributed configuration and if used with :class:`~ignite.handlers.DiskSaver`, checkpoint is stored by rank 0 process. .. warning:: When running on TPUs, it should be run in all processes, otherwise application can get stuck on saving the checkpoint. .. code-block:: python # Wrong: # if idist.get_rank() == 0: # handler = Checkpoint(...) # trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), handler) # Correct: handler = Checkpoint(...) trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), handler) Examples: Attach the handler to make checkpoints during training: .. code-block:: python from ignite.engine import Engine, Events from ignite.handlers import Checkpoint, DiskSaver trainer = ... model = ... optimizer = ... lr_scheduler = ... to_save = {'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler, 'trainer': trainer} handler = Checkpoint(to_save, DiskSaver('/tmp/models', create_dir=True), n_saved=2) trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), handler) trainer.run(data_loader, max_epochs=6) > ["checkpoint_7000.pt", "checkpoint_8000.pt", ] Attach the handler to an evaluator to save best model during the training according to computed validation metric: .. code-block:: python from ignite.engine import Engine, Events from ignite.handlers import Checkpoint, DiskSaver, global_step_from_engine trainer = ... evaluator = ... # Setup Accuracy metric computation on evaluator # Run evaluation on epoch completed event # ... def score_function(engine): return engine.state.metrics['accuracy'] to_save = {'model': model} handler = Checkpoint(to_save, DiskSaver('/tmp/models', create_dir=True), n_saved=2, filename_prefix='best', score_function=score_function, score_name="val_acc", global_step_transform=global_step_from_engine(trainer)) evaluator.add_event_handler(Events.COMPLETED, handler) trainer.run(data_loader, max_epochs=10) > ["best_model_9_val_acc=0.77.pt", "best_model_10_val_acc=0.78.pt", ] """ Item = namedtuple("Item", ["priority", "filename"]) def __init__( self, to_save: Mapping, save_handler: Union[Callable, BaseSaveHandler], filename_prefix: str = "", score_function: Optional[Callable] = None, score_name: Optional[str] = None, n_saved: Optional[int] = 1, global_step_transform: Callable = None, archived: bool = False, ): if to_save is not None: # for compatibility with ModelCheckpoint if not isinstance(to_save, collections.Mapping): raise TypeError("Argument `to_save` should be a dictionary, but given {}".format(type(to_save))) if len(to_save) < 1: raise ValueError("No objects to checkpoint.") self._check_objects(to_save, "state_dict") if not (callable(save_handler) or isinstance(save_handler, BaseSaveHandler)): raise TypeError("Argument `save_handler` should be callable or inherit from BaseSaveHandler") if score_function is None and score_name is not None: raise ValueError("If `score_name` is provided, then `score_function` " "should be also provided.") if global_step_transform is not None and not callable(global_step_transform): raise TypeError( "global_step_transform should be a function, got {} instead.".format(type(global_step_transform)) ) if archived: warnings.warn("Argument archived is deprecated and will be removed in 0.5.0") self.to_save = to_save self._fname_prefix = filename_prefix + "_" if len(filename_prefix) > 0 else filename_prefix self.save_handler = save_handler self._score_function = score_function self._score_name = score_name self._n_saved = n_saved self._saved = [] self._ext = ".pt" self.global_step_transform = global_step_transform @property def last_checkpoint(self) -> Optional[str]: if len(self._saved) < 1: return None return self._saved[-1].filename def _check_lt_n_saved(self, or_equal=False): if self._n_saved is None: return True return len(self._saved) < self._n_saved + int(or_equal) def __call__(self, engine: Engine) -> None: suffix = "" if self.global_step_transform is not None: global_step = self.global_step_transform(engine, engine.last_event_name) suffix = "{}".format(global_step) if self._score_function is not None: priority = self._score_function(engine) if not isinstance(priority, numbers.Number): raise ValueError("Output of score_function should be a number") else: priority = engine.state.get_event_attrib_value(Events.ITERATION_COMPLETED) if self._check_lt_n_saved() or self._saved[0].priority < priority: priority_str = ( "{}".format(priority) if isinstance(priority, numbers.Integral) else "{:.4f}".format(priority) ) if self._score_name is not None: if len(suffix) > 0: suffix += "_" suffix = "{}{}={}".format(suffix, self._score_name, priority_str) elif self._score_function is not None: if len(suffix) > 0: suffix += "_" suffix = "{}{}".format(suffix, priority_str) elif len(suffix) == 0: suffix = "{}".format(priority_str) checkpoint = self._setup_checkpoint() name = "checkpoint" if len(checkpoint) == 1: for k in checkpoint: name = k checkpoint = checkpoint[name] filename = "{}{}_{}{}".format(self._fname_prefix, name, suffix, self._ext) if any(item.filename == filename for item in self._saved): return metadata = { "basename": "{}{}".format(self._fname_prefix, name), "score_name": self._score_name, "priority": priority, } try: self.save_handler(checkpoint, filename, metadata) except TypeError: self.save_handler(checkpoint, filename) self._saved.append(Checkpoint.Item(priority, filename)) self._saved.sort(key=lambda item: item[0]) if not self._check_lt_n_saved(or_equal=True): item = self._saved.pop(0) if isinstance(self.save_handler, BaseSaveHandler): self.save_handler.remove(item.filename) def _setup_checkpoint(self) -> dict: checkpoint = {} for k, obj in self.to_save.items(): if isinstance(obj, (nn.DataParallel, nn.parallel.DistributedDataParallel)): obj = obj.module checkpoint[k] = obj.state_dict() return checkpoint @staticmethod def _check_objects(objs: Mapping, attr: str) -> None: for k, obj in objs.items(): if not hasattr(obj, attr): raise TypeError("Object {} should have `{}` method".format(type(obj), attr))
[docs] @staticmethod def load_objects(to_load: Mapping, checkpoint: Mapping, **kwargs) -> None: """Helper method to apply `load_state_dict` on the objects from `to_load` using states from `checkpoint`. Exemples: .. code-block:: python import torch from ignite.engine import Engine, Events from ignite.handlers import ModelCheckpoint, Checkpoint trainer = Engine(lambda engine, batch: None) handler = ModelCheckpoint('/tmp/models', 'myprefix', n_saved=None, create_dir=True) model = torch.nn.Linear(3, 3) optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) to_save = {"weights": model, "optimizer": optimizer} trainer.add_event_handler(Events.EPOCH_COMPLETED(every=2), handler, to_save) trainer.run(torch.randn(10, 1), 5) to_load = to_save checkpoint_fp = "/tmp/models/myprefix_checkpoint_40.pth" checkpoint = torch.load(checkpoint_fp) Checkpoint.load_objects(to_load=to_load, checkpoint=checkpoint) Args: to_load (Mapping): a dictionary with objects, e.g. `{"model": model, "optimizer": optimizer, ...}` checkpoint (Mapping): a dictionary with state_dicts to load, e.g. `{"model": model_state_dict, "optimizer": opt_state_dict}`. If `to_load` contains a single key, then checkpoint can contain directly corresponding state_dict. **kwargs: Keyword arguments accepted for `nn.Module.load_state_dict()`. Passing `strict=False` enables the user to load part of the pretrained model (useful for example, in Transfer Learning) """ Checkpoint._check_objects(to_load, "load_state_dict") if not isinstance(checkpoint, collections.Mapping): raise TypeError("Argument checkpoint should be a dictionary, but given {}".format(type(checkpoint))) if len(kwargs) > 1 or any(k for k in kwargs.keys() if k not in ["strict"]): warnings.warn("kwargs contains keys other than strict and these will be ignored") is_state_dict_strict = kwargs.get("strict", True) if len(to_load) == 1: # single object and checkpoint is directly a state_dict key, obj = list(to_load.items())[0] if key not in checkpoint: obj.load_state_dict(checkpoint, strict=is_state_dict_strict) return # multiple objects to load for k, obj in to_load.items(): if k not in checkpoint: raise ValueError("Object labeled by '{}' from `to_load` is not found in the checkpoint".format(k)) if isinstance(obj, torch.nn.Module): obj.load_state_dict(checkpoint[k], strict=is_state_dict_strict) else: obj.load_state_dict(checkpoint[k])
[docs]class DiskSaver(BaseSaveHandler): """Handler that saves input checkpoint on a disk. Args: dirname (str): Directory path where the checkpoint will be saved atomic (bool, optional): if True, checkpoint is serialized to a temporary file, and then moved to final destination, so that files are guaranteed to not be damaged (for example if exception occurs during saving). create_dir (bool, optional): if True, will create directory 'dirname' if it doesnt exist. require_empty (bool, optional): If True, will raise exception if there are any files in the directory 'dirname'. """ def __init__( self, dirname: str, atomic: bool = True, create_dir: bool = True, require_empty: bool = True, ): self.dirname = os.path.expanduser(dirname) self._atomic = atomic self._check_and_setup(dirname, create_dir, require_empty) @staticmethod @idist.one_rank_only() def _check_and_setup(dirname, create_dir, require_empty): if create_dir: if not os.path.exists(dirname): os.makedirs(dirname) # Ensure that dirname exists if not os.path.exists(dirname): raise ValueError("Directory path '{}' is not found".format(dirname)) if require_empty: matched = [fname for fname in os.listdir(dirname) if fname.endswith(".pt")] if len(matched) > 0: raise ValueError( "Files {} with extension '.pt' are already present " "in the directory {}. If you want to use this " "directory anyway, pass `require_empty=False`." "".format(matched, dirname) ) def __call__(self, checkpoint: Mapping, filename: str, metadata: Optional[Mapping] = None) -> None: path = os.path.join(self.dirname, filename) if idist.has_xla_support: self._save_xla(checkpoint, path) else: self._save_native(checkpoint, path) @idist.one_rank_only() def _save_native(self, checkpoint: Mapping, path: str): self._save_func(checkpoint, path, torch.save) def _save_xla(self, checkpoint: Mapping, path: str): import torch_xla.core.xla_model as xm # all tpu procs should enter here as internally performs sync across device self._save_func(checkpoint, path, xm.save, rank=idist.get_rank()) def _save_func(self, checkpoint: Mapping, path: str, func: Callable, rank: int = 0): if not self._atomic: func(checkpoint, path) else: tmp_file = None tmp_name = None tmp = None if rank == 0: tmp = tempfile.NamedTemporaryFile(delete=False, dir=self.dirname) tmp_file = tmp.file tmp_name = tmp.name try: func(checkpoint, tmp_file) except BaseException: if tmp is not None: tmp.close() os.remove(tmp_name) raise else: if tmp is not None: tmp.close() os.rename(tmp.name, path) @idist.one_rank_only() def remove(self, filename: str) -> None: path = os.path.join(self.dirname, filename) os.remove(path)
[docs]class ModelCheckpoint(Checkpoint): """ModelCheckpoint handler can be used to periodically save objects to disk only. If needed to store checkpoints to another storage type, please consider :class:`~ignite.handlers.checkpoint.Checkpoint`. This handler expects two arguments: - an :class:`~ignite.engine.Engine` object - a `dict` mapping names (`str`) to objects that should be saved to disk. See Examples for further details. .. warning:: Behaviour of this class has been changed since v0.3.0. Argument `save_as_state_dict` is deprecated and should not be used. It is considered as True. Argument `save_interval` is deprecated and should not be used. Please, use events filtering instead, e.g. :attr:`~ignite.engine.Events.ITERATION_STARTED(every=1000)` There is no more internal counter that has been used to indicate the number of save actions. User could see its value `step_number` in the filename, e.g. `{filename_prefix}_{name}_{step_number}.pt`. Actually, `step_number` is replaced by current engine's epoch if `score_function` is specified and current iteration otherwise. A single `pt` file is created instead of multiple files. Args: dirname (str): Directory path where objects will be saved. filename_prefix (str): Prefix for the filenames to which objects will be saved. See Notes of :class:`~ignite.handlers.Checkpoint` for more details. score_function (callable, optional): if not None, it should be a function taking a single argument, an :class:`~ignite.engine.Engine` object, and return a score (`float`). Objects with highest scores will be retained. score_name (str, optional): if `score_function` not None, it is possible to store its value using `score_name`. See Notes for more details. n_saved (int, optional): Number of objects that should be kept on disk. Older files will be removed. If set to `None`, all objects are kept. atomic (bool, optional): If True, objects are serialized to a temporary file, and then moved to final destination, so that files are guaranteed to not be damaged (for example if exception occurs during saving). require_empty (bool, optional): If True, will raise exception if there are any files starting with `filename_prefix` in the directory 'dirname'. create_dir (bool, optional): If True, will create directory 'dirname' if it doesnt exist. global_step_transform (callable, optional): global step transform function to output a desired global step. Input of the function is `(engine, event_name)`. Output of function should be an integer. Default is None, global_step based on attached engine. If provided, uses function output as global_step. To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`. archived (bool, optional): Deprecated argument as models saved by `torch.save` are already compressed. Examples: >>> import os >>> from ignite.engine import Engine, Events >>> from ignite.handlers import ModelCheckpoint >>> from torch import nn >>> trainer = Engine(lambda batch: None) >>> handler = ModelCheckpoint('/tmp/models', 'myprefix', n_saved=2, create_dir=True) >>> model = nn.Linear(3, 3) >>> trainer.add_event_handler(Events.EPOCH_COMPLETED(every=2), handler, {'mymodel': model}) >>> trainer.run([0], max_epochs=6) >>> os.listdir('/tmp/models') ['myprefix_mymodel_4.pt', 'myprefix_mymodel_6.pt'] >>> handler.last_checkpoint ['/tmp/models/myprefix_mymodel_6.pt'] """ def __init__( self, dirname: str, filename_prefix: str, save_interval: Optional[Callable] = None, score_function: Optional[Callable] = None, score_name: Optional[str] = None, n_saved: Union[int, None] = 1, atomic: bool = True, require_empty: bool = True, create_dir: bool = True, save_as_state_dict: bool = True, global_step_transform: Optional[Callable] = None, archived: bool = False, ): if not save_as_state_dict: raise ValueError( "Argument save_as_state_dict is deprecated and should be True." "This argument will be removed in 0.5.0." ) if save_interval is not None: msg = ( "Argument save_interval is deprecated and should be None. This argument will be removed in 0.5.0." "Please, use events filtering instead, e.g. Events.ITERATION_STARTED(every=1000)" ) if save_interval == 1: # Do not break for old version who used `save_interval=1` warnings.warn(msg) else: # No choice raise ValueError(msg) disk_saver = DiskSaver(dirname, atomic=atomic, create_dir=create_dir, require_empty=require_empty,) super(ModelCheckpoint, self).__init__( to_save=None, save_handler=disk_saver, filename_prefix=filename_prefix, score_function=score_function, score_name=score_name, n_saved=n_saved, global_step_transform=global_step_transform, archived=archived, ) @property def last_checkpoint(self) -> Union[str, None]: if len(self._saved) < 1: return None return os.path.join(self.save_handler.dirname, self._saved[-1].filename) def __call__(self, engine: Engine, to_save: Mapping) -> None: if len(to_save) == 0: raise RuntimeError("No objects to checkpoint found.") self._check_objects(to_save, "state_dict") self.to_save = to_save super(ModelCheckpoint, self).__call__(engine)

© Copyright 2024, PyTorch-Ignite Contributors. Last updated on 12/16/2024, 11:45:29 PM.

Built with Sphinx using a theme provided by Read the Docs.