Source code for torch.distributed.elastic.rendezvous.api
# Copyright (c) Facebook, Inc. and its affiliates.# All rights reserved.## This source code is licensed under the BSD-style license found in the# LICENSE file in the root directory of this source tree.fromabcimportABC,abstractmethodfromtypingimportAny,Callable,Dict,Optional,Tuplefromtorch.distributedimportStore
[docs]classRendezvousError(Exception):"""Represents the base type for rendezvous errors."""
[docs]classRendezvousClosedError(RendezvousError):"""Raised when a rendezvous is closed."""
[docs]classRendezvousTimeoutError(RendezvousError):"""Raised when a rendezvous did not complete on time."""
[docs]classRendezvousConnectionError(RendezvousError):"""Raised when the connection to a rendezvous backend has failed."""
[docs]classRendezvousStateError(RendezvousError):"""Raised when the state of a rendezvous is corrupt."""
[docs]classRendezvousHandler(ABC):"""Main rendezvous interface. Note: Distributed Torch users normally **do not** need to implement their own ``RendezvousHandler``. An implementation based on C10d Store is already provided, and is recommended for most users. """
[docs]@abstractmethoddefget_backend(self)->str:"""Returns the name of the rendezvous backend."""
[docs]@abstractmethoddefnext_rendezvous(self,)->Tuple[Store,int,int]:"""Main entry-point into the rendezvous barrier. Blocks until the rendezvous is complete and the current process is included in the formed worker group, or a timeout occurs, or the rendezvous was marked closed. Returns: A tuple of :py:class:`torch.distributed.Store`, ``rank``, and ``world size``. Raises: RendezvousClosedError: The rendezvous is closed. RendezvousConnectionError: The connection to the rendezvous backend has failed. RendezvousStateError: The rendezvous state is corrupt. RendezvousTimeoutError: The rendezvous did not complete on time. """
[docs]@abstractmethoddefis_closed(self)->bool:"""Checks whether the rendezvous has been closed. A closed rendezvous means all future attempts to re-rendezvous within same job will fail. ``is_closed()`` and :py:meth:`set_closed` have semantics of eventual propagation and should not be used for synchronization. The intention is that if at least one node decides the job is finished, it will close the rendezvous, and other nodes will soon observe this and stop running as well. """
[docs]@abstractmethoddefset_closed(self):"""Marks the rendezvous as closed."""
[docs]@abstractmethoddefnum_nodes_waiting(self)->int:"""Returns the number of nodes who arrived late at the rendezvous barrier, hence were not included in the current worker group. Callers should periodically call this method to check whether new nodes are waiting to join the job and if so admit them by calling :py:meth:`next_rendezvous()` (re-rendezvous). """
[docs]@abstractmethoddefget_run_id(self)->str:"""Returns the run id of the rendezvous. The run id is a user-defined id that uniquely identifies an instance of a distributed application. It typically maps to a job id and is used to allow nodes to join the correct distributed application. """
[docs]@abstractmethoddefshutdown(self)->bool:"""Closes all resources that were open for the rendezvous. Example:: rdzv_handler = ... try: store, rank, world_size = rdzv_handler.next_rendezvous() finally: rdzv_handler.shutdown() """
[docs]classRendezvousParameters:"""Holds the parameters to construct a :py:class:`RendezvousHandler`. Args: backend: The name of the backend to use to handle the rendezvous. endpoint: The endpoint of the rendezvous, usually in form <hostname>[:<port>]. run_id: The id of the rendezvous. min_nodes: The minimum number of nodes to admit to the rendezvous. max_nodes: The maximum number of nodes to admit to the rendezvous. local_addr: The address of the local node. **kwargs: Additional parameters for the specified backend. """def__init__(self,backend:str,endpoint:str,run_id:str,min_nodes:int,max_nodes:int,local_addr:Optional[str]=None,**kwargs,):ifnotbackend:raiseValueError("The rendezvous backend name must be a non-empty string.")ifmin_nodes<1:raiseValueError(f"The minimum number of rendezvous nodes ({min_nodes}) must be greater than zero.")ifmax_nodes<min_nodes:raiseValueError(f"The maximum number of rendezvous nodes ({max_nodes}) must be greater than or "f"equal to the minimum number of rendezvous nodes ({min_nodes}).")self.backend=backendself.endpoint=endpointself.run_id=run_idself.min_nodes=min_nodesself.max_nodes=max_nodesself.config=kwargsself.local_addr=local_addr
[docs]defget(self,key:str,default:Any=None)->Any:"""Returns the value for ``key`` if ``key`` exists, else ``default``."""returnself.config.get(key,default)
[docs]defget_as_bool(self,key:str,default:Optional[bool]=None)->Optional[bool]:"""Returns the value for ``key`` as a ``bool``."""value=self.get(key,default)ifvalueisNoneorisinstance(value,bool):returnvalueifisinstance(value,int):ifvalue==1:returnTrueifvalue==0:returnFalseelifisinstance(value,str):ifvalue.lower()in["1","true","t","yes","y"]:returnTrueifvalue.lower()in["0","false","f","no","n"]:returnFalseraiseValueError(f"The rendezvous configuration option '{key}' does not represent a valid boolean value.")
[docs]defget_as_int(self,key:str,default:Optional[int]=None)->Optional[int]:"""Returns the value for ``key`` as an ``int``."""value=self.get(key,default)ifvalueisNone:returnvaluetry:returnint(value)exceptValueErrorase:raiseValueError(f"The rendezvous configuration option '{key}' does not represent a valid integer ""value.")frome
[docs]classRendezvousHandlerRegistry:"""Represents a registry of :py:class:`RendezvousHandler` backends."""_registry:Dict[str,RendezvousHandlerCreator]def__init__(self)->None:self._registry={}
[docs]defregister(self,backend:str,creator:RendezvousHandlerCreator)->None:"""Registers a new rendezvous backend. Args: backend: The name of the backend. creator: The callback to invoke to construct the :py:class:`RendezvousHandler`. """ifnotbackend:raiseValueError("The rendezvous backend name must be a non-empty string.")current_creator:Optional[RendezvousHandlerCreator]try:current_creator=self._registry[backend]exceptKeyError:current_creator=Noneifcurrent_creatorisnotNoneandcurrent_creator!=creator:raiseValueError(f"The rendezvous backend '{backend}' cannot be registered with '{creator}' as it "f"is already registered with '{current_creator}'.")self._registry[backend]=creator
[docs]defcreate_handler(self,params:RendezvousParameters)->RendezvousHandler:"""Creates a new :py:class:`RendezvousHandler`."""try:creator=self._registry[params.backend]exceptKeyErrorase:raiseValueError(f"The rendezvous backend '{params.backend}' is not registered. Did you forget "f"to call `{self.register.__name__}`?")fromehandler=creator(params)# Do some sanity check.ifhandler.get_backend()!=params.backend:raiseRuntimeError(f"The rendezvous backend '{handler.get_backend()}' does not match the requested "f"backend '{params.backend}'.")returnhandler
# The default global registry instance used by launcher scripts to instantiate# rendezvous handlers.rendezvous_handler_registry=RendezvousHandlerRegistry()
Docs
Access comprehensive developer documentation for PyTorch
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: Cookies Policy.