Source code for torch.distributed.elastic.timer.api
# Copyright (c) Facebook, Inc. and its affiliates.# All rights reserved.## This source code is licensed under the BSD-style license found in the# LICENSE file in the root directory of this source tree.importabcimportloggingimportthreadingimporttimefromcontextlibimportcontextmanagerfrominspectimportgetframeinfo,stackfromtypingimportAny,Dict,List,Optional,Set__all__=['TimerRequest','TimerClient','RequestQueue','TimerServer','configure','expires']log=logging.getLogger(__name__)
[docs]classTimerRequest:""" Data object representing a countdown timer acquisition and release that is used between the ``TimerClient`` and ``TimerServer``. A negative ``expiration_time`` should be interpreted as a "release" request. .. note:: the type of ``worker_id`` is implementation specific. It is whatever the TimerServer and TimerClient implementations have on to uniquely identify a worker. """__slots__=["worker_id","scope_id","expiration_time"]def__init__(self,worker_id:Any,scope_id:str,expiration_time:float):self.worker_id=worker_idself.scope_id=scope_idself.expiration_time=expiration_timedef__eq__(self,other):ifisinstance(other,TimerRequest):return(self.worker_id==other.worker_idandself.scope_id==other.scope_idandself.expiration_time==other.expiration_time)returnFalse
[docs]classTimerClient(abc.ABC):""" Client library to acquire and release countdown timers by communicating with the TimerServer. """
[docs]@abc.abstractmethoddefacquire(self,scope_id:str,expiration_time:float)->None:""" Acquires a timer for the worker that holds this client object given the scope_id and expiration_time. Typically registers the timer with the TimerServer. """pass
[docs]@abc.abstractmethoddefrelease(self,scope_id:str):""" Releases the timer for the ``scope_id`` on the worker this client represents. After this method is called, the countdown timer on the scope is no longer in effect. """pass
classRequestQueue(abc.ABC):""" Consumer queue holding timer acquisition/release requests """@abc.abstractmethoddefsize(self)->int:""" Returns the size of the queue at the time this method is called. Note that by the time ``get`` is called the size of the queue may have increased. The size of the queue should not decrease until the ``get`` method is called. That is, the following assertion should hold: size = q.size() res = q.get(size, timeout=0) assert size == len(res) -- or -- size = q.size() res = q.get(size * 2, timeout=1) assert size <= len(res) <= size * 2 """pass@abc.abstractmethoddefget(self,size:int,timeout:float)->List[TimerRequest]:""" Gets up to ``size`` number of timer requests in a blocking fashion (no more than ``timeout`` seconds). """pass
[docs]classTimerServer(abc.ABC):""" Entity that monitors active timers and expires them in a timely fashion. This server is responsible for reaping workers that have expired timers. """def__init__(self,request_queue:RequestQueue,max_interval:float,daemon:bool=True):""" :param request_queue: Consumer ``RequestQueue`` :param max_interval: max time (in seconds) to wait for an item in the request_queue :param daemon: whether to run the watchdog thread as a daemon """super().__init__()self._request_queue=request_queueself._max_interval=max_intervalself._daemon=daemonself._watchdog_thread:Optional[threading.Thread]=Noneself._stop_signaled=False
[docs]@abc.abstractmethoddefregister_timers(self,timer_requests:List[TimerRequest])->None:""" Processes the incoming timer requests and registers them with the server. The timer request can either be a acquire-timer or release-timer request. Timer requests with a negative expiration_time should be interpreted as a release-timer request. """pass
[docs]@abc.abstractmethoddefclear_timers(self,worker_ids:Set[Any])->None:""" Clears all timers for the given ``worker_ids``. """pass
[docs]@abc.abstractmethoddefget_expired_timers(self,deadline:float)->Dict[str,List[TimerRequest]]:""" Returns all expired timers for each worker_id. An expired timer is a timer for which the expiration_time is less than or equal to the provided deadline. """pass
@abc.abstractmethoddef_reap_worker(self,worker_id:Any)->bool:""" Reaps the given worker. Returns True if the worker has been successfully reaped, False otherwise. If any uncaught exception is thrown from this method, the worker is considered reaped and all associated timers will be removed. """def_reap_worker_no_throw(self,worker_id:Any)->bool:""" Wraps ``_reap_worker(worker_id)``, if an uncaught exception is thrown, then it considers the worker as reaped. """try:returnself._reap_worker(worker_id)exceptExceptionase:log.error("Uncaught exception thrown from _reap_worker(), ""check that the implementation correctly catches exceptions",exc_info=e,)returnTruedef_watchdog_loop(self):whilenotself._stop_signaled:try:self._run_watchdog()exceptExceptionase:log.error("Error running watchdog",exc_info=e)def_run_watchdog(self):batch_size=max(1,self._request_queue.size())timer_requests=self._request_queue.get(batch_size,self._max_interval)self.register_timers(timer_requests)now=time.time()reaped_worker_ids=set()forworker_id,expired_timersinself.get_expired_timers(now).items():log.info("Reaping worker_id=[%s]."" Expired timers: %s",worker_id,self._get_scopes(expired_timers))ifself._reap_worker_no_throw(worker_id):log.info("Successfully reaped worker=[%s]",worker_id)reaped_worker_ids.add(worker_id)else:log.error("Error reaping worker=[%s]. Will retry on next watchdog.",worker_id)self.clear_timers(reaped_worker_ids)def_get_scopes(self,timer_requests):return[r.scope_idforrintimer_requests]defstart(self)->None:log.info("Starting %s..."" max_interval=%s,"" daemon=%s",type(self).__name__,self._max_interval,self._daemon)self._watchdog_thread=threading.Thread(target=self._watchdog_loop,daemon=self._daemon)log.info("Starting watchdog thread...")self._watchdog_thread.start()defstop(self)->None:log.info("Stopping %s",type(self).__name__)self._stop_signaled=Trueifself._watchdog_thread:log.info("Stopping watchdog thread...")self._watchdog_thread.join(self._max_interval)self._watchdog_thread=Noneelse:log.info("No watchdog thread running, doing nothing")
_timer_client=None
[docs]defconfigure(timer_client:TimerClient):""" Configures a timer client. Must be called before using ``expires``. """global_timer_client_timer_client=timer_clientlog.info("Timer client configured to: %s",type(_timer_client).__name__)
[docs]@contextmanagerdefexpires(after:float,scope:Optional[str]=None,client:Optional[TimerClient]=None):""" Acquires a countdown timer that expires in ``after`` seconds from now, unless the code-block that it wraps is finished within the timeframe. When the timer expires, this worker is eligible to be reaped. The exact meaning of "reaped" depends on the client implementation. In most cases, reaping means to terminate the worker process. Note that the worker is NOT guaranteed to be reaped at exactly ``time.now() + after``, but rather the worker is "eligible" for being reaped and the ``TimerServer`` that the client talks to will ultimately make the decision when and how to reap the workers with expired timers. Usage:: torch.distributed.elastic.timer.configure(LocalTimerClient()) with expires(after=10): torch.distributed.all_reduce(...) """ifclientisNone:if_timer_clientisNone:raiseRuntimeError("Configure timer client before using countdown timers.")client=_timer_clientifscopeisNone:# grab the caller file + linenocaller=getframeinfo(stack()[1][0])scope=f"{caller.filename}#{caller.lineno}"expiration=time.time()+afterclient.acquire(scope,expiration)try:yieldfinally:client.release(scope)
Docs
Access comprehensive developer documentation for PyTorch
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: Cookies Policy.