Source code for torch.distributed.elastic.events.api
#!/usr/bin/env python3# mypy: allow-untyped-defs# Copyright (c) Facebook, Inc. and its affiliates.# All rights reserved.## This source code is licensed under the BSD-style license found in the# LICENSE file in the root directory of this source tree.importjsonfromdataclassesimportasdict,dataclass,fieldfromenumimportEnumfromtypingimportDict,Optional,Union__all__=["EventSource","Event","NodeState","RdzvEvent"]EventMetadataValue=Union[str,int,float,bool,None]
[docs]classEventSource(str,Enum):"""Known identifiers of the event producers."""AGENT="AGENT"WORKER="WORKER"
[docs]@dataclassclassEvent:""" The class represents the generic event that occurs during the torchelastic job execution. The event can be any kind of meaningful action. Args: name: event name. source: the event producer, e.g. agent or worker timestamp: timestamp in milliseconds when event occurred. metadata: additional data that is associated with the event. """name:strsource:EventSourcetimestamp:int=0metadata:Dict[str,EventMetadataValue]=field(default_factory=dict)def__str__(self):returnself.serialize()@staticmethoddefdeserialize(data:Union[str,"Event"])->"Event":ifisinstance(data,Event):returndataifisinstance(data,str):data_dict=json.loads(data)data_dict["source"]=EventSource[data_dict["source"]]# type: ignore[possibly-undefined]returnEvent(**data_dict)defserialize(self)->str:returnjson.dumps(asdict(self))
classNodeState(str,Enum):"""The states that a node can be in rendezvous."""INIT="INIT"RUNNING="RUNNING"SUCCEEDED="SUCCEEDED"FAILED="FAILED"@dataclassclassRdzvEvent:""" Dataclass to represent any rendezvous event. Args: name: Event name. (E.g. Current action being performed) run_id: The run id of the rendezvous message: The message describing the event hostname: Hostname of the node pid: The process id of the node node_state: The state of the node (INIT, RUNNING, SUCCEEDED, FAILED) master_endpoint: The master endpoint for the rendezvous store, if known rank: The rank of the node, if known local_id: The local_id of the node, if defined in dynamic_rendezvous.py error_trace: Error stack trace, if this is an error event. """name:strrun_id:strmessage:strhostname:strpid:intnode_state:NodeStatemaster_endpoint:str=""rank:Optional[int]=Nonelocal_id:Optional[int]=Noneerror_trace:str=""def__str__(self):returnself.serialize()@staticmethoddefdeserialize(data:Union[str,"RdzvEvent"])->"RdzvEvent":ifisinstance(data,RdzvEvent):returndataifisinstance(data,str):data_dict=json.loads(data)data_dict["node_state"]=NodeState[data_dict["node_state"]]# type: ignore[possibly-undefined]returnRdzvEvent(**data_dict)defserialize(self)->str:returnjson.dumps(asdict(self))
Docs
Access comprehensive developer documentation for PyTorch
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: Cookies Policy.