[docs]defis_available()->bool:""" Return ``True`` if the distributed package is available. Otherwise, ``torch.distributed`` does not expose any other APIs. Currently, ``torch.distributed`` is available on Linux, MacOS and Windows. Set ``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source. Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows, ``USE_DISTRIBUTED=0`` for MacOS. """returnhasattr(torch._C,"_c10d_init")
ifis_available()andnottorch._C._c10d_init():raiseRuntimeError("Failed to initialize torch.distributed")# Custom Runtime Errors thrown from the distributed packageDistError=torch._C._DistErrorDistBackendError=torch._C._DistBackendErrorDistNetworkError=torch._C._DistNetworkErrorDistStoreError=torch._C._DistStoreErrorifis_available():fromtorch._C._distributed_c10dimport(Store,FileStore,TCPStore,ProcessGroupasProcessGroup,Backendas_Backend,PrefixStore,Reducer,Logger,BuiltinCommHookType,GradBucket,Workas_Work,_DEFAULT_FIRST_BUCKET_BYTES,_register_comm_hook,_register_builtin_comm_hook,_broadcast_coalesced,_compute_bucket_assignment_by_size,_verify_params_across_processes,_test_python_store,DebugLevel,get_debug_level,set_debug_level,set_debug_level_from_env,_make_nccl_premul_sum,_ControlCollectives,_StoreCollectives,)class_DistributedPdb(pdb.Pdb):""" Supports using PDB from inside a multiprocessing child process. Usage: _DistributedPdb().set_trace() """definteraction(self,*args,**kwargs):_stdin=sys.stdintry:sys.stdin=open('/dev/stdin')pdb.Pdb.interaction(self,*args,**kwargs)finally:sys.stdin=_stdin
[docs]defbreakpoint(rank:int=0):""" Set a breakpoint, but only on a single rank. All other ranks will wait for you to be done with the breakpoint before continuing. Args: rank (int): Which rank to break on. Default: ``0`` """ifget_rank()==rank:pdb=_DistributedPdb()pdb.message("\n!!! ATTENTION !!!\n\n"f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n")pdb.set_trace()# If Meta/Python keys are in the TLS, we want to make sure that we ignore them# and hit the (default) CPU/CUDA implementation of barrier.meta_in_tls=torch._C._meta_in_tls_dispatch_include()guard=torch._C._DisableTorchDispatch()# type: ignore[attr-defined]torch._C._set_meta_in_tls_dispatch_include(False)try:barrier()finally:torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)delguard
ifsys.platform!="win32":fromtorch._C._distributed_c10dimport(HashStore,_round_robin_process_groups,)from.distributed_c10dimport*# noqa: F403# Variables prefixed with underscore are not auto imported# See the comment in `distributed_c10d.py` above `_backend` on why we expose# this.from.distributed_c10dimport(_all_gather_base,_reduce_scatter_base,_create_process_group_wrapper,_rank_not_in_group,_coalescing_manager,_CoalescingManager,_get_process_group_name,get_node_local_rank,)from.rendezvousimport(rendezvous,_create_store_from_options,register_rendezvous_handler,)from.remote_deviceimport_remote_devicefrom.device_meshimportinit_device_mesh,DeviceMeshset_debug_level_from_env()else:# This stub is sufficient to get# python test/test_public_bindings.py -k test_correct_module_names# working even when USE_DISTRIBUTED=0. Feel free to add more# stubs as necessary.# We cannot define stubs directly because they confuse pyreclass_ProcessGroupStub:passsys.modules["torch.distributed"].ProcessGroup=_ProcessGroupStub# type: ignore[attr-defined]
Docs
Access comprehensive developer documentation for PyTorch
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: Cookies Policy.