Source code for torchx.specs
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
This contains the TorchX AppDef and related component definitions. These are
used by components to define the apps which can then be launched via a TorchX
scheduler or pipeline adapter.
"""
import difflib
from typing import Callable, Dict, Optional
from torchx.specs.named_resources_aws import NAMED_RESOURCES as AWS_NAMED_RESOURCES
from torchx.specs.named_resources_generic import (
NAMED_RESOURCES as GENERIC_NAMED_RESOURCES,
)
from torchx.util.entrypoints import load_group
from .api import ( # noqa: F401 F403
ALL,
AppDef,
AppDryRunInfo,
AppHandle,
AppState,
AppStatus,
BindMount,
CfgVal,
DeviceMount,
get_type_name,
InvalidRunConfigException,
is_terminal,
macros,
MalformedAppHandleException,
MISSING,
NONE,
NULL_RESOURCE,
parse_app_handle,
ReplicaState,
ReplicaStatus,
Resource,
RetryPolicy,
Role,
RoleStatus,
runopt,
runopts,
UnknownAppException,
UnknownSchedulerException,
VolumeMount,
)
from .builders import make_app_handle, materialize_appdef, parse_mounts # noqa
GiB: int = 1024
def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
resource_methods = load_group("torchx.named_resources", default={})
materialized_resources: Dict[str, Callable[[], Resource]] = {}
for name, resource in {
**GENERIC_NAMED_RESOURCES,
**AWS_NAMED_RESOURCES,
**resource_methods,
}.items():
materialized_resources[name] = resource
materialized_resources["NULL"] = lambda: NULL_RESOURCE
materialized_resources["MISSING"] = lambda: NULL_RESOURCE
return materialized_resources
_named_resource_factories: Dict[str, Callable[[], Resource]] = _load_named_resources()
class _NamedResourcesLibrary:
def __getitem__(self, key: str) -> Resource:
if key in _named_resource_factories:
return _named_resource_factories[key]()
else:
matches = difflib.get_close_matches(
key,
_named_resource_factories.keys(),
n=1,
)
if matches:
msg = f"Did you mean `{matches[0]}`?"
else:
msg = f"Registered named resources: {list(_named_resource_factories.keys())}"
raise KeyError(f"No named resource found for `{key}`. {msg}")
def __contains__(self, key: str) -> bool:
return key in _named_resource_factories
def __iter__(self) -> None:
raise NotImplementedError("named resources doesn't support iterating")
named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
[docs]def resource(
cpu: Optional[int] = None,
gpu: Optional[int] = None,
memMB: Optional[int] = None,
h: Optional[str] = None,
) -> Resource:
"""
Convenience method to create a ``Resource`` object from either the
raw resource specs (cpu, gpu, memMB) or the registered named resource (``h``).
Note that the (cpu, gpu, memMB) is mutually exclusive with ``h``
taking predecence if specified.
If ``h`` is specified then it is used to look up the
resource specs from the list of registered named resources.
See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
Otherwise a ``Resource`` object is created from the raw resource specs.
Example:
.. code-block:: python
resource(cpu=1) # returns Resource(cpu=1)
resource(named_resource="foobar") # returns registered named resource "foo"
resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored)
resource() # returns default resource values
resource(cpu=None, gpu=None, memMB=None) # throws
"""
if h:
return get_named_resources(h)
else:
# could make these defaults customizable via entrypoint
# not doing that now since its not a requested feature and may just over complicate things
# keeping these defaults method local so that no one else takes a dep on it
DEFAULT_CPU = 2
DEFAULT_GPU = 0
DEFAULT_MEM_MB = 1024
return Resource(
cpu=cpu or DEFAULT_CPU,
gpu=gpu or DEFAULT_GPU,
memMB=memMB or DEFAULT_MEM_MB,
)
[docs]def get_named_resources(res: str) -> Resource:
"""
Get resource object based on the string definition registered via entrypoints.txt.
TorchX implements ``named_resource`` registration mechanism, which consists of
the following steps:
1. Create a module and define your resource retrieval function:
.. code-block:: python
# my_module.resources
from typing import Dict
from torchx.specs import Resource
def gpu_x_1() -> Dict[str, Resource]:
return Resource(cpu=2, memMB=64 * 1024, gpu = 2)
2. Register resource retrieval in the entrypoints section:
::
[torchx.named_resources]
gpu_x_1 = my_module.resources:gpu_x_1
The ``gpu_x_1`` can be used as string argument to this function:
::
from torchx.specs import named_resources
resource = named_resources["gpu_x_1"]
"""
return named_resources[res]