Source code for torchx.specs

#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

"""
This contains the TorchX AppDef and related component definitions. These are
used by components to define the apps which can then be launched via a TorchX
scheduler or pipeline adapter.
"""
import difflib
from typing import Callable, Dict, Optional

from torchx.specs.api import (
    ALL,
    AppDef,
    AppDryRunInfo,
    AppHandle,
    AppState,
    AppStatus,
    BindMount,
    CfgVal,
    DeviceMount,
    get_type_name,
    InvalidRunConfigException,
    is_terminal,
    macros,
    MalformedAppHandleException,
    MISSING,
    NONE,
    NULL_RESOURCE,
    parse_app_handle,
    ReplicaState,
    ReplicaStatus,
    Resource,
    RetryPolicy,
    Role,
    RoleStatus,
    runopt,
    runopts,
    UnknownAppException,
    UnknownSchedulerException,
    VolumeMount,
)
from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts

from torchx.specs.named_resources_aws import NAMED_RESOURCES as AWS_NAMED_RESOURCES
from torchx.specs.named_resources_generic import (
    NAMED_RESOURCES as GENERIC_NAMED_RESOURCES,
)
from torchx.util.entrypoints import load_group

GiB: int = 1024


def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
    resource_methods = load_group("torchx.named_resources", default={})
    materialized_resources: Dict[str, Callable[[], Resource]] = {}

    for name, resource in {
        **GENERIC_NAMED_RESOURCES,
        **AWS_NAMED_RESOURCES,
        **resource_methods,
    }.items():
        materialized_resources[name] = resource

    materialized_resources["NULL"] = lambda: NULL_RESOURCE
    materialized_resources["MISSING"] = lambda: NULL_RESOURCE
    return materialized_resources


_named_resource_factories: Dict[str, Callable[[], Resource]] = _load_named_resources()


class _NamedResourcesLibrary:
    def __getitem__(self, key: str) -> Resource:
        if key in _named_resource_factories:
            return _named_resource_factories[key]()
        else:
            matches = difflib.get_close_matches(
                key,
                _named_resource_factories.keys(),
                n=1,
            )
            if matches:
                msg = f"Did you mean `{matches[0]}`?"
            else:
                msg = f"Registered named resources: {list(_named_resource_factories.keys())}"

            raise KeyError(f"No named resource found for `{key}`. {msg}")

    def __contains__(self, key: str) -> bool:
        return key in _named_resource_factories

    def __iter__(self) -> None:
        raise NotImplementedError("named resources doesn't support iterating")


named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()


[docs]def resource(
    cpu: Optional[int] = None,
    gpu: Optional[int] = None,
    memMB: Optional[int] = None,
    h: Optional[str] = None,
) -> Resource:
    """
    Convenience method to create a ``Resource`` object from either the
    raw resource specs (cpu, gpu, memMB) or the registered named resource (``h``).
    Note that the (cpu, gpu, memMB) is mutually exclusive with ``h``
    taking predecence if specified.

    If ``h`` is specified then it is used to look up the
    resource specs from the list of registered named resources.
    See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.

    Otherwise a ``Resource`` object is created from the raw resource specs.

    Example:

    .. code-block:: python

         resource(cpu=1) # returns Resource(cpu=1)
         resource(named_resource="foobar") # returns registered named resource "foo"
         resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored)
         resource() # returns default resource values
         resource(cpu=None, gpu=None, memMB=None) # throws
    """

    if h:
        return get_named_resources(h)
    else:
        # could make these defaults customizable via entrypoint
        # not doing that now since its not a requested feature and may just over complicate things
        # keeping these defaults method local so that no one else takes a dep on it
        DEFAULT_CPU = 2
        DEFAULT_GPU = 0
        DEFAULT_MEM_MB = 1024

        return Resource(
            cpu=cpu or DEFAULT_CPU,
            gpu=gpu or DEFAULT_GPU,
            memMB=memMB or DEFAULT_MEM_MB,
        )


[docs]def get_named_resources(res: str) -> Resource:
    """
    Get resource object based on the string definition registered via entrypoints.txt.

    TorchX implements ``named_resource`` registration mechanism, which consists of
    the following steps:

    1. Create a module and define your resource retrieval function:

    .. code-block:: python

     # my_module.resources
     from typing import Dict
     from torchx.specs import Resource

     def gpu_x_1() -> Dict[str, Resource]:
         return Resource(cpu=2, memMB=64 * 1024, gpu = 2)

    2. Register resource retrieval in the entrypoints section:

    ::

     [torchx.named_resources]
     gpu_x_1 = my_module.resources:gpu_x_1

    The ``gpu_x_1`` can be used as string argument to this function:

    ::

     from torchx.specs import named_resources
     resource = named_resources["gpu_x_1"]

    """
    return named_resources[res]


__all__ = [
    "AppDef",
    "AppDryRunInfo",
    "AppHandle",
    "AppState",
    "AppStatus",
    "BindMount",
    "CfgVal",
    "DeviceMount",
    "get_type_name",
    "is_terminal",
    "macros",
    "MISSING",
    "NONE",
    "NULL_RESOURCE",
    "parse_app_handle",
    "ReplicaState",
    "ReplicaStatus",
    "Resource",
    "RetryPolicy",
    "Role",
    "RoleStatus",
    "runopt",
    "runopts",
    "UnknownAppException",
    "UnknownSchedulerException",
    "InvalidRunConfigException",
    "MalformedAppHandleException",
    "VolumeMount",
    "resource",
    "get_named_resources",
    "named_resources",
    "make_app_handle",
    "materialize_appdef",
    "parse_mounts",
    "ALL",
]
Source code for torchx.specs

Docs

Tutorials

Resources