# Source code for torch.autograd

"""
torch.autograd provides classes and functions implementing automatic
differentiation of arbitrary scalar valued functions. It requires minimal
changes to the existing code - you only need to declare :class:Tensor s
for which gradients should be computed with the requires_grad=True keyword.
"""
import torch
import warnings

from .variable import Variable
from .function import Function, NestedIOFunction
from .gradcheck import gradcheck, gradgradcheck
from .grad_mode import no_grad, enable_grad, set_grad_enabled
from .anomaly_mode import detect_anomaly, set_detect_anomaly
from . import profiler

__all__ = ['Variable', 'Function', 'backward', 'grad_mode']

def _make_grads(outputs, grads):
new_grads = []
for out, grad in zip(outputs, grads):
if isinstance(grad, torch.Tensor):
new_grads.append(grad)
elif grad is None:
if out.requires_grad:
if out.numel() != 1:
raise RuntimeError("grad can be implicitly created only for scalar outputs")
new_grads.append(torch.ones_like(out))
else:
new_grads.append(None)
else:
raise TypeError("gradients can be either Tensors or None, but got " +
type(grad).__name__)
return tuple(new_grads)

[docs]def backward(tensors, grad_tensors=None, retain_graph=None, create_graph=False, grad_variables=None):
r"""Computes the sum of gradients of given tensors w.r.t. graph leaves.

The graph is differentiated using the chain rule. If any of tensors
are non-scalar (i.e. their data has more than one element) and require
gradient, then the Jacobian-vector product would be computed, in this
case the function additionally requires specifying grad_tensors.
It should be a sequence of matching length, that contains the "vector"
in the Jacobian-vector product, usually the gradient of the differentiated
function w.r.t. corresponding tensors (None is an acceptable value for
all tensors that don't need gradient tensors).

This function accumulates gradients in the leaves - you might need to zero
them before calling it.

Arguments:
tensors (sequence of Tensor): Tensors of which the derivative will be
computed.
grad_tensors (sequence of (Tensor or None)): The "vector" in the Jacobian-vector
product, usually gradients w.r.t. each element of corresponding tensors.
None values can be specified for scalar Tensors or ones that don't require
grad. If a None value would be acceptable for all grad_tensors, then this
argument is optional.
retain_graph (bool, optional): If False, the graph used to compute the grad
will be freed. Note that in nearly all cases setting this option to True
is not needed and often can be worked around in a much more efficient
way. Defaults to the value of create_graph.
create_graph (bool, optional): If True, graph of the derivative will
be constructed, allowing to compute higher order derivative products.
Defaults to False.
"""
if grad_variables is not None:
warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
if grad_tensors is None:
grad_tensors = grad_variables
else:
raise RuntimeError("'grad_tensors' and 'grad_variables' (deprecated) "
"arguments both passed to backward(). Please only "
"use 'grad_tensors'.")

tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)

if grad_tensors is None:
grad_tensors = [None] * len(tensors)
elif isinstance(grad_tensors, torch.Tensor):
grad_tensors = [grad_tensors]
else:
grad_tensors = list(grad_tensors)

grad_tensors = _make_grads(tensors, grad_tensors)
if retain_graph is None:
retain_graph = create_graph

Variable._execution_engine.run_backward(
tensors, grad_tensors, retain_graph, create_graph,
allow_unreachable=True)  # allow_unreachable flag

[docs]def grad(outputs, inputs, grad_outputs=None, retain_graph=None, create_graph=False,
only_inputs=True, allow_unused=False):
r"""Computes and returns the sum of gradients of outputs w.r.t. the inputs.

grad_outputs should be a sequence of length matching output
containing the "vector" in Jacobian-vector product, usually the pre-computed
gradients w.r.t. each of the outputs. If an output doesn't require_grad,
then the gradient can be None).

If only_inputs is True, the function will only return a list of gradients
w.r.t the specified inputs. If it's False, then gradient w.r.t. all remaining
leaves will still be computed, and will be accumulated into their .grad
attribute.

Arguments:
outputs (sequence of Tensor): outputs of the differentiated function.
inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
returned (and not accumulated into .grad).
grad_outputs (sequence of Tensor): The "vector" in the Jacobian-vector product.
Usually gradients w.r.t. each output. None values can be specified for scalar
Tensors or ones that don't require grad. If a None value would be acceptable
for all grad_tensors, then this argument is optional. Default: None.
retain_graph (bool, optional): If False, the graph used to compute the grad
will be freed. Note that in nearly all cases setting this option to True
is not needed and often can be worked around in a much more efficient
way. Defaults to the value of create_graph.
create_graph (bool, optional): If True, graph of the derivative will
be constructed, allowing to compute higher order derivative products.
Default: False.
allow_unused (bool, optional): If False, specifying inputs that were not
used when computing outputs (and therefore their grad is always zero)
is an error. Defaults to False.
"""
if not only_inputs:
warnings.warn("only_inputs argument is deprecated and is ignored now "
"(defaults to True). To accumulate gradient for other "
"parts of the graph, please use torch.autograd.backward.")

outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs)
inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs)
if grad_outputs is None:
grad_outputs = [None] * len(outputs)
elif isinstance(grad_outputs, torch.Tensor):
grad_outputs = [grad_outputs]
else:
grad_outputs = list(grad_outputs)

grad_outputs = _make_grads(outputs, grad_outputs)
if retain_graph is None:
retain_graph = create_graph

return Variable._execution_engine.run_backward(
outputs, grad_outputs, retain_graph, create_graph,
inputs, allow_unused)

# This function applies in case of gradient checkpointing for memory
# optimization. Currently, for gradient checkpointing, we only support imperative
# backwards call i.e. torch.autograd.backward() and the torch.autograd.grad() won't
# work. The reason being that: torch.autograd.grad() only calculates the grads
# for the inputs that are passed by user but it doesn't calculate grad for
# anything else e.g. model parameters like weights, bias etc. However, for
# torch.autograd.backward(), we would actually compute the grad for the weights as well.
#
# This function returns whether the checkpointing is valid i.e. torch.autograd.backward
# or not i.e. torch.autograd.grad. The implementation works by maintaining a thread
# local variable in torch/csrc/autograd/engine.cpp which looks at the FunctionTask
# in the stack and before a FunctionTask is executed in evaluate_function, it
# checks for whether reentrant backwards is imperative or not.
# See https://github.com/pytorch/pytorch/pull/4594 for more discussion/context
def _is_checkpoint_valid():
return Variable._execution_engine.is_checkpoint_valid()

def variable(*args, **kwargs):
warnings.warn("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead")
return torch.tensor(*args, **kwargs)

if not torch._C._autograd_init():
raise RuntimeError("autograd initialization failed")