# Source code for torch.nn.modules.loss

```
from torch.autograd import Variable
import torch
from .module import Module
from .container import Sequential
from .activation import LogSoftmax
from .. import functional as F
def _assert_no_grad(variable):
assert not variable.requires_grad, \
"nn criterions don't compute the gradient w.r.t. targets - please " \
"mark these variables as volatile or not requiring gradients"
class _Loss(Module):
def __init__(self, size_average=True):
super(_Loss, self).__init__()
self.size_average = size_average
def forward(self, input, target):
_assert_no_grad(target)
backend_fn = getattr(self._backend, type(self).__name__)
return backend_fn(self.size_average)(input, target)
class _WeightedLoss(_Loss):
def __init__(self, weight=None, size_average=True):
super(_WeightedLoss, self).__init__(size_average)
self.register_buffer('weight', weight)
def forward(self, input, target):
_assert_no_grad(target)
backend_fn = getattr(self._backend, type(self).__name__)
return backend_fn(self.size_average, weight=self.weight)(input, target)
[docs]class L1Loss(_Loss):
r"""Creates a criterion that measures the mean absolute value of the
element-wise difference between input `x` and target `y`:
:math:`{loss}(x, y) = 1/n \sum |x_i - y_i|`
`x` and `y` arbitrary shapes with a total of `n` elements each.
The sum operation still operates over all the elements, and divides by `n`.
The division by `n` can be avoided if one sets the constructor argument `size_average=False`
"""
pass
[docs]class NLLLoss(_WeightedLoss):
r"""The negative log likelihood loss. It is useful to train a classification problem with n classes
If provided, the optional argument `weights` should be a 1D Tensor assigning
weight to each of the classes.
This is particularly useful when you have an unbalanced training set.
The input given through a forward call is expected to contain log-probabilities
of each class: input has to be a 2D Tensor of size `(minibatch, n)`
Obtaining log-probabilities in a neural network is easily achieved by
adding a `LogSoftmax` layer in the last layer of your network.
You may use `CrossEntropyLoss` instead, if you prefer not to add an extra layer.
The target that this loss expects is a class index `(0 to N-1, where N = number of classes)`
The loss can be described as::
loss(x, class) = -x[class]
or in the case of the weights argument it is specified as follows::
loss(x, class) = -weights[class] * x[class]
Args:
weight (Tensor, optional): a manual rescaling weight given to each class.
If given, has to be a Tensor of size "nclasses"
size_average (bool, optional): By default, the losses are averaged over observations for each minibatch.
However, if the field size_average is set to False,
the losses are instead summed for each minibatch.
Shape:
- Input: :math:`(N, C)` where `C = number of classes`
- Target: :math:`(N)` where each value is `0 <= targets[i] <= C-1`
Attributes:
weight: the class-weights given as input to the constructor
Examples::
>>> m = nn.LogSoftmax()
>>> loss = nn.NLLLoss()
>>> # input is of size nBatch x nClasses = 3 x 5
>>> input = autograd.Variable(torch.randn(3, 5), requires_grad=True)
>>> # each element in target has to have 0 <= value < nclasses
>>> target = autograd.Variable(torch.LongTensor([1, 0, 4]))
>>> output = loss(m(input), target)
>>> output.backward()
"""
def forward(self, input, target):
_assert_no_grad(target)
return F.nll_loss(input, target,
self.weight, self.size_average)
[docs]class NLLLoss2d(_WeightedLoss):
r"""This is negative log likehood loss, but for image inputs. It computes NLL loss per-pixel.
Args:
weight (Tensor, optional): a manual rescaling weight given to each class.
If given, has to be a 1D Tensor having as many elements, as there are classes.
size_average: By default, the losses are averaged over observations for each minibatch.
However, if the field size_average is set to False, the losses
are instead summed for each minibatch. Default: True
Shape:
- Input: :math:`(N, C, H, W)` where `C = number of classes`
- Target: :math:`(N, H, W)` where each value is `0 <= targets[i] <= C-1`
Examples:
>>> m = nn.Conv2d(16, 32, (3, 3)).float()
>>> loss = nn.NLLLoss2d()
>>> # input is of size nBatch x nClasses x height x width
>>> input = autograd.Variable(torch.randn(3, 16, 10, 10))
>>> # each element in target has to have 0 <= value < nclasses
>>> target = autograd.Variable(torch.LongTensor(3, 8, 8).random_(0, 4))
>>> output = loss(m(input), target)
>>> output.backward()
"""
pass
[docs]class KLDivLoss(_WeightedLoss):
r"""The `Kullback-Leibler divergence`_ Loss
KL divergence is a useful distance measure for continuous distributions
and is often useful when performing direct regression over the space of
(discretely sampled) continuous output distributions.
As with `NLLLoss`, the `input` given is expected to contain
*log-probabilities*, however unlike `ClassNLLLoss`, `input` is not
restricted to a 2D Tensor, because the criterion is applied element-wise.
This criterion expects a `target` `Tensor` of the same size as the
`input` `Tensor`.
The loss can be described as:
.. math:: loss(x, target) = 1/n \sum(target_i * (log(target_i) - x_i))
By default, the losses are averaged for each minibatch over observations
**as well as** over dimensions. However, if the field
`size_average` is set to `False`, the losses are instead summed.
.. _Kullback-Leibler divergence:
https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
"""
pass
[docs]class MSELoss(_Loss):
r"""Creates a criterion that measures the mean squared error between
`n` elements in the input `x` and target `y`:
:math:`{loss}(x, y) = 1/n \sum |x_i - y_i|^2`
`x` and `y` arbitrary shapes with a total of `n` elements each.
The sum operation still operates over all the elements, and divides by `n`.
The division by `n` can be avoided if one sets the internal variable
`size_average` to `False`.
"""
pass
[docs]class BCELoss(_WeightedLoss):
r"""Creates a criterion that measures the Binary Cross Entropy
between the target and the output:
.. math:: loss(o, t) = - 1/n \sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
or in the case of the weights argument being specified:
.. math:: loss(o, t) = - 1/n \sum_i weights[i] * (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
This is used for measuring the error of a reconstruction in for example
an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1.
By default, the losses are averaged for each minibatch over observations
*as well as* over dimensions. However, if the field `size_average` is set
to `False`, the losses are instead summed.
"""
pass
[docs]class BCEWithLogitsLoss(Module):
r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single class.
This version is more numerically stable than using a plain `Sigmoid` followed by a `BCELoss` as, by combining the
operations into one layer, we take advantage of the log-sum-exp trick for numerical stability.
This Binary Cross Entropy between the target and the output logits (no sigmoid applied) is:
.. math:: loss(o, t) = - 1/n \sum_i (t[i] * log(sigmoid(o[i])) + (1 - t[i]) * log(1 - sigmoid(o[i])))
or in the case of the weights argument being specified:
.. math:: loss(o, t) = - 1/n \sum_i weights[i] * (t[i] * log(sigmoid(o[i])) + (1 - t[i]) * log(1 - sigmoid(o[i])))
This is used for measuring the error of a reconstruction in for example
an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1.
By default, the losses are averaged for each minibatch over observations
*as well as* over dimensions. However, if the field `size_average` is set
to `False`, the losses are instead summed.
"""
def __init__(self, weight=None, size_average=True):
super(BCEWithLogitsLoss, self).__init__()
self.size_average = size_average
self.register_buffer('weight', weight)
def forward(self, input, target):
if self.weight is not None:
return F.binary_cross_entropy_with_logits(input, target, Variable(self.weight), self.size_average)
else:
return F.binary_cross_entropy_with_logits(input, target, size_average=self.size_average)
[docs]class HingeEmbeddingLoss(_Loss):
r"""Measures the loss given an input `x` which is a 2D mini-batch tensor
and a labels `y`, a 1D tensor containg values (`1` or `-1`).
This is usually used for measuring whether two inputs are similar or dissimilar,
e.g. using the L1 pairwise distance, and is typically used for learning
nonlinear embeddings or semi-supervised learning::
{ x_i, if y_i == 1
loss(x, y) = 1/n {
{ max(0, margin - x_i), if y_i == -1
`x` and `y` arbitrary shapes with a total of `n` elements each
the sum operation still operates over all the elements, and divides by `n`.
The division by `n` can be avoided if one sets the internal variable `size_average=False`.
The `margin` has a default value of `1`, or can be set in the constructor.
"""
def __init__(self, margin=1.0, size_average=True):
super(HingeEmbeddingLoss, self).__init__()
self.margin = margin
self.size_average = size_average
def forward(self, input, target):
return self._backend.HingeEmbeddingLoss(self.margin,
self.size_average)(input, target)
[docs]class MultiLabelMarginLoss(_Loss):
r"""Creates a criterion that optimizes a multi-class multi-classification
hinge loss (margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and
output `y` (which is a 2D `Tensor` of target class indices).
For each sample in the mini-batch::
loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0)
where `i == 0` to `x.size(0)`, `j == 0` to `y.size(0)`,
`y[j] != 0`, and `i != y[j]` for all `i` and `j`.
`y` and `x` must have the same size.
The criterion only considers the first non zero `y[j]` targets.
This allows for different samples to have variable amounts of target classes
"""
pass
[docs]class SmoothL1Loss(_Loss):
r"""Creates a criterion that uses a squared term if the absolute
element-wise error falls below 1 and an L1 term otherwise.
It is less sensitive to outliers than the `MSELoss` and in some cases
prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
Also known as the Huber loss::
{ 0.5 * (x_i - y_i)^2, if |x_i - y_i| < 1
loss(x, y) = 1/n \sum {
{ |x_i - y_i| - 0.5, otherwise
`x` and `y` arbitrary shapes with a total of `n` elements each
the sum operation still operates over all the elements, and divides by `n`.
The division by `n` can be avoided if one sets the internal variable
`size_average` to `False`
"""
pass
[docs]class SoftMarginLoss(_Loss):
r"""Creates a criterion that optimizes a two-class classification
logistic loss between input `x` (a 2D mini-batch Tensor) and
target `y` (which is a tensor containing either `1` or `-1`).
::
loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x.nelement()
The normalization by the number of elements in the input can be disabled by
setting `self.size_average` to `False`.
"""
pass
[docs]class CrossEntropyLoss(_WeightedLoss):
r"""This criterion combines `LogSoftMax` and `NLLLoss` in one single class.
It is useful when training a classification problem with `n` classes.
If provided, the optional argument `weights` should be a 1D `Tensor`
assigning weight to each of the classes.
This is particularly useful when you have an unbalanced training set.
The `input` is expected to contain scores for each class.
`input` has to be a 2D `Tensor` of size `batch x n`.
This criterion expects a class index (0 to nClasses-1) as the
`target` for each value of a 1D tensor of size `n`
The loss can be described as::
loss(x, class) = -log(exp(x[class]) / (\sum_j exp(x[j])))
= -x[class] + log(\sum_j exp(x[j]))
or in the case of the `weights` argument being specified::
loss(x, class) = weights[class] * (-x[class] + log(\sum_j exp(x[j])))
The losses are averaged across observations for each minibatch.
Shape:
- Input: :math:`(N, C)` where `C = number of classes`
- Target: :math:`(N)` where each value is `0 <= targets[i] <= C-1`
"""
def forward(self, input, target):
_assert_no_grad(target)
return F.cross_entropy(input, target,
self.weight, self.size_average)
[docs]class MultiLabelSoftMarginLoss(_WeightedLoss):
r"""Creates a criterion that optimizes a multi-label one-versus-all
loss based on max-entropy, between input `x` (a 2D mini-batch `Tensor`) and
target `y` (a binary 2D `Tensor`). For each sample in the minibatch::
loss(x, y) = - sum_i (y[i] * log( 1 / (1 + exp(-x[i])) )
+ ( (1-y[i]) * log(exp(-x[i]) / (1 + exp(-x[i])) ) )
where `i == 0` to `x.nElement()-1`, `y[i] in {0,1}`.
`y` and `x` must have the same size.
"""
def forward(self, input, target):
return F.binary_cross_entropy(torch.sigmoid(input), target,
self.weight, self.size_average)
[docs]class CosineEmbeddingLoss(Module):
r"""Creates a criterion that measures the loss given an input tensors x1, x2
and a `Tensor` label `y` with values 1 or -1.
This is used for measuring whether two inputs are similar or dissimilar,
using the cosine distance, and is typically used for learning nonlinear
embeddings or semi-supervised learning.
`margin` should be a number from `-1` to `1`, `0` to `0.5` is suggested.
If `margin` is missing, the default value is `0`.
The loss function for each sample is::
{ 1 - cos(x1, x2), if y == 1
loss(x, y) = {
{ max(0, cos(x1, x2) - margin), if y == -1
If the internal variable `size_average` is equal to `True`,
the loss function averages the loss over the batch samples;
if `size_average` is `False`, then the loss function sums over the
batch samples. By default, `size_average = True`.
"""
def __init__(self, margin=0, size_average=True):
super(CosineEmbeddingLoss, self).__init__()
self.margin = margin
self.size_average = size_average
def forward(self, input1, input2, target):
return self._backend.CosineEmbeddingLoss(self.margin,
self.size_average)(input1, input2, target)
[docs]class MarginRankingLoss(Module):
r"""Creates a criterion that measures the loss given
inputs `x1`, `x2`, two 1D mini-batch `Tensor`s,
and a label 1D mini-batch tensor `y` with values (`1` or `-1`).
If `y == 1` then it assumed the first input should be ranked higher
(have a larger value) than the second input, and vice-versa for `y == -1`.
The loss function for each sample in the mini-batch is::
loss(x, y) = max(0, -y * (x1 - x2) + margin)
if the internal variable `size_average = True`,
the loss function averages the loss over the batch samples;
if `size_average = False`, then the loss function sums over the batch samples.
By default, `size_average` equals to `True`.
"""
def __init__(self, margin=0, size_average=True):
super(MarginRankingLoss, self).__init__()
self.margin = margin
self.size_average = size_average
def forward(self, input1, input2, target):
return self._backend.MarginRankingLoss(self.margin,
self.size_average)(input1, input2, target)
[docs]class MultiMarginLoss(Module):
r"""Creates a criterion that optimizes a multi-class classification hinge loss
(margin-based loss) between input `x` (a 2D mini-batch `Tensor`) and
output `y` (which is a 1D tensor of target class indices, `0` <= `y` <= `x.size(1)`):
For each mini-batch sample::
loss(x, y) = sum_i(max(0, (margin - x[y] + x[i]))^p) / x.size(0)
where `i == 0` to `x.size(0)` and `i != y`.
Optionally, you can give non-equal weighting on the classes by passing
a 1D `weights` tensor into the constructor.
The loss function then becomes:
loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] - x[i]))^p) / x.size(0)
By default, the losses are averaged over observations for each minibatch.
However, if the field `size_average` is set to `False`,
the losses are instead summed.
"""
def __init__(self, p=1, margin=1, weight=None, size_average=True):
super(MultiMarginLoss, self).__init__()
if p != 1 and p != 2:
raise ValueError("only p == 1 and p == 2 supported")
assert weight is None or weight.dim() == 1
self.p = p
self.margin = margin
self.size_average = size_average
self.weight = weight
def forward(self, input, target):
return self._backend.MultiMarginLoss(self.size_average, self.p,
self.margin, weight=self.weight)(input, target)
[docs]class TripletMarginLoss(Module):
r"""Creates a criterion that measures the triplet loss given an input tensors x1, x2, x3
and a margin with a value greater than 0.
This is used for measuring a relative similarity between samples. A triplet is composed by
`a`, `p` and `n`: anchor, positive examples and negative example respectively.
The shape of all input variables should be :math:`(N, D)`.
The distance swap is described in detail in the paper `Learning shallow convolutional feature descriptors with
triplet losses`_ by V. Balntas, E. Riba et al.
.. math::
L(a, p, n) = \frac{1}{N} \left( \sum_{i=1}^N \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\} \right)
where :math:`d(x_i, y_i) = \| {\bf x}_i - {\bf y}_i \|_2^2`.
Args:
anchor: anchor input tensor
positive: positive input tensor
negative: negative input tensor
p: the norm degree. Default: 2
Shape:
- Input: :math:`(N, D)` where `D = vector dimension`
- Output: :math:`(N, 1)`
>>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
>>> input1 = autograd.Variable(torch.randn(100, 128))
>>> input2 = autograd.Variable(torch.randn(100, 128))
>>> input3 = autograd.Variable(torch.randn(100, 128))
>>> output = triplet_loss(input1, input2, input3)
>>> output.backward()
.. _Learning shallow convolutional feature descriptors with triplet losses:
http://www.iis.ee.ic.ac.uk/%7Evbalnt/shallow_descr/TFeat_paper.pdf
"""
def __init__(self, margin=1.0, p=2, eps=1e-6, swap=False):
super(TripletMarginLoss, self).__init__()
self.margin = margin
self.p = p
self.eps = eps
self.swap = swap
def forward(self, anchor, positive, negative):
return F.triplet_margin_loss(anchor, positive, negative, self.margin,
self.p, self.eps, self.swap)
# TODO: L1HingeEmbeddingCriterion
# TODO: MSECriterion weight
# TODO: ClassSimplexCriterion
```