[docs]classAdadelta(Optimizer):r"""Implements Adadelta algorithm. .. math:: \begin{aligned} &\rule{110mm}{0.4pt} \\ &\textbf{input} : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)}, \: \lambda \text{ (weight decay)} \\ &\textbf{initialize} : v_0 \leftarrow 0 \: \text{ (square avg)}, \: u_0 \leftarrow 0 \: \text{ (accumulate variables)} \\[-1.ex] &\rule{110mm}{0.4pt} \\ &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm}if \: \lambda \neq 0 \\ &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ &\hspace{5mm} v_t \leftarrow v_{t-1} \rho + g^2_t (1 - \rho) \\ &\hspace{5mm}\Delta x_t \leftarrow \frac{\sqrt{u_{t-1} + \epsilon }}{ \sqrt{v_t + \epsilon} }g_t \hspace{21mm} \\ &\hspace{5mm} u_t \leftarrow u_{t-1} \rho + \Delta x^2_t (1 - \rho) \\ &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma \Delta x_t \\ &\rule{110mm}{0.4pt} \\[-1.ex] &\bf{return} \: \theta_t \\[-1.ex] &\rule{110mm}{0.4pt} \\[-1.ex] \end{aligned} For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups rho (float, optional): coefficient used for computing a running average of squared gradients (default: 0.9) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-6) lr (float, optional): coefficient that scale delta before it is applied to the parameters (default: 1.0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) .. _ADADELTA\: An Adaptive Learning Rate Method: https://arxiv.org/abs/1212.5701 """def__init__(self,params,lr=1.0,rho=0.9,eps=1e-6,weight_decay=0):ifnot0.0<=lr:raiseValueError("Invalid learning rate: {}".format(lr))ifnot0.0<=rho<=1.0:raiseValueError("Invalid rho value: {}".format(rho))ifnot0.0<=eps:raiseValueError("Invalid epsilon value: {}".format(eps))ifnot0.0<=weight_decay:raiseValueError("Invalid weight_decay value: {}".format(weight_decay))defaults=dict(lr=lr,rho=rho,eps=eps,weight_decay=weight_decay)super(Adadelta,self).__init__(params,defaults)
[docs]@torch.no_grad()defstep(self,closure=None):"""Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """loss=NoneifclosureisnotNone:withtorch.enable_grad():loss=closure()forgroupinself.param_groups:params_with_grad=[]grads=[]square_avgs=[]acc_deltas=[]lr,rho,eps,weight_decay=group['lr'],group['rho'],group['eps'],group['weight_decay']forpingroup['params']:ifp.gradisNone:continueparams_with_grad.append(p)ifp.grad.is_sparse:raiseRuntimeError('Adadelta does not support sparse gradients')grads.append(p.grad)state=self.state[p]# Lazy state initializationiflen(state)==0:state['step']=0state['square_avg']=torch.zeros_like(p,memory_format=torch.preserve_format)state['acc_delta']=torch.zeros_like(p,memory_format=torch.preserve_format)square_avgs.append(state['square_avg'])acc_deltas.append(state['acc_delta'])state['step']+=1F.adadelta(params_with_grad,grads,square_avgs,acc_deltas,lr=lr,rho=rho,eps=eps,weight_decay=weight_decay)returnloss
Docs
Access comprehensive developer documentation for PyTorch
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: Cookies Policy.