[docs]classSparseAdam(Optimizer):r"""Implements lazy version of Adam algorithm suitable for sparse tensors. In this variant, only moments that show up in the gradient get updated, and only those portions of the gradient get applied to the parameters. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 """def__init__(self,params,lr=1e-3,betas=(0.9,0.999),eps=1e-8):ifnot0.0<lr:raiseValueError("Invalid learning rate: {}".format(lr))ifnot0.0<eps:raiseValueError("Invalid epsilon value: {}".format(eps))ifnot0.0<=betas[0]<1.0:raiseValueError("Invalid beta parameter at index 0: {}".format(betas[0]))ifnot0.0<=betas[1]<1.0:raiseValueError("Invalid beta parameter at index 1: {}".format(betas[1]))params=list(params)sparse_params=[]forindex,paraminenumerate(params):ifisinstance(param,dict):ford_index,d_paraminenumerate(param.get("params",[])):ifd_param.is_sparse:sparse_params.append([index,d_index])elifparam.is_sparse:sparse_params.append(index)ifsparse_params:raiseValueError(f"Sparse params at indices {sparse_params}: SparseAdam requires dense parameter tensors")defaults=dict(lr=lr,betas=betas,eps=eps)super(SparseAdam,self).__init__(params,defaults)
[docs]@torch.no_grad()defstep(self,closure=None):"""Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """loss=NoneifclosureisnotNone:withtorch.enable_grad():loss=closure()forgroupinself.param_groups:params_with_grad=[]grads=[]exp_avgs=[]exp_avg_sqs=[]state_steps=[]eps=group['eps']lr=group['lr']beta1,beta2=group['betas']forpingroup['params']:ifp.gradisnotNone:params_with_grad.append(p)ifnotp.grad.is_sparse:raiseRuntimeError('SparseAdam does not support dense gradients, please consider Adam instead')grads.append(p.grad)state=self.state[p]# State initializationiflen(state)==0:state['step']=0# Exponential moving average of gradient valuesstate['exp_avg']=torch.zeros_like(p,memory_format=torch.preserve_format)# Exponential moving average of squared gradient valuesstate['exp_avg_sq']=torch.zeros_like(p,memory_format=torch.preserve_format)exp_avgs.append(state['exp_avg'])exp_avg_sqs.append(state['exp_avg_sq'])# update the steps for each param group updatestate['step']+=1# record the step after step updatestate_steps.append(state['step'])F.sparse_adam(params_with_grad,grads,exp_avgs,exp_avg_sqs,state_steps,beta1=beta1,beta2=beta2,lr=group['lr'],eps=group['eps'])returnloss
Docs
Access comprehensive developer documentation for PyTorch
To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: Cookies Policy.