torch Tensor.backward() Without this, the computation graph of the loss continues to accumulate. This is why we need to call backward() every time in Gradient Accumulation
Custom autograd
import torch from torch.autograd import Function class CustomReLU(Function): @staticmethod def forward(ctx, input): ctx.save_for_backward(input) return input.clamp(min=0) @staticmethod def backward(ctx, grad_output): input, = ctx.saved_tensors grad_input = grad_output.clone() grad_input[input < 0] = 0.1 # Modified gradient for negative values return grad_input