Enables training with virtually simulate larger batch sizes by accumulating gradients over iterations, enhancing stability and model quality. Useful for training large-scale models in NLP or vision domains where memory constraints restrict batch sizes.
Model Generalization 측면에서 좋고 매번 업데이트 안해줘서 좋다
너무 크게 하면 local minimum에 빠질 수 있으니 조심해야
import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset # Dummy dataset x, y = torch.randn(1000, 10), torch.randn(1000, 1) train_loader = DataLoader(TensorDataset(x, y), batch_size=32, shuffle=True) # Simple model model = nn.Linear(10, 1) criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) accumulation_steps = 4 for epoch in range(5): for i, (inputs, labels) in enumerate(train_loader): loss = criterion(model(inputs), labels) / accumulation_steps loss.backward() if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() print(f"Epoch {epoch+1}, Loss: {loss.item()}") print("Training completed")
sequence length normalization matters