Pytorch Memory

Creator
Creator
Seonglae Cho
Created
Created
2025 Feb 10 11:56
Editor
Edited
Edited
2025 Feb 10 12:16
Refs
allocated = torch.cuda.memory_allocated() reserved = torch.cuda.memory_reserved() print(f"Allocated: {allocated/1024**2:.2f} MB, Reserved: {reserved/1024**2:.2f} MB")
 
torch.cuda.empty_cache()
removes allocated memory
 
print(torch.cuda.memory_summary(device=torch.cuda.current_device(), abbreviated=False))
  • CUDA OOMs: Count of out-of-memory errors on the GPU.
  • cudaMalloc retries: Number of times memory allocation was retried before succeeding.
  • Allocated memory:
    • Cur Usage: Currently allocated bytes by tensors.
    • Peak Usage: Maximum allocated bytes recorded.
    • Tot Alloc / Tot Freed: Total bytes allocated/freed over time.
    • from large/small pool: Breakdown for large vs. small block allocations.
  • Active memory: Memory currently held by active tensors (similar stats as Allocated memory).
  • Requested memory: Memory actually requested from the allocator (may differ slightly due to alignment or overhead).
  • GPU reserved memory: Total GPU memory reserved by PyTorch’s caching allocator (includes both used and cached/free blocks).
  • Non-releasable memory: Memory that, due to fragmentation or caching, cannot be returned to the GPU immediately.
  • Allocations: Count of all allocation events (total and split into large/small pool allocations).
  • Active allocs: Count of currently active allocation events.
  • GPU reserved segments: Number of contiguous memory chunks reserved from the GPU (split into large and small segments).
  • Non-releasable allocs: Count of allocations that remain in memory and cannot be freed immediately.
  • Oversize allocations / Oversize GPU segments: Allocations (and their corresponding memory chunks) that were too large to fit in the regular caching pools and thus were allocated separately.
 
 

Memory snapshot

  • Active Memory Timeline
  • Allocator State History
  • Active Cached Segment Timeline
  • Allocator Settings
Allocator State History
Allocator State History
import logging import socket from datetime import datetime logger: logging.Logger = logging.getLogger(__name__) MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000 TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S" def start_record_memory_history() -> None: if not torch.cuda.is_available(): logger.info("CUDA unavailable. Not recording memory history") return logger.info("Starting snapshot record_memory_history") torch.cuda.memory._record_memory_history( max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT ) def stop_record_memory_history() -> None: if not torch.cuda.is_available(): logger.info("CUDA unavailable. Not recording memory history") return logger.info("Stopping snapshot record_memory_history") torch.cuda.memory._record_memory_history(enabled=None) def export_memory_snapshot() -> None: if not torch.cuda.is_available(): logger.info("CUDA unavailable. Not exporting memory snapshot") return # Prefix for file names. host_name = socket.gethostname() timestamp = datetime.now().strftime(TIME_FORMAT_STR) file_prefix = f"{host_name}_{timestamp}" try: logger.info(f"Saving snapshot to local file: {file_prefix}.pickle") torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle") except Exception as e: logger.error(f"Failed to capture memory snapshot {e}") return
notion image

Memory snapshot with profiler

# (c) Meta Platforms, Inc. and affiliates. import logging import socket from datetime import datetime, timedelta import torch from torch.autograd.profiler import record_function from torchvision import models logging.basicConfig( format="%(levelname)s:%(asctime)s %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S", ) logger: logging.Logger = logging.getLogger(__name__) logger.setLevel(level=logging.INFO) TIME_FORMAT_STR: str = "%b_%d_%H_%M_%S" def trace_handler(prof: torch.profiler.profile): # Prefix for file names. host_name = socket.gethostname() timestamp = datetime.now().strftime(TIME_FORMAT_STR) file_prefix = f"{host_name}_{timestamp}" # Construct the trace file. prof.export_chrome_trace(f"{file_prefix}.json.gz") # Construct the memory timeline file. prof.export_memory_timeline(f"{file_prefix}.html", device="cuda:0") def run_resnet50(num_iters=5, device="cuda:0"): model = models.resnet50().to(device=device) inputs = torch.randn(1, 3, 224, 224, device=device) labels = torch.rand_like(model(inputs)) optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9) loss_fn = torch.nn.CrossEntropyLoss() with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], schedule=torch.profiler.schedule(wait=0, warmup=0, active=6, repeat=1), record_shapes=True, profile_memory=True, with_stack=True, on_trace_ready=trace_handler, ) as prof: for _ in range(num_iters): prof.step() with record_function("## forward ##"): pred = model(inputs) with record_function("## backward ##"): loss_fn(pred, labels).backward() with record_function("## optimizer ##"): optimizer.step() optimizer.zero_grad(set_to_none=True) if __name__ == "__main__": # Warm up run_resnet50() # Run the resnet50 model run_resnet50()
notion image
 
 

Memory exported visualizer (hosted)

 
 

Recommendations