Running the jupyter notebook on GPU¶
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
2.4.1 11.8 True NVIDIA GeForce RTX 4090 Laptop GPU
GPUtil is a Python module for getting the GPU status from NVIDA GPUs using nvidia-smi.¶
pip install GPUtil
Collecting GPUtil Downloading GPUtil-1.4.0.tar.gz (5.5 kB) Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' Building wheels for collected packages: GPUtil Building wheel for GPUtil (setup.py): started Building wheel for GPUtil (setup.py): finished with status 'done' Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7401 sha256=232218d22e076a53c47b904dde4379881068093ebbfa96d86b248fb36e3f54d0 Stored in directory: c:\users\abhis\appdata\local\pip\cache\wheels\ba\03\bb\7a97840eb54479b328672e15a536e49dc60da200fb21564d53 Successfully built GPUtil Installing collected packages: GPUtil Successfully installed GPUtil-1.4.0 Note: you may need to restart the kernel to use updated packages.
✅ GPU Stress Test & Benchmark Program (PyTorch)¶
This script:
Trains a simple CNN on a synthetic dataset (no downloads needed).
Runs on GPU (if available).
Prints training time per epoch, GPU memory usage, and total throughput.
📈 What You'll See¶
Epoch-wise speed (in seconds).
Accuracy (based on synthetic labels — so low is okay).
Total GPU training time.
Memory usage and GPU load (optional with GPUtil).
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import time
import numpy as np
import GPUtil # Optional, for memory usage
# ✅ Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# ✅ Create synthetic image dataset (28x28 like MNIST)
X = torch.randn(10000, 1, 28, 28)
y = torch.randint(0, 10, (10000,))
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=128, shuffle=True)
# ✅ Define a small CNN
class TestNet(nn.Module):
def __init__(self):
super(TestNet, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
self.fc1 = nn.Linear(64 * 12 * 12, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2)
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
return self.fc2(x)
model = TestNet().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
# ✅ Train and time each epoch
epochs = 5
total_start = time.time()
for epoch in range(epochs):
start = time.time()
model.train()
total_loss = 0
correct = 0
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
pred = outputs.argmax(dim=1)
correct += (pred == labels).sum().item()
end = time.time()
accuracy = correct / len(dataset)
print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.2f} - Accuracy: {accuracy:.4f} - Time: {end - start:.2f}s")
total_end = time.time()
print(f"\n⏱️ Total Training Time: {total_end - total_start:.2f}s")
# ✅ (Optional) Show GPU memory stats
try:
gpus = GPUtil.getGPUs()
for gpu in gpus:
print(f"\n📊 GPU Utilization Report for {gpu.name}")
print(f" Load : {gpu.load * 100:.1f}%")
print(f" Memory Used: {gpu.memoryUsed} MB / {gpu.memoryTotal} MB")
print(f" Temp : {gpu.temperature} °C")
except:
print("Install GPUtil via: pip install gputil")
Using device: cuda Epoch 1/5 - Loss: 182.64 - Accuracy: 0.0995 - Time: 1.88s Epoch 2/5 - Loss: 181.89 - Accuracy: 0.1013 - Time: 0.77s Epoch 3/5 - Loss: 181.76 - Accuracy: 0.1089 - Time: 0.79s Epoch 4/5 - Loss: 180.32 - Accuracy: 0.1433 - Time: 0.79s Epoch 5/5 - Loss: 173.15 - Accuracy: 0.2093 - Time: 0.80s ⏱️ Total Training Time: 5.03s 📊 GPU Utilization Report for NVIDIA GeForce RTX 4090 Laptop GPU Load : 32.0% Memory Used: 1208.0 MB / 16376.0 MB Temp : 49.0 °C
let’s push your GPU harder by training a larger and deeper model on a real dataset, such as CIFAR-10 with ResNet18.¶
This test will:
Load CIFAR-10 from torchvision.datasets (~60,000 images).
Use ResNet18 (a popular, deeper CNN).
Train for several epochs to saturate GPU load.
Show timing, GPU usage, and performance metrics.
🧠 What You’ll Get¶
Total training time per epoch (heavier load).
Accuracy improvement over time.
GPU memory and utilization stats.
Significant stress on your GPU (compared to synthetic tests).
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import time
import GPUtil
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️ Using device: {device}")
# ✅ Load CIFAR-10 dataset (auto-downloads)
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True, num_workers=2)
# ✅ Load pretrained ResNet18 and adapt it
resnet18 = torchvision.models.resnet18(weights=None) # or use weights="DEFAULT" for pretrained
resnet18.fc = nn.Linear(resnet18.fc.in_features, 10)
model = resnet18.to(device)
# ✅ Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# ✅ Train for multiple epochs
epochs = 20
total_start = time.time()
for epoch in range(epochs):
start = time.time()
model.train()
running_loss = 0.0
correct = 0
total = 0
for i, data in enumerate(train_loader, 0):
inputs, labels = data[0].to(device), data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
end = time.time()
accuracy = 100 * correct / total
print(f"Epoch [{epoch + 1}/{epochs}] - Loss: {running_loss:.2f} - Accuracy: {accuracy:.2f}% - Time: {end - start:.2f}s")
total_end = time.time()
print(f"\n⏱️ Total Training Time: {total_end - total_start:.2f}s")
# ✅ GPU stats
try:
gpus = GPUtil.getGPUs()
for gpu in gpus:
print(f"\n📊 GPU Stats for {gpu.name}")
print(f" Load : {gpu.load * 100:.1f}%")
print(f" Memory Used: {gpu.memoryUsed} MB / {gpu.memoryTotal} MB")
print(f" Temp : {gpu.temperature} °C")
except:
print("Install GPUtil for GPU stats: pip install gputil")
🖥️ Using device: cuda Files already downloaded and verified Epoch [1/20] - Loss: 533.86 - Accuracy: 50.78% - Time: 36.46s Epoch [2/20] - Loss: 376.90 - Accuracy: 65.90% - Time: 35.29s Epoch [3/20] - Loss: 305.69 - Accuracy: 72.57% - Time: 36.22s Epoch [4/20] - Loss: 257.21 - Accuracy: 76.83% - Time: 34.88s Epoch [5/20] - Loss: 216.65 - Accuracy: 80.31% - Time: 36.07s Epoch [6/20] - Loss: 179.43 - Accuracy: 83.93% - Time: 35.99s Epoch [7/20] - Loss: 146.47 - Accuracy: 86.85% - Time: 36.28s Epoch [8/20] - Loss: 120.00 - Accuracy: 89.07% - Time: 36.39s Epoch [9/20] - Loss: 96.59 - Accuracy: 91.29% - Time: 36.93s Epoch [10/20] - Loss: 79.86 - Accuracy: 92.80% - Time: 37.09s Epoch [11/20] - Loss: 63.81 - Accuracy: 94.24% - Time: 35.98s Epoch [12/20] - Loss: 54.07 - Accuracy: 95.24% - Time: 35.58s Epoch [13/20] - Loss: 45.52 - Accuracy: 95.99% - Time: 38.45s Epoch [14/20] - Loss: 43.47 - Accuracy: 96.04% - Time: 37.86s Epoch [15/20] - Loss: 40.34 - Accuracy: 96.41% - Time: 35.80s Epoch [16/20] - Loss: 34.26 - Accuracy: 96.90% - Time: 37.96s Epoch [17/20] - Loss: 32.94 - Accuracy: 97.06% - Time: 35.31s Epoch [18/20] - Loss: 32.16 - Accuracy: 97.11% - Time: 36.67s Epoch [19/20] - Loss: 24.70 - Accuracy: 97.82% - Time: 35.57s Epoch [20/20] - Loss: 27.82 - Accuracy: 97.56% - Time: 35.39s ⏱️ Total Training Time: 726.16s 📊 GPU Stats for NVIDIA GeForce RTX 4090 Laptop GPU Load : 37.0% Memory Used: 1593.0 MB / 16376.0 MB Temp : 51.0 °C