-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeep_gpu_busy.py
70 lines (57 loc) · 2.35 KB
/
keep_gpu_busy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import subprocess
import torch
import time
def get_gpu_utilization():
"""
Gets the current GPU utilization using nvidia-smi command.
Returns:
list: A list of GPU utilization percentages.
"""
try:
result = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'])
utilization = result.decode('utf-8').strip().split('\n')
utilization = [int(x) for x in utilization]
return utilization
except subprocess.CalledProcessError as e:
print("Error querying GPU utilization:", e)
return [0] * torch.cuda.device_count()
def gpu_stress_test(size, iterations, interval, gpu_id):
"""
Function to perform a highly compute-intensive operation on a specific GPU.
"""
# print(f"Running on GPU: {gpu_id}")
# Set the specified GPU device
torch.cuda.set_device(gpu_id)
# Create large tensors and perform multiple compute-intensive operations
for _ in range(iterations):
a = torch.randn(size, size, device=f'cuda:{gpu_id}')
b = torch.randn(size, size, device=f'cuda:{gpu_id}')
c = a @ b # Matrix multiplication
# Additional operations to increase load
for _ in range(10):
c = c @ b
c = c.sin() # trigonometric operation for added complexity
c = c.cos()
time.sleep(interval)
def keep_gpu_busy(interval=1, threshold=10):
"""
Keeps the GPU utilization above a certain threshold by performing
dummy computations periodically.
Args:
interval (int): Time interval (in seconds) between each dummy computation.
threshold (int): GPU utilization threshold.
"""
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available. Please ensure you have a GPU with CUDA support.")
device_count = torch.cuda.device_count()
while True:
utilization = get_gpu_utilization()
print(f"Current GPU utilization: {utilization}%")
for i in range(device_count):
if utilization[i] < threshold:
# Perform a dummy computation to keep the GPU busy
gpu_stress_test(3600, 30, 0.05, i)
# Wait for the specified interval before the next computation
time.sleep(interval)
if __name__ == "__main__":
keep_gpu_busy(interval=1, threshold=10)