from olympus.utils.stat import StatStream
from collections import OrderedDict
import time
metrics = [
'memory.used',
'memory.free',
'memory.total',
'temperature.gpu',
'utilization.gpu',
'utilization.memory',
]
[docs]class AmdGpuMonitor:
""" ROCm smi utility is too basic for us to use.
We will use sysfs/KFD api instead.
One problem with KFD is that the node number is not guaranteed to be the same after reboot
which sucks. I only have one GPU so I do not care.
"""
def __init__(self, loop_interval, device_id):
self.streams = [StatStream(drop_first_obs=2) for _ in metrics]
self.n = len(metrics)
self.running = True
self.sleep_time=loop_interval
# this requires sudo
self.powerprefix = '/sys/kernel/debug/dri/'
self.drmprefix = '/sys/class/drm'
self.hwmonprefix = '/sys/class/hwmon'
self.kfdprefix = '/sys/devices/virtual/kfd'
self.file_names = {
'memory': self.kfdprefix + '/kfd/topology/nodes/1/mem_banks/0/used_memory',
'memory_property': self.kfdprefix + '/kfd/topology/nodes/1/mem_banks/0/properties',
'gpu_usage': self.hwmonprefix + '/hwmon0/device/gpu_busy_percent',
'temperature': self.hwmonprefix + '/hwmon0/temp1_input',
}
# opened file cache
self.files = {}
self.total_memory = self.parse_memory_props()
self.used_memory = None
self.metrics = OrderedDict({
# 'name': self.parse_name,
'temperature.gpu': self.parse_temperature,
'utilization.gpu': self.parse_gpu_usage,
'utilization.memory': self.parse_memory_usage,
'memory.total': self.parse_memory_total,
'memory.free': self.parse_memory_free,
'memory.used': self.parse_memory_used
})
[docs] def read_props(self, file):
props = {}
with open(file, 'r') as f:
for line in f.readlines():
key, val = line.split(' ')
props[key] = val
return props
[docs] def parse_memory_props(self):
return int(self.read_props(self.file_names['memory_property'])['size_in_bytes']) / (1024 * 1024)
[docs] def read_value(self, file_name):
file = self.files.get(file_name)
if file is None:
file = open(self.file_names[file_name], 'r')
file.seek(0)
temp = file.readline()
return temp
[docs] def parse_temperature(self):
return int(self.read_value('temperature'))
[docs] def parse_gpu_usage(self):
return float(self.read_value('gpu_usage'))
[docs] def parse_memory_used(self):
self.used_memory = int(self.read_value('memory')) / (1024 * 1024)
return self.used_memory
[docs] def parse_memory_usage(self):
return self.used_memory * 100.0 / self.total_memory
[docs] def parse_memory_total(self):
return self.total_memory
[docs] def parse_memory_free(self):
return self.total_memory - self.used_memory
[docs] def parse_name(self):
return ''
[docs] def run(self):
while self.running:
for i, metric in enumerate(metrics):
self.streams[i].update(self.metrics[metric]())
time.sleep(self.sleep_time / 1000)
[docs] def stop(self):
self.running = False