Source code for olympus.utils.gpu.nvidia

import subprocess
import torch.cuda

from olympus.utils.stat import StatStream


nvidia_smi = 'nvidia-smi'
metrics = [
    'index',
    'temperature.gpu',
    'utilization.gpu',
    'utilization.memory',
    'memory.total',
    'memory.free',
    'memory.used'
]
query = '--query-gpu=' + ','.join(metrics)


[docs]class NvGpuMonitor: def __init__(self, loop_interval, device_id): self.options = ['--format=csv', '--loop-ms=' + str(loop_interval), '--id=' + str(device_id)] self.n = len(metrics) self.process = None self.running = True self.n_gpu = torch.cuda.device_count() self.dispatcher = { 'name': self.process_ignore, 'temperature.gpu': self.process_value, 'utilization.gpu': self.process_percentage, 'utilization.memory': self.process_percentage, 'memory.total': self.process_memory, 'memory.free': self.process_memory, 'memory.used': self.process_memory } # All GPUs self.overall = { k: StatStream(drop_first_obs=2) for k in metrics[1:] } # Per GPUs self.streams = { k: [StatStream(drop_first_obs=2)] * self.n_gpu for k in metrics[1:] } self.ts = {} for k in metrics[1:]: for i in range(self.n_gpu): self.ts[f'{k}{i}'] = []
[docs] def to_json(self, overall=True, extended=False): to_json = lambda x: x.to_json() if not extended: to_json = lambda x: x.avg return { k: to_json(item) for k, item in self.overall.items() }
[docs] def metrics(self): return self.streams
[docs] def run(self): try: with subprocess.Popen([nvidia_smi, query] + self.options, stdout=subprocess.PIPE, bufsize=1) as proc: self.process = proc count = 0 while self.running: line = proc.stdout.readline() if count > 0: self.parse(line.decode('UTF-8').strip()) count += 1 proc.kill() except: pass
[docs] def parse(self, line): if line == '': return elems = line.split(',') if len(elems) != self.n: print('Error line mismatch {} != {} with \n -> `{}`'.format(len(elems), self.n, line)) return gpu_index = int(elems[0]) gpu_data = elems[1:] for index, value in enumerate(gpu_data): metric_name = metrics[index + 1] self.dispatcher[metric_name](metric_name, gpu_index, value)
[docs] def process_percentage(self, metric_name, gpu_index, value): try: value, _ = value.strip().split(' ') self.process_value(metric_name, gpu_index, value) except Exception as e: print('Expected value format: `66 %` got `{}`'.format(value)) print(e)
[docs] def process_value(self, metric_name, gpu_index, value): self.streams[metric_name][gpu_index] += float(value) self.overall[metric_name] += float(value) self.ts[f'{metric_name}{gpu_index}'].append(float(value))
[docs] def process_ignore(self, metric_name, gpu_index, value): pass
[docs] def process_memory(self, metric_name, gpu_index, value): try: value, _ = value.strip().split(' ') self.process_value(metric_name, gpu_index, value) except Exception as e: print('Expected value format: `66 Mib` got `{}`'.format(value)) print(e)
[docs] def stop(self): if self.process is not None: self.running = False self.process.terminate()