Is the MIG service free?
Opened this issue · 1 comments
qingfenghcy commented
thanks
MehdiTantaoui-99 commented
I am using this to know if it's free or not:
def get_nvidia_smi_xml():
result = subprocess.run(['nvidia-smi', '-q', '-x'], stdout=subprocess.PIPE)
xml_output = result.stdout.decode('utf-8')
return xml_output
def parse_nvidia_smi_xml(xml_output):
root = ET.fromstring(xml_output)
gpus = []
for gpu in root.findall('gpu'):
gpu_info = {}
gpu_info['id'] = int(gpu.find('minor_number').text)
gpu_info['name'] = gpu.find('product_name').text.strip()
total_memory_str = gpu.find('fb_memory_usage/total').text.strip()
gpu_info['total_memory'] = int(total_memory_str.replace(' MiB', ''))
mig_mode = gpu.find('mig_mode/current_mig').text.strip()
gpu_info['supports_mig'] = (mig_mode == 'Enabled')
gpu_info['mig_devices'] = []
# Parse MIG devices
for mig_device in gpu.findall('mig_devices/mig_device'):
mig_info = {}
mig_info['index'] = int(mig_device.find('index').text)
mig_info['gpu_instance_id'] = int(mig_device.find('gpu_instance_id').text)
mig_info['compute_instance_id'] = int(mig_device.find('compute_instance_id').text)
mig_info['is_in_use'] = False # Initialize as not in use
# Extract memory size of MIG device
memory_total_elem = mig_device.find('fb_memory_usage/total')
if memory_total_elem is not None:
memory_total_str = memory_total_elem.text.strip()
mig_info['memory'] = int(memory_total_str.replace(' MiB', ''))
else:
mig_info['memory'] = 0
# Extract multiprocessor count
sm_count_elem = mig_device.find('device_attributes/shared/multiprocessor_count')
if sm_count_elem is not None:
mig_info['sm_count'] = int(sm_count_elem.text.strip())
else:
mig_info['sm_count'] = 0
# Infer profile name based on memory and SM count
mig_info['name'] = infer_profile_name(gpu_info['name'], mig_info['memory'], mig_info['sm_count'])
gpu_info['mig_devices'].append(mig_info)
# Parse processes and map them to MIG devices
for proc in gpu.findall('processes/process_info'):
gpu_instance_id = int(proc.find('gpu_instance_id').text)
compute_instance_id = int(proc.find('compute_instance_id').text)
pid = int(proc.find('pid').text)
process_name = proc.find('process_name').text
used_memory_str = proc.find('used_memory').text.strip()
used_memory = int(used_memory_str.replace(' MiB', ''))
# Find the corresponding MIG device
for mig_device in gpu_info['mig_devices']:
if (mig_device['gpu_instance_id'] == gpu_instance_id and
mig_device['compute_instance_id'] == compute_instance_id):
mig_device['is_in_use'] = True
mig_device['process'] = {
'pid': pid,
'process_name': process_name,
'used_memory': used_memory
}
gpus.append(gpu_info)
return gpus
xml_output = get_nvidia_smi_xml()
gpus = parse_nvidia_smi_xml(xml_output)
print(f"Parsed GPUs: {gpus}")
Output sample:
Parsed GPUs: [{'id': 0, 'name': 'NVIDIA A30', 'total_memory': 24576, 'supports_mig': True, 'mig_devices': [{'index': 0, 'gpu_instance_id': 3, 'compute_instance_id': 0, 'is_in_use': True, 'memory': 6016, 'sm_count': 14, 'name': '1g.6gb', 'process': {'pid': 1295432, 'process_name': 'tritonserver', 'used_memory': 214}}]}]