NVIDIA/mig-parted

Is the MIG service free?

Opened this issue · 1 comments

thanks

I am using this to know if it's free or not:

def get_nvidia_smi_xml():
    result = subprocess.run(['nvidia-smi', '-q', '-x'], stdout=subprocess.PIPE)
    xml_output = result.stdout.decode('utf-8')
    return xml_output

def parse_nvidia_smi_xml(xml_output):
    root = ET.fromstring(xml_output)
    gpus = []
    for gpu in root.findall('gpu'):
        gpu_info = {}
        gpu_info['id'] = int(gpu.find('minor_number').text)
        gpu_info['name'] = gpu.find('product_name').text.strip()
        total_memory_str = gpu.find('fb_memory_usage/total').text.strip()
        gpu_info['total_memory'] = int(total_memory_str.replace(' MiB', ''))
        mig_mode = gpu.find('mig_mode/current_mig').text.strip()
        gpu_info['supports_mig'] = (mig_mode == 'Enabled')
        gpu_info['mig_devices'] = []

        # Parse MIG devices
        for mig_device in gpu.findall('mig_devices/mig_device'):
            mig_info = {}
            mig_info['index'] = int(mig_device.find('index').text)
            mig_info['gpu_instance_id'] = int(mig_device.find('gpu_instance_id').text)
            mig_info['compute_instance_id'] = int(mig_device.find('compute_instance_id').text)
            mig_info['is_in_use'] = False  # Initialize as not in use

            # Extract memory size of MIG device
            memory_total_elem = mig_device.find('fb_memory_usage/total')
            if memory_total_elem is not None:
                memory_total_str = memory_total_elem.text.strip()
                mig_info['memory'] = int(memory_total_str.replace(' MiB', ''))
            else:
                mig_info['memory'] = 0

            # Extract multiprocessor count
            sm_count_elem = mig_device.find('device_attributes/shared/multiprocessor_count')
            if sm_count_elem is not None:
                mig_info['sm_count'] = int(sm_count_elem.text.strip())
            else:
                mig_info['sm_count'] = 0

            # Infer profile name based on memory and SM count
            mig_info['name'] = infer_profile_name(gpu_info['name'], mig_info['memory'], mig_info['sm_count'])

            gpu_info['mig_devices'].append(mig_info)

        # Parse processes and map them to MIG devices
        for proc in gpu.findall('processes/process_info'):
            gpu_instance_id = int(proc.find('gpu_instance_id').text)
            compute_instance_id = int(proc.find('compute_instance_id').text)
            pid = int(proc.find('pid').text)
            process_name = proc.find('process_name').text
            used_memory_str = proc.find('used_memory').text.strip()
            used_memory = int(used_memory_str.replace(' MiB', ''))

            # Find the corresponding MIG device
            for mig_device in gpu_info['mig_devices']:
                if (mig_device['gpu_instance_id'] == gpu_instance_id and
                    mig_device['compute_instance_id'] == compute_instance_id):
                    mig_device['is_in_use'] = True
                    mig_device['process'] = {
                        'pid': pid,
                        'process_name': process_name,
                        'used_memory': used_memory
                    }

        gpus.append(gpu_info)
    return gpus
xml_output = get_nvidia_smi_xml()
gpus = parse_nvidia_smi_xml(xml_output)
print(f"Parsed GPUs: {gpus}")

Output sample:

Parsed GPUs: [{'id': 0, 'name': 'NVIDIA A30', 'total_memory': 24576, 'supports_mig': True, 'mig_devices': [{'index': 0, 'gpu_instance_id': 3, 'compute_instance_id': 0, 'is_in_use': True, 'memory': 6016, 'sm_count': 14, 'name': '1g.6gb', 'process': {'pid': 1295432, 'process_name': 'tritonserver', 'used_memory': 214}}]}]