Model architecture is modified when I use BitsAndBytesConfig with default params

Question

Model architecture is modified when I use BitsAndBytesConfig with default params

yunhao-tech opened this issue 3 months ago · 0 comments

System Info

Ubuntu 20.04

cuda 12.2.2
Python=3.11.9
transformers=4.44.2
bitandbytes=0.43.3
GPU: A800

Reproduction

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

def format_size(size):
    K, M, B = 1e3, 1e6, 1e9
    if size == 0:
        return '0'
    elif size < M:
        return f"{size / K:.1f}K"
    elif size < B:
        return f"{size / M:.1f}M"
    else:
        return f"{size / B:.1f}B"


def get_pytorch_model_info(model: torch.nn.Module) -> (dict, list):
    params_list = []
    total_params = 0
    total_params_non_trainable = 0

    for name, param in model.named_parameters():
        layer_name = name.split('.')[0]
        layer = dict(model.named_modules())[layer_name]
        layer_class = layer.__class__.__name__

        params_count = param.numel()
        trainable = param.requires_grad
        params_list.append({
            'tensor': name,
            'layer_class': layer_class,
            'shape': str(list(param.size())),
            'precision': str(param.dtype).split('.')[-1],
            'params_count': str(params_count),
            'trainable': str(trainable),
        })
        total_params += params_count
        if not trainable:
            total_params_non_trainable += params_count

    total_params_trainable = total_params - total_params_non_trainable

    total_params_info = {
        'total_params': format_size(total_params),
        'total_params_trainable': format_size(total_params_trainable),
        'total_params_non_trainable': format_size(total_params_non_trainable)
    }

    return total_params_info, params_list


if __name__ == "__main__":
    base_model = "My fine tuned starcoder2 model path."

    quantization_config = BitsAndBytesConfig()
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=quantization_config,
        torch_dtype=torch.float16, # torch.bfloat16
        device_map="auto",
        trust_remote_code=True
    )
    model.config.pad_token_id = tokenizer.pad_token_id
    model.eval()
    total_params_info, params_list = get_pytorch_model_info(model)
    print(total_params_info)
    for ele in params_list:
        print(ele)

Expected behavior

When I load the model without BitsandBytes, with code:

model = AutoModelForCausalLM.from_pretrained(
      base_model,
      torch_dtype=torch.float16, 
      device_map="auto",
      trust_remote_code=True
  )

it is normal like below:

However, when I use BitsAndBytesConfig (with default parameters), the model architecture is modified and the number of parameters becomes half.

I expect that, if I pass no parameters (i.e. just with default choice), there should be same as normal case (no quantization, no modification of model architecture).