microsoft/BitBLAS

"nan" in the outputs

Closed this issue · 4 comments

Hi, @LeiWang1999 , thanks for the excellent project! I am replacing pretrained models with bitblas.linear, this is the code:

class bitlinear(bitblas.Linear):
def init(
self,
in_features: int,
out_features: int,
bias: bool = False,
A_dtype: str = "float16",
W_dtype: str = "int2",
accum_dtype: str = "float16",
out_dtype: str = "float16",
group_size: int = -1,
with_scaling: bool = False,
with_zeros: bool = False,
zeros_mode: str = None,
opt_M: list = [1, 16, 32, 64, 128, 256, 512],
fast_decoding: bool = True,
alpha: torch.float16 = 1.
):
super().init(
in_features=in_features,
out_features=out_features,
bias=bias,
A_dtype=A_dtype,
W_dtype=W_dtype,
accum_dtype=accum_dtype,
out_dtype=out_dtype,
group_size=group_size,
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
opt_M=opt_M,
fast_decoding=fast_decoding,
)
self.alpha = alpha

def forward(self, A: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
    out = super().forward(A, out)
    out *= self.alpha
    if self.bias is not None:
        out += self.bias.view(1, -1).expand_as(out)
    return out

def Ternarize(W):
with torch.no_grad():
m = W.abs().mean()
m *= 2
W = torch.clamp(torch.round(W / m), min=-1, max=1)
return W, m

def convert_to_bitlinear(layer):
w, a = Ternarize(layer.weight.data)
bitlayer = bitlinear(
in_features=layer.in_features,
out_features=layer.out_features,
bias=False,
A_dtype="float16", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="float32", # accumulation dtype
out_dtype="float16", # output dtype
# configs for weight only quantization
group_size=-1, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
# Target optimization var for dynamic symbolic.
# For detailed information please checkout docs/PythonAPI.md
# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
opt_M=[1, 16, 32, 64, 128, 256, 512],
fast_decoding=True,
alpha=a.to(torch.float16)
)
bitlayer.load_and_transform_weight(w.to(torch.int8))
if layer.bias is not None:
bitlayer.bias = layer.bias.data.to(torch.float16)
return bitlayer

def replace_modules(model):
for name, module in model.named_children():
if isinstance(module, nn.Linear):
new_linear = convert_to_bitlinear(module)
setattr(model, name, new_linear)
elif len(list(module.children())) > 0:
replace_modules(module)

I replaced all the nn.linear layers of a pretrained bert model with bitlinear, and found the output is "nan":

tensor([[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]])

I tried to compare the output of a single linear layer, and the results are acceptable:

class bitlinear(bitblas.Linear):
def init(
self,
in_features: int,
out_features: int,
bias: bool = False,
A_dtype: str = "float16",
W_dtype: str = "int2",
accum_dtype: str = "float16",
out_dtype: str = "float16",
group_size: int = -1,
with_scaling: bool = False,
with_zeros: bool = False,
zeros_mode: str = None,
opt_M: list = [1, 16, 32, 64, 128, 256, 512],
fast_decoding: bool = True,
alpha: torch.float16 = 1.
):
super().init(
in_features=in_features,
out_features=out_features,
bias=bias,
A_dtype=A_dtype,
W_dtype=W_dtype,
accum_dtype=accum_dtype,
out_dtype=out_dtype,
group_size=group_size,
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
opt_M=opt_M,
fast_decoding=fast_decoding,
)
self.alpha = alpha

def forward(self, A: torch.Tensor, output: torch.Tensor = None) -> torch.Tensor:
    output = super().forward(A, output)
    output *= self.alpha
    if self.bias is not None:
        output += self.bias.view(1, -1).expand_as(output)
    return output.to(torch.float32)

def Ternarize(W):
with torch.no_grad():
m = W.abs().mean()
m *= 2
W = torch.clamp(torch.round(W / m), min=-1, max=1)
return W, m

def convert_to_bitlinear(layer):
w, a = Ternarize(layer.weight.data)
bitlayer = bitlinear(
in_features=layer.in_features,
out_features=layer.out_features,
bias=False,
A_dtype="float16", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="float32", # accumulation dtype
out_dtype="float16", # output dtype
# configs for weight only quantization
group_size=-1, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
# Target optimization var for dynamic symbolic.
# For detailed information please checkout docs/PythonAPI.md
# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
opt_M=[1, 16, 32, 64, 128, 256, 512],
fast_decoding=True,
alpha=a.to(torch.float16)
)
bitlayer.load_and_transform_weight(w.to(torch.int8))
bitlayer.bias = layer.bias.data.to(torch.float16)
return bitlayer

x = torch.randn((2,3, 1024), dtype=torch.float16).cuda()
weight_tensor = torch.randn((1024, 1024), dtype=torch.float16).cuda()
bias_tensor = torch.randn((1, 1024), dtype=torch.float16).cuda()

layer = nn.Linear(1024, 1024, bias=True).cuda()
layer.weight.data = weight_tensor
layer.bias.data = bias_tensor

BitLinear = convert_to_bitlinear(layer)
layer.weight.data, a = Ternarize(weight_tensor)
layer.weight.data *= a

print(BitLinear(x))
print(layer(x))

The output is:
tensor([[[ 55.1875, -49.0312, 14.8984, ..., 34.9062, -9.4375, -37.0312],
[-25.8438, -10.3672, 18.4688, ..., 37.2812, 25.7500, 12.7656],
[-84.5000, -26.8594, 13.6406, ..., -8.1875, -51.6562, 28.4375]],

    [[  0.0889, -16.1250,  38.6562,  ..., -35.3750,  28.1250,   9.7969],
     [ 32.2812,  -3.9141,  30.7344,  ..., -60.0000,  28.0156, -65.3750],
     [-56.0312,  30.4844, -33.2812,  ..., -34.2812,  19.5312,  55.3750]]],
   device='cuda:0')

tensor([[[ 55.1562, -49.0312, 14.9062, ..., 34.9062, -9.4297, -37.0312],
[-25.8594, -10.3672, 18.4688, ..., 37.2812, 25.7500, 12.7734],
[-84.4375, -26.8750, 13.6484, ..., -8.1875, -51.6250, 28.4531]],

    [[  0.0889, -16.1250,  38.6562,  ..., -35.3750,  28.1250,   9.8047],
     [ 32.2812,  -3.9043,  30.7188,  ..., -59.9688,  28.0000, -65.3750],
     [-56.0312,  30.4688, -33.3125,  ..., -34.2812,  19.5156,  55.4062]]],
   device='cuda:0', dtype=torch.float16, grad_fn=<ViewBackward0>)

Could you tell me why the output is "nan" when I replace all the linear layers? When I debugged at breakpoints, I found that the number of “nan“ continued to increase with the feedforward, resulting in all the final output being “nan“.

hi @dataparameters , thanks for your attention, it’s difficult for us to reproduce the issue on our end. Could you please provide a more complete and well-formed example of the code?

Yes, here is the whole code:

import bitblas
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

bitblas.set_log_level("Debug")

torch.manual_seed(0)

class bitlinear(bitblas.Linear):
def init(
self,
in_features: int,
out_features: int,
bias: bool = False,
A_dtype: str = "float16",
W_dtype: str = "int2",
accum_dtype: str = "float16",
out_dtype: str = "float16",
group_size: int = -1,
with_scaling: bool = False,
with_zeros: bool = False,
zeros_mode: str = None,
opt_M: list = [1, 16, 32, 64, 128, 256, 512],
fast_decoding: bool = True,
alpha: torch.float16 = 1.
):
super().init(
in_features=in_features,
out_features=out_features,
bias=bias,
A_dtype=A_dtype,
W_dtype=W_dtype,
accum_dtype=accum_dtype,
out_dtype=out_dtype,
group_size=group_size,
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
opt_M=opt_M,
fast_decoding=fast_decoding,
)
self.alpha = alpha

def forward(self, A: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
    out = super().forward(A, out)
    out *= self.alpha
    if self.bias is not None:
        out += self.bias.view(1, -1).expand_as(out)
    return out

def Ternarize(W):
with torch.no_grad():
m = W.abs().mean()
m *= 2
W = torch.clamp(torch.round(W / m), min=-1, max=1)
return W, m

def convert_to_bitlinear(layer):
w, a = Ternarize(layer.weight.data)
bitlayer = bitlinear(
in_features=layer.in_features,
out_features=layer.out_features,
bias=False,
A_dtype="float16", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="float32", # accumulation dtype
out_dtype="float16", # output dtype
# configs for weight only quantization
group_size=-1, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
# Target optimization var for dynamic symbolic.
# For detailed information please checkout docs/PythonAPI.md
# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
opt_M=[1, 16, 32, 64, 128, 256, 512],
fast_decoding=True,
alpha=a.to(torch.float16)
)
bitlayer.load_and_transform_weight(w.to(torch.int8))
if layer.bias is not None:
bitlayer.bias = layer.bias.data.to(torch.float16)
return bitlayer

def replace_modules(model):
for name, module in model.named_children():
if 'query' in name or 'key' in name or 'value' in name:
continue
if isinstance(module, nn.Linear):
new_linear = convert_to_bitlinear(module)
setattr(model, name, new_linear)
elif len(list(module.children())) > 0:
replace_modules(module)

model_name = "xiaobu-embedding-v2"

sentences = ["样例数据-1", "样例数据-2"]

tokenizer = AutoTokenizer.from_pretrained(model_name)
model1 = AutoModel.from_pretrained(model_name)

replace_modules(model1)

print(model1)
model1.eval()

encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
model_output1 = model1(**encoded_input)
sentence_embeddings1 = model_output1[0][:, 0]

sentence_embeddings1 = torch.nn.functional.normalize(sentence_embeddings1, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings1)

The result is: Sentence embeddings: tensor([[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]])

hi @dataparameters , I cannot access the model that you provide xiaobu-embedding-v2, but I think that you should put both the model and input tensor to cuda instead of cpu.

Sorry for my oversight... It works, thank you so much. I will close the issue.