"nan" in the outputs
Closed this issue · 4 comments
Hi, @LeiWang1999 , thanks for the excellent project! I am replacing pretrained models with bitblas.linear, this is the code:
class bitlinear(bitblas.Linear):
def init(
self,
in_features: int,
out_features: int,
bias: bool = False,
A_dtype: str = "float16",
W_dtype: str = "int2",
accum_dtype: str = "float16",
out_dtype: str = "float16",
group_size: int = -1,
with_scaling: bool = False,
with_zeros: bool = False,
zeros_mode: str = None,
opt_M: list = [1, 16, 32, 64, 128, 256, 512],
fast_decoding: bool = True,
alpha: torch.float16 = 1.
):
super().init(
in_features=in_features,
out_features=out_features,
bias=bias,
A_dtype=A_dtype,
W_dtype=W_dtype,
accum_dtype=accum_dtype,
out_dtype=out_dtype,
group_size=group_size,
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
opt_M=opt_M,
fast_decoding=fast_decoding,
)
self.alpha = alpha
def forward(self, A: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
out = super().forward(A, out)
out *= self.alpha
if self.bias is not None:
out += self.bias.view(1, -1).expand_as(out)
return out
def Ternarize(W):
with torch.no_grad():
m = W.abs().mean()
m *= 2
W = torch.clamp(torch.round(W / m), min=-1, max=1)
return W, m
def convert_to_bitlinear(layer):
w, a = Ternarize(layer.weight.data)
bitlayer = bitlinear(
in_features=layer.in_features,
out_features=layer.out_features,
bias=False,
A_dtype="float16", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="float32", # accumulation dtype
out_dtype="float16", # output dtype
# configs for weight only quantization
group_size=-1, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
# Target optimization var for dynamic symbolic.
# For detailed information please checkout docs/PythonAPI.md
# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
opt_M=[1, 16, 32, 64, 128, 256, 512],
fast_decoding=True,
alpha=a.to(torch.float16)
)
bitlayer.load_and_transform_weight(w.to(torch.int8))
if layer.bias is not None:
bitlayer.bias = layer.bias.data.to(torch.float16)
return bitlayer
def replace_modules(model):
for name, module in model.named_children():
if isinstance(module, nn.Linear):
new_linear = convert_to_bitlinear(module)
setattr(model, name, new_linear)
elif len(list(module.children())) > 0:
replace_modules(module)
I replaced all the nn.linear layers of a pretrained bert model with bitlinear, and found the output is "nan":
tensor([[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]])
I tried to compare the output of a single linear layer, and the results are acceptable:
class bitlinear(bitblas.Linear):
def init(
self,
in_features: int,
out_features: int,
bias: bool = False,
A_dtype: str = "float16",
W_dtype: str = "int2",
accum_dtype: str = "float16",
out_dtype: str = "float16",
group_size: int = -1,
with_scaling: bool = False,
with_zeros: bool = False,
zeros_mode: str = None,
opt_M: list = [1, 16, 32, 64, 128, 256, 512],
fast_decoding: bool = True,
alpha: torch.float16 = 1.
):
super().init(
in_features=in_features,
out_features=out_features,
bias=bias,
A_dtype=A_dtype,
W_dtype=W_dtype,
accum_dtype=accum_dtype,
out_dtype=out_dtype,
group_size=group_size,
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
opt_M=opt_M,
fast_decoding=fast_decoding,
)
self.alpha = alpha
def forward(self, A: torch.Tensor, output: torch.Tensor = None) -> torch.Tensor:
output = super().forward(A, output)
output *= self.alpha
if self.bias is not None:
output += self.bias.view(1, -1).expand_as(output)
return output.to(torch.float32)
def Ternarize(W):
with torch.no_grad():
m = W.abs().mean()
m *= 2
W = torch.clamp(torch.round(W / m), min=-1, max=1)
return W, m
def convert_to_bitlinear(layer):
w, a = Ternarize(layer.weight.data)
bitlayer = bitlinear(
in_features=layer.in_features,
out_features=layer.out_features,
bias=False,
A_dtype="float16", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="float32", # accumulation dtype
out_dtype="float16", # output dtype
# configs for weight only quantization
group_size=-1, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
# Target optimization var for dynamic symbolic.
# For detailed information please checkout docs/PythonAPI.md
# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
opt_M=[1, 16, 32, 64, 128, 256, 512],
fast_decoding=True,
alpha=a.to(torch.float16)
)
bitlayer.load_and_transform_weight(w.to(torch.int8))
bitlayer.bias = layer.bias.data.to(torch.float16)
return bitlayer
x = torch.randn((2,3, 1024), dtype=torch.float16).cuda()
weight_tensor = torch.randn((1024, 1024), dtype=torch.float16).cuda()
bias_tensor = torch.randn((1, 1024), dtype=torch.float16).cuda()
layer = nn.Linear(1024, 1024, bias=True).cuda()
layer.weight.data = weight_tensor
layer.bias.data = bias_tensor
BitLinear = convert_to_bitlinear(layer)
layer.weight.data, a = Ternarize(weight_tensor)
layer.weight.data *= a
print(BitLinear(x))
print(layer(x))
The output is:
tensor([[[ 55.1875, -49.0312, 14.8984, ..., 34.9062, -9.4375, -37.0312],
[-25.8438, -10.3672, 18.4688, ..., 37.2812, 25.7500, 12.7656],
[-84.5000, -26.8594, 13.6406, ..., -8.1875, -51.6562, 28.4375]],
[[ 0.0889, -16.1250, 38.6562, ..., -35.3750, 28.1250, 9.7969],
[ 32.2812, -3.9141, 30.7344, ..., -60.0000, 28.0156, -65.3750],
[-56.0312, 30.4844, -33.2812, ..., -34.2812, 19.5312, 55.3750]]],
device='cuda:0')
tensor([[[ 55.1562, -49.0312, 14.9062, ..., 34.9062, -9.4297, -37.0312],
[-25.8594, -10.3672, 18.4688, ..., 37.2812, 25.7500, 12.7734],
[-84.4375, -26.8750, 13.6484, ..., -8.1875, -51.6250, 28.4531]],
[[ 0.0889, -16.1250, 38.6562, ..., -35.3750, 28.1250, 9.8047],
[ 32.2812, -3.9043, 30.7188, ..., -59.9688, 28.0000, -65.3750],
[-56.0312, 30.4688, -33.3125, ..., -34.2812, 19.5156, 55.4062]]],
device='cuda:0', dtype=torch.float16, grad_fn=<ViewBackward0>)
Could you tell me why the output is "nan" when I replace all the linear layers? When I debugged at breakpoints, I found that the number of “nan“ continued to increase with the feedforward, resulting in all the final output being “nan“.
hi @dataparameters , thanks for your attention, it’s difficult for us to reproduce the issue on our end. Could you please provide a more complete and well-formed example of the code?
Yes, here is the whole code:
import bitblas
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
bitblas.set_log_level("Debug")
torch.manual_seed(0)
class bitlinear(bitblas.Linear):
def init(
self,
in_features: int,
out_features: int,
bias: bool = False,
A_dtype: str = "float16",
W_dtype: str = "int2",
accum_dtype: str = "float16",
out_dtype: str = "float16",
group_size: int = -1,
with_scaling: bool = False,
with_zeros: bool = False,
zeros_mode: str = None,
opt_M: list = [1, 16, 32, 64, 128, 256, 512],
fast_decoding: bool = True,
alpha: torch.float16 = 1.
):
super().init(
in_features=in_features,
out_features=out_features,
bias=bias,
A_dtype=A_dtype,
W_dtype=W_dtype,
accum_dtype=accum_dtype,
out_dtype=out_dtype,
group_size=group_size,
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
opt_M=opt_M,
fast_decoding=fast_decoding,
)
self.alpha = alpha
def forward(self, A: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
out = super().forward(A, out)
out *= self.alpha
if self.bias is not None:
out += self.bias.view(1, -1).expand_as(out)
return out
def Ternarize(W):
with torch.no_grad():
m = W.abs().mean()
m *= 2
W = torch.clamp(torch.round(W / m), min=-1, max=1)
return W, m
def convert_to_bitlinear(layer):
w, a = Ternarize(layer.weight.data)
bitlayer = bitlinear(
in_features=layer.in_features,
out_features=layer.out_features,
bias=False,
A_dtype="float16", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="float32", # accumulation dtype
out_dtype="float16", # output dtype
# configs for weight only quantization
group_size=-1, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
# Target optimization var for dynamic symbolic.
# For detailed information please checkout docs/PythonAPI.md
# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
opt_M=[1, 16, 32, 64, 128, 256, 512],
fast_decoding=True,
alpha=a.to(torch.float16)
)
bitlayer.load_and_transform_weight(w.to(torch.int8))
if layer.bias is not None:
bitlayer.bias = layer.bias.data.to(torch.float16)
return bitlayer
def replace_modules(model):
for name, module in model.named_children():
if 'query' in name or 'key' in name or 'value' in name:
continue
if isinstance(module, nn.Linear):
new_linear = convert_to_bitlinear(module)
setattr(model, name, new_linear)
elif len(list(module.children())) > 0:
replace_modules(module)
model_name = "xiaobu-embedding-v2"
sentences = ["样例数据-1", "样例数据-2"]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model1 = AutoModel.from_pretrained(model_name)
replace_modules(model1)
print(model1)
model1.eval()
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
model_output1 = model1(**encoded_input)
sentence_embeddings1 = model_output1[0][:, 0]
sentence_embeddings1 = torch.nn.functional.normalize(sentence_embeddings1, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings1)
The result is: Sentence embeddings: tensor([[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]])
hi @dataparameters , I cannot access the model that you provide xiaobu-embedding-v2
, but I think that you should put both the model and input tensor to cuda
instead of cpu.
Sorry for my oversight... It works, thank you so much. I will close the issue.