Compressed models that call torch.is_floating_point() during inference are traced with runtime error.
Closed this issue · 0 comments
alexsu52 commented
Example of model: "hf-internal-testing/tiny-random-bert"
transformers_model = OVModelForSequenceClassification.auto_model_class.from_pretrained("hf-internal-testing/tiny-random-bert")
quantizer = OVQuantizer.from_pretrained(transformers_model, task=OVModelForSequenceClassification.export_feature)
quantizer.quantize(save_directory="tmp", weights_only=True)
Trace log:
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/quantization/algorithms/weight_compression/algorithm.py:270: in apply
transformed_model = self.do_compression(model, graph, nodes_to_compress, activations)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/quantization/algorithms/weight_compression/algorithm.py:316: in do_compression
transformed_model = self._backend_entity.transform_model(
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/quantization/algorithms/weight_compression/torch_backend.py:238: in transform_model
transformed_model = PTModelTransformer(model).transform(transformation_layout)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/model_transformer.py:76: in transform
model.nncf.rebuild_graph()
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/nncf_network.py:510: in rebuild_graph
compressed_traced_graph = builder.build_dynamic_graph(
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/graph/graph_builder.py:50: in build_dynamic_graph
return tracer.trace_graph(model, context_to_use, as_eval, trace_parameters)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/graph_tracer.py:56: in trace_graph
self.custom_forward_fn(model)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/graph_tracer.py:100: in default_dummy_forward_fn
retval = model(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/nncf_network.py:973: in __call__
return ORIGINAL_CALL(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1518: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1527: in _call_impl
return forward_call(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/nncf_network.py:1005: in forward
retval = wrap_module_call(self.nncf._original_unbound_forward)(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:157: in wrapped
retval = module_call(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py:1564: in forward
outputs = self.bert(
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:157: in wrapped
retval = module_call(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1518: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1527: in _call_impl
return forward_call(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py:986: in forward
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(1124, 32, padding_idx=0)
(position_emb...
(pooler): BertPooler(
(dense): Linear(in_features=32, out_features=32, bias=True)
(activation): Tanh()
)
)
attention_mask = tensor([[1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0],
[0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0]])
input_shape = torch.Size([2, 16]), device = None, dtype = torch.uint8
def get_extended_attention_mask(
self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
) -> Tensor:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
attention_mask (`torch.Tensor`):
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
Returns:
`torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
"""
if dtype is None:
dtype = self.dtype
if not (attention_mask.dim() == 2 and self.config.is_decoder):
# show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
if device is not None:
warnings.warn(
"The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
elif attention_mask.dim() == 2:
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder:
extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
input_shape, attention_mask, device
)
else:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError(
f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and the dtype's smallest value for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility
> extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
E TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo'