openvinotoolkit/nncf

Compressed models that call torch.is_floating_point() during inference are traced with runtime error.

Closed this issue · 0 comments

Example of model: "hf-internal-testing/tiny-random-bert"

transformers_model = OVModelForSequenceClassification.auto_model_class.from_pretrained("hf-internal-testing/tiny-random-bert")
quantizer = OVQuantizer.from_pretrained(transformers_model, task=OVModelForSequenceClassification.export_feature)
quantizer.quantize(save_directory="tmp", weights_only=True)

Trace log:

../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/quantization/algorithms/weight_compression/algorithm.py:270: in apply
    transformed_model = self.do_compression(model, graph, nodes_to_compress, activations)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/quantization/algorithms/weight_compression/algorithm.py:316: in do_compression
    transformed_model = self._backend_entity.transform_model(
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/quantization/algorithms/weight_compression/torch_backend.py:238: in transform_model
    transformed_model = PTModelTransformer(model).transform(transformation_layout)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/model_transformer.py:76: in transform
    model.nncf.rebuild_graph()
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/nncf_network.py:510: in rebuild_graph
    compressed_traced_graph = builder.build_dynamic_graph(
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/graph/graph_builder.py:50: in build_dynamic_graph
    return tracer.trace_graph(model, context_to_use, as_eval, trace_parameters)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/graph_tracer.py:56: in trace_graph
    self.custom_forward_fn(model)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/graph_tracer.py:100: in default_dummy_forward_fn
    retval = model(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/nncf_network.py:973: in __call__
    return ORIGINAL_CALL(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1518: in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1527: in _call_impl
    return forward_call(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/nncf_network.py:1005: in forward
    retval = wrap_module_call(self.nncf._original_unbound_forward)(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:157: in wrapped
    retval = module_call(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py:1564: in forward
    outputs = self.bert(
../../../../venv/nncf_torch/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:157: in wrapped
    retval = module_call(self, *args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1518: in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/torch/nn/modules/module.py:1527: in _call_impl
    return forward_call(*args, **kwargs)
../../../../venv/nncf_torch/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py:986: in forward
    extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(1124, 32, padding_idx=0)
    (position_emb...
  (pooler): BertPooler(
    (dense): Linear(in_features=32, out_features=32, bias=True)
    (activation): Tanh()
  )
)
attention_mask = tensor([[1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0]])
input_shape = torch.Size([2, 16]), device = None, dtype = torch.uint8

    def get_extended_attention_mask(
        self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
    ) -> Tensor:
        """
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
    
        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.
    
        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        """
        if dtype is None:
            dtype = self.dtype
    
        if not (attention_mask.dim() == 2 and self.config.is_decoder):
            # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
            if device is not None:
                warnings.warn(
                    "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning
                )
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # Provided a padding mask of dimensions [batch_size, seq_length]
            # - if the model is a decoder, apply a causal mask in addition to the padding mask
            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
            if self.config.is_decoder:
                extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder(
                    input_shape, attention_mask, device
                )
            else:
                extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
            )
    
        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and the dtype's smallest value for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
>       extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
E       TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo'