[Bug] Outputs of torch.to abnormally mismatch on GPU when adding extra output var

Question

[Bug] Outputs of torch.to abnormally mismatch on GPU when adding extra output var

Azyka opened this issue 7 months ago · 1 comments

Describe the bug
When adding an extra node of torch.to as output in this model:

class Model0():
    def forward(self, *args):
        to = args[0].to(dtype = torch.float32)
        return (to)

New:

class Model1():
    def forward(self, *args):
        to = args[0].to(dtype = torch.float32)
        to_1 = args[0].to(dtype = torch.float32)
        return (to, to_1)

The output of torch.to is expected to be the same for the same input. However, it mismatched between the 2 models.
This mismatch is seen only on cuda.

To Reproduce
Repro script:

import numpy as np
import pickle
from numpy import testing
import torch

DEVICE='cuda'

class Model0(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, *args):
        to = args[0].to(dtype = torch.float32)
        return (to)

model_0 = Model0()
output_names_0 = ['v0_0']

class Model1(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, *args):
        to = args[0].to(dtype = torch.float32)
        to_1 = args[0].to(dtype = torch.float32)
        return (to, to_1)

model_1 = Model1()
output_names_1 = ['v5_0', 'v0_0']

data = np.random.rand(41).astype(np.float16)
input_data_0 = [data]

optmodel_0 = torch.compile(model_0, fullgraph=True, backend='hidet', mode=None)
model_out_0 = optmodel_0(*[torch.from_numpy(v).to(DEVICE) for v in input_data_0])
model_out_0 = [v.to(DEVICE).detach() for v in model_out_0] if isinstance(model_out_0, tuple) else [model_out_0.to(DEVICE).detach()]
model_out_0 = [v.cpu().resolve_conj().numpy() if v.is_conj() else v.cpu().numpy() for v in model_out_0]
output_0 = dict(zip(output_names_0, model_out_0))

input_data_1 = [data]

optmodel_1 = torch.compile(model_1, fullgraph=True, backend='hidet', mode=None)
model_out_1 = optmodel_1(*[torch.from_numpy(v).to(DEVICE) for v in input_data_1])
model_out_1 = [v.to(DEVICE).detach() for v in model_out_1] if isinstance(model_out_1, tuple) else [model_out_1.to(DEVICE).detach()]
model_out_1 = [v.cpu().resolve_conj().numpy() if v.is_conj() else v.cpu().numpy() for v in model_out_1]
output_1 = dict(zip(output_names_1, model_out_1))
output_name_dict = {'v0_0': 'v0_0'}

print('=========================')
try:
    for tensor_name_0, tensor_name_1 in output_name_dict.items():
        testing.assert_allclose(output_0[tensor_name_0], output_1[tensor_name_1], rtol=1, err_msg=f'at {tensor_name_0}, {tensor_name_1}')
    print("hidet does not trigger assertion")
except AssertionError as e:
    print("hidet triggers assertion")
    print(e)
print('=========================')

model_out_0 = model_0(*[torch.from_numpy(v).to(DEVICE) for v in input_data_0])
model_out_0 = [v.to(DEVICE).detach() for v in model_out_0] if isinstance(model_out_0, tuple) else [model_out_0.to(DEVICE).detach()]
model_out_0 = [v.cpu().resolve_conj().numpy() if v.is_conj() else v.cpu().numpy() for v in model_out_0]
output_0 = dict(zip(output_names_0, model_out_0))

model_out_1 = model_1(*[torch.from_numpy(v).to(DEVICE) for v in input_data_1])
model_out_1 = [v.to(DEVICE).detach() for v in model_out_1] if isinstance(model_out_1, tuple) else [model_out_1.to(DEVICE).detach()]
model_out_1 = [v.cpu().resolve_conj().numpy() if v.is_conj() else v.cpu().numpy() for v in model_out_1]
output_1 = dict(zip(output_names_1, model_out_1))

print('=========================')
try:
    for tensor_name_0, tensor_name_1 in output_name_dict.items():
        testing.assert_allclose(output_0[tensor_name_0], output_1[tensor_name_1], rtol=1, err_msg=f'at {tensor_name_0}, {tensor_name_1}')
    print("torch_eager does not trigger assertion")
except AssertionError as e:
    print("torch_eager triggers assertion")
    print(e)
print('=========================')

Output:

=========================
hidet triggers assertion

Not equal to tolerance rtol=1, atol=0
at v0_0, v0_0
Mismatched elements: 41 / 41 (100%)
Max absolute difference: 0.98828125
Max relative difference: inf
 x: array([3.936768e-02, 6.499023e-01, 1.479492e-01, 3.615723e-01,
       7.329102e-01, 4.431152e-01, 4.995117e-01, 9.067383e-01,
       9.796143e-02, 7.086182e-02, 6.757812e-01, 2.117920e-01,...
 y: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.], dtype=float32)
=========================
=========================
torch_eager does not trigger assertion
=========================

Expected behavior
The output of torch.to is expected to be the same for the same input.

Enviroment

OS: Ubuntu 22.04.3 LTS (x86_64)
GPU: RTX 1660
NVIDIA GPU Driver: 525.147.05
Hidet Version: 0.3.0
PyTorch Version: 2.1.0+cu118

Answer 1 · 2023-12-06T12:32:48.000Z

Fixed in #384 , Thanks for you efforts on it! @Aalanli and @yaoyaoding