Test failure in CUDA 11

Question

Test failure in CUDA 11

minghanz opened this issue 3 years ago · 2 comments

Describe the problem
2 out of 4 tests failed with differences exceeding the margin of error.

Error message

============================ test session starts =============================
platform linux -- Python 3.7.10, pytest-6.2.5, py-1.10.0, pluggy-1.0.0
rootdir: /home/minghanz/repos/torch-batch-svd
plugins: anyio-3.3.0
collected 4 items                                                            

test.py F..F                                                           [100%]

================================== FAILURES ==================================
_________________________________ test_float _________________________________

    def test_float():
        torch.manual_seed(0)
        a = torch.randn(N, H, W).cuda()
        b = a.clone()
        a.requires_grad = True
        b.requires_grad = True
    
        U, S, V = svd(a)
        loss = U.sum() + S.sum() + V.sum()
        loss.backward()
    
        u, s, v = torch.svd(b[0], some=True, compute_uv=True)
        loss0 = u.sum() + s.sum() + v.sum()
        loss0.backward()
    
        # eigenvectors are only precise up to sign
        testing.assert_allclose(U[0].abs(), u.abs())
        testing.assert_allclose(S[0].abs(), s.abs())
        testing.assert_allclose(V[0].abs(), v.abs())
>       testing.assert_allclose(a, U @ torch.diag_embed(S) @ V.transpose(-2, -1))

test.py:28: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

actual = tensor([[[-1.1258, -1.1524, -0.2506],
         [-0.4339,  0.8487,  0.6920],
         [-0.3160, -2.1152,  0.3223],
    ...77],
         [ 1.4223,  0.2985,  0.0924],
         [-1.0208,  0.3279,  0.0111]]], device='cuda:0', requires_grad=True)
expected = tensor([[[-1.1258, -1.1528, -0.2509],
         [-0.4338,  0.8488,  0.6922],
         [-0.3161, -2.1155,  0.3225],
    ....4220,  0.2986,  0.0924],
         [-1.0208,  0.3279,  0.0110]]], device='cuda:0',
       grad_fn=<UnsafeViewBackward>)
rtol = 0.0001, atol = 1e-05, equal_nan = True
msg = 'With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error...difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).'

    def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None:
        if not isinstance(actual, torch.Tensor):
            actual = torch.tensor(actual)
        if not isinstance(expected, torch.Tensor):
            expected = torch.tensor(expected, dtype=actual.dtype)
        if expected.shape != actual.shape:
            raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
                                 "shape {1}!".format(expected.shape, actual.shape))
        if rtol is None or atol is None:
            if rtol is not None or atol is not None:
                raise ValueError("rtol and atol must both be specified or both be unspecified")
            rtol, atol = _get_default_tolerance(actual, expected)
    
        result, debug_msg = _compare_tensors_internal(actual, expected,
                                                      rtol=rtol, atol=atol,
                                                      equal_nan=equal_nan)
    
        if result:
            return
    
        if msg is None or msg == '':
            msg = debug_msg
    
>       raise AssertionError(msg)
E       AssertionError: With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).

../../../anaconda3/envs/pytorch3d/lib/python3.7/site-packages/torch/testing/_core.py:270: AssertionError
_____________________________ test_multiple_gpus _____________________________

    def test_multiple_gpus():
        num_gpus = torch.cuda.device_count()
    
        for gpu_idx in range(num_gpus):
            device = torch.device('cuda:{}'.format(gpu_idx))
    
            torch.manual_seed(0)
            a = torch.randn(N, H, W).to(device)
            b = a.clone()
            a.requires_grad = True
            b.requires_grad = True
    
            U, S, V = svd(a)
            loss = U.sum() + S.sum() + V.sum()
            loss.backward()
    
            u, s, v = torch.svd(b[0], some=True, compute_uv=True)
            loss0 = u.sum() + s.sum() + v.sum()
            loss0.backward()
    
            # eigenvectors are only precise up to sign
            testing.assert_allclose(U[0].abs(), u.abs())
            testing.assert_allclose(S[0].abs(), s.abs())
            testing.assert_allclose(V[0].abs(), v.abs())
            testing.assert_allclose(a,
>                                   U @ torch.diag_embed(S) @ V.transpose(-2, -1))

test.py:104: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

actual = tensor([[[-1.1258, -1.1524, -0.2506],
         [-0.4339,  0.8487,  0.6920],
         [-0.3160, -2.1152,  0.3223],
    ...77],
         [ 1.4223,  0.2985,  0.0924],
         [-1.0208,  0.3279,  0.0111]]], device='cuda:0', requires_grad=True)
expected = tensor([[[-1.1258, -1.1528, -0.2509],
         [-0.4338,  0.8488,  0.6922],
         [-0.3161, -2.1155,  0.3225],
    ....4220,  0.2986,  0.0924],
         [-1.0208,  0.3279,  0.0110]]], device='cuda:0',
       grad_fn=<UnsafeViewBackward>)
rtol = 0.0001, atol = 1e-05, equal_nan = True
msg = 'With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error...difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).'

    def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None:
        if not isinstance(actual, torch.Tensor):
            actual = torch.tensor(actual)
        if not isinstance(expected, torch.Tensor):
            expected = torch.tensor(expected, dtype=actual.dtype)
        if expected.shape != actual.shape:
            raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
                                 "shape {1}!".format(expected.shape, actual.shape))
        if rtol is None or atol is None:
            if rtol is not None or atol is not None:
                raise ValueError("rtol and atol must both be specified or both be unspecified")
            rtol, atol = _get_default_tolerance(actual, expected)
    
        result, debug_msg = _compare_tensors_internal(actual, expected,
                                                      rtol=rtol, atol=atol,
                                                      equal_nan=equal_nan)
    
        if result:
            return
    
        if msg is None or msg == '':
            msg = debug_msg
    
>       raise AssertionError(msg)
E       AssertionError: With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).

../../../anaconda3/envs/pytorch3d/lib/python3.7/site-packages/torch/testing/_core.py:270: AssertionError
========================== short test summary info ===========================
FAILED test.py::test_float - AssertionError: With rtol=0.0001 and atol=1e-0...
FAILED test.py::test_multiple_gpus - AssertionError: With rtol=0.0001 and a...
======================== 2 failed, 2 passed in 2.45s =========================

Environments

OS: ubuntu 20.04.3 LTS
CUDA version: 11.1.74 (reported by conda list cudatoolkit)
Pytorch version: 1.9.0
Python version: 3.7.10

How to Reproduce
Just run python -m pytest test.py

Answer 1 · 2021-10-25T01:12:50.000Z

Thanks for flagging. I think it is due to the numerical error.

Answer 2 · 2021-10-25T01:33:23.000Z

However, I recently found that Pytorch float32 tensors matrix multiplication on CUDA 11 gives slightly different result when the operation is on CPU and on GPU. The CPU result is consistent with the numpy multiplication result, therefore I suspect that the problem is on Pytorch side. Just to provide some information from my side. I'd appreciate it if you could confirm what problem it is actually.