Test failure in CUDA 11
minghanz opened this issue · 2 comments
minghanz commented
Describe the problem
2 out of 4 tests failed with differences exceeding the margin of error.
Error message
============================ test session starts =============================
platform linux -- Python 3.7.10, pytest-6.2.5, py-1.10.0, pluggy-1.0.0
rootdir: /home/minghanz/repos/torch-batch-svd
plugins: anyio-3.3.0
collected 4 items
test.py F..F [100%]
================================== FAILURES ==================================
_________________________________ test_float _________________________________
def test_float():
torch.manual_seed(0)
a = torch.randn(N, H, W).cuda()
b = a.clone()
a.requires_grad = True
b.requires_grad = True
U, S, V = svd(a)
loss = U.sum() + S.sum() + V.sum()
loss.backward()
u, s, v = torch.svd(b[0], some=True, compute_uv=True)
loss0 = u.sum() + s.sum() + v.sum()
loss0.backward()
# eigenvectors are only precise up to sign
testing.assert_allclose(U[0].abs(), u.abs())
testing.assert_allclose(S[0].abs(), s.abs())
testing.assert_allclose(V[0].abs(), v.abs())
> testing.assert_allclose(a, U @ torch.diag_embed(S) @ V.transpose(-2, -1))
test.py:28:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
actual = tensor([[[-1.1258, -1.1524, -0.2506],
[-0.4339, 0.8487, 0.6920],
[-0.3160, -2.1152, 0.3223],
...77],
[ 1.4223, 0.2985, 0.0924],
[-1.0208, 0.3279, 0.0111]]], device='cuda:0', requires_grad=True)
expected = tensor([[[-1.1258, -1.1528, -0.2509],
[-0.4338, 0.8488, 0.6922],
[-0.3161, -2.1155, 0.3225],
....4220, 0.2986, 0.0924],
[-1.0208, 0.3279, 0.0110]]], device='cuda:0',
grad_fn=<UnsafeViewBackward>)
rtol = 0.0001, atol = 1e-05, equal_nan = True
msg = 'With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error...difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).'
def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None:
if not isinstance(actual, torch.Tensor):
actual = torch.tensor(actual)
if not isinstance(expected, torch.Tensor):
expected = torch.tensor(expected, dtype=actual.dtype)
if expected.shape != actual.shape:
raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
"shape {1}!".format(expected.shape, actual.shape))
if rtol is None or atol is None:
if rtol is not None or atol is not None:
raise ValueError("rtol and atol must both be specified or both be unspecified")
rtol, atol = _get_default_tolerance(actual, expected)
result, debug_msg = _compare_tensors_internal(actual, expected,
rtol=rtol, atol=atol,
equal_nan=equal_nan)
if result:
return
if msg is None or msg == '':
msg = debug_msg
> raise AssertionError(msg)
E AssertionError: With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).
../../../anaconda3/envs/pytorch3d/lib/python3.7/site-packages/torch/testing/_core.py:270: AssertionError
_____________________________ test_multiple_gpus _____________________________
def test_multiple_gpus():
num_gpus = torch.cuda.device_count()
for gpu_idx in range(num_gpus):
device = torch.device('cuda:{}'.format(gpu_idx))
torch.manual_seed(0)
a = torch.randn(N, H, W).to(device)
b = a.clone()
a.requires_grad = True
b.requires_grad = True
U, S, V = svd(a)
loss = U.sum() + S.sum() + V.sum()
loss.backward()
u, s, v = torch.svd(b[0], some=True, compute_uv=True)
loss0 = u.sum() + s.sum() + v.sum()
loss0.backward()
# eigenvectors are only precise up to sign
testing.assert_allclose(U[0].abs(), u.abs())
testing.assert_allclose(S[0].abs(), s.abs())
testing.assert_allclose(V[0].abs(), v.abs())
testing.assert_allclose(a,
> U @ torch.diag_embed(S) @ V.transpose(-2, -1))
test.py:104:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
actual = tensor([[[-1.1258, -1.1524, -0.2506],
[-0.4339, 0.8487, 0.6920],
[-0.3160, -2.1152, 0.3223],
...77],
[ 1.4223, 0.2985, 0.0924],
[-1.0208, 0.3279, 0.0111]]], device='cuda:0', requires_grad=True)
expected = tensor([[[-1.1258, -1.1528, -0.2509],
[-0.4338, 0.8488, 0.6922],
[-0.3161, -2.1155, 0.3225],
....4220, 0.2986, 0.0924],
[-1.0208, 0.3279, 0.0110]]], device='cuda:0',
grad_fn=<UnsafeViewBackward>)
rtol = 0.0001, atol = 1e-05, equal_nan = True
msg = 'With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error...difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).'
def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg='') -> None:
if not isinstance(actual, torch.Tensor):
actual = torch.tensor(actual)
if not isinstance(expected, torch.Tensor):
expected = torch.tensor(expected, dtype=actual.dtype)
if expected.shape != actual.shape:
raise AssertionError("expected tensor shape {0} doesn't match with actual tensor "
"shape {1}!".format(expected.shape, actual.shape))
if rtol is None or atol is None:
if rtol is not None or atol is not None:
raise ValueError("rtol and atol must both be specified or both be unspecified")
rtol, atol = _get_default_tolerance(actual, expected)
result, debug_msg = _compare_tensors_internal(actual, expected,
rtol=rtol, atol=atol,
equal_nan=equal_nan)
if result:
return
if msg is None or msg == '':
msg = debug_msg
> raise AssertionError(msg)
E AssertionError: With rtol=0.0001 and atol=1e-05, found 1976 element(s) (out of 2700) whose difference(s) exceeded the margin of error (including 0 nan comparisons). The greatest difference was 0.0013537406921386719 (-3.153724431991577 vs. -3.1523706912994385), which occurred at index (17, 6, 2).
../../../anaconda3/envs/pytorch3d/lib/python3.7/site-packages/torch/testing/_core.py:270: AssertionError
========================== short test summary info ===========================
FAILED test.py::test_float - AssertionError: With rtol=0.0001 and atol=1e-0...
FAILED test.py::test_multiple_gpus - AssertionError: With rtol=0.0001 and a...
======================== 2 failed, 2 passed in 2.45s =========================
Environments
- OS: ubuntu 20.04.3 LTS
- CUDA version: 11.1.74 (reported by
conda list cudatoolkit
) - Pytorch version: 1.9.0
- Python version: 3.7.10
How to Reproduce
Just run python -m pytest test.py
KinglittleQ commented
Thanks for flagging. I think it is due to the numerical error.
minghanz commented
However, I recently found that Pytorch float32 tensors matrix multiplication on CUDA 11 gives slightly different result when the operation is on CPU and on GPU. The CPU result is consistent with the numpy multiplication result, therefore I suspect that the problem is on Pytorch side. Just to provide some information from my side. I'd appreciate it if you could confirm what problem it is actually.