PyG CI Failures

Question

PyG CI Failures

puririshi98 opened this issue 2 years ago · 4 comments

🐛 Describe the bug

The latest pytorch_geometric w/ latest pyg_lib gets a segfault when running the CI in an environment where cuda and pyg_lib are available (therefore the RGCNConv uses the cpu pyg_lib backend which seems to be broken).

/workspace/pytorch_geometric# pytest --cov --cov-report=xml
======================================================= FAILURES =======================================================
_________________________________________ test_to_hetero_and_rgcn_equal_output _________________________________________

    def test_to_hetero_and_rgcn_equal_output():
        torch.manual_seed(1234)

        # Run `RGCN`:
        x = torch.randn(10, 16)  # 6 paper nodes, 4 author nodes
        adj = (torch.rand(10, 10) > 0.5)
        adj[6:, 6:] = False
        edge_index = adj.nonzero(as_tuple=False).t().contiguous()
        row, col = edge_index

        # # 0 = paper<->paper, 1 = paper->author, 2 = author->paper
        edge_type = torch.full((edge_index.size(1), ), -1, dtype=torch.long)
        edge_type[(row < 6) & (col < 6)] = 0
        edge_type[(row < 6) & (col >= 6)] = 1
        edge_type[(row >= 6) & (col < 6)] = 2
        assert edge_type.min() == 0

        conv = RGCNConv(16, 32, num_relations=3)
        out1 = conv(x, edge_index, edge_type)

        # Run `to_hetero`:
        x_dict = {
            'paper': x[:6],
            'author': x[6:],
        }
        edge_index_dict = {
            ('paper', '_', 'paper'):
            edge_index[:, edge_type == 0],
            ('paper', '_', 'author'):
            edge_index[:, edge_type == 1] - torch.tensor([[0], [6]]),
            ('author', '_', 'paper'):
            edge_index[:, edge_type == 2] - torch.tensor([[6], [0]]),
        }

        node_types, edge_types = list(x_dict.keys()), list(edge_index_dict.keys())

        adj_t_dict = {
            key: SparseTensor.from_edge_index(edge_index).t()
            for key, edge_index in edge_index_dict.items()
        }

        model = to_hetero(RGCN(16, 32), (node_types, edge_types))

        # Set model weights:
        for i, edge_type in enumerate(edge_types):
            weight = model.conv['__'.join(edge_type)].lin.weight
            weight.data = conv.weight[i].data.t()
        for i, node_type in enumerate(node_types):
            model.lin[node_type].weight.data = conv.root.data.t()
            model.lin[node_type].bias.data = conv.bias.data

        out2 = model(x_dict, edge_index_dict)
        out2 = torch.cat([out2['paper'], out2['author']], dim=0)
>       assert torch.allclose(out1, out2, atol=1e-6)
E       assert False
E        +  where False = <built-in method allclose of type object at 0x7f45065379a0>(tensor([[-0.4873, -0.0432, -0.0032, -0.7257,  0.4246, -0.5593,  0.2248,  0.0758,\n         -0.3861, -0.4978,  0.6873,  ...5599,\n          0.2734,  0.8062,  0.3442,  1.8065, -1.0037, -0.5862,  0.3200, -1.7757]],\n       grad_fn=<AddBackward0>), tensor([[-0.4568,  0.1837,  0.0086, -0.9106,  0.4647, -1.2277,  1.0352, -0.0584,\n         -0.6260, -0.6004,  1.9123,  ...5599,\n          0.2734,  0.8062,  0.3442,  1.8065, -1.0037, -0.5862,  0.3200, -1.7757]],\n       grad_fn=<CatBackward0>), atol=1e-06)
E        +    where <built-in method allclose of type object at 0x7f45065379a0> = torch.allclose

test/nn/test_to_hetero_transformer.py:390: AssertionError
____________________________________________ test_rgcn_conv_equality[conf0] ____________________________________________

conf = (None, None)

    @pytest.mark.parametrize('conf', confs)
    def test_rgcn_conv_equality(conf):
        num_bases, num_blocks = conf

        x1 = torch.randn(4, 4)
        edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [0, 0, 1, 0, 1, 1]])
        edge_type = torch.tensor([0, 1, 1, 0, 0, 1])

        edge_index = torch.tensor([
            [0, 1, 1, 2, 2, 3, 0, 1, 1, 2, 2, 3],
            [0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1],
        ])
        edge_type = torch.tensor([0, 1, 1, 0, 0, 1, 2, 3, 3, 2, 2, 3])

        torch.manual_seed(12345)
        conv1 = RGCNConv(4, 32, 4, num_bases, num_blocks)

        torch.manual_seed(12345)
        conv2 = FastRGCNConv(4, 32, 4, num_bases, num_blocks)

        out1 = conv1(x1, edge_index, edge_type)
        out2 = conv2(x1, edge_index, edge_type)
>       assert torch.allclose(out1, out2, atol=1e-6)
E       assert False
E        +  where False = <built-in method allclose of type object at 0x7f45065379a0>(tensor([[ 0.1372, -0.0832,  0.7295,  0.1608,  0.6120, -0.7268, -0.1438,  0.8274,\n          0.0621, -0.2741, -0.0108,  ...0206,\n          0.4476, -0.5212, -0.1653,  0.6561,  0.5276,  0.7402,  0.4341, -0.1560]],\n       grad_fn=<AddBackward0>), tensor([[ 3.2177e-01,  2.9880e-01,  7.2184e-01,  3.9274e-01,  1.0374e+00,\n         -7.6988e-01, -4.9286e-01,  1.2883e+...7e-01, -1.6527e-01,  6.5613e-01,  5.2758e-01,  7.4016e-01,\n          4.3408e-01, -1.5605e-01]], grad_fn=<AddBackward0>), atol=1e-06)
E        +    where <built-in method allclose of type object at 0x7f45065379a0> = torch.allclose

test/nn/conv/test_rgcn_conv.py:36: AssertionError
____________________________________________ test_rgcn_conv[RGCNConv-conf0] ____________________________________________

cls = <class 'torch_geometric.nn.conv.rgcn_conv.RGCNConv'>, conf = (None, None)

    @pytest.mark.parametrize('cls,conf', product(classes, confs))
    def test_rgcn_conv(cls, conf):
        num_bases, num_blocks = conf

        x1 = torch.randn(4, 4)
        x2 = torch.randn(2, 16)
        idx1 = torch.arange(4)
        idx2 = torch.arange(2)
        edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [0, 0, 1, 0, 1, 1]])
        edge_type = torch.tensor([0, 1, 1, 0, 0, 1])
        row, col = edge_index
        adj = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(4, 4))

        conv = cls(4, 32, 2, num_bases, num_blocks)
        assert conv.__repr__() == f'{cls.__name__}(4, 32, num_relations=2)'
        out1 = conv(x1, edge_index, edge_type)
        assert out1.size() == (4, 32)
>       assert conv(x1, adj.t()).tolist() == out1.tolist()
E       AssertionError: assert [[0.094849109...5622253, ...]] == [[-0.37521255...5622253, ...]]
E         At index 0 diff: [0.0948491096496582, -0.0819135308265686, 1.3105871677398682, 0.40821221470832825, 0.9653813242912292, -0.9601163268089294, 0.8856582641601562, 0.0513533353805542, -0.2117443084716797, -1.0377789735794067, -0.5760844945907593, -1.245593786239624, 0.12341594696044922, 0.41952651739120483, -0.541046142578125, -0.8374639749526978, 0.4925721287727356, -0.4119006097316742, -0.25780433416366577, 0.9387804269790649, 1.1101524829864502, -0.3150181472301483, 0.8848719000816345, 0.20270687341690063, -0.051341116428375244, -0.3655675947666168, 0.4692113399505615, ...
E
E         ...Full output truncated (2 lines hidden), use '-vv' to show

test/nn/conv/test_rgcn_conv.py:61: AssertionError
__________________________________________________ test_mask_feature ___________________________________________________

    def test_mask_feature():
        x = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
                         dtype=torch.float)

        out = mask_feature(x, training=False)
        assert out[0].tolist() == x.tolist()
        assert torch.all(out[1])

        torch.manual_seed(4)
        out = mask_feature(x)
        assert out[0].tolist() == [[1.0, 2.0, 0.0, 0.0], [5.0, 6.0, 0.0, 0.0],
                                   [9.0, 10.0, 0.0, 0.0]]
        assert out[1].tolist() == [[True, True, False, False]]

        torch.manual_seed(5)
        out = mask_feature(x, mode='row')
        assert out[0].tolist() == [[1.0, 2.0, 3.0, 4.0], [0.0, 0.0, 0.0, 0.0],
                                   [9.0, 10.0, 11.0, 12.0]]
        assert out[1].tolist() == [[True], [False], [True]]

        torch.manual_seed(7)
        out = mask_feature(x, mode='all')
>       assert out[0].tolist() == [[1.0, 0.0, 0.0, 4.0], [5.0, 6.0, 7.0, 0.0],
                                   [9.0, 10.0, 0.0, 12.0]]
E       assert [[1.0, 2.0, 0..., 11.0, 12.0]] == [[1.0, 0.0, 0...0, 0.0, 12.0]]
E         At index 0 diff: [1.0, 2.0, 0.0, 0.0] != [1.0, 0.0, 0.0, 4.0]
E         Use -v to get the full diff
FAILED test/nn/test_to_hetero_transformer.py::test_to_hetero_and_rgcn_equal_output - assert False
FAILED test/nn/conv/test_rgcn_conv.py::test_rgcn_conv_equality[conf0] - assert False
FAILED test/nn/conv/test_rgcn_conv.py::test_rgcn_conv[RGCNConv-conf0] - AssertionError: assert [[0.094849109...562225...
FAILED test/utils/test_augmentation.py::test_mask_feature - assert [[1.0, 2.0, 0..., 11.0, 12.0]] == [[1.0, 0.0, 0......

Environment

latest pyg and pyg-lib main branch

Answer 1 · 2022-10-05T01:06:41.000Z

test_ops passes for me, and the C++ tests work as well. Can you try to install latest nightly and run again?

Answer 2 · 2022-10-05T09:27:28.000Z

Hi @puririshi98, I didn't observe issues with pyg-lib test on my machine. I can confirm though that with pyg-lib enabled, tests from test_rgcn_conv.py are failling, but in my environment there is no segmentation fault, fail happens on verification of torch.allclose. I checked the calculation made at message() method and segment_matmul output appears to be correct. Does rgcn test work on GPU? (I cannot confirm, gemm initialization is always failing on my machine GPU - gtx1060 + cu116)

Update:
In test/nn/conv/test_rgcn_conv.py::test_rgcn_conv_equality[conf0] it happens that ptr splits inputs into equal chunks of memory, so it's a perfect case for torch.bmm. I made an experiment and it produces exactly the same results as segment_matmul from pyg-lib, I'm guessing that issue lies in rgcn code (path where we use segment_matmul).

Answer 3 · 2022-10-05T14:49:33.000Z

Thanks for confirming @dszwicht :) PyG tests are indeed failing for RGCN - I will take a look over the weekend.

Answer 4 · 2022-10-05T16:19:27.000Z

yeah im not sure what happened, wiped my environment and reinstalled everything and it was fine, sorry for the confusion