PyG CI Failures
puririshi98 opened this issue ยท 4 comments
๐ Describe the bug
The latest pytorch_geometric w/ latest pyg_lib gets a segfault when running the CI in an environment where cuda and pyg_lib are available (therefore the RGCNConv uses the cpu pyg_lib backend which seems to be broken).
/workspace/pytorch_geometric# pytest --cov --cov-report=xml
======================================================= FAILURES =======================================================
_________________________________________ test_to_hetero_and_rgcn_equal_output _________________________________________
def test_to_hetero_and_rgcn_equal_output():
torch.manual_seed(1234)
# Run `RGCN`:
x = torch.randn(10, 16) # 6 paper nodes, 4 author nodes
adj = (torch.rand(10, 10) > 0.5)
adj[6:, 6:] = False
edge_index = adj.nonzero(as_tuple=False).t().contiguous()
row, col = edge_index
# # 0 = paper<->paper, 1 = paper->author, 2 = author->paper
edge_type = torch.full((edge_index.size(1), ), -1, dtype=torch.long)
edge_type[(row < 6) & (col < 6)] = 0
edge_type[(row < 6) & (col >= 6)] = 1
edge_type[(row >= 6) & (col < 6)] = 2
assert edge_type.min() == 0
conv = RGCNConv(16, 32, num_relations=3)
out1 = conv(x, edge_index, edge_type)
# Run `to_hetero`:
x_dict = {
'paper': x[:6],
'author': x[6:],
}
edge_index_dict = {
('paper', '_', 'paper'):
edge_index[:, edge_type == 0],
('paper', '_', 'author'):
edge_index[:, edge_type == 1] - torch.tensor([[0], [6]]),
('author', '_', 'paper'):
edge_index[:, edge_type == 2] - torch.tensor([[6], [0]]),
}
node_types, edge_types = list(x_dict.keys()), list(edge_index_dict.keys())
adj_t_dict = {
key: SparseTensor.from_edge_index(edge_index).t()
for key, edge_index in edge_index_dict.items()
}
model = to_hetero(RGCN(16, 32), (node_types, edge_types))
# Set model weights:
for i, edge_type in enumerate(edge_types):
weight = model.conv['__'.join(edge_type)].lin.weight
weight.data = conv.weight[i].data.t()
for i, node_type in enumerate(node_types):
model.lin[node_type].weight.data = conv.root.data.t()
model.lin[node_type].bias.data = conv.bias.data
out2 = model(x_dict, edge_index_dict)
out2 = torch.cat([out2['paper'], out2['author']], dim=0)
> assert torch.allclose(out1, out2, atol=1e-6)
E assert False
E + where False = <built-in method allclose of type object at 0x7f45065379a0>(tensor([[-0.4873, -0.0432, -0.0032, -0.7257, 0.4246, -0.5593, 0.2248, 0.0758,\n -0.3861, -0.4978, 0.6873, ...5599,\n 0.2734, 0.8062, 0.3442, 1.8065, -1.0037, -0.5862, 0.3200, -1.7757]],\n grad_fn=<AddBackward0>), tensor([[-0.4568, 0.1837, 0.0086, -0.9106, 0.4647, -1.2277, 1.0352, -0.0584,\n -0.6260, -0.6004, 1.9123, ...5599,\n 0.2734, 0.8062, 0.3442, 1.8065, -1.0037, -0.5862, 0.3200, -1.7757]],\n grad_fn=<CatBackward0>), atol=1e-06)
E + where <built-in method allclose of type object at 0x7f45065379a0> = torch.allclose
test/nn/test_to_hetero_transformer.py:390: AssertionError
____________________________________________ test_rgcn_conv_equality[conf0] ____________________________________________
conf = (None, None)
@pytest.mark.parametrize('conf', confs)
def test_rgcn_conv_equality(conf):
num_bases, num_blocks = conf
x1 = torch.randn(4, 4)
edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [0, 0, 1, 0, 1, 1]])
edge_type = torch.tensor([0, 1, 1, 0, 0, 1])
edge_index = torch.tensor([
[0, 1, 1, 2, 2, 3, 0, 1, 1, 2, 2, 3],
[0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1],
])
edge_type = torch.tensor([0, 1, 1, 0, 0, 1, 2, 3, 3, 2, 2, 3])
torch.manual_seed(12345)
conv1 = RGCNConv(4, 32, 4, num_bases, num_blocks)
torch.manual_seed(12345)
conv2 = FastRGCNConv(4, 32, 4, num_bases, num_blocks)
out1 = conv1(x1, edge_index, edge_type)
out2 = conv2(x1, edge_index, edge_type)
> assert torch.allclose(out1, out2, atol=1e-6)
E assert False
E + where False = <built-in method allclose of type object at 0x7f45065379a0>(tensor([[ 0.1372, -0.0832, 0.7295, 0.1608, 0.6120, -0.7268, -0.1438, 0.8274,\n 0.0621, -0.2741, -0.0108, ...0206,\n 0.4476, -0.5212, -0.1653, 0.6561, 0.5276, 0.7402, 0.4341, -0.1560]],\n grad_fn=<AddBackward0>), tensor([[ 3.2177e-01, 2.9880e-01, 7.2184e-01, 3.9274e-01, 1.0374e+00,\n -7.6988e-01, -4.9286e-01, 1.2883e+...7e-01, -1.6527e-01, 6.5613e-01, 5.2758e-01, 7.4016e-01,\n 4.3408e-01, -1.5605e-01]], grad_fn=<AddBackward0>), atol=1e-06)
E + where <built-in method allclose of type object at 0x7f45065379a0> = torch.allclose
test/nn/conv/test_rgcn_conv.py:36: AssertionError
____________________________________________ test_rgcn_conv[RGCNConv-conf0] ____________________________________________
cls = <class 'torch_geometric.nn.conv.rgcn_conv.RGCNConv'>, conf = (None, None)
@pytest.mark.parametrize('cls,conf', product(classes, confs))
def test_rgcn_conv(cls, conf):
num_bases, num_blocks = conf
x1 = torch.randn(4, 4)
x2 = torch.randn(2, 16)
idx1 = torch.arange(4)
idx2 = torch.arange(2)
edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [0, 0, 1, 0, 1, 1]])
edge_type = torch.tensor([0, 1, 1, 0, 0, 1])
row, col = edge_index
adj = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(4, 4))
conv = cls(4, 32, 2, num_bases, num_blocks)
assert conv.__repr__() == f'{cls.__name__}(4, 32, num_relations=2)'
out1 = conv(x1, edge_index, edge_type)
assert out1.size() == (4, 32)
> assert conv(x1, adj.t()).tolist() == out1.tolist()
E AssertionError: assert [[0.094849109...5622253, ...]] == [[-0.37521255...5622253, ...]]
E At index 0 diff: [0.0948491096496582, -0.0819135308265686, 1.3105871677398682, 0.40821221470832825, 0.9653813242912292, -0.9601163268089294, 0.8856582641601562, 0.0513533353805542, -0.2117443084716797, -1.0377789735794067, -0.5760844945907593, -1.245593786239624, 0.12341594696044922, 0.41952651739120483, -0.541046142578125, -0.8374639749526978, 0.4925721287727356, -0.4119006097316742, -0.25780433416366577, 0.9387804269790649, 1.1101524829864502, -0.3150181472301483, 0.8848719000816345, 0.20270687341690063, -0.051341116428375244, -0.3655675947666168, 0.4692113399505615, ...
E
E ...Full output truncated (2 lines hidden), use '-vv' to show
test/nn/conv/test_rgcn_conv.py:61: AssertionError
__________________________________________________ test_mask_feature ___________________________________________________
def test_mask_feature():
x = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
dtype=torch.float)
out = mask_feature(x, training=False)
assert out[0].tolist() == x.tolist()
assert torch.all(out[1])
torch.manual_seed(4)
out = mask_feature(x)
assert out[0].tolist() == [[1.0, 2.0, 0.0, 0.0], [5.0, 6.0, 0.0, 0.0],
[9.0, 10.0, 0.0, 0.0]]
assert out[1].tolist() == [[True, True, False, False]]
torch.manual_seed(5)
out = mask_feature(x, mode='row')
assert out[0].tolist() == [[1.0, 2.0, 3.0, 4.0], [0.0, 0.0, 0.0, 0.0],
[9.0, 10.0, 11.0, 12.0]]
assert out[1].tolist() == [[True], [False], [True]]
torch.manual_seed(7)
out = mask_feature(x, mode='all')
> assert out[0].tolist() == [[1.0, 0.0, 0.0, 4.0], [5.0, 6.0, 7.0, 0.0],
[9.0, 10.0, 0.0, 12.0]]
E assert [[1.0, 2.0, 0..., 11.0, 12.0]] == [[1.0, 0.0, 0...0, 0.0, 12.0]]
E At index 0 diff: [1.0, 2.0, 0.0, 0.0] != [1.0, 0.0, 0.0, 4.0]
E Use -v to get the full diff
FAILED test/nn/test_to_hetero_transformer.py::test_to_hetero_and_rgcn_equal_output - assert False
FAILED test/nn/conv/test_rgcn_conv.py::test_rgcn_conv_equality[conf0] - assert False
FAILED test/nn/conv/test_rgcn_conv.py::test_rgcn_conv[RGCNConv-conf0] - AssertionError: assert [[0.094849109...562225...
FAILED test/utils/test_augmentation.py::test_mask_feature - assert [[1.0, 2.0, 0..., 11.0, 12.0]] == [[1.0, 0.0, 0......
Environment
latest pyg and pyg-lib main branch
test_ops
passes for me, and the C++ tests work as well. Can you try to install latest nightly and run again?
Hi @puririshi98, I didn't observe issues with pyg-lib test on my machine. I can confirm though that with pyg-lib enabled, tests from test_rgcn_conv.py are failling, but in my environment there is no segmentation fault, fail happens on verification of torch.allclose
. I checked the calculation made at message() method and segment_matmul
output appears to be correct. Does rgcn test work on GPU? (I cannot confirm, gemm initialization is always failing on my machine GPU - gtx1060 + cu116)
Update:
In test/nn/conv/test_rgcn_conv.py::test_rgcn_conv_equality[conf0]
it happens that ptr
splits inputs
into equal chunks of memory, so it's a perfect case for torch.bmm. I made an experiment and it produces exactly the same results as segment_matmul
from pyg-lib, I'm guessing that issue lies in rgcn code (path where we use segment_matmul
).
Thanks for confirming @dszwicht :) PyG tests are indeed failing for RGCN - I will take a look over the weekend.
yeah im not sure what happened, wiped my environment and reinstalled everything and it was fine, sorry for the confusion