DART: Only basic types allowed in this operation (BYTE given)
bertwesarg opened this issue · 2 comments
bertwesarg commented
Error with 512 units:
[ 7 ERROR ] [ 1115435975.583 ] dart_globmem.c :451 !!! DART: dart_team_memalloc_aligned ! Only basic types allowed in this operation (BYTE given)
[ 7 ERROR ] [ 24829 ] AllocationPolicy.h :178 | GlobalAllocationPolicy.do_global_allocate(nlocal)| cannot allocate global memory segment 0
terminate called after throwing an instance of 'std::bad_alloc'
what(): std::bad_alloc
[taurusi6415:24829] *** Process received signal ***
[taurusi6415:24829] Signal: Aborted (6)
[taurusi6415:24829] Signal code: (-6)
[taurusi6415:24829] [ 0] /usr/lib64/libpthread.so.0(+0xf5e0)[0x2b0dba09f5e0]
[taurusi6415:24829] [ 1] /usr/lib64/libc.so.6(gsignal+0x37)[0x2b0dba7961f7]
[taurusi6415:24829] [ 2] /usr/lib64/libc.so.6(abort+0x148)[0x2b0dba7978e8]
[taurusi6415:24829] [ 3] /sw/installed/GCCcore/8.2.0/lib64/libstdc++.so.6(+0x9f8f3)[0x2b0dba34b8f3]
[taurusi6415:24829] [ 4] /sw/installed/GCCcore/8.2.0/lib64/libstdc++.so.6(+0xa5a76)[0x2b0dba351a76]
[taurusi6415:24829] [ 5] /sw/installed/GCCcore/8.2.0/lib64/libstdc++.so.6(+0xa5ab1)[0x2b0dba351ab1]
[taurusi6415:24829] [ 6] /sw/installed/GCCcore/8.2.0/lib64/libstdc++.so.6(+0xa5ce4)[0x2b0dba351ce4]
[taurusi6415:24829] [ 7] /home/h8/wesarg/SPEC/620.dashmg/_build/dashMg-5-5-5-512-16-12279022/../test-allreduce[0x40ce6e]
[taurusi6415:24829] [ 8] /home/h8/wesarg/SPEC/620.dashmg/_build/dashMg-5-5-5-512-16-12279022/../test-allreduce[0x40db71]
[taurusi6415:24829] [ 9] /home/h8/wesarg/SPEC/620.dashmg/_build/dashMg-5-5-5-512-16-12279022/../test-allreduce[0x407539]
[taurusi6415:24829] [10] /usr/lib64/libc.so.6(__libc_start_main+0xf5)[0x2b0dba782c05]
[taurusi6415:24829] [11] /home/h8/wesarg/SPEC/620.dashmg/_build/dashMg-5-5-5-512-16-12279022/../test-allreduce[0x407b37]
[taurusi6415:24829] *** End of error message ***
srun: error: taurusi6415: task 7: Aborted
Alweays rank 7, not hardware related (tried two different sets of nodes). Works with 256
Reproducer:
#include <iostream>
#include <libdash.h>
int
main(int ac, char *av[])
{
using TeamSpecT = dash::TeamSpec<3>;
using MatrixT = dash::NArray<double, 3>;
using PatternT = typename MatrixT::pattern_type;
using SizeSpecT = dash::SizeSpec<3>;
using DistSpecT = dash::DistributionSpec<3>;
dash::init(&ac, &av);
auto size_spec = SizeSpecT(64, 64, 64);
auto dist_spec = DistSpecT(dash::BLOCKED, dash::BLOCKED, dash::BLOCKED);
std::vector<MatrixT*> levels;
auto* team_current = &dash::Team::All();
TeamSpecT team_all_spec(team_current->size(), 1, 1);
team_all_spec.balance_extents();
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_all_spec));
size_t counter = team_all_spec.extent(0);
while (counter > 1) {
if (0 == dash::myid())
std::cout << counter << std::endl;
team_current = &team_current->split(8);
TeamSpecT team_current_spec(team_current->size(), 1, 1);
team_current_spec.balance_extents();
counter = team_current_spec.extent(0);
if (!team_current->is_null() && 0 == team_current->position()) {
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
levels.push_back(new MatrixT(size_spec, dist_spec, *team_current, team_current_spec));
} else {
break;
}
}
dash::Array<double> centralized(dash::size(), dash::BLOCKCYCLIC(dash::size()), dash::Team::All());
dash::finalize();
return 0;
}
bertwesarg commented
For 2048 ranks, 32 are failing (units 160-191).
For 4096 ranks, 64 are failing (units 320-383).
Looks like a pattern :(
devreal commented
I'm looking into it right now, waiting for nodes on the system :)