mpi4py issue with large k-eigenvalue simulations in Lassen
northroj opened this issue · 4 comments
On Lassen, MCDC breaks after the first eigenvalue cycle if the number of histories per cycle is larger than 1e5 and numba is enabled.
# k k (avg)
==== ======= ===================
1 1.43481
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
Traceback (most recent call last):
File "/usr/WS1/northroj/SMR/mcdc/c5g7td/inffuel/input.py", line 78, in <module>
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
mcdc.run()
File "/usr/WS1/northroj/miniconda3/MCDC/mcdc/main.py", line 41, in run
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1438, in mpi4py.MPI.Comm.recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 341, in mpi4py.MPI.PyMPI_recv
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '_'.
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '\xba'.
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '\x00'.
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '\x00'.
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
OverflowError: BINBYTES exceeds system's maximum size of 9223372036854775807 bytes
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '\x0f'.
File "mpi4py/MPI/msgpickle.pxi", line 306, in mpi4py.MPI.PyMPI_recv_match
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '\xfe'.
File "mpi4py/MPI/msgpickle.pxi", line 152, in mpi4py.MPI.pickle_load
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '\x00'.
_pickle.UnpicklingError: invalid load key, '\x00'.
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: unexpected MARK found
File "mpi4py/MPI/msgpickle.pxi", line 141, in mpi4py.MPI.cloads
_pickle.UnpicklingError: invalid load key, '\x9f'.
@spasmann: was this like any of the issues you had seen?
I've also encountered this issue, typically when running on the ND cluster. But I just ran the Takeda-1 problem with N=2e5 with 2 MPI processes and seem to get the same issue, although the output is somewhat different. This could be a problem with my installation of MPI. I will try again on the ND cluster when they are back online.
# k k (avg)
==== ======= ===================
←[K 1 0.41197================] 100%
Traceback (most recent call last):
File "C:\Users\Sam\Documents\Github\MCDC\examples\eigenvalue\takeda_1\input.py", line 192, in <module>
Traceback (most recent call last):
File "C:\Users\Sam\Documents\Github\MCDC\examples\eigenvalue\takeda_1\input.py", line 192, in <module>
mcdc.run()
File "c:\users\sam\documents\github\mcdc\mcdc\main.py", line 45, in run
mcdc.run()
File "c:\users\sam\documents\github\mcdc\mcdc\main.py", line 45, in run
loop_main(mcdc)
loop_main(mcdc)
File "mpi4py/MPI/Comm.pyx", line 1839, in mpi4py.MPI.Intracomm.Exscan
File "mpi4py/MPI/Comm.pyx", line 1839, in mpi4py.MPI.Intracomm.Exscan
File "mpi4py/MPI/msgbuffer.pxi", line 874, in mpi4py.MPI._p_msg_cco.for_exscan
File "mpi4py/MPI/msgbuffer.pxi", line 874, in mpi4py.MPI._p_msg_cco.for_exscan
ValueError: mismatch in send and receive MPI datatypes
ValueError: mismatch in send and receive MPI datatypes
I think these issues happen due to the use of the lowercase, instead of the uppercase, mpi4py functions. Such as send
instead of Send
. The easier-to-use lowercase versions may be less portable compared to the uppercase ones, which are closer to the actual MPI functions.
Tested the other mpi modules on Lassen. spectrum-mpi/2019.06.24 didn't run at all, and spectrum-mpi/2020.08.19 and spectrum-mpi/test-rolling-release got similar errors to the rolling release.