ccpem/mrcfile

test_data_is_not_read_if_dimensions_are_too_huge fails on i586

pgajdos opened this issue · 7 comments

Hi,

I get following failure:

[   21s] =================================== FAILURES ===================================
[   21s] ________ MrcMemmapTest.test_data_is_not_read_if_dimensions_are_too_huge ________
[   21s] 
[   21s] self = <tests.test_mrcmemmap.MrcMemmapTest testMethod=test_data_is_not_read_if_dimensions_are_too_huge>
[   21s] 
[   21s]     def test_data_is_not_read_if_dimensions_are_too_huge(self):
[   21s]         # Prepare x, y and z counts to try to trigger an out-of-memory error
[   21s]         # The individual values need to fit in int32 values to be stored in the header
[   21s]         # and their product must be as large as possible while still less than
[   21s]         # sys.maxsize (if larger, it triggers an index overflow error instead)
[   21s]         max_i4 = np.iinfo('i4').max
[   21s]         max_arr = sys.maxsize
[   21s]         nx = max_i4
[   21s]         ny = min(max_i4, max_arr // nx)
[   21s]         nz = min(max_i4, max_arr // (nx * ny))
[   21s]         # Check that an allocation of this size really does cause a memory error
[   21s]         with self.assertRaises(MemoryError):
[   21s]             _ = bytearray(nx * ny * nz)
[   21s]     
[   21s]         # Now put these values into a file
[   21s]         with self.newmrc(self.temp_mrc_name, mode='w+') as mrc:
[   21s]             mrc.set_data(np.arange(24, dtype=np.int8).reshape(2, 3, 4))
[   21s]             mrc.header.mx = mrc.header.nx = nx
[   21s]             mrc.header.my = mrc.header.ny = ny
[   21s]             mrc.header.mz = mrc.header.nz = nz
[   21s]     
[   21s]         # And now check that if we open the file, we avoid the problem and don't try
[   21s]         # to allocate enough memory to cause an out-of-memory error
[   21s]         with warnings.catch_warnings(record=True) as w:
[   21s]             warnings.simplefilter("always")
[   21s] >           with self.newmrc(self.temp_mrc_name, permissive=True) as mrc:
[   21s] 
[   21s] tests/test_mrcfile.py:352: 
[   21s] _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
[   21s] mrcfile/mrcfile.py:115: in __init__
[   21s]     self._read(header_only)
[   21s] mrcfile/mrcfile.py:131: in _read
[   21s]     super(MrcFile, self)._read(header_only)
[   21s] mrcfile/mrcinterpreter.py:176: in _read
[   21s]     self._read_data()
[   21s] mrcfile/mrcmemmap.py:112: in _read_data
[   21s]     self._open_memmap(dtype, shape)
[   21s] mrcfile/mrcmemmap.py:121: in _open_memmap
[   21s]     self._data = np.memmap(self._iostream,
[   21s] _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
[   21s] 
[   21s] subtype = <class 'numpy.memmap'>
[   21s] filename = <_io.BufferedReader name='/tmp/tmpbfu8zgrg/test_mrcfile.mrc'>
[   21s] dtype = dtype('int8'), mode = 'r', offset = 1024, shape = (1, 1, 2147483647)
[   21s] order = 'C'
[   21s] 
[   21s]     def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
[   21s]                 shape=None, order='C'):
[   21s]         # Import here to minimize 'import numpy' overhead
[   21s]         import mmap
[   21s]         import os.path
[   21s]         try:
[   21s]             mode = mode_equivalents[mode]
[   21s]         except KeyError as e:
[   21s]             if mode not in valid_filemodes:
[   21s]                 raise ValueError(
[   21s]                     "mode must be one of {!r} (got {!r})"
[   21s]                     .format(valid_filemodes + list(mode_equivalents.keys()), mode)
[   21s]                 ) from None
[   21s]     
[   21s]         if mode == 'w+' and shape is None:
[   21s]             raise ValueError("shape must be given")
[   21s]     
[   21s]         if hasattr(filename, 'read'):
[   21s]             f_ctx = nullcontext(filename)
[   21s]         else:
[   21s]             f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
[   21s]     
[   21s]         with f_ctx as fid:
[   21s]             fid.seek(0, 2)
[   21s]             flen = fid.tell()
[   21s]             descr = dtypedescr(dtype)
[   21s]             _dbytes = descr.itemsize
[   21s]     
[   21s]             if shape is None:
[   21s]                 bytes = flen - offset
[   21s]                 if bytes % _dbytes:
[   21s]                     raise ValueError("Size of available data is not a "
[   21s]                             "multiple of the data-type size.")
[   21s]                 size = bytes // _dbytes
[   21s]                 shape = (size,)
[   21s]             else:
[   21s]                 if not isinstance(shape, tuple):
[   21s]                     shape = (shape,)
[   21s]                 size = np.intp(1)  # avoid default choice of np.int_, which might overflow
[   21s]                 for k in shape:
[   21s]                     size *= k
[   21s]     
[   21s]             bytes = int(offset + size*_dbytes)
[   21s]     
[   21s]             if mode in ('w+', 'r+') and flen < bytes:
[   21s]                 fid.seek(bytes - 1, 0)
[   21s]                 fid.write(b'\0')
[   21s]                 fid.flush()
[   21s]     
[   21s]             if mode == 'c':
[   21s]                 acc = mmap.ACCESS_COPY
[   21s]             elif mode == 'r':
[   21s]                 acc = mmap.ACCESS_READ
[   21s]             else:
[   21s]                 acc = mmap.ACCESS_WRITE
[   21s]     
[   21s]             start = offset - offset % mmap.ALLOCATIONGRANULARITY
[   21s]             bytes -= start
[   21s]             array_offset = offset - start
[   21s] >           mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
[   21s] E           OverflowError: memory mapped length must be positive
[   21s] 
[   21s] /usr/lib/python3.11/site-packages/numpy/core/memmap.py:267: OverflowError
[   21s] =============================== warnings summary ===============================

It would be very welcome if you would find time to look at it or would give me any pointers how to debug.

Cheers

Tested with 3.11, 3.10 and 3.9.
No other test does fail on that architecture.

Thanks for the report. I will try to look at it at some point, but for now I think you can safely ignore this test failure. This particular test is likely to be sensitive to hardware details and if it's failing it's very likely to be a problem with the test itself rather than the main mrcfile code.

I'm getting the same on Fedora rawhide (with both python 3.11 and the upcoming 3.12). It's likely a 32-bit related issue as the test is failing on ARM 32-bit and x86 32-bit arches. I'll skip the test as suggested for now.

I haven't been able to reproduce this error, but a change has recently been committed (see #61) which looks like it might be related. Could you try running this again and see if the latest version of mrcfile fixes the test that was failing?

Unfortunately, this test is still failing with 1.5.2, which contains this commit:

+ python3 -m unittest tests
/builddir/build/BUILD/python-mrcfile-1.5.2-build/mrcfile-1.5.2/tests/test_load_functions.py:108: SyntaxWarning: invalid escape sequence '\.'
  with self.assertRaisesRegex(ValueError, "call 'mrcfile\.new\(\)'"):
.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................F.........................................................................................................................................................................................................
======================================================================
FAIL: test_data_is_not_read_if_dimensions_are_too_huge (tests.test_mrcmemmap.MrcMemmapTest.test_data_is_not_read_if_dimensions_are_too_huge)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/builddir/build/BUILD/python-mrcfile-1.5.2-build/mrcfile-1.5.2/tests/test_mrcfile.py", line 358, in test_data_is_not_read_if_dimensions_are_too_huge
    assert len(w) == 1
           ^^^^^^^^^^^
AssertionError

----------------------------------------------------------------------
Ran 811 tests in 17.638s

FAILED (failures=1)

However, the following patch makes the test pass on x86 32-bit:

--- mrcmemmap.py.orig	2024-07-18 16:55:06.000000000 +0200
+++ mrcmemmap.py	2024-07-20 08:22:31.747617907 +0200
@@ -114,7 +114,7 @@
     def _open_memmap(self, dtype, shape):
         """Open a new memmap array pointing at the file's data block."""
         acc_mode = 'r' if self._read_only else 'r+'
-        header_nbytes = int(self.header.nbytes + self.header.nsymbt)
+        header_nbytes = np.int64(self.header.nbytes + self.header.nsymbt)
         
         self._iostream.flush()
         try:

Would you like me to submit a PR?

Thanks for the update. I've decided to leave this as a Python int because that's what Numpy documents that the type should be. np.int64 does indeed work, but it feels like it's taking advantage of an internal implementation detail in Numpy so probably better avoided unless it's really necessary.

The test that was failing doesn't really make sense on a 32-bit system anyway. I've made a few changes to improve the handling of large files in general, and avoid problems from int32 limits, and that particular test (and a few new ones) are now simply skipped on 32-bit systems.

If you come across any other test failures or problems using mrcfile on a 32-bit system, do please open a new issue.