DerivativesDataSink not preventing gzip mtime from being set, leading to non-deterministic file checksums

As reported by @glatard on Mattermost, NIfTI images (sub-032633/ses-001/anat/sub-032633_ses-001_run-1_space-MNI152NLin2009cAsym_desc-preproc_T1w.nii.gz) are being produced with identical uncompressed contents but different compressed checksums. This is almost certainly the mtime being set because NIfTI images are being written by nibabel in this section:

niworkflows/niworkflows/interfaces/bids.py

Lines 579 to 644 in 11ac7a1

    
           self._results["compression"].append(_copy_any(orig_file, str(out_file))) 
        
           is_nifti = out_file.name.endswith( 
        
               (".nii", ".nii.gz") 
        
           ) and not out_file.name.endswith((".dtseries.nii", ".dtseries.nii.gz")) 
        
           data_dtype = self.inputs.data_dtype or DEFAULT_DTYPES[self.inputs.suffix] 
        
           if is_nifti and any((self.inputs.check_hdr, data_dtype)): 
        
               # Do not use mmap; if we need to access the data at all, it will be to 
        
               # rewrite, risking a BusError 
        
               nii = nb.load(out_file, mmap=False) 
        
               if self.inputs.check_hdr: 
        
                   hdr = nii.header 
        
                   curr_units = tuple( 
        
                       [None if u == "unknown" else u for u in hdr.get_xyzt_units()] 
        
                   ) 
        
                   curr_codes = (int(hdr["qform_code"]), int(hdr["sform_code"])) 
        
                   # Default to mm, use sec if data type is bold 
        
                   units = ( 
        
                       curr_units[0] or "mm", 
        
                       "sec" if out_entities["suffix"] == "bold" else None, 
        
                   ) 
        
                   xcodes = (1, 1)  # Derivative in its original scanner space 
        
                   if self.inputs.space: 
        
                       xcodes = ( 
        
                           (4, 4) if self.inputs.space in STANDARD_SPACES else (2, 2) 
        
                       ) 
        
                   if curr_codes != xcodes or curr_units != units: 
        
                       self._results["fixed_hdr"][i] = True 
        
                       hdr.set_qform(nii.affine, xcodes[0]) 
        
                       hdr.set_sform(nii.affine, xcodes[1]) 
        
                       hdr.set_xyzt_units(*units) 
        
                       # Rewrite file with new header 
        
                       overwrite_header(nii, out_file) 
        
               if data_dtype == "source":  # match source dtype 
        
                   try: 
        
                       data_dtype = nb.load(self.inputs.source_file).get_data_dtype() 
        
                   except Exception: 
        
                       LOGGER.warning( 
        
                           f"Could not get data type of file {self.inputs.source_file}" 
        
                       ) 
        
                       data_dtype = None 
        
               if data_dtype: 
        
                   if self.inputs.check_hdr: 
        
                       # load updated NIfTI 
        
                       nii = nb.load(out_file, mmap=False) 
        
                   data_dtype = np.dtype(data_dtype) 
        
                   orig_dtype = nii.get_data_dtype() 
        
                   if orig_dtype != data_dtype: 
        
                       LOGGER.warning( 
        
                           f"Changing {out_file} dtype from {orig_dtype} to {data_dtype}" 
        
                       ) 
        
                       # coerce dataobj to new data dtype 
        
                       if np.issubdtype(data_dtype, np.integer): 
        
                           new_data = np.rint(nii.dataobj).astype(data_dtype) 
        
                       else: 
        
                           new_data = np.asanyarray(nii.dataobj, dtype=data_dtype) 
        
                       # and set header to match 
        
                       nii.set_data_dtype(data_dtype) 
        
                       nii = nii.__class__(new_data, nii.affine, nii.header) 
        
                       nii.to_filename(out_file)

I would suggest we move to the following approach:

img = None
if <detect need to modify header>:
    img = <create img>

if img is not None:
    <write img deterministically>
else:
    <copy file with gzip cleaning>

This is an LTS-affecting bug, so any fix should target niworkflows 1.3.4.

This solution nipy/nibabel#1023 (comment) could actually make the non-deterministic file checksum a non-issue.
lmk @effigies

Yes, if you want to use tail -c 8 as a check, that seems fine to me. I still think that a deterministic checksum is a good idea.

	self._results["compression"].append(_copy_any(orig_file, str(out_file)))

	is_nifti = out_file.name.endswith(
	(".nii", ".nii.gz")
	) and not out_file.name.endswith((".dtseries.nii", ".dtseries.nii.gz"))
	data_dtype = self.inputs.data_dtype or DEFAULT_DTYPES[self.inputs.suffix]
	if is_nifti and any((self.inputs.check_hdr, data_dtype)):
	# Do not use mmap; if we need to access the data at all, it will be to
	# rewrite, risking a BusError
	nii = nb.load(out_file, mmap=False)

	if self.inputs.check_hdr:
	hdr = nii.header
	curr_units = tuple(
	[None if u == "unknown" else u for u in hdr.get_xyzt_units()]
	)
	curr_codes = (int(hdr["qform_code"]), int(hdr["sform_code"]))

	# Default to mm, use sec if data type is bold
	units = (
	curr_units[0] or "mm",
	"sec" if out_entities["suffix"] == "bold" else None,
	)
	xcodes = (1, 1) # Derivative in its original scanner space
	if self.inputs.space:
	xcodes = (
	(4, 4) if self.inputs.space in STANDARD_SPACES else (2, 2)
	)

	if curr_codes != xcodes or curr_units != units:
	self._results["fixed_hdr"][i] = True
	hdr.set_qform(nii.affine, xcodes[0])
	hdr.set_sform(nii.affine, xcodes[1])
	hdr.set_xyzt_units(*units)

	# Rewrite file with new header
	overwrite_header(nii, out_file)

	if data_dtype == "source": # match source dtype
	try:
	data_dtype = nb.load(self.inputs.source_file).get_data_dtype()
	except Exception:
	LOGGER.warning(
	f"Could not get data type of file {self.inputs.source_file}"
	)
	data_dtype = None

	if data_dtype:
	if self.inputs.check_hdr:
	# load updated NIfTI
	nii = nb.load(out_file, mmap=False)
	data_dtype = np.dtype(data_dtype)
	orig_dtype = nii.get_data_dtype()
	if orig_dtype != data_dtype:
	LOGGER.warning(
	f"Changing {out_file} dtype from {orig_dtype} to {data_dtype}"
	)
	# coerce dataobj to new data dtype
	if np.issubdtype(data_dtype, np.integer):
	new_data = np.rint(nii.dataobj).astype(data_dtype)
	else:
	new_data = np.asanyarray(nii.dataobj, dtype=data_dtype)
	# and set header to match
	nii.set_data_dtype(data_dtype)
	nii = nii.__class__(new_data, nii.affine, nii.header)
	nii.to_filename(out_file)