Skip to content

[Bug]: compound Dtypes do not support custom chunking and compression with Zarr. #1296

@pauladkisson

Description

@pauladkisson

What happened?

I was working on catalystneuro/neuroconv#1003 and ran into yet another issue with chunking and compression on compound data types (ex. PlaneSegmentation.pixel_mask).

This time I noticed that when writing such data types with a zarr backend, any custom data I/O options were ignored/discarded and replaced with the default from zarr (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)). See minimal example below:

Steps to Reproduce

import os
import shutil
from pathlib import Path

import numcodecs
from hdmf_zarr import ZarrDataIO
from hdmf_zarr.nwb import NWBZarrIO
from pynwb.ophys import PlaneSegmentation
from pynwb.testing.mock.file import mock_NWBFile
from pynwb.testing.mock.ophys import mock_ImagingPlane

def main():
    """Demonstrate the hdmf-zarr compound dtype bug."""
    # Setup file paths
    zarr_file_path = Path("test_compound_dtype_bug.nwb.zarr")
    
    # Clean up existing file if it exists
    if zarr_file_path.exists():
        if zarr_file_path.is_dir():
            shutil.rmtree(zarr_file_path)
        else:
            os.remove(zarr_file_path)
    
    # Create minimal NWB file
    nwbfile = mock_NWBFile()
    
    # Create imaging plane
    imaging_plane = mock_ImagingPlane(nwbfile=nwbfile)
    
    # Create PlaneSegmentation
    plane_segmentation = PlaneSegmentation(
        description="Test plane segmentation for compound dtype bug demonstration",
        imaging_plane=imaging_plane,
        name="TestPlaneSegmentation",
    )
    
    # Create pixel mask data (compound dtype)
    n_rois = 10
    pixel_mask = []
    pixel_mask_index = []
    for i in range(n_rois):
        n_pixels_per_roi = 3
        roi_mask = [(x, x, 1.0) for x in range(n_pixels_per_roi)]
        pixel_mask.extend(roi_mask)
        pixel_mask_index.append(len(pixel_mask))

    data = ZarrDataIO(data=pixel_mask, compressor=numcodecs.GZip(level=1))
    for i in range(len(pixel_mask_index)):
        plane_segmentation.add_row()
    plane_segmentation.add_column(name="pixel_mask", description="Pixel mask for each ROI", data=data, index=pixel_mask_index)
    
    # Add to processing module
    nwbfile.create_processing_module("ophys", "ophys processing module")
    nwbfile.processing["ophys"].add(plane_segmentation)
    
    with NWBZarrIO(str(zarr_file_path), mode="w") as io:
        io.write(nwbfile)
    
    # Read the file back
    with NWBZarrIO(str(zarr_file_path), mode="r") as io:
        read_nwbfile = io.read()
        
        # Access the pixel mask data
        expected_compressor = numcodecs.GZip(level=1)
        actual_compressor = read_nwbfile.processing["ophys"]["TestPlaneSegmentation"].pixel_mask.data.compressor # Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
    assert actual_compressor == expected_compressor, "Compressor does not match expected GZip level 1"

if __name__ == "__main__":
    main()

Traceback

python minimal_compound_dtype_bug_example.py
/Users/pauladkisson/Documents/CatalystNeuro/Neuroconv/neuroconv/minimal_compound_dtype_bug_example.py:52: UserWarning: Column 'pixel_mask' is predefined in PlaneSegmentation with index=True which does not match the entered index argument. The predefined index spec will be ignored. Please ensure the new column complies with the spec. This will raise an error in a future version of HDMF.
  plane_segmentation.add_column(name="pixel_mask", description="Pixel mask for each ROI", data=data, index=pixel_mask_index)
Traceback (most recent call last):
  File "/Users/pauladkisson/Documents/CatalystNeuro/Neuroconv/neuroconv/minimal_compound_dtype_bug_example.py", line 72, in <module>
    main()
  File "/Users/pauladkisson/Documents/CatalystNeuro/Neuroconv/neuroconv/minimal_compound_dtype_bug_example.py", line 69, in main
    assert actual_compressor == expected_compressor, "Compressor does not match expected GZip level 1"
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: Compressor does not match expected GZip level 1

Operating System

macOS

Python Version

3.12

Package Versions

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions