How many times to call the filter in writing a single chunk


#1

Hi HDF5 Expert,
I am trying to use ZSTD filter plugin here on a data with 30000 by 6912, as shown by the below, to write the data. It works !!!. Something wired is that I found the filter is called twice even though I write the whole chunk(data) with a single H5Dwrite. Anyone can help to explain it why? Also, Once I had a few attributes on the dataset “A”, it seems to be called more than 2 times.

Thanks for your help.
Bin

==========
GROUP “/” {
DATASET “A” {
DATATYPE H5T_STD_I16LE
DATASPACE SIMPLE { ( 30000, 6912 ) / ( 30000, 6912 ) }
STORAGE_LAYOUT {
CHUNKED ( 30000, 6912 )
SIZE 326796913 (1.269:1 COMPRESSION)
}
FILTERS {
USER_DEFINED_FILTER {
FILTER_ID 32015
COMMENT Zstandard compression: http://www.zstd.net
PARAMS { 11 }
}
}

=======
I used the https://github.com/aparamon/HDF5Plugin-Zstandard
I just added a printf right after the ZSTD_compress. It is printed twice and I think the filter is called twice.
Both calls have the same origSize but the the first one is compressed real well.

aggression = 11, origSize =414720000, compSize = 12678
aggression = 11, origSize =414720000, compSize = 326796913

=====
my code looks like the below:

DLL_EXPORT size_t zstd_filter(unsigned int flags, size_t cd_nelmts,
const unsigned int cd_values[], size_t nbytes,
size_t *buf_size, void **buf)
{

inbuf = buf;
size_t origSize = nbytes; /
Number of bytes for output (compressed) buffer */
if (flags & H5Z_FLAG_REVERSE)
{

}
else
{
int aggression;
if (cd_nelmts > 0)
aggression = (int)cd_values[0];
else
aggression = ZSTD_CLEVEL_DEFAULT;
size_t compSize = ZSTD_compressBound(origSize);
compSize = ZSTD_compress(outbuf, compSize, inbuf, origSize, aggression);
printf(" aggression = %d, origSize =%zu, compSize = %zu \n", aggression, origSize, compSize);

}

return 0;

}


#2

I think unless you create the dataset with H5Pset_fill_time(plist, H5D_FILL_TIME_NEVER), it’ll write fill values every time a chunk is allocated and might call your filter as a side effect? Just guessing…

G.


#3

Thanks @gheber for the suggestion.

After trying the H5Pset_fill_time(plist, H5D_FILL_TIME_NEVER), the plugin is still got called twice.


#4

Not sure what you are doing, but here’s a little helper for diagnosing.

The filter does nothing (identity).

You can play with the (commented) combinations for allocation- and fill-time. For example:

  1. When using defaults, Compressing... will be printed twice, because chunk allocation is late (on write) and we are writing two chunks.
  2. If you use early allocation, Compressing... will be printed thrice, because we allocate chunks w/ fill values on dataset creation, but the library needs only one blank reference chunk compressed once but written twice.
#include "hdf5.h"

#include <stdio.h>
#include <stdlib.h>

size_t filter(unsigned int flags, size_t cd_nelmts,
              const unsigned int cd_values[], size_t nbytes, size_t *buf_size,
              void **buf) {
  buf_size = 0;

  if (flags & H5Z_FLAG_REVERSE) {
    // read data, e.g., decompress data
    // ...
    printf("Decompressing...\n");
  } else {
    // write data, e.g., compress data
    // ...
    printf("Compressing...\n");
  }

  return nbytes;
}

int main()
{
  __label__ fail_register, fail_file, fail_dspace, fail_dcpl, fail_dset, fail_write;
  int retval = EXIT_SUCCESS;
  hid_t file, dspace, dcpl, dset;
  int data[2048];

  H5Z_class_t cls;
  cls.version = H5Z_CLASS_T_VERS;
  cls.id = 256;
  cls.encoder_present = 1;
  cls.decoder_present = 1;
  cls.name = "Identity filter";
  cls.can_apply = NULL;
  cls.set_local = NULL;
  cls.filter = &filter;

  // register the filter
  if (H5Zregister(&cls) < 0) {
    retval = EXIT_FAILURE;
    goto fail_register;
  }

  if ((file = H5Fcreate("filter.h5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT)) ==
      H5I_INVALID_HID) {
    retval = EXIT_FAILURE;
    goto fail_file;
  }

  if ((dspace = H5Screate_simple(1, (hsize_t[]){2048},
                                 (hsize_t[]){H5S_UNLIMITED})) ==
      H5I_INVALID_HID) {
    retval = EXIT_FAILURE;
    goto fail_dspace;
  }

  if ((dcpl = H5Pcreate(H5P_DATASET_CREATE)) == H5I_INVALID_HID) {
    retval = EXIT_FAILURE;
    goto fail_dcpl;
  }

  if (H5Pset_filter(dcpl, cls.id, 0|H5Z_FLAG_MANDATORY, 0, NULL) < 0 ||
      //H5Pset_alloc_time(dcpl, H5D_ALLOC_TIME_EARLY) < 0 ||
      //H5Pset_fill_time(dcpl, H5D_FILL_TIME_NEVER) < 0 ||
      H5Pset_chunk(dcpl, 1, (hsize_t[]) {1024}) < 0) {
    retval = EXIT_FAILURE;
    goto fail_dset;
  }

  if ((dset = H5Dcreate(file, "dset", H5T_STD_I32LE, dspace, H5P_DEFAULT, dcpl,
                        H5P_DEFAULT)) == H5I_INVALID_HID) {
    retval = EXIT_FAILURE;
    goto fail_dset;
  }

  if (H5Dwrite(dset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, data) < 0) {
    retval = EXIT_FAILURE;
    goto fail_write;
  }

fail_write:
  H5Dclose(dset);

fail_dset:
  H5Pclose(dcpl);

fail_dcpl:
  H5Sclose(dspace);

fail_dspace:
  H5Fclose(file);

fail_file:
  // unregister the filter
  if (H5Zunregister(cls.id) < 0) {
    retval = EXIT_FAILURE;
  }

fail_register:
  return retval;
}

G.


#5

Thanks @gheber

Got chance to talk with Jordan Henderson (Thanks !). He pointed out that it is caused by the parallel I/O lib of HDF5 which enforce the fill. After switching to no-parallel I/O operation (i.e., opening/writing file sequently). It is fine now.

Bin