Extendable Datasets

Hi all! I’m having trouble extending a 1D dataset using the C++ API. I have some code based on the demo here, but it is producing unexpected results. Here is my code:

#include <H5Cpp.h>
#include <hdf5.h>

using namespace H5;    
int main(int argc, char* argv[]) {
  const int DIM0 = 4;
  const int EDIM0 = 6;
  const int CHUNK0 = 1;
  const std::string FILE = "/tmp/test.h5";
  const std::string DATASET = "DS1";

  hid_t file, space, dset, dcpl; /* Handles */
  herr_t status;
  hsize_t dims[1] = {DIM0};
  hsize_t extdims[1] = {EDIM0 + DIM0};
  hsize_t maxdims[1];
  hsize_t chunk[1] = {CHUNK0};
  hsize_t start[1];
  hsize_t count[1];
  int wdata[DIM0];
  int wdata2[EDIM0];

  /*
   * Fill Dataset
   */
  for (int i = 0; i < DIM0; i++)
    wdata[i] = i + 1;

  file = H5Fcreate(FILE.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

  maxdims[0] = H5S_UNLIMITED;
  space = H5Screate_simple(1, dims, maxdims);
  dcpl = H5Pcreate(H5P_DATASET_CREATE);
  status = H5Pset_chunk(dcpl, 1, chunk);

  dset = H5Dcreate(file, DATASET.c_str(), H5T_STD_I32LE, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);

  status = H5Dwrite(dset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &wdata[0]);

  /*
   * Extend the dataset.
   */
  status = H5Dset_extent(dset, extdims);
  space = H5Dget_space(dset);
  for (int i = 0; i < EDIM0; i++)
    wdata2[i] = 100 - (i + 1);

  status = H5Sselect_all(space);

  /*
   * Subtract a hyperslab reflecting the original dimensions from the
   * selection.  The selection now contains only the newly extended
   * portions of the dataset.
   */
  start[0] = DIM0;
  count[0] = EDIM0;
  status = H5Sselect_hyperslab(space, H5S_SELECT_SET, start, NULL, count, NULL);

  status = H5Dwrite(dset, H5T_NATIVE_INT, H5S_ALL, space, H5P_DEFAULT, &wdata2[0]);

  status = H5Dclose(dset);
  status = H5Sclose(space);
  status = H5Fclose(file);
}

Now what I’m expecting is a dataset called “DS1” with the following data:
[1,2,3,4,99,98,97,96,95,94]
where the [1,2,3,4] is the ‘original’ data and the [99,98,97,96,95,94] is the “extension”. But what I’m getting is:
[1,2,3,4,95,94,79149008,0,4,0].
It’s as if when I extend the dataset, it starts in position 0, but only starts writing once it gets to the empty part of the array and then continues to write garbage memory from beyond the array bounds until it gets to the end of the dataset extension. I’m pretty sure I’m making a mistake with H5Sselect_hyperslab, but I’m not sure what.

Thanks!

Probably you want to roll you own, which is a good thing: engages the mind; but in case you are looking for a performant solution:

#include <iostream>
#include <vector>
#include "struct.h"
#include <h5cpp/core>
	// generated file must be sandwiched between core and io 
	// to satisfy template dependencies in <h5cpp/io>  
	#include "generated.h"
#include <h5cpp/io>


int main(){
	h5::fd_t fd = h5::create("test.h5", H5F_ACC_TRUNC);
	{ // this is to create the dataset
		h5::ds_t ds = h5::create<sn::record_t>(fd, "/path/dataset", h5::max_dims{H5S_UNLIMITED} );
		// vector of strings as attribute:
		ds["attribute"] = {"first","second","...","last"};
		
		h5::pt_t pt = ds; // convert to packet table, you could go straight from vector as well
		for(int i=0; i<3; i++)
			h5::append(pt,
			// this is your pod struct 
			sn::record_t{1.0 * i, 2.0 *i ,{1,2,3,4,5},{11,12,13,14,15}});
	}

	{ // read entire dataset back
		h5::ds_t ds = h5::open(fd, "/path/dataset");

		std::vector<std::string> attribute = h5::aread<
			std::vector<std::string>>(ds, "attribute");
		std::cout << attribute <<std::endl;
		// dump data
		for( auto rec: h5::read<std::vector<sn::record_t>>(ds, "/path/dataset")) // this is your HPC loop
			std::cerr << rec.A <<" ";
		std::cerr << std::endl;
	}
}

Similar has been answered strean of compound datatype, another (earlier) …, stream of integrals with clang 11, efficient CSV to HDF5, packet table with long double

#ifndef  H5TEST_STRUCT_01 
#define  H5TEST_STRUCT_01

namespace sn {
	struct record_t {     // POD struct with nested namespace
		double A;
		double B;
		double array_00[5];
		double array_01[5];
	};
}
#endif

and the generated H5CPP shim:

#ifndef H5CPP_GUARD_NKohX
#define H5CPP_GUARD_NKohX

namespace h5{
    //template specialization of sn::record_t to create HDF5 COMPOUND type
    template<> hid_t inline register_struct<sn::record_t>(){
        hsize_t at_00_[] ={5};            hid_t at_00 = H5Tarray_create(H5T_NATIVE_DOUBLE,1,at_00_);
        hsize_t at_01_[] ={5};            hid_t at_01 = H5Tarray_create(H5T_NATIVE_DOUBLE,1,at_01_);

        hid_t ct_00 = H5Tcreate(H5T_COMPOUND, sizeof (sn::record_t));
        H5Tinsert(ct_00, "A",	HOFFSET(sn::record_t,A),H5T_NATIVE_DOUBLE);
        H5Tinsert(ct_00, "B",	HOFFSET(sn::record_t,B),H5T_NATIVE_DOUBLE);
        H5Tinsert(ct_00, "array_00",	HOFFSET(sn::record_t,array_00),at_00);
        H5Tinsert(ct_00, "array_01",	HOFFSET(sn::record_t,array_01),at_01);

        //closing all hid_t allocations to prevent resource leakage
        H5Tclose(at_00); H5Tclose(at_01); 

        //if not used with h5cpp framework, but as a standalone code generator then
        //the returned 'hid_t ct_00' must be closed: H5Tclose(ct_00);
        return ct_00;
    };
}
H5CPP_REGISTER_STRUCT(sn::record_t);

#endif

best wishes: steven@h5cpp

Hi @timo.stoffregen,

In case you are not bound to requirements and are open to experiment another paradigm, you may want to try out HDFql, a declarative programming language, which considerably alleviates users from HDF5 low-level details. Your use-case could be solved as follows in C++ using HDFql:

// declare variables
int wdata[4];
int wdata2[6];
int i;

// create an HDF5 file named '/tmp/test.h5' and use (i.e. open) it
HDFql::execute("CREATE AND USE FILE /tmp/test.h5");

// create an extendible dataset named 'DS1' as a one dimensional array of type integer (with an initial size equal to 4)
HDFql::execute("CREATE DATASET DS1 AS INT(4 TO UNLIMITED)");

// populate variable 'wdata'
for(i = 0; i < 4; i++)
{
    wdata[i] = i + 1;
}

// register variable 'wdata' for subsequent use (by HDFql)
HDFql::variableTransientRegister(&wdata);

// insert (i.e. write) values stored in 'wdata' into the first 4 positions of dataset 'DS1'
HDFql::execute("INSERT INTO DS1 VALUES FROM MEMORY 0");

// alter (i.e. change) dimension of dataset 'DS1' to +6 (i.e. size equal to 10)
HDFql::execute("ALTER DIMENSION DS1 TO +6");

// populate variable 'wdata2'
for(i = 0; i < 6; i++)
{
    wdata2[i] = 100 - (i + 1);
}

// register variable 'wdata2' for subsequent use (by HDFql)
HDFql::variableTransientRegister(&wdata2);

// insert (i.e. write) values stored in 'wdata2' into the 6 subsequent positions of dataset 'DS1' using an hyperslab
HDFql::execute("INSERT INTO DS1(4:::) VALUES FROM MEMORY 0");

Hope it helps!

There are several issues with your version:

  1. Compared to the example, you changed the semantics of EDIM0, and that’s where the trouble essentially comes from.
  2. Your wdata2 array is too small to use the masking trick.
  3. Before calling space = H5Dget_space(dset); you should H5Sclose(space);, which otherwise is a handle leak.
  4. In H5Sselect_hyperslab, it must be H5S_SELECT_NOTB as in the original example.
#include "hdf5.h"

int main(int argc, char* argv[]) {
  const int DIM0 = 4;
  const int EDIM0 = 6;
  const int CHUNK0 = 1;
  const char FILE[] = "/tmp/test.h5";
  const char DATASET[] = "DS1";

  hid_t file, space, dset, dcpl; /* Handles */
  herr_t status;
  hsize_t dims[1] = {DIM0};
  hsize_t extdims[1] = {EDIM0 + DIM0};
  hsize_t maxdims[1];
  hsize_t chunk[1] = {CHUNK0};
  hsize_t start[1];
  hsize_t count[1];
  int wdata[DIM0];
  int wdata2[EDIM0 + DIM0];

  /*
   * Fill Dataset
   */
  for (int i = 0; i < DIM0; i++)
    wdata[i] = i + 1;

  file = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

  maxdims[0] = H5S_UNLIMITED;
  space = H5Screate_simple(1, dims, maxdims);
  dcpl = H5Pcreate(H5P_DATASET_CREATE);
  status = H5Pset_chunk(dcpl, 1, chunk);

  dset = H5Dcreate(file, DATASET, H5T_STD_I32LE, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);

  H5Sclose(space);

  status = H5Dwrite(dset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, &wdata[0]);

  /*
   * Extend the dataset.
   */
  status = H5Dset_extent(dset, extdims);
  space = H5Dget_space(dset);
  for (int i = 0; i < EDIM0; i++)
    wdata2[i+DIM0] = 100 - (i + 1);

  status = H5Sselect_all(space);

  /*
   * Subtract a hyperslab reflecting the original dimensions from the
   * selection.  The selection now contains only the newly extended
   * portions of the dataset.
   */
  start[0] = 0;
  count[0] = DIM0;
  status = H5Sselect_hyperslab(space, H5S_SELECT_NOTB, start, NULL, count, NULL);

  status = H5Dwrite(dset, H5T_NATIVE_INT, H5S_ALL, space, H5P_DEFAULT, &wdata2[0]);

  status = H5Dclose(dset);
  status = H5Sclose(space);
  status = H5Fclose(file);
}

OK?
G.

For completeness: Your example does NOT use any C++ HDF5 API, so the include and using are superfluous.

Thanks to @steven.varga and @contact for their genuine C++ submissions.

G.

2 Likes

If you want wdata2 just to be the size of the increment, this might work for you:

#include <hdf5.h>

int main(int argc, char* argv[]) {
  const int DIM0 = 4;
  const int EDIM0 = 6;
  const int CHUNK0 = 1;
  const char FILE[] = "/tmp/test.h5";
  const char DATASET[] = "DS1";

  hid_t file, fspace, mspace, dset, dcpl; /* Handles */
  herr_t status;
  hsize_t dims[1] = {DIM0};
  hsize_t extdims[1] = {EDIM0 + DIM0};
  hsize_t maxdims[1];
  hsize_t chunk[1] = {CHUNK0};
  hsize_t start[1];
  hsize_t count[1];
  int wdata[DIM0];
  int wdata2[EDIM0];

  /*
   * Fill Dataset
   */
  for (int i = 0; i < DIM0; i++)
    wdata[i] = i + 1;

  file = H5Fcreate(FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);

  maxdims[0] = H5S_UNLIMITED;
  fspace = H5Screate_simple(1, dims, maxdims);
  dcpl = H5Pcreate(H5P_DATASET_CREATE);
  status = H5Pset_chunk(dcpl, 1, chunk);

  dset = H5Dcreate(file, DATASET, H5T_STD_I32LE, fspace, H5P_DEFAULT, dcpl, H5P_DEFAULT);
  H5Sclose(fspace);

  status = H5Dwrite(dset, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, wdata);

  /*
   * Extend the dataset.
   */
  status = H5Dset_extent(dset, extdims);
  fspace = H5Dget_space(dset);
  for (int i = 0; i < EDIM0; i++)
    wdata2[i] = 100 - (i + 1);

  start[0] = DIM0;
  count[0] = EDIM0;
  status = H5Sselect_hyperslab(fspace, H5S_SELECT_SET, start, NULL, count, NULL);

  mspace = H5Screate_simple(1, (hsize_t[]) {EDIM0}, NULL);
  H5Sselect_all(mspace);

  status = H5Dwrite(dset, H5T_NATIVE_INT, mspace, fspace, H5P_DEFAULT, wdata2);

  status = H5Sclose(mspace);
  status = H5Dclose(dset);
  status = H5Sclose(fspace);
  status = H5Fclose(file);
}

Notice that H5S_ALL will not work for the second H5Dwrite.

G.

1 Like

Hi! I have used hdfql in the past, however I can’t have this additional dependency in this situation. It is a helpful library though!

1 Like

This solves my issue. Thank you very much for taking the time to answer my question so thoroughly. Why does H5S_ALL not work? I can verify that it doesn’t, but there are no clues in the documentation as to why.