Beginers Question on unlimted datasets

i am struggling with creating an unlimted 1 dimensional dataset that I write in chunks.

the following Mimimal Working Example, works correctly for the initial chunk write, but fails for subsequent chunks, with the following error.

HDF5-DIAG: Error detected in HDF5 (1.10.7) thread 1:
  #000: ../../../src/H5Dio.c line 319 in H5Dwrite(): could not get a validated dataspace from mem_space_id
major: Invalid arguments to routine
minor: Bad value
  #001: ../../../src/H5S.c line 257 in H5S_get_validated_dataspace(): selection + offset not within extent
major: Dataspace
minor: Out of range
terminate called after throwing an instance of 'H5::DataSetIException'
[1]    426631 IOT instruction (core dumped)  

I am sure what I am doing wrong is simple, but I cant for the life of me see it, any help / code corrections are gratefully received.

Code below compiles correctly on Ubuntu Linux, with ubuntu bundled HDF5 dev libraries

#include <iostream>
#include <string>
#include <vector>
#include "H5Cpp.h"

using namespace H5;
const H5std_string FILE_NAME("custom_data.h5");

// Define a struct to represent a data record
struct DataRecord {
    int id;
    double latitude;
    double longitude;
    double confidence;
    std::string csquare;
};

int main() {
    // Create a new HDF5 file
    std::string dataset_name = "my_dataset";
    hsize_t record_count = 100;

    H5File file(FILE_NAME, H5F_ACC_TRUNC);
    std::cout << " created hdf5" << std::endl;

    hsize_t chunk_dims[] = {record_count};

    hsize_t dims[] = {0};
    hsize_t max_dims[] = {H5S_UNLIMITED};
    DataSpace dataspace(1, dims, max_dims);

    std::cout << " created dataspace" << std::endl;

    

    std::cout << " created data" << std::endl;
    CompType datatype(sizeof(DataRecord));
    datatype.insertMember("id", HOFFSET(DataRecord, id), PredType::NATIVE_INT);
    datatype.insertMember("latitude", HOFFSET(DataRecord, latitude), PredType::NATIVE_DOUBLE);
    datatype.insertMember("longitude", HOFFSET(DataRecord, longitude), PredType::NATIVE_DOUBLE);
    datatype.insertMember("confidence", HOFFSET(DataRecord, confidence), PredType::NATIVE_DOUBLE);
    datatype.insertMember("csquare", HOFFSET(DataRecord, csquare), StrType(PredType::C_S1, H5T_VARIABLE));
    std::cout << " created datatype" << std::endl;
    
    DSetCreatPropList create_params;
    hsize_t chunk_size[] = {100};
    create_params.setChunk(1,chunk_size);

    // Create the dataset
    DataSet dataset = file.createDataSet(dataset_name, datatype, dataspace, create_params);

    std::cout << " created dataset" << std::endl;
    // Stream the records to the dataset
    
    int num_records {0};
    hsize_t current_size[] = {0};
    hsize_t offset[] = {0};
    int chunk_num = 0;
    std::vector<DataRecord> data(record_count);
    for (int i = 0; i < 1000; i++) {
        // Generate some fake data
        data[i%100].id = i;
        data[i%100].latitude = i +((double) 1/i);
        data[i%100].longitude = i + ((double) 1/i);
        data[i%100].confidence = 1/((double) i);
        data[i%100].csquare = "0123456789012345678";
 
        // Add the record to the buffer


        // Write the buffer to the dataset if it is full
        if (num_records == record_count) {

            current_size[0] = (chunk_num+1) * record_count; // make sure we allways have lots of space
            dataset.extend(current_size);
            std::cout << " extended ok "<<std::endl;
            // Select the portion of the file dataspace and memory dataspace
            // that correspond to the data buffer
            offset[0] = chunk_num * record_count;
            DataSpace memspace(1, &record_count);
            std::cout << " created memspace"<<std::endl;
            memspace.selectHyperslab(H5S_SELECT_SET, &record_count, offset);
            std::cout << "selected Hyperslab"<<std::endl;
            DataSpace filespace = dataset.getSpace();
            std::cout << "created filespace"<<std::endl;
            filespace.selectHyperslab(H5S_SELECT_SET, &record_count, offset);
            std::cout << "selected Hyperslab"<<std::endl;
            // Write the data buffer to the dataset
            dataset.write(data.data(), datatype, memspace, filespace);
            chunk_num++;
            num_records = 0;
            std::cout << "written "<<i<<" onto chunk "<< chunk_num<< std::endl;

            // lets wipe the vector
            data.clear();
            data.resize(record_count);
            
      } else {
        num_records++;
      }
    }

    dataset.close();
    file.close();
    
    return 0;
}

Hi @joh3,

Not sure how the library you use works but, in case you are not bound to requirements, you may want to check HDFql as it greatly simplifies how HDF5 is handled. Looking at the posted code, your use-case could be solved as follow in C++ with HDFql:

// include HDFql C++ header file
#include "HDFql.hpp"

// define structure
struct DataRecord
{
    int id;
    double latitude;
    double longitude;
    double confidence;
    char csquare[19];
};

int main(int argc, char *argv[])
{
    // declare variables
    std::vector<DataRecord> data(100);
    std::stringstream script;
    int i;

    // create HDF5 file 'custom_data.h5' and use (i.e. open) it
    HDFql::execute("CREATE AND USE FILE custom_data.h5");

    // prepare script that creates an extendible dataset 'my_dataset' of unlimited size
    script << "CREATE DATASET my_dataset AS COMPOUND(id AS INT OFFSET " << offsetof(struct DataRecord, id);
    script << ", latitude AS DOUBLE OFFSET " << offsetof(struct DataRecord, latitude);
    script << ", longitude AS DOUBLE OFFSET " << offsetof(struct DataRecord, longitude);
    script << ", confidence AS DOUBLE OFFSET " << offsetof(struct DataRecord, confidence);
    script << ", csquare AS CHAR(19) OFFSET " << offsetof(struct DataRecord, csquare); << ")(UNLIMITED)";

    // execute script
    HDFql::execute(script);

    // register variable 'data' for subsequent usage (by HDFql)
    HDFql::variableRegister(data);

    // populate dataset 'my_dataset' with dummy data
    for(i = 0; i < 1000; i++)
    {
        // generate dummy data
        data[i % 100].id = i;
        data[i % 100].latitude = i + ((double) 1 / i);
        data[i % 100].longitude = i + ((double) 1 / i);
        data[i % 100].confidence = 1 / ((double) i);
        strcpy(data[i % 100].csquare, "0123456789012345678");

        if ((i + 1) % 100 == 0)
        {
            // alter (i.e. extend) dataset 'my_dataset' by 100 additional positions (or rows)
            HDFql::execute("ALTER DIMENSION my_dataset TO +100");

            // write dummy data into dataset 'my_dataset' using an hyperslab
            HDFql::execute("INSERT INTO my_dataset(-100:::) VALUES FROM MEMORY 0");
        }
    }

    // unregister variable 'data' as it is no longer needed/used
    HDFql::variableUnregister(data);

    // close HDF5 file
    HDFql::execute("CLOSE FILE");

    return 0;
}

Hope it helps!

2 Likes

Many thanks, this looks Awesome, i’ll check it out in depth
Joe

1 Like