Error reading variable length UTF8 string using H5LT C++ API

I am unable to read the following string using H5LTread_dataset_string API.
Below code works fine for fixed length strings. Variable length seems to the issue.
Is this expected behavior / any suggestions on how to read this using C/C++?

HDF5 “test.h5” {
DATASET “/name” {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
DATA {
(0): “0.1.0”
}
}
}

Python3 code for generating h5 file

import h5py
f=h5py.File(‘test.h5’,‘w’)
dt = h5py.special_dtype(vlen=str)
dset = f.create_dataset(“name”, (1,), dtype=dt)
dset[…]=“0.1.0”
f.close()

This is my C code for reading h5 file:
#include “hdf5.h”
#include “hdf5_hl.h”
int main( void ) {
hid_t file_id;
char version[1024];
file_id = H5Fopen(“test.h5”,H5F_ACC_RDONLY,H5P_DEFAULT);
herr_t status = H5LTread_dataset_string(file_id, “/name”, version);
printf(“version: %s\n”, version);
H5Fclose (file_id);
return 0;
}

Output:
version: ???

Hello!
This is a known bug with H5LTread_dataset_string. (For your reference, the issue is HDFFV-10310, though our bug database is not open to the public.)

On this page, the very last example shows how to create a variable length string with the HDF5 library:
Examples by API

I attached the example to this message:
h5ex_t_vlstring.c (3.7 KB)

(It creates an array of strings, though)

For more information on fixed length and variable length strings, see Chapter 6, section 6.1 in the User’s Guide:

HDF5 User's Guide

-Barbara

1 Like

Thanks for confirming that this is a known bug. This narrows down my problem a lot.

h5ex_t_vlstring.c is one of the things I’ve tried before posting this message. This program writes and reads the following h5 file:
HDF5 “h5ex_t_vlstring.h5” {
DATASET “DS1” {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_SPACEPAD;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 4 ) / ( 4 ) }
DATA {
(0): “Parting”, “is such”, “sweet”, “sorrow.”
}
}
}

The difference seems to be the STRPAD property: H5T_STR_SPACEPAD vs H5T_STR_NULLTERM.

I’m working with a facility that writes out hdf5 files with variable length strings with H5T_STR_NULLTERM. (Default behavior of h5py I believe).

Is there a C++ API for reading H5T_STR_NULLTERM variable string?

You don’t have to do anything special to read strings with different string pads.

The h5ex_t_vlstring.c example specifically created a Fortran string. See line 43:
filetype = H5Tcopy (H5T_FORTRAN_S1);

If you change that to a C string, the string will be null terminated.
filetype = H5Tcopy (H5T_C_S1);

See:
h5strnullt.c (3.7 KB)

Also see, H5Tset_strpad (C) API or H5::StrType::setStrpad for C++ .

1 Like

Thanks Barbara. I’m including code below for reading 3 different types of strings using C++ API for completeness.

Here’s my python code for generating 3 test hdf5 files

  1. variable length UTF8 string
  2. variable length ASCII string
  3. fixed length ASCII string:

import h5py
f=h5py.File(‘utf8.h5’,‘w’)
dt = h5py.special_dtype(vlen=str) #unicode
dset = f.create_dataset(“name”, (1,), dtype=dt)
dset[…]=“0.1.0”
f.close()

f=h5py.File(‘ascii.h5’,‘w’)
dt = h5py.special_dtype(vlen=bytes)
dset = f.create_dataset(“name”, (1,), dtype=dt)
dset[…]=“0.1.0”
f.close()

f=h5py.File(‘ascii_fix.h5’,‘w’)
import numpy as np
dset = f.create_dataset(“name”, (1,), dtype=“S5”)
dset[0]=np.string_(“0.1.0”)
f.close()

Output hdf5 files look like this:

HDF5 “utf8.h5” {
DATASET “name” {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_UTF8;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 1 ) / ( 1 ) }
DATA {
(0): “0.1.0”
}
}
}

HDF5 “ascii.h5” {
DATASET “name” {
DATATYPE H5T_STRING {
STRSIZE H5T_VARIABLE;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 1 ) / ( 1 ) }
DATA {
(0): “0.1.0”
}
}
}

HDF5 “ascii_fix.h5” {
DATASET “name” {
DATATYPE H5T_STRING {
STRSIZE 5;
STRPAD H5T_STR_NULLPAD;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SIMPLE { ( 1 ) / ( 1 ) }
DATA {
(0): “0.1.0”
}
}
}

Here’s the C++ code for reading these files:

#include “hdf5.h”
int main( void )
{
hid_t file_id;
file_id = H5Fopen(“utf8.h5”,H5F_ACC_RDONLY,H5P_DEFAULT);
hid_t dset = H5Dopen(file_id, “name”, H5P_DEFAULT);
hid_t filetype = H5Dget_type(dset);
hid_t space = H5Dget_space(dset);
hsize_t dims[1] = {1};
int ndims = H5Sget_simple_extent_dims(space, dims, NULL);
char rdata = (char) malloc(dims[0]sizeof(char));
hid_t memtype = H5Tcopy(H5T_C_S1);
herr_t status = H5Tset_size(memtype, H5T_VARIABLE);
status = H5Tset_cset(memtype, H5T_CSET_UTF8); // Specify UTF8 here
status = H5Dread(dset, memtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, rdata);
printf(“version: %s\n”, rdata[0]);
H5Fclose (file_id);
return 0;
}

#include “hdf5.h”
int main( void )
{
// Open PAL HDF5 file and read in header information
hid_t file_id;
file_id = H5Fopen(“ascii.h5”,H5F_ACC_RDONLY,H5P_DEFAULT);
if(file_id < 0){
exit(1);
}
hid_t dset = H5Dopen(file_id, “name”, H5P_DEFAULT);
hid_t filetype = H5Dget_type(dset);
hid_t space = H5Dget_space(dset);
hsize_t dims[1] = {1};
int ndims = H5Sget_simple_extent_dims(space, dims, NULL);
char rdata = (char) malloc(dims[0]sizeof(char));
hid_t memtype = H5Tcopy(H5T_C_S1);
herr_t status = H5Tset_size(memtype, H5T_VARIABLE);
status = H5Dread(dset, memtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, rdata);
printf(“version: %s\n”, rdata[0]);
H5Fclose (file_id);
return 0;
}

#include “hdf5.h”
#include “hdf5_hl.h”
int main( void )
{
hid_t file_id;
char version[1024];
file_id = H5Fopen(“ascii_fix.h5”,H5F_ACC_RDONLY,H5P_DEFAULT);
herr_t status = H5LTread_dataset_string(file_id, “/name”, version);
printf(“version: %s\n”, version);
H5Fclose (file_id);
return 0;
}