Problems writing data sets in parallel: slow or crashing

Hi,

I am attempting to write HDF5 files in parallel but I am having problems. Depending on he compiler and HDF version I am getting different results. In some cases the data will be written out extremely slowly, in others an initial set of dumps will be quick but the code hangs when attempting to write files at a later time. And in even other cases the code runs fine but the data that has been written is corrupted, i.e. some hyperslabs contain incomplete values of values from the wrong process.

I am not sure if I am doing something wrong or if it is a problem to do with the configuration of the software or the hardware. I am not the admin on the machine that I am trying to run the code on so I can't comment to much on the details of the configuration but I did try compiling my own version of the HDF libraries without seeing any improvement.

Below I have pasted the FORTRAN code of my routine for writing an array in parallel. A lot of it was copied from tutorials I found online but I think I understand most of it. However, I am not sure about the specific settings that I am passing to MPI or the values of sieve_buf_size etc. Could these be the cause of the problem?

Any help would be appreciated, including any leads on where to start looking for the problem.

Thanks

Holger

   subroutine write_field(name, field)
     character(LEN=*), intent(in) :: name
     double precision, dimension(x_lo_g:x_hi_g, y_lo_g:x_hi_g, z_lo_g:x_hi_g), intent(in) :: field

     ! [xyz]_lo_g and [xyz]_hi_g are the dimensions of the local grid, including ghost cells
     ! [xyz]_lo_g and [xyz]_hi_g are the dimensions of the inner domain of the local grid, excluding ghost cells
     ! N[xyz] is the size of the global domain, excluding ghost cells

     integer(HID_T) :: plist_id
     integer(HID_T) :: dxpl_id
     integer(HID_T) :: file_id
     integer(HID_T) :: space_id
     integer(HID_T) :: dset_id
     integer(HID_T) :: file_dataspace
     integer(HID_T) :: mem_dataspace

     integer :: mpi_info

     integer :: hdferr, mpierr

     integer(HSIZE_T) :: locdims(3)
     integer(HSIZE_T) :: locdims_g(3)
     integer(HSIZE_T) :: dims(3)
     integer(HSIZE_T) :: start(3)
     integer(HSIZE_T) :: gstart(3)

     integer(8), parameter :: sieve_buf_size = 262144
     integer(8), parameter :: align_threshold = 524288
     integer(8), parameter :: alignment = 262144

     ! call H5Pset_fapl_mpiposix_f(plist_id, MPI_COMM_WORLD, .false., hdferr);

     ! setup file access template
     call H5Pcreate_f(H5P_FILE_ACCESS_F, plist_id, hdferr)

     call H5Pset_sieve_buf_size_f(plist_id, sieve_buf_size, hdferr)
     call H5Pset_alignment_f(plist_id, align_threshold, alignment, hdferr)

     call MPI_Info_create(mpi_info, mpierr)

     call MPI_Info_set(mpi_info, "access_style", "write_once", mpierr)
     call MPI_Info_set(mpi_info, "collective_buffering", "true", mpierr)
     call MPI_Info_set(mpi_info, "cb_block_size", "1048576", mpierr)
     call MPI_Info_set(mpi_info, "cb_buffer_size", "4194304", mpierr)

     ! set Parallel access with communicator
     call H5Pset_fapl_mpio_f(plist_id, MPI_COMM_WORLD, mpi_info, hdferr);

     ! H5Pset_fapl_mpiposix(plist_id, mpiComm, 0);

     ! open the file collectively
     call H5Fcreate_f(name, H5F_ACC_TRUNC_F, file_id, hdferr, H5P_DEFAULT_F, plist_id);
     ! Release file-access template
     call H5Pclose_f(plist_id, hdferr);
     call H5Pcreate_f(H5P_DATASET_XFER_F, dxpl_id, hdferr)
     call H5Pset_dxpl_mpio_f(dxpl_id, H5FD_MPIO_COLLECTIVE_F, hdferr)

     locdims(1) = x_hi-x_lo+1
     locdims(2) = y_hi-y_lo+1
     locdims(3) = z_hi-z_lo+1

     locdims_g(1) = x_hi_g-x_lo_g+1
     locdims_g(2) = y_hi_g-y_lo_g+1
     locdims_g(3) = z_hi_g-z_lo_g+1

     ! The start is the inner region of the domain, excluding the ghost cells
     start(1) = x_lo
     start(2) = y_lo
     start(3) = z_lo

     gstart = ghost

     ! ************************************************
     ! Beginning of parallel HDF output

     dims(1) = Nx
     dims(2) = Ny
     dims(3) = Nz

     ! create a simple dataspace with global dimensions
     call H5Screate_simple_f(3, dims, space_id, hdferr)

     ! create a dataset linked to the file and associate it with the dataspace
     call H5Dcreate_f(file_id, "data", H5T_NATIVE_DOUBLE, space_id, dset_id, &
       hdferr)

     ! makes a copy of the dataspace contained in the file
     ! we need a copy because we are going to modify it by selecting a hyperslab
     call H5Dget_space_f(dset_id, file_dataspace, hdferr)

     ! select a hyperslab in the file dataspace. This is the region in the file
     ! that we will be writing into
     call H5Sselect_hyperslab_f(file_dataspace, H5S_SELECT_SET_F, &
                              start, locdims, hdferr)

     ! create a memory dataspace. This dataspace corresponds to the extent of
     ! the local array
     call H5Screate_simple_f(3, locdims_g, mem_dataspace, hdferr)

     ! select a hyperslab in the memory dataspace. This is the region
     ! excluding the ghost cells
     call H5Sselect_hyperslab_f(mem_dataspace, H5S_SELECT_SET_F, &
                              gstart, locdims, hdferr)

     ! hid_t mem_dataspace = H5Dget_space(dataset);

     ! write data (independently or collectively, depending on dxpl_id)
     call H5Dwrite_f(dset_id, &
                     H5T_NATIVE_DOUBLE, &
                     field, &
                     locdims, &
                     hdferr, &
                     mem_dataspace, &
                     file_dataspace, &
                     dxpl_id)

     ! release dataspace ID
     call H5Sclose_f(mem_dataspace, hdferr)
     call H5Sclose_f(file_dataspace, hdferr)

     call H5Dclose_f(dset_id, hdferr)
     call H5Fclose_f(file_id, hdferr)
   end subroutine write_field

I have in-lined some comments on your code. The crashes and incorrect data are of course the first worry, then we can figure out where the performance concerns are.

What platform are you on? What bandwidth do you expect to see? (either because your administrator has told you, or because you ran a benchmark like IOR to measure)

Are you familiar with the Darshan tool? It's a way to record MPI-IO and POSIX I/O operations, but not by tracing every operation. Instead it collects aggregate statistics. So for example if your application is slow, one reason might be "a billion 4 byte writes". Darshan would capture that, and then we would know either we need to apply some kind of optimization, or that a requested optimization was not used for some reason.

   subroutine write_field(name, field)
     character(LEN=*), intent(in) :: name
     double precision, dimension(x_lo_g:x_hi_g, y_lo_g:x_hi_g,
z_lo_g:x_hi_g), intent(in) :: field

     ! [xyz]_lo_g and [xyz]_hi_g are the dimensions of the local grid,
including ghost cells
     ! [xyz]_lo_g and [xyz]_hi_g are the dimensions of the inner domain
of the local grid, excluding ghost cells
     ! N[xyz] is the size of the global domain, excluding ghost cells

     integer(HID_T) :: plist_id
     integer(HID_T) :: dxpl_id
     integer(HID_T) :: file_id
     integer(HID_T) :: space_id
     integer(HID_T) :: dset_id
     integer(HID_T) :: file_dataspace
     integer(HID_T) :: mem_dataspace

     integer :: mpi_info

     integer :: hdferr, mpierr

     integer(HSIZE_T) :: locdims(3)
     integer(HSIZE_T) :: locdims_g(3)
     integer(HSIZE_T) :: dims(3)
     integer(HSIZE_T) :: start(3)
     integer(HSIZE_T) :: gstart(3)

     integer(8), parameter :: sieve_buf_size = 262144
     integer(8), parameter :: align_threshold = 524288
     integer(8), parameter :: alignment = 262144

     ! call H5Pset_fapl_mpiposix_f(plist_id, MPI_COMM_WORLD, .false.,
hdferr);
     ! setup file access template
     call H5Pcreate_f(H5P_FILE_ACCESS_F, plist_id, hdferr)

     call H5Pset_sieve_buf_size_f(plist_id, sieve_buf_size, hdferr)
     call H5Pset_alignment_f(plist_id, align_threshold, alignment, hdferr)

These are often helpful, but I would not jump in with them right away until you get the crashes and correctness fixed.

     call MPI_Info_create(mpi_info, mpierr)

     call MPI_Info_set(mpi_info, "access_style", "write_once", mpierr)
     call MPI_Info_set(mpi_info, "collective_buffering", "true", mpierr)
     call MPI_Info_set(mpi_info, "cb_block_size", "1048576", mpierr)
     call MPI_Info_set(mpi_info, "cb_buffer_size", "4194304", mpierr)

Likewise, the default MPI-IO hints are probably going to be OK, and you would override them once tuning and profiling has suggested otherwsie. For example, your cb_buffer_size is really tiny on modern systems.

     ! set Parallel access with communicator
     call H5Pset_fapl_mpio_f(plist_id, MPI_COMM_WORLD, mpi_info, hdferr);

     ! H5Pset_fapl_mpiposix(plist_id, mpiComm, 0);

     ! open the file collectively
     call H5Fcreate_f(name, H5F_ACC_TRUNC_F, file_id, hdferr,
H5P_DEFAULT_F, plist_id);
     ! Release file-access template
     call H5Pclose_f(plist_id, hdferr);
     call H5Pcreate_f(H5P_DATASET_XFER_F, dxpl_id, hdferr)
     call H5Pset_dxpl_mpio_f(dxpl_id, H5FD_MPIO_COLLECTIVE_F, hdferr)

good, you've opened the dataset collectively and set up the subsequent transfers to also use MPI-IO collectives.

     locdims(1) = x_hi-x_lo+1
     locdims(2) = y_hi-y_lo+1
     locdims(3) = z_hi-z_lo+1

     locdims_g(1) = x_hi_g-x_lo_g+1
     locdims_g(2) = y_hi_g-y_lo_g+1
     locdims_g(3) = z_hi_g-z_lo_g+1

how big are these arrays typically?

     ! The start is the inner region of the domain, excluding the ghost
cells
     start(1) = x_lo
     start(2) = y_lo
     start(3) = z_lo

     gstart = ghost

     ! ************************************************
     ! Beginning of parallel HDF output

     dims(1) = Nx
     dims(2) = Ny
     dims(3) = Nz

     ! create a simple dataspace with global dimensions
     call H5Screate_simple_f(3, dims, space_id, hdferr)

     ! create a dataset linked to the file and associate it with the
dataspace
     call H5Dcreate_f(file_id, "data", H5T_NATIVE_DOUBLE, space_id,
dset_id, &
       hdferr)

     ! makes a copy of the dataspace contained in the file
     ! we need a copy because we are going to modify it by selecting a
hyperslab
     call H5Dget_space_f(dset_id, file_dataspace, hdferr)

     ! select a hyperslab in the file dataspace. This is the region in
the file
     ! that we will be writing into
     call H5Sselect_hyperslab_f(file_dataspace, H5S_SELECT_SET_F, &
                              start, locdims, hdferr)

     ! create a memory dataspace. This dataspace corresponds to the
extent of
     ! the local array
     call H5Screate_simple_f(3, locdims_g, mem_dataspace, hdferr)

     ! select a hyperslab in the memory dataspace. This is the region
     ! excluding the ghost cells
     call H5Sselect_hyperslab_f(mem_dataspace, H5S_SELECT_SET_F, &
                              gstart, locdims, hdferr)

     ! hid_t mem_dataspace = H5Dget_space(dataset);

     ! write data (independently or collectively, depending on dxpl_id)
     call H5Dwrite_f(dset_id, &
                     H5T_NATIVE_DOUBLE, &
                     field, &
                     locdims, &
                     hdferr, &
                     mem_dataspace, &
                     file_dataspace, &
                     dxpl_id)

     ! release dataspace ID
     call H5Sclose_f(mem_dataspace, hdferr)
     call H5Sclose_f(file_dataspace, hdferr)

     call H5Dclose_f(dset_id, hdferr)
     call H5Fclose_f(file_id, hdferr)
   end subroutine write_field

This all looks pretty standard.

==rob

···

On 03/13/2015 06:29 AM, Holger Schmitz wrote:

--
Rob Latham
Mathematics and Computer Science Division
Argonne National Lab, IL USA