Parallel writing compressed data stuck

inchinp · August 31, 2022, 6:59pm

Good day,

I am trying to write a compressed data in parallel, each core to a separate dataset. I tested it on 360 cores and it works fine. Now I move to higher number of cores and it just stucks during writing.

I think it is some problem with parallel writing simultaneously to the file, but I can’t realize where the problem is, having in mind that at comparatively small amount of cores it works fine. I use HDF5 1.12 or 1.10.

Thank you for any advice,

!     =====================================================
      subroutine out2(meqn,mbc,mx,my,&
     &     xlower,ylower,dx,dy,q,t,iframe)
!     =====================================================

     use mpi
     use hdf5
     implicit double precision (a-h,o-z)

     !------------------ HDF variables ------------------!
     integer(hid_t) :: plist_id      ! property list identifier
     integer(hid_t) :: dcpl          ! property list identifier
     integer(hid_t) :: file_id       ! file identifier
     integer(hid_t) :: dataset_id    ! dataset identifier
     integer(hid_t) :: dataspace_id  ! dataspace identifier
     double precision, dimension(3) :: attr_datacur
     INTEGER(HSIZE_T), DIMENSION(1) :: adims = (/3/) ! Attribute dimension
     INTEGER(HSIZE_T), DIMENSION(1) :: data_dims
     INTEGER(HID_T) :: attr_id ! Attribute identifier
     INTEGER(HID_T) :: aspace_id ! Attribute dataspace identifier
     INTEGER(HID_T) :: atype_id ! Attribute dataspace identifier
     INTEGER :: arank = 1 ! Attribute rank
     integer(hid_t) :: filespace, memspace, memd
     !---------------------------------------------------!

     !------------------ Filter variables ---------------!
     double precision, allocatable :: data(:,:,:)           ! data
     integer(hsize_t), dimension(3) :: cdims = (/1,1,1/) ! chunks data dimensions
     !INTEGER :: szip_options_mask
     !INTEGER :: szip_pixels_per_block
     !---------------------------------------------------!

      parameter   (nDim = 2)

     !------------------ miscellaneous ------------------!
     character(len=3) :: c                                     ! dataset name for specific rank
     character(len=10) :: dataset_name
     integer :: rank = 3                                       ! data rank. q is 4D
     character(mpi_max_processor_name) hostname
     dimension q(meqn, 1-mbc:mx+mbc, 1-mbc:my+mbc), mtotal(nDim)
     integer(hsize_t), dimension(3) :: dimsf ! data dataset dimensions
     integer :: i,j,k,l,m,info,idd
     character*20 fname
     common /mpicomm/ mpi_comm_2d, lx, ly, mtotal, mstart
     common /mpi_proc_info/ np, id
     !---------------------------------------------------!

     ! initialize HDF5 fortran interface
    call h5open_f(ierr)
    ngrids_out = 1

    ! define size of q for every core (m,i,j)
    dimsf(1) = meqn
    dimsf(2) = mx
    dimsf(3) = my

    allocate (data(dimsf(1),dimsf(2),dimsf(3)))

     info = mpi_info_null

     fname = 'fort.q' &
     & // char(ichar('0') + mod(iframe/1000,10)) &
     & // char(ichar('0') + mod(iframe/100,10)) &
     & // char(ichar('0') + mod(iframe/10,10)) &
     & // char(ichar('0') + mod(iframe,10)) &
     & // '.h5'

      ! Check data for very small values and
      do j=1,my
      do i=1,mx
      do m=1,meqn
      data(m,i,j) = q(m,i,j)
      end do
      end do
      end do

    ! have id 0 creates hdf5 data layout and write all attributes
    if (id == 0) then

     ! create datatype for the attribute
	call h5tcopy_f(h5t_native_double,atype_id,ierr)

    cdims(1) = dimsf(1)
    cdims(2) = dimsf(2)
    cdims(3) = dimsf(3)

    	 ! create scalar dataspace for the attribute
    	 call h5screate_simple_f(arank,adims,aspace_id, ierr)

         ! create the hdf5 file
         call h5fcreate_f(fname, h5f_acc_trunc_f, file_id, ierr)

         ! create the dataspace for the dataset
         call h5screate_simple_f(rank, dimsf, dataspace_id, ierr)

         ! create properties variable for the data
         call h5pcreate_f(h5p_dataset_create_f, dcpl, ierr)

         ! attribute the chunk size
         call h5pset_chunk_f(dcpl, 3, cdims, ierr)

         ! attribute the compression type (GZIP compression)
         call h5pset_deflate_f(dcpl, 6, ierr)

         ! attribute time of allocation of space for data in datasets
         call h5pset_alloc_time_f(dcpl, h5d_alloc_time_early_f, ierr)

! create name for every dataset
   do i=1,np

         write(c,"(i0)") i
         dataset_name = "Pid" // trim(c)

         ! create dataset for this processor (based on id)
         call h5dcreate_f(file_id, dataset_name, h5t_native_double, &
                             dataspace_id, dataset_id, ierr, dcpl_id=dcpl)


	     if (i == 1) then
         ! Attributes list is created only for MASTER

	    attr_datacur(1) = ngrids_out
        attr_datacur(2) = nDim
        attr_datacur(3) = t

         call h5acreate_f(dataset_id,"Parameters",atype_id,aspace_id,attr_id, ierr)

         data_dims(1) = 25
         call h5awrite_f(attr_id, atype_id, attr_datacur, data_dims, ierr)

         ! close attribute
         call h5aclose_f(attr_id, ierr)

         ! close access to the dataspace for attribute
         call h5sclose_f(aspace_id, ierr)

         call h5tclose_f(atype_id, ierr)

         end if

         ! close dataset
         call h5dclose_f(dataset_id, ierr)



   enddo

         ! close the dataspace
         call h5sclose_f(dataspace_id, ierr)

         ! close the properties variable
         call h5pclose_f(dcpl, ierr)

         ! close the file
         call h5fclose_f(file_id, ierr)
   end if

      ! mpi barrier to make sure everything is synched
      call mpi_barrier(mpi_comm_2d, ierr)

      ! Now every processor is writing its own data to its dataset

      ! setup file access property variable with parallel i/o access
     call h5pcreate_f(h5p_file_access_f, plist_id, ierr)
     call h5pset_fapl_mpio_f(plist_id, mpi_comm_2d, info, ierr)

     ! open hdf5 file for current time
     call h5fopen_f(fname, h5f_acc_rdwr_f, file_id, ierr, plist_id)

     ! close the property list
     call h5pclose_f(plist_id, ierr)

     ! create properties variable
     call h5pcreate_f(h5p_dataset_xfer_f, plist_id, ierr)
     ! set collective mpio model
     call h5pset_dxpl_mpio_f(plist_id, h5fd_mpio_collective_f, ierr)

     ! Parallel compression requires collective writing
     do i=1,np

     write(c,"(i0)") i
     dataset_name = "Pid" // trim(c)

     ! open dataset (each processor opens its own dataset)
     call h5dopen_f(file_id, dataset_name, dataset_id, ierr)
	 call h5dget_space_f(dataset_id,filespace,ierr)

     if (id /= i-1) then
     call h5sselect_none_f(filespace, ierr)
     end if

     ! write data to dataset
     call h5dwrite_f(dataset_id, h5t_native_double, data, &
     & dimsf, ierr, file_space_id = filespace, xfer_prp = plist_id)

     call h5dclose_f(dataset_id,ierr)

     enddo

     call h5sclose_f(filespace, ierr)

     call h5pclose_f(plist_id, ierr)
     call h5fclose_f(file_id, ierr)

	 deallocate(data)

     ! close fortran interface
     call h5close_f(ierr)

     return
     end

brtnfld · September 2, 2022, 5:42pm

A couple of things:

You don’t need to copy atype_id. Just use h5t_native_double.
Scaling the number of datasets with the number of ranks is usually not the most fantastic I/O pattern for a parallel filesystem. It will significantly increase the metadata in HDF5 as you increase the number of ranks (especially since you are using chunked datasets). Is there a reason not to use a single dataset and use hyperslab selections?
The issue might be early allocation. Can you remove that? When that is removed, you should use H5Pset_fill_time with H5D_FILL_TIME_NEVER. In HDF5, in sequential mode, HDF5 allocates chunks incrementally, i.e., when data is written to a chunk for the first time. The chunk is also initialized with the default or user-provided fill value. In the parallel case (with the MPI IO VFD), chunks are always allocated when the dataset is created (not incrementally). So given that the dataset was created sequentially, it will not get allocated until the dataset is opened. It will then allocate the dataset, write the fill values, and compress the dataset when it is opened. This will cause a terrible slowdown. It is even worse in this case since the scale of the number of datasets increases with the number of ranks.
Since you are using parallel compression, you will want to use the latest versions in 1.10,1.12, or 1.13 since parallel compression is continuously improving. Which exact versions did you try?
It seems you can move to creating/writing the attribute when you create the datasets on one rank—no reason to have all the ranks involved in that.