@jhenderson I’ve applied and tested your patch against HDF 1.10.5, and while it doesn’t crash it now hangs at the same stage. What seems to be happening is that the nodes have gotten out of sync and some hit an MPI section, and others didn’t. I’ve taken stack traces of each process and put them below, apologies for the verbose dump!
Updated with debugging output
Rank 0 (rank 1 is very similar, with just a deeper set of msort calls, probably dependent on when I sample):
#0 0x00007f42cd583b60 in memcpy@GLIBC_2.2.5 () from /cvmfs/soft.computecanada.ca/nix/store/63pk88rnmkzjblpxydvrmskkc8ci7cx6-glibc-2.24/lib/libc.so.6
#1 0x00007f42cd534ce2 in msort_with_tmp.part.0 () from /cvmfs/soft.computecanada.ca/nix/store/63pk88rnmkzjblpxydvrmskkc8ci7cx6-glibc-2.24/lib/libc.so.6
#2 0x00007f42cd534cf8 in msort_with_tmp.part.0 () from /cvmfs/soft.computecanada.ca/nix/store/63pk88rnmkzjblpxydvrmskkc8ci7cx6-glibc-2.24/lib/libc.so.6
#3 0x00007f42cd534ce2 in msort_with_tmp.part.0 () from /cvmfs/soft.computecanada.ca/nix/store/63pk88rnmkzjblpxydvrmskkc8ci7cx6-glibc-2.24/lib/libc.so.6
#4 0x00007f42cd53524f in qsort_r () from /cvmfs/soft.computecanada.ca/nix/store/63pk88rnmkzjblpxydvrmskkc8ci7cx6-glibc-2.24/lib/libc.so.6
#5 0x00000000004df4f0 in H5D__chunk_collective_fill (dset=0x1d66dc0, chunk_info=0x7ffe63a73530, chunk_size=531, fill_buf=0x1eb8ac0) at H5Dchunk.c:4683
#6 0x00000000004de2f6 in H5D__chunk_allocate (io_info=0x7ffe63a73b80, full_overwrite=false, old_dim=0x7ffe63a739d0) at H5Dchunk.c:4402
#7 0x000000000050fdaf in H5D__init_storage (io_info=0x7ffe63a73b80, full_overwrite=false, old_dim=0x7ffe63a739d0) at H5Dint.c:2421
#8 0x000000000050f7ea in H5D__alloc_storage (io_info=0x7ffe63a73b80, time_alloc=H5D_ALLOC_CREATE, full_overwrite=false, old_dim=0x0) at H5Dint.c:2334
#9 0x000000000051cb6b in H5D__layout_oh_create (file=0x1d534b0, oh=0x1db43f0, dset=0x1d66dc0, dapl_id=720575940379279367) at H5Dlayout.c:507
#10 0x000000000050938b in H5D__update_oh_info (file=0x1d534b0, dset=0x1d66dc0, dapl_id=720575940379279367) at H5Dint.c:976
#11 0x000000000050aa0d in H5D__create (file=0x1d534b0, type_id=216172782113783850, space=0x1d57bf0, dcpl_id=720575940379279377, dapl_id=720575940379279367) at H5Dint.c:1277
#12 0x000000000093059d in H5O__dset_create (f=0x1d534b0, _crt_info=0x7ffe63a748d0, obj_loc=0x7ffe63a73f80) at H5Doh.c:299
#13 0x00000000006b6519 in H5O_obj_create (f=0x1d534b0, obj_type=H5O_TYPE_DATASET, crt_info=0x7ffe63a748d0, obj_loc=0x7ffe63a73f80) at H5Oint.c:2452
#14 0x000000000065e722 in H5L__link_cb (grp_loc=0x7ffe63a74640, name=0x7ffe63a741a0 "dset2", lnk=0x0, obj_loc=0x0, _udata=0x7ffe63a747c0, own_loc=0x7ffe63a745ac) at H5L.c:1603
#15 0x00000000005f6a2e in H5G__traverse_real (_loc=0x7ffe63a74980, name=0x97c9ee "dset2", target=0, op=0x65e406 <H5L__link_cb>, op_data=0x7ffe63a747c0) at H5Gtraverse.c:626
#16 0x00000000005f78a1 in H5G_traverse (loc=0x7ffe63a74980, name=0x97c9ee "dset2", target=0, op=0x65e406 <H5L__link_cb>, op_data=0x7ffe63a747c0) at H5Gtraverse.c:850
#17 0x000000000065f30d in H5L__create_real (link_loc=0x7ffe63a74980, link_name=0x97c9ee "dset2", obj_path=0x0, obj_file=0x0, lnk=0x7ffe63a74850, ocrt_info=0x7ffe63a748f0, lcpl_id=720575940379279374) at H5L.c:1797
#18 0x000000000065e3a8 in H5L_link_object (new_loc=0x7ffe63a74980, new_name=0x97c9ee "dset2", ocrt_info=0x7ffe63a748f0, lcpl_id=720575940379279374) at H5L.c:1556
#19 0x0000000000505f0d in H5D__create_named (loc=0x7ffe63a74980, name=0x97c9ee "dset2", type_id=216172782113783850, space=0x1d57bf0, lcpl_id=720575940379279374, dcpl_id=720575940379279377, dapl_id=720575940379279367) at H5Dint.c:328
#20 0x00000000004bf947 in H5Dcreate2 (loc_id=72057594037927936, name=0x97c9ee "dset2", type_id=216172782113783850, space_id=288230376151711746, lcpl_id=720575940379279374, dcpl_id=720575940379279377, dapl_id=720575940379279367) at H5
D.c:144
#21 0x0000000000404003 in main (argc=1, argv=0x7ffe63a74b98) at test_ph5.c:81
Rank 2 (again rank 3 similar):
#0 0x00007f4db850bb46 in psm2_mq_ipeek2 () from /cvmfs/soft.computecanada.ca/nix/var/nix/profiles/16.09/lib/libpsm2.so.2
#1 0x00007f4db8757409 in ompi_mtl_psm2_progress () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/openmpi/mca_mtl_psm2.so
#2 0x00007f4dbd0a4e0b in opal_progress () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libopen-pal.so.40
#3 0x00007f4dbdc46435 in ompi_request_default_wait () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libmpi.so.40
#4 0x00007f4dbdca6303 in ompi_coll_base_sendrecv_actual () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libmpi.so.40
#5 0x00007f4dbdca6739 in ompi_coll_base_allreduce_intra_recursivedoubling () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libmpi.so.40
#6 0x00007f4dbdc5a5b8 in PMPI_Allreduce () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libmpi.so.40
#7 0x00007f4dbdd2aafc in mca_io_romio_dist_MPI_File_set_view () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libmpi.so.40
#8 0x00007f4dbdcfa9ab in mca_io_romio321_file_set_view () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libmpi.so.40
#9 0x00007f4dbdc6ad68 in PMPI_File_set_view () from /cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/gcc8/openmpi/4.0.1/lib/libmpi.so.40
#10 0x000000000090aa05 in H5FD_mpio_write (_file=0x2aee1d0, type=H5FD_MEM_DRAW, dxpl_id=720575940379279368, addr=0, size=1, buf=0x2c52b20) at H5FDmpio.c:1806
#11 0x00000000005b6669 in H5FD_write (file=0x2aee1d0, type=H5FD_MEM_DRAW, addr=0, size=1, buf=0x2c52b20) at H5FDint.c:257
#12 0x0000000000933e3b in H5F__accum_write (f=0x2aee360, map_type=H5FD_MEM_DRAW, addr=0, size=1, buf=0x2c52b20) at H5Faccum.c:825
#13 0x000000000073df2c in H5PB_write (f=0x2aee360, type=H5FD_MEM_DRAW, addr=0, size=1, buf=0x2c52b20) at H5PB.c:1027
#14 0x00000000005885f4 in H5F_block_write (f=0x2aee360, type=H5FD_MEM_DRAW, addr=0, size=1, buf=0x2c52b20) at H5Fio.c:164
#15 0x00000000004dfa65 in H5D__chunk_collective_fill (dset=0x2b012e0, chunk_info=0x7ffc343d85a0, chunk_size=531, fill_buf=0x2c52b20) at H5Dchunk.c:4731
#16 0x00000000004de2f6 in H5D__chunk_allocate (io_info=0x7ffc343d8bf0, full_overwrite=false, old_dim=0x7ffc343d8a40) at H5Dchunk.c:4402
#17 0x000000000050fdaf in H5D__init_storage (io_info=0x7ffc343d8bf0, full_overwrite=false, old_dim=0x7ffc343d8a40) at H5Dint.c:2421
#18 0x000000000050f7ea in H5D__alloc_storage (io_info=0x7ffc343d8bf0, time_alloc=H5D_ALLOC_CREATE, full_overwrite=false, old_dim=0x0) at H5Dint.c:2334
#19 0x000000000051cb6b in H5D__layout_oh_create (file=0x2aee360, oh=0x2b4dac0, dset=0x2b012e0, dapl_id=720575940379279367) at H5Dlayout.c:507
#20 0x000000000050938b in H5D__update_oh_info (file=0x2aee360, dset=0x2b012e0, dapl_id=720575940379279367) at H5Dint.c:976
#21 0x000000000050aa0d in H5D__create (file=0x2aee360, type_id=216172782113783850, space=0x2af2790, dcpl_id=720575940379279377, dapl_id=720575940379279367) at H5Dint.c:1277
#22 0x000000000093059d in H5O__dset_create (f=0x2aee360, _crt_info=0x7ffc343d9940, obj_loc=0x7ffc343d8ff0) at H5Doh.c:299
#23 0x00000000006b6519 in H5O_obj_create (f=0x2aee360, obj_type=H5O_TYPE_DATASET, crt_info=0x7ffc343d9940, obj_loc=0x7ffc343d8ff0) at H5Oint.c:2452
#24 0x000000000065e722 in H5L__link_cb (grp_loc=0x7ffc343d96b0, name=0x7ffc343d9210 "dset2", lnk=0x0, obj_loc=0x0, _udata=0x7ffc343d9830, own_loc=0x7ffc343d961c) at H5L.c:1603
#25 0x00000000005f6a2e in H5G__traverse_real (_loc=0x7ffc343d99f0, name=0x97c9ee "dset2", target=0, op=0x65e406 <H5L__link_cb>, op_data=0x7ffc343d9830) at H5Gtraverse.c:626
#26 0x00000000005f78a1 in H5G_traverse (loc=0x7ffc343d99f0, name=0x97c9ee "dset2", target=0, op=0x65e406 <H5L__link_cb>, op_data=0x7ffc343d9830) at H5Gtraverse.c:850
#27 0x000000000065f30d in H5L__create_real (link_loc=0x7ffc343d99f0, link_name=0x97c9ee "dset2", obj_path=0x0, obj_file=0x0, lnk=0x7ffc343d98c0, ocrt_info=0x7ffc343d9960, lcpl_id=720575940379279374) at H5L.c:1797
#28 0x000000000065e3a8 in H5L_link_object (new_loc=0x7ffc343d99f0, new_name=0x97c9ee "dset2", ocrt_info=0x7ffc343d9960, lcpl_id=720575940379279374) at H5L.c:1556
#29 0x0000000000505f0d in H5D__create_named (loc=0x7ffc343d99f0, name=0x97c9ee "dset2", type_id=216172782113783850, space=0x2af2790, lcpl_id=720575940379279374, dcpl_id=720575940379279377, dapl_id=720575940379279367) at H5Dint.c:328
#30 0x00000000004bf947 in H5Dcreate2 (loc_id=72057594037927936, name=0x97c9ee "dset2", type_id=216172782113783850, space_id=288230376151711746, lcpl_id=720575940379279374, dcpl_id=720575940379279377, dapl_id=720575940379279367) at H5D.c:144
#31 0x0000000000404003 in main (argc=1, argv=0x7ffc343d9c08) at test_ph5.c:81
Hope that’s of some use. Let me know if there’s anything more I can help test out.