Hi everyone
I’m encountering an issue when using h5py to read a batch of HDF files. Occasionally, the program hangs when trying to access a specific dataset within a file. Strangely, if I read the same dataset from the same file using a different script or environment, the read operation works just fine.
Here is the code I’m using to check the files:
import h5py
import tqdm
import os
import torch
import numpy as np
import functools
import signal
import glob
import traceback
def timeout(sec):
"""
timeout decorator
:param sec: function raise TimeoutError after ? seconds
"""
def decorator(func):
@functools.wraps(func)
def wrapped_func(*args, **kwargs):
def _handle_timeout(signum, frame):
err_msg = f'Function {func.__name__} timed out after {sec} seconds'
raise TimeoutError(err_msg)
signal.signal(signal.SIGALRM, _handle_timeout)
signal.alarm(sec)
try:
result = func(*args, **kwargs)
finally:
signal.alarm(0)
return result
return wrapped_func
return decorator
@timeout(180)
def read_h5file(fn):
with h5py.File(fn, 'r') as h5f:
data = torch.from_numpy(h5f['data'][:].astype(np.float32))
target = torch.from_numpy(h5f['target'][:].astype(np.float32))
lats = torch.from_numpy(h5f['latitude'][:].astype(np.float32))
lons = torch.from_numpy(h5f['longitude'][:].astype(np.float32))
return
def check_h5fiels(fn_list):
for fn in tqdm.tqdm(fn_list):
try:
read_h5file(fn)
except TimeoutError:
print("Time out reading ", fn)
traceback.print_exc()
pass
except Exception as e:
print(f"Get exception {e} reading {fn}")
if __name__ == "__main__":
dataset_path = "/mnt/md2/FY3E-HIRAS-DATA"
fn_list = glob.glob(os.path.join(dataset_path, "**", "*HDF"))
check_h5fiels(fn_list)
I’ve set a timeout of 180 seconds, which is usually more than enough for normal reads. However, I still get output like this:
Time out reading /mnt/md2/FY3E-HIRAS-DATA/data/20230325_1309.HDF
Traceback (most recent call last):
File "/home/ices/lay/lightning-Swin/check_hdf_files.py", line 48, in check_h5fiels
read_h5file(fn)
File "/home/ices/lay/lightning-Swin/check_hdf_files.py", line 27, in wrapped_func
result = func(*args, **kwargs)
File "/home/ices/lay/lightning-Swin/check_hdf_files.py", line 38, in read_h5file
data = torch.from_numpy(h5f['data'][:].astype(np.float32))
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
File "/home/ices/miniconda3/envs/lightning/lib/python3.10/site-packages/h5py/_hl/dataset.py", line 758, in __getitem__
return self._fast_reader.read(args)
File "/home/ices/lay/lightning-Swin/check_hdf_files.py", line 22, in _handle_timeout
raise TimeoutError(err_msg)
TimeoutError: Function read_h5file timed out after 180 seconds
3%|█▊ | 664/19850 [1:03:32<156:30:42, 29.37s/it]
As shown above, the program hangs when calling h5f[‘data’][:]. I’m not sure what’s causing the hang or how to debug this behavior.
Does anyone have suggestions on how to further investigate or resolve this issue?
Below is my environment:
Ubuntu 22.04.5 LTS
h5py 3.11.0
Thanks in advance!