Hello,
I’m using the H5Py library v3.5.0 to read data from multiple h5 files full of images (using gzip compression).
When trying to use a pytorch dataset with multiple workers to do this my memory usage spikes until my page size is full. When using the same code, only with number of workers on 0, I only use like 2-3 GB which is the expected amount. Below is my dataset code with some stuff marked as dummy since the data is private. I’d think this issue is due to the multiprocessing in combination with h5py but maybe I’m doing something wrong in my code.
class Dataset(Dataset):
def __init__(self, x, y):
self.database_name = "dummy"
self.all_layers = x
self.output = y
super().__init__()
self.index_dict = self.__register_indexes()
def __register_indexes(self):
"""
Returns a dictionary that assigns index
to image location on the hdf5 file
"""
index_dict = {}
counter = 0
for index in range(len(self.all_layers)):
h5_file = h5py.File('dummy.h5', 'r')
for i in range(len(h5_file[self.database_name])):
index_dict[counter] = {
'file': int(self.all_layers[index]), 'index': i, 'output': self.output
}
counter += 1
h5_file.close()
return index_dict
def __getitem__(self, index):
item_info = self.index_dict[index]
file = item_info['file']
group_index = item_info['index']
output = item_info['output']
with h5py.File('dummy.h5', 'r') as h5_file:
return torch.from_numpy(
h5_file[self.database_name][group_index][:][:].astype('uint8')),
torch.tensor(output, dtype=torch.long)