I could upload my the data on the server with the proposed way.
Now I’m looking for the solution of my problem described in other topic.
I thought that HSDS approach could improve perfomance but with the opportunities provided by the Kita Lab (4 nodes) it took 250 seconds to read 500 arbitrary rows (on my personal PC I read 5000 rows in 90 seconds as posted in the question, probably this is limited by my HDD disk). But the goal is to do that in about 1 second if possible.
Does HSDS spread the data among multiple nodes so I can expect that reading selected data may be faster than speed limits of a single HDD disk?
Is there any preffered way to solve my task?
For now I get an exception when trying to read each 5000th row from 25 millions rows data:
arr=dset[0:25000000:5000, :]
---------------------------------------------------------------------------
timeout Traceback (most recent call last)
/opt/conda/lib/python3.9/site-packages/urllib3/response.py in _error_catcher(self)
437 try:
--> 438 yield
439
/opt/conda/lib/python3.9/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
518 cache_content = False
--> 519 data = self._fp.read(amt) if not fp_closed else b""
520 if (
/opt/conda/lib/python3.9/http/client.py in read(self, amt)
457 b = bytearray(amt)
--> 458 n = self.readinto(b)
459 return memoryview(b)[:n].tobytes()
/opt/conda/lib/python3.9/http/client.py in readinto(self, b)
501 # (for example, reading in 1k chunks)
--> 502 n = self.fp.readinto(b)
503 if not n and b:
/opt/conda/lib/python3.9/socket.py in readinto(self, b)
703 try:
--> 704 return self._sock.recv_into(b)
705 except timeout:
timeout: timed out
During handling of the above exception, another exception occurred:
ReadTimeoutError Traceback (most recent call last)
/opt/conda/lib/python3.9/site-packages/requests/models.py in generate()
752 try:
--> 753 for chunk in self.raw.stream(chunk_size, decode_content=True):
754 yield chunk
/opt/conda/lib/python3.9/site-packages/urllib3/response.py in stream(self, amt, decode_content)
575 while not is_fp_closed(self._fp):
--> 576 data = self.read(amt=amt, decode_content=decode_content)
577
/opt/conda/lib/python3.9/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
540 # Content-Length are caught.
--> 541 raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
542
/opt/conda/lib/python3.9/contextlib.py in __exit__(self, type, value, traceback)
134 try:
--> 135 self.gen.throw(type, value, traceback)
136 except StopIteration as exc:
/opt/conda/lib/python3.9/site-packages/urllib3/response.py in _error_catcher(self)
442 # there is yet no clean way to get at it from this context.
--> 443 raise ReadTimeoutError(self._pool, None, "Read timed out.")
444
ReadTimeoutError: HTTPConnectionPool(host='hsds.hdflab.svc.cluster.local', port=80): Read timed out.
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
/opt/conda/lib/python3.9/site-packages/h5pyd/_hl/dataset.py in __getitem__(self, args, new_dtype)
1178 try:
-> 1179 rsp = self.GET(req, params=params, format="binary")
1180 except IOError as ioe:
/opt/conda/lib/python3.9/site-packages/h5pyd/_hl/base.py in GET(self, req, params, use_cache, format)
972 downloaded_bytes = 0
--> 973 for http_chunk in rsp.iter_content(chunk_size=HTTP_CHUNK_SIZE):
974 if http_chunk: # filter out keep alive chunks
/opt/conda/lib/python3.9/site-packages/requests/models.py in generate()
759 except ReadTimeoutError as e:
--> 760 raise ConnectionError(e)
761 else:
ConnectionError: HTTPConnectionPool(host='hsds.hdflab.svc.cluster.local', port=80): Read timed out.
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-12-ad1fd6235f39> in <module>
1 # start = time.time()
----> 2 arr=dset[0:25000000:5000, :]
3 # end = time.time()
4 # print(f'elapsed time: {end - start}')
5 print(f"array shape: {arr.shape}")
/opt/conda/lib/python3.9/site-packages/h5pyd/_hl/dataset.py in __getitem__(self, args, new_dtype)
1186 break
1187 else:
-> 1188 raise IOError(f"Error retrieving data: {ioe.errno}")
1189 if type(rsp) in (bytes, bytearray):
1190 # got binary response
OSError: Error retrieving data: None