This is my code for loading these files (for the compressed file you need to change the file name):
-
compressed h5 (link to my personal dropbox)
-
uncompressed h5 (link to my personal dropbox)
# load the respective cached dictionary for the definition of "kmeansDic" throughout this project
silhouette = True # True should be selected for complete data. If False is selected, then silhouette will just be assigned to 0 and omitted, saving a lot of computation time.
sample_size = 300000 # this number refers to the sample size in the silhouette_score() function which is computationally very expensive
k_lowest = 2
k_highest = 3
data_processing = 1
scale_all = True
absolute_values = False
kmeansDic = h5py.File(parent_folder_path + f"kmeansDic({k_lowest},{k_highest}){sample_size}-samplesize{silhouette!s}-silhouette_{data_processing}-data-processing_{scale_all!s}-scale-all_{absolute_values!s}-absolute-values.h5", “r”)
Note that this alone doesn’t take much time, but when I use it in my other functions, then it takes 16 instead of 8 seconds when using the uncompressed file, and over 140 seconds with the compressed file - compared to storing directly into the working memory.
If it is relevant, this is my code for saving the h5 files:
# Create a nested dictionary for all cluster arrays, sum of squared distances, silhouette and calinski_harabasz scores
start = time.time()
kmeansDic = {}
silhouette_setting = True # True should be selected for complete data. If False is selected, then silhouette will just be assigned to 0 and omitted, saving a lot of computation time.
sample_size = 300000 # this number refers to the sample size in the silhouette_score() function which is computationally very expensive
k_lowest = 2
k_highest = 3
data_processing = 1
scale_all = True
absolute_values = False
compression = True
for k in range(k_lowest, k_highest+1):
start_k = time.time()
cluster_numbers, ssqd, silhouette, calinski_harabasz = kmeans_clustering(data_processing=data_processing, scale_all=scale_all, absolute_values=absolute_values, k=k, sample_size=sample_size, silhouette=silhouette_setting)
kmeansDic[str(k)] = {"cluster_numbers": cluster_numbers, "ssqd": ssqd, "silhouette": silhouette, "calinski_harabasz": calinski_harabasz}
end = time.time()
print("Finished entry in nested dictionary for k = " + str(k) + ". That took " + str(round((end-start_k), 2)) + " seconds.")
end = time.time()
print("Calculating all cluster arrays took", round((end-start), 2), "seconds.")
### SAVE CACHE ###
# with these settings (copied from http://www.silx.org/doc/silx/0.2.0/modules/io/dictdump.html), the file size is much smaller, but loading takes much longer, so better not use it!
if compression:
create_ds_args = {'compression': "gzip",
'shuffle': True,
'fletcher32': True}
else:
create_ds_args = None
saveh5 = input("Do you want to save the dictionary as a h5 file that other functions will refer to ? (y/n)\n")
if saveh5.lower() == "y":
if not compression:
fname = f"kmeansDic({k_lowest},{k_highest})_{sample_size}-samplesize_{silhouette_setting!s}-silhouette_{data_processing}-data-processing_{scale_all!s}-scale-all_{absolute_values!s}-absolute-values.h5"
else:
fname = f"kmeansDic({k_lowest},{k_highest})_{sample_size}-samplesize_{silhouette_setting!s}-silhouette_{data_processing}-data-processing_{scale_all!s}-scale-all_{absolute_values!s}-absolute-values_COMPRESSED.h5"
dicttoh5(kmeansDic, "Cache/"+ fname, create_dataset_args=create_ds_args)
print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")
if saveh5.lower() == "n":
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
fname = "kmeansDic_no-save_" + dt_string + ".h5"
dicttoh5(kmeansDic, "Cache/" + fname)
print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")
I cannot provide details about the original data, since this is confidential, and also the code for the other function I referred to is quite long and I guess would not help much.
(Off-topic note: It is really tricky to format code here.
Sorry, I hope this is legible. Maybe pastebin would be better?)