from __future__ import division; from __future__ import print_function; import tables; import numpy import sys DEFAULT_NODE_NAME = "defaultNode"; def init_h5_file(toDiskName, groupName=DEFAULT_NODE_NAME, groupDescription=DEFAULT_NODE_NAME): """ toDiskName: the name of the file on disk """ import tables; h5file = tables.openFile(toDiskName, mode="w", title="Dataset") gcolumns = h5file.createGroup(h5file.root, groupName, groupDescription) return h5file; class InfoToInitArrayOnH5File(object): def __init__(self, name, shape, atomicType): """ name: the name of this matrix shape: tuple indicating the shape of the matrix (similar to numpy shapes) atomicType: one of the pytables atomic types - eg: tables.Float32Atom() or tables.StringAtom(itemsize=length); """ self.name = name; self.shape = shape; self.atomicType = atomicType; def writeToDisk(theH5Column, whatToWrite, batch_size=5000): """ Going to write to disk in batches of batch_size """ data_size = len(whatToWrite); last = int(data_size / float(batch_size)) * batch_size for i in xrange(0, data_size, batch_size): stop = (i + data_size%batch_size if i >= last else i + batch_size) theH5Column.append(whatToWrite[i:stop]); h5file.flush() def getH5column(h5file, columnName, nodeName=DEFAULT_NODE_NAME): node = h5file.getNode('/', DEFAULT_NODE_NAME); return getattr(node, columnName); def initColumnsOnH5File(h5file, infoToInitArraysOnH5File, expectedRows, nodeName=DEFAULT_NODE_NAME, complib='blosc', complevel=5): """ h5file: filehandle to the h5file, initialised with init_h5_file infoToInitArrayOnH5File: array of instances of InfoToInitArrayOnH5File expectedRows: this code is set up to work with EArrays, which can be extended after creation. (presumably, if your data is too big to fit in memory, you're going to have to use EArrays to write it in pieces). "sizeEstimate" is the estimated size of the final array; it is used by the compression algorithm and can have a significant impace on performance. nodeName: the name of the node being written to. complib: the docs seem to recommend blosc for compression... complevel: compression level. Not really sure how much of a difference this number makes... """ gcolumns = h5file.getNode(h5file.root, nodeName); filters = tables.Filters(complib=complib, complevel=complevel); for infoToInitArrayOnH5File in infoToInitArraysOnH5File: finalShape = [0]; #in an eArray, the extendable dimension is set to have len 0 finalShape.extend(infoToInitArrayOnH5File.shape); h5file.createEArray(gcolumns, infoToInitArrayOnH5File.name, atom=infoToInitArrayOnH5File.atomicType , shape=finalShape, title=infoToInitArrayOnH5File.name #idk what title does... , filters=filters, expectedrows=expectedRows); def load_ATOM_BOX(): dataName = "data"; dataShape = [4,20,20,20]; #arr describing the dimensions other than the extendable dim. labelName = "label"; labelShape = []; all_Xtr=[] all_ytr=[] all_train_sizes=[] train_mean=numpy.zeros((4,20,20,20)) total_train_size=0 for part in range (0,6): filename_train = "../data/ATOM_CHANNEL_dataset/train_data_"+str(part+1)+".pytables"; h5file_train = tables.openFile(filename_train, mode="r") dataColumn_train = getH5column(h5file_train, dataName); labelColumn_train = getH5column(h5file_train, labelName); Xtr=dataColumn_train[:] ytr=labelColumn_train[:] total_train_size+=Xtr.shape[0] train_mean += numpy.mean(Xtr, axis=0) all_train_sizes.append(Xtr.shape[0]) all_Xtr.append(Xtr) all_ytr.append(ytr) mean = train_mean/6 norm_Xtr = [] for Xtr in all_Xtr: Xtr -= mean norm_Xtr.append(Xtr) # Due to memorry consideration and training speed, we only used 1/6 test data to get a sense of the general test error. # We test the full test dataset separately after the training is completed. for part in range (0,1): filename_test = "../data/ATOM_CHANNEL_dataset/test_data_"+str(part+1)+".pytables"; h5file_test = tables.openFile(filename_test, mode="r") dataColumn_test = getH5column(h5file_test, dataName); labelColumn_test = getH5column(h5file_test, labelName); Xt=dataColumn_test[:] yt=labelColumn_test[:] Xt -= mean if part == 0: norm_Xt = Xt all_yt = yt else: norm_Xt = numpy.concatenate((norm_Xt,Xt), axis=0) all_yt = numpy.concatenate((all_yt,yt), axis=0) # Same considerations as the above for the test dataset, more val data can be used to tune the hyper-parameters if desired for part in range (0,1): filename_val = "../data/ATOM_CHANNEL_dataset/val_data_"+str(part+1)+".pytables"; h5file_val = tables.openFile(filename_val, mode="r") dataColumn_val = getH5column(h5file_val, dataName); labelColumn_val = getH5column(h5file_val, labelName); Xv=dataColumn_val[:] yv=labelColumn_val[:] Xv -= mean if part == 0: norm_Xv = Xv all_yv = yv else: norm_Xv = numpy.concatenate((norm_Xv,Xv), axis=0) all_yv = numpy.concatenate((all_yv,yv), axis=0) all_examples=[norm_Xtr,norm_Xt,norm_Xv] all_labels=[all_ytr,all_yt,all_yv] return [all_examples, all_labels, all_train_sizes, norm_Xt.shape[0], norm_Xv.shape[0]] def load_FEATURE(): ID = 'FEATURE_SCOP_T4_train_beta' for part in range(0,5): data_part = numpy.load("../data/FEATURE_dataset/FEATURE_train_X_"+str(part)+".dat") labels_part = numpy.load("../data/FEATURE_dataset/FEATURE_train_y_"+str(part)+".dat") if part ==0: data = data_part labels = labels_part else: data = numpy.concatenate((data,data_part),axis=0) labels = numpy.concatenate((labels,labels_part),axis=0) data_mean = numpy.mean(data,axis=0) Xv = numpy.load("../data/FEATURE_dataset/FEATURE_val_X.dat") yv = numpy.load("../data/FEATURE_dataset/FEATURE_val_y.dat") Xt = numpy.load("../data/FEATURE_dataset/FEATURE_test_X.dat") yt = numpy.load("../data/FEATURE_dataset/FEATURE_test_y.dat") Xv -= data_mean Xt -= data_mean data -= data_mean Xtr=data ytr=labels return [Xtr, ytr, Xt, yt, Xv, yv]