#!/usr/bin/env python from __future__ import division; from __future__ import print_function; import tables; import numpy import sys import os import random DEFAULT_NODE_NAME = "defaultNode"; def init_h5_file(toDiskName, groupName=DEFAULT_NODE_NAME, groupDescription=DEFAULT_NODE_NAME): """ toDiskName: the name of the file on disk """ import tables; h5file = tables.openFile(toDiskName, mode="w", title="Dataset") gcolumns = h5file.createGroup(h5file.root, groupName, groupDescription) return h5file; class InfoToInitArrayOnH5File(object): def __init__(self, name, shape, atomicType): """ name: the name of this matrix shape: tuple indicating the shape of the matrix (similar to numpy shapes) atomicType: one of the pytables atomic types - eg: tables.Float32Atom() or tables.StringAtom(itemsize=length); """ self.name = name; self.shape = shape; self.atomicType = atomicType; def writeToDisk(theH5Column, whatToWrite, batch_size=5000): """ Going to write to disk in batches of batch_size """ data_size = len(whatToWrite); last = int(data_size / float(batch_size)) * batch_size for i in xrange(0, data_size, batch_size): stop = (i + data_size%batch_size if i >= last else i + batch_size) theH5Column.append(whatToWrite[i:stop]); h5file.flush() def getH5column(h5file, columnName, nodeName=DEFAULT_NODE_NAME): node = h5file.getNode('/', DEFAULT_NODE_NAME); return getattr(node, columnName); def initColumnsOnH5File(h5file, infoToInitArraysOnH5File, expectedRows, nodeName=DEFAULT_NODE_NAME, complib='blosc', complevel=5): """ h5file: filehandle to the h5file, initialised with init_h5_file infoToInitArrayOnH5File: array of instances of InfoToInitArrayOnH5File expectedRows: this code is set up to work with EArrays, which can be extended after creation. (presumably, if your data is too big to fit in memory, you're going to have to use EArrays to write it in pieces). "sizeEstimate" is the estimated size of the final array; it is used by the compression algorithm and can have a significant impace on performance. nodeName: the name of the node being written to. complib: the docs seem to recommend blosc for compression... complevel: compression level. Not really sure how much of a difference this number makes... """ gcolumns = h5file.getNode(h5file.root, nodeName); filters = tables.Filters(complib=complib, complevel=complevel); for infoToInitArrayOnH5File in infoToInitArraysOnH5File: finalShape = [0]; #in an eArray, the extendable dimension is set to have len 0 finalShape.extend(infoToInitArrayOnH5File.shape); h5file.createEArray(gcolumns, infoToInitArrayOnH5File.name, atom=infoToInitArrayOnH5File.atomicType , shape=finalShape, title=infoToInitArrayOnH5File.name #idk what title does... , filters=filters, expectedrows=expectedRows); def performScikitFit(predictors, outcomes): import sklearn.linear_model; model = sklearn.linear_model.LinearRegression(predictors, outcomes); model.fit(predictors, outcomes); print(model.predict([2.0,2.0])); if __name__ == "__main__": train_or_test = sys.argv[1] pos_or_neg = sys.argv[2] input_dir = '../site_atp_numpy/'+train_or_test ID = train_or_test+'_'+pos_or_neg files = [ os.path.join(input_dir,f) for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir,f))] files = [t for t in files if ID in t] files = [t for t in files if '.dat' in t] total_num = len(files) print (total_num) filename_train = "../pytables_atp/"+ID+".pytables"; h5file = init_h5_file(filename_train); #intiialise the columns going on the file dataName = "data"; dataShape = [4,20,20,20]; #arr describing the dimensions other than the extendable dim. labelName = "label"; labelShape = []; dataInfo = InfoToInitArrayOnH5File(dataName, dataShape, tables.Float32Atom()); labelInfo = InfoToInitArrayOnH5File(labelName, labelShape, tables.Float32Atom()); num_of_dat = min(1000,total_num) numSamples = num_of_dat*1000; initColumnsOnH5File(h5file, [dataInfo,labelInfo], numSamples); dataColumn = getH5column(h5file, dataName); labelColumn = getH5column(h5file, labelName); for dat_num in range (0,num_of_dat): print (dat_num) X=numpy.load(input_dir+'/'+ID+'_'+str(dat_num)+'.dat') writeToDisk(dataColumn, X); actual_size = 1000*(total_num-1)+X.shape[0] if pos_or_neg == 'neg': y = numpy.zeros((actual_size,),dtype=numpy.float32) elif pos_or_neg == 'pos': y = numpy.ones((actual_size,),dtype=numpy.float32) writeToDisk(labelColumn, y); #close and reopen h5file.close();