#!/usr/bin/env python
from __future__ import division;
from __future__ import print_function;
import tables;
import numpy
import sys
import os
import random

DEFAULT_NODE_NAME = "defaultNode";


def init_h5_file(toDiskName, groupName=DEFAULT_NODE_NAME, groupDescription=DEFAULT_NODE_NAME):
	"""
		toDiskName: the name of the file on disk
	"""
	import tables;
	h5file = tables.openFile(toDiskName, mode="w", title="Dataset")
	gcolumns = h5file.createGroup(h5file.root, groupName, groupDescription)
	return h5file;

class InfoToInitArrayOnH5File(object):
	def __init__(self, name, shape, atomicType):
		"""
			name: the name of this matrix
			shape: tuple indicating the shape of the matrix (similar to numpy shapes)
			atomicType: one of the pytables atomic types - eg: tables.Float32Atom() or tables.StringAtom(itemsize=length);
		"""
		self.name = name;
		self.shape = shape;
		self.atomicType = atomicType;

def writeToDisk(theH5Column, whatToWrite, batch_size=5000):
	"""
		Going to write to disk in batches of batch_size
	""" 
	data_size = len(whatToWrite);
	last = int(data_size / float(batch_size)) * batch_size
	for i in xrange(0, data_size, batch_size):
		stop = (i + data_size%batch_size if i >= last
				else i + batch_size)
		theH5Column.append(whatToWrite[i:stop]);
		h5file.flush()
	
def getH5column(h5file, columnName, nodeName=DEFAULT_NODE_NAME):
	node = h5file.getNode('/', DEFAULT_NODE_NAME);
	return getattr(node, columnName);


def initColumnsOnH5File(h5file, infoToInitArraysOnH5File, expectedRows, nodeName=DEFAULT_NODE_NAME, complib='blosc', complevel=5):
	"""
		h5file: filehandle to the h5file, initialised with init_h5_file
		infoToInitArrayOnH5File: array of instances of InfoToInitArrayOnH5File
		expectedRows: this code is set up to work with EArrays, which can be extended after creation.
			(presumably, if your data is too big to fit in memory, you're going to have to use EArrays
			to write it in pieces). "sizeEstimate" is the estimated size of the final array; it
			is used by the compression algorithm and can have a significant impace on performance.
		nodeName: the name of the node being written to.
		complib: the docs seem to recommend blosc for compression...
		complevel: compression level. Not really sure how much of a difference this number makes...
	"""
	gcolumns = h5file.getNode(h5file.root, nodeName);
	filters = tables.Filters(complib=complib, complevel=complevel);
	for infoToInitArrayOnH5File in infoToInitArraysOnH5File:
		finalShape = [0]; #in an eArray, the extendable dimension is set to have len 0
		finalShape.extend(infoToInitArrayOnH5File.shape);
		h5file.createEArray(gcolumns, infoToInitArrayOnH5File.name, atom=infoToInitArrayOnH5File.atomicType
							, shape=finalShape, title=infoToInitArrayOnH5File.name #idk what title does...
							, filters=filters, expectedrows=expectedRows);
	
def performScikitFit(predictors, outcomes):
	import sklearn.linear_model;
	model = sklearn.linear_model.LinearRegression(predictors, outcomes);
	model.fit(predictors, outcomes);
	print(model.predict([2.0,2.0]));
	
if __name__ == "__main__":

	train_or_test = sys.argv[1]
	pos_or_neg = sys.argv[2]
	
	input_dir = '../site_atp_numpy/'+train_or_test
	ID = train_or_test+'_'+pos_or_neg
  
	files = [ os.path.join(input_dir,f) for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir,f))]
	files = [t for t in files if ID in t]
	files = [t for t in files if '.dat' in t]
	total_num = len(files)
	print (total_num)

	filename_train = "../pytables_atp/"+ID+".pytables";
	h5file = init_h5_file(filename_train);

	#intiialise the columns going on the file
	dataName = "data";
	dataShape = [4,20,20,20]; #arr describing the dimensions other than the extendable dim.
	labelName = "label";
	labelShape = [];
	dataInfo = InfoToInitArrayOnH5File(dataName, dataShape, tables.Float32Atom());
	labelInfo = InfoToInitArrayOnH5File(labelName, labelShape, tables.Float32Atom());
	
	num_of_dat = min(1000,total_num)
	numSamples = num_of_dat*1000;
	initColumnsOnH5File(h5file, [dataInfo,labelInfo], numSamples);
	dataColumn = getH5column(h5file, dataName);
	labelColumn = getH5column(h5file, labelName); 

	for dat_num in range (0,num_of_dat):
		print (dat_num)
		X=numpy.load(input_dir+'/'+ID+'_'+str(dat_num)+'.dat')
		writeToDisk(dataColumn, X);

	actual_size = 1000*(total_num-1)+X.shape[0]
	if pos_or_neg == 'neg':
		y = numpy.zeros((actual_size,),dtype=numpy.float32)
	elif pos_or_neg == 'pos':
		y = numpy.ones((actual_size,),dtype=numpy.float32)
	writeToDisk(labelColumn, y);

	#close and reopen
	h5file.close();