import os
import sys
import time
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from theano.tensor.nnet.nnet import sigmoid
from theano import shared
from collections import OrderedDict
from theano.misc.pkl_utils import dump
import numpy as np
from process_poc_pretrain import *
from data_utils import *
from graph_cnn_layers import *

max_poc_degrees = 20
max_nodes_in_poc = 50


def shared_dataset(env_features, env_neighbors, env_degrees, mask, borrow=True):

	shared_env_features = theano.shared(numpy.asarray(env_features,dtype=theano.config.floatX),borrow=borrow)
	shared_env_neighbors = theano.shared(numpy.asarray(env_neighbors,dtype=theano.config.floatX),borrow=borrow)
	shared_env_degrees = theano.shared(numpy.asarray(env_degrees,dtype=theano.config.floatX),borrow=borrow)
	shared_mask = theano.shared(numpy.asarray(mask,dtype=theano.config.floatX),borrow=borrow)
	
	return shared_env_features, shared_env_neighbors, shared_env_degrees, shared_mask


def dump_weights_pickle(num_layers, Weights, Bias, file_name):
	all_var_tuple = ()
	for i in range(num_layers):
		[W_self,W_degrees]=Weights[i]
		[b_prime,b_prime_self,b_layer]=Bias[i]
		all_var_tuple = all_var_tuple + (W_self,)
		
		for j in range(0,max_poc_degrees):
			all_var_tuple = all_var_tuple + (W_degrees[j],)
		all_var_tuple = all_var_tuple + (b_prime,b_prime_self,b_layer)


	with open(file_name, 'wb') as f:
		dump(all_var_tuple, f)

class sparse_dA(object):
	
	def __init__(
		self,
		numpy_rng,
		corruption_level,
		theano_rng=None,
		n_visible=470,
		n_hidden=500,
		W_out = None,
		b_out = None, 
		b_layer = None, 
		W_self = None, 
		W_degrees = None, 
		num_in_features = None,
		num_input = None, 
		env_features = None, 
		env_neighbors = None, 
		env_degrees = None, 
		mask = None, 
		layer_fp=None, 
		fp_length=512, 
		b_prime=None,
		b_prime_self=None
	):

		rng = numpy_rng
		self.n_visible = n_visible
		self.n_hidden = n_hidden
		if layer_fp == None:
			self.layer_fp=theano.shared(numpy.zeros((num_input,fp_length),dtype=theano.config.floatX),borrow=True)
		else:
			self.layer_fp=layer_fp

		self.num_in_features = num_in_features
		self.env_features = env_features
		self.mask = mask
		self.env_degrees = env_degrees

		if not theano_rng:
			theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

		if not b_prime:
			b_prime = theano.shared(
				value=numpy.zeros(
					n_visible,
					dtype=theano.config.floatX
				),
				borrow=True
			)

		self.b_prime = b_prime

		if not b_prime_self:
			b_prime_self = theano.shared(
				value=numpy.zeros(
					n_visible,
					dtype=theano.config.floatX
				),
				borrow=True
			)

		self.b_prime_self = b_prime_self


		# tied weights, therefore W_prime is W transpose
		self.theano_rng = theano_rng
		# if no input is given, generate a variable representing the input
		corrupt_env_features = self.theano_rng.binomial(size=self.env_features.shape, n=1,
										p=1 - corruption_level,
										dtype=theano.config.floatX) * self.env_features
		
		self.Graph_CNN = Graph_Conv(
			rng=numpy_rng, 
			W_out=None, 
			b_out=None, 
			b_layer=None, 
			W_self=None,  
			W_degrees=None,  
			num_in_features=self.num_in_features,
			num_hidden_features=self.n_hidden, 
			num_input=num_input,
			env_features=corrupt_env_features, 
			env_neighbors=env_neighbors,
			env_degrees=env_degrees, 
			mask=mask,
			layer_fp=self.layer_fp, 
			fp_length=fp_length,
			)

		self.W_primes = []
		for degree in range(max_poc_degrees):
			self.W_primes.append(self.Graph_CNN.W_degrees[degree].T)

		self.W_prime_self = self.Graph_CNN.W_self.T

		self.layer_conv = self.Graph_CNN.layer_conv
		self.params = [self.b_prime, self.b_prime_self]
		self.params.extend(self.Graph_CNN.params)
		

	def get_cost_updates(self, learning_rate):
		""" This function computes the cost and the updates for one trainng
		step of the dA """
		x_n = self.Graph_CNN.summed_neighbors
		x_s = self.Graph_CNN.env_features_reshaped
		y = self.Graph_CNN.layer_conv


		for d in range(max_poc_degrees):
			if d == 0:
				activations = T.dot(y, self.W_primes[d]).dimshuffle(0,1,'x')
			else:
				tmp = T.dot(y, self.W_primes[d]).dimshuffle(0,1,'x')
				activations = T.concatenate([activations, tmp], axis=2)

		env_degrees_newaixs=self.env_degrees.dimshuffle(0,'x',1)
		recon_n_activations = T.sum(activations*env_degrees_newaixs,axis=2)
		recon_s_act = T.dot(y, self.W_prime_self)

		z_n = relu(recon_n_activations + self.b_prime.dimshuffle('x',0))
		z_s = relu(recon_s_act + self.b_prime_self.dimshuffle('x',0))
		L = T.sum((x_n-z_n)**2 + (x_s-z_s)**2,axis=1)
		L = L * self.mask
		cost = T.mean(L)#+betta*penalty
		gparams = T.grad(cost, self.params)
		# generate the list of updates
		updates = [
			(param, param - learning_rate * gparam)
			for param, gparam in zip(self.params, gparams)
		]
		return (cost, updates)


class SdA(object):


	def __init__(
		self,
		numpy_rng,
		theano_rng=None,
		n_ins=480,
		hidden_layers_sizes=[200, 100],
		corruption_levels=[0.05, 0.05],
		W_out = None,
		b_out = None, 
		b_layer = None, 
		W_self = None, 
		W_degrees = None, 
		num_in_features = None,
		num_input = 5, 
		env_features = None, 
		env_neighbors = None, 
		env_degrees = None, 
		mask = None, 
		layer_fp=None, 
		fp_length=512, 
	):

		self.Graph_layers = []
		self.dA_layers = []
		self.params = []
		self.n_layers = len(hidden_layers_sizes)

		assert self.n_layers > 0

		if not theano_rng:
			theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
		# allocate symbolic variables for the data
		
		self.y = T.ivector('y')  # the labels are presented as 1D vector of
								 # [int] labels
	 
		for i in range(self.n_layers):
			if i == 0:
				input_size = n_ins
			else:
				input_size = hidden_layers_sizes[i - 1]

			if i == 0:
				layer_input = env_features
			else:
				layer_input = self.dA_layers[-1].layer_conv

			dA_layer = sparse_dA(numpy_rng=numpy_rng,
							theano_rng=theano_rng,
							corruption_level=corruption_levels[i],
							n_visible=input_size,
							n_hidden=hidden_layers_sizes[i],
							W_out = W_out,
							b_out = b_out, 
							b_layer = b_layer,
							W_self = W_self,
							W_degrees = W_degrees,
							num_in_features = input_size,
							num_input = num_input, 
							env_features = layer_input, 
							env_neighbors = env_neighbors, 
							env_degrees = env_degrees,
							mask = mask,
							layer_fp=layer_fp,
							fp_length=512, 
							b_prime=None,
							b_prime_self=None
						  )
			self.dA_layers.append(dA_layer)
			self.params.extend(dA_layer.params)
	   

def train_poc_autoencoder_I(learning_rate=0.001, pretraining_epochs=500, batch_size=5, reg=5e-9):

	[W_poc_out_0,W_poc_self_0,W_poc_out_1, b_poc_out_0,b_poc_layer_0,b_poc_out_1, W_poc_degrees] = [None,None,None,None,None, None,None]
	
	Weights =[W_poc_out_0,W_poc_self_0,W_poc_out_1,W_poc_degrees]
	Bias = [b_poc_out_0,b_poc_layer_0,b_poc_out_1]

	all_poc, max_ff, min_ff, mean_ff = get_all_pocs() 
	all_poc_drug, max_ff_drug, min_ff_drug, mean_ff_drug = get_all_pocs_drug()
	n_train_batches = int(len(all_poc)/batch_size) # batch_size should be the number of paris of (pocket, mol)
	
	numpy_rng = numpy.random.RandomState(89677)
	print ('... building the model')
	batchsize = batch_size

	env_features = T.matrix('env_features', dtype=theano.config.floatX)
	env_neighbors = T.matrix('env_neighbors', dtype=theano.config.floatX)
	env_mask = T.fvector('env_mask')
	env_degrees = T.matrix('env_degrees', dtype=theano.config.floatX)


	index = T.lscalar('index')  # index to a [mini]batch
	lr = T.scalar(name='lr')

	print ('... finetunning the model')
	# early-stopping parameters
	patience = 100000  # look as this many examples regardless
	patience_increase = 2.  # wait this much longer when a new best is
							# found
	improvement_threshold = 0.995  # a relative improvement of this much is
								   # considered significant
	
	best_test_score = numpy.inf
	test_score = 0.
	start_time = time.clock()

	done_looping = False
	epoch = 0

	##############################################
	################## SDA #######################
	##############################################
	sda = SdA(
		numpy_rng=numpy_rng,
		n_ins=480,
		hidden_layers_sizes=[200, 100],
		corruption_levels=[0.05, 0.05],
		W_out = None,
		b_out = None, 
		b_layer = None, 
		W_self = None, 
		W_degrees = None, 
		num_in_features = None,
		num_input = batch_size, 
		env_features = env_features, 
		env_neighbors = env_neighbors,
		env_degrees = env_degrees,
		mask = env_mask, 
		layer_fp=None, 
		fp_length=512, 
	)

	
	for i in range(sda.n_layers):

		training_dA = sda.dA_layers[i]
		cost, updates = training_dA.get_cost_updates(learning_rate)
		
		pretrain_fn = theano.function([env_features, env_neighbors, env_degrees, env_mask], cost,
										updates=updates)
		for epoch in range(pretraining_epochs):
			# go through the training set
			c = []
			for batch_index in range(n_train_batches):
				env_features_train, env_neighbors_train, env_degrees_train, env_mask_train = get_pocket_attributes(all_poc[batch_index*batch_size:(batch_index+1)*batch_size],max_ff, min_ff, mean_ff)
				train_EF, train_EN, train_ED, train_env_mask = shared_dataset(env_features_train, env_neighbors_train, env_degrees_train, env_mask_train)
				minibatch_cost = pretrain_fn(train_EF.eval(), train_EN.eval(), train_ED.eval(), train_env_mask.eval())
				c.append(minibatch_cost)
			mean_cost = numpy.mean(c)
			print ('Pre-training layer %i, epoch %d, cost ' % (i, epoch))
			print (mean_cost)
			list_file= open('../progress_poc_pretrain_SDA_tied_relu_scPDB.txt','a')
			list_file.write('layer %i, epoch %d, cost ' % (i, epoch))
			list_file.write(str(mean_cost)+'\n')
			list_file.close()
			

			#### dump weights ######
			num_layers = sda.n_layers
			Weights=[]
			Bias=[]
			for k in range(num_layers):

				b_prime = sda.dA_layers[k].b_prime
				b_prime_self = sda.dA_layers[k].b_prime_self
				W_self = sda.dA_layers[k].Graph_CNN.W_self
				b_layer = sda.dA_layers[k].Graph_CNN.b_layer
				W_degrees = sda.dA_layers[k].Graph_CNN.W_degrees

				Weights.append([W_self,W_degrees])
				Bias.append([b_prime,b_prime_self,b_layer])

			dump_weights_pickle(num_layers, Weights, Bias, file_name='../weights/weights_graph_autoencoder_I.zip')


if __name__ == '__main__':

	train_poc_autoencoder_I()