import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from theano.tensor.nnet.nnet import sigmoid from theano import shared from collections import OrderedDict from theano.misc.pkl_utils import dump import numpy as np from process_poc_pretrain import * from data_utils import * from graph_cnn_layers import * max_poc_degrees = 20 max_nodes_in_poc = 50 def shared_dataset(env_features, env_neighbors, env_degrees, mask, borrow=True): shared_env_features = theano.shared(numpy.asarray(env_features,dtype=theano.config.floatX),borrow=borrow) shared_env_neighbors = theano.shared(numpy.asarray(env_neighbors,dtype=theano.config.floatX),borrow=borrow) shared_env_degrees = theano.shared(numpy.asarray(env_degrees,dtype=theano.config.floatX),borrow=borrow) shared_mask = theano.shared(numpy.asarray(mask,dtype=theano.config.floatX),borrow=borrow) return shared_env_features, shared_env_neighbors, shared_env_degrees, shared_mask def dump_weights_pickle(num_layers, Weights, Bias, file_name): all_var_tuple = () for i in range(num_layers): [W_self,W_degrees]=Weights[i] [b_prime,b_prime_self,b_layer]=Bias[i] all_var_tuple = all_var_tuple + (W_self,) for j in range(0,max_poc_degrees): all_var_tuple = all_var_tuple + (W_degrees[j],) all_var_tuple = all_var_tuple + (b_prime,b_prime_self,b_layer) with open(file_name, 'wb') as f: dump(all_var_tuple, f) class sparse_dA(object): def __init__( self, numpy_rng, corruption_level, theano_rng=None, n_visible=470, n_hidden=500, W_out = None, b_out = None, b_layer = None, W_self = None, W_degrees = None, num_in_features = None, num_input = None, env_features = None, env_neighbors = None, env_degrees = None, mask = None, layer_fp=None, fp_length=512, b_prime=None, b_prime_self=None ): rng = numpy_rng self.n_visible = n_visible self.n_hidden = n_hidden if layer_fp == None: self.layer_fp=theano.shared(numpy.zeros((num_input,fp_length),dtype=theano.config.floatX),borrow=True) else: self.layer_fp=layer_fp self.num_in_features = num_in_features self.env_features = env_features self.mask = mask self.env_degrees = env_degrees if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) if not b_prime: b_prime = theano.shared( value=numpy.zeros( n_visible, dtype=theano.config.floatX ), borrow=True ) self.b_prime = b_prime if not b_prime_self: b_prime_self = theano.shared( value=numpy.zeros( n_visible, dtype=theano.config.floatX ), borrow=True ) self.b_prime_self = b_prime_self # tied weights, therefore W_prime is W transpose self.theano_rng = theano_rng # if no input is given, generate a variable representing the input corrupt_env_features = self.theano_rng.binomial(size=self.env_features.shape, n=1, p=1 - corruption_level, dtype=theano.config.floatX) * self.env_features self.Graph_CNN = Graph_Conv( rng=numpy_rng, W_out=None, b_out=None, b_layer=None, W_self=None, W_degrees=None, num_in_features=self.num_in_features, num_hidden_features=self.n_hidden, num_input=num_input, env_features=corrupt_env_features, env_neighbors=env_neighbors, env_degrees=env_degrees, mask=mask, layer_fp=self.layer_fp, fp_length=fp_length, ) self.W_primes = [] for degree in range(max_poc_degrees): self.W_primes.append(self.Graph_CNN.W_degrees[degree].T) self.W_prime_self = self.Graph_CNN.W_self.T self.layer_conv = self.Graph_CNN.layer_conv self.params = [self.b_prime, self.b_prime_self] self.params.extend(self.Graph_CNN.params) def get_cost_updates(self, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ x_n = self.Graph_CNN.summed_neighbors x_s = self.Graph_CNN.env_features_reshaped y = self.Graph_CNN.layer_conv for d in range(max_poc_degrees): if d == 0: activations = T.dot(y, self.W_primes[d]).dimshuffle(0,1,'x') else: tmp = T.dot(y, self.W_primes[d]).dimshuffle(0,1,'x') activations = T.concatenate([activations, tmp], axis=2) env_degrees_newaixs=self.env_degrees.dimshuffle(0,'x',1) recon_n_activations = T.sum(activations*env_degrees_newaixs,axis=2) recon_s_act = T.dot(y, self.W_prime_self) z_n = relu(recon_n_activations + self.b_prime.dimshuffle('x',0)) z_s = relu(recon_s_act + self.b_prime_self.dimshuffle('x',0)) L = T.sum((x_n-z_n)**2 + (x_s-z_s)**2,axis=1) L = L * self.mask cost = T.mean(L)#+betta*penalty gparams = T.grad(cost, self.params) # generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates) class SdA(object): def __init__( self, numpy_rng, theano_rng=None, n_ins=480, hidden_layers_sizes=[200, 100], corruption_levels=[0.05, 0.05], W_out = None, b_out = None, b_layer = None, W_self = None, W_degrees = None, num_in_features = None, num_input = 5, env_features = None, env_neighbors = None, env_degrees = None, mask = None, layer_fp=None, fp_length=512, ): self.Graph_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels for i in range(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = env_features else: layer_input = self.dA_layers[-1].layer_conv dA_layer = sparse_dA(numpy_rng=numpy_rng, theano_rng=theano_rng, corruption_level=corruption_levels[i], n_visible=input_size, n_hidden=hidden_layers_sizes[i], W_out = W_out, b_out = b_out, b_layer = b_layer, W_self = W_self, W_degrees = W_degrees, num_in_features = input_size, num_input = num_input, env_features = layer_input, env_neighbors = env_neighbors, env_degrees = env_degrees, mask = mask, layer_fp=layer_fp, fp_length=512, b_prime=None, b_prime_self=None ) self.dA_layers.append(dA_layer) self.params.extend(dA_layer.params) def train_poc_autoencoder_I(learning_rate=0.001, pretraining_epochs=500, batch_size=5, reg=5e-9): [W_poc_out_0,W_poc_self_0,W_poc_out_1, b_poc_out_0,b_poc_layer_0,b_poc_out_1, W_poc_degrees] = [None,None,None,None,None, None,None] Weights =[W_poc_out_0,W_poc_self_0,W_poc_out_1,W_poc_degrees] Bias = [b_poc_out_0,b_poc_layer_0,b_poc_out_1] all_poc, max_ff, min_ff, mean_ff = get_all_pocs() all_poc_drug, max_ff_drug, min_ff_drug, mean_ff_drug = get_all_pocs_drug() n_train_batches = int(len(all_poc)/batch_size) # batch_size should be the number of paris of (pocket, mol) numpy_rng = numpy.random.RandomState(89677) print ('... building the model') batchsize = batch_size env_features = T.matrix('env_features', dtype=theano.config.floatX) env_neighbors = T.matrix('env_neighbors', dtype=theano.config.floatX) env_mask = T.fvector('env_mask') env_degrees = T.matrix('env_degrees', dtype=theano.config.floatX) index = T.lscalar('index') # index to a [mini]batch lr = T.scalar(name='lr') print ('... finetunning the model') # early-stopping parameters patience = 100000 # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant best_test_score = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 ############################################## ################## SDA ####################### ############################################## sda = SdA( numpy_rng=numpy_rng, n_ins=480, hidden_layers_sizes=[200, 100], corruption_levels=[0.05, 0.05], W_out = None, b_out = None, b_layer = None, W_self = None, W_degrees = None, num_in_features = None, num_input = batch_size, env_features = env_features, env_neighbors = env_neighbors, env_degrees = env_degrees, mask = env_mask, layer_fp=None, fp_length=512, ) for i in range(sda.n_layers): training_dA = sda.dA_layers[i] cost, updates = training_dA.get_cost_updates(learning_rate) pretrain_fn = theano.function([env_features, env_neighbors, env_degrees, env_mask], cost, updates=updates) for epoch in range(pretraining_epochs): # go through the training set c = [] for batch_index in range(n_train_batches): env_features_train, env_neighbors_train, env_degrees_train, env_mask_train = get_pocket_attributes(all_poc[batch_index*batch_size:(batch_index+1)*batch_size],max_ff, min_ff, mean_ff) train_EF, train_EN, train_ED, train_env_mask = shared_dataset(env_features_train, env_neighbors_train, env_degrees_train, env_mask_train) minibatch_cost = pretrain_fn(train_EF.eval(), train_EN.eval(), train_ED.eval(), train_env_mask.eval()) c.append(minibatch_cost) mean_cost = numpy.mean(c) print ('Pre-training layer %i, epoch %d, cost ' % (i, epoch)) print (mean_cost) list_file= open('../progress_poc_pretrain_SDA_tied_relu_scPDB.txt','a') list_file.write('layer %i, epoch %d, cost ' % (i, epoch)) list_file.write(str(mean_cost)+'\n') list_file.close() #### dump weights ###### num_layers = sda.n_layers Weights=[] Bias=[] for k in range(num_layers): b_prime = sda.dA_layers[k].b_prime b_prime_self = sda.dA_layers[k].b_prime_self W_self = sda.dA_layers[k].Graph_CNN.W_self b_layer = sda.dA_layers[k].Graph_CNN.b_layer W_degrees = sda.dA_layers[k].Graph_CNN.W_degrees Weights.append([W_self,W_degrees]) Bias.append([b_prime,b_prime_self,b_layer]) dump_weights_pickle(num_layers, Weights, Bias, file_name='../weights/weights_graph_autoencoder_I.zip') if __name__ == '__main__': train_poc_autoencoder_I()