import os import sys import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from scipy.io import matlab import math from theano import shared from collections import OrderedDict from theano.misc.pkl_utils import dump import numpy as np from io_utils_DUDE import * from io_utils_MUV import * from mol_graph import * from process_poc_pretrain import * from data_utils import * from graph_cnn_layers import * from sklearn import metrics from sklearn.metrics import roc_auc_score max_poc_degrees = 20 max_nodes_in_poc = 50 max_mol_degrees = 6 max_nodes_in_mol = 60 def numpy_floatX(data): return numpy.asarray(data, dtype=theano.config.floatX) class Graph_Conv_mol(object): def __init__(self, rng, b_layer, W_self, W_degrees, num_atom_features, num_bond_features, num_hidden_features, num_input, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mask, max_nodes_in_mol=20, max_mol_degrees=6): """Sets up functions to compute convnets over all molecules in a minibatch together.""" self.num_hidden_features = num_hidden_features self.num_atom_features = num_atom_features self.num_bond_features = num_bond_features self.mol_atom_features = mol_atom_features self.mol_bond_features = mol_bond_features self.mol_atom_neighbors = mol_atom_neighbors self.mol_bond_neighbors = mol_bond_neighbors if b_layer is None: bl_values = numpy.zeros((num_hidden_features,), dtype=theano.config.floatX) self.b_layer = theano.shared(value=bl_values, borrow=True) else: self.b_layer = b_layer if W_self is None: W_self_bound = numpy.sqrt(6. / (num_atom_features + num_hidden_features)) self.W_self = theano.shared(numpy.asarray(rng.uniform(low=-W_self_bound, high=W_self_bound, size=(num_atom_features,num_hidden_features)),dtype=theano.config.floatX),borrow=True) else: self.W_self = W_self self.W_degrees = [] if W_degrees is None: for degree in range(max_mol_degrees): W_d = theano.shared(numpy.asarray(rng.uniform(low=-W_self_bound, high=W_self_bound, size=(num_atom_features+num_bond_features,num_hidden_features)),dtype=theano.config.floatX),borrow=True) self.W_degrees.append(W_d) else: for degree in range(max_mol_degrees): self.W_degrees.append(W_degrees[degree]) self.self_activations = T.dot(mol_atom_features, self.W_self) atom_deg = T.sum(mol_atom_neighbors,axis=1).reshape((num_input*max_nodes_in_mol,1)) bond_deg = T.sum(mol_bond_neighbors,axis=1).reshape((num_input*max_nodes_in_mol,1)) summed_atom_neighbors = T.dot(mol_atom_neighbors, mol_atom_features) / (atom_deg+1e-8) summed_bond_neighbors = T.dot(mol_bond_neighbors, mol_bond_features) / (bond_deg+1e-8) summed_neighbors = T.concatenate([summed_atom_neighbors, summed_bond_neighbors], axis=1) for d in range(max_mol_degrees): if d == 0: activations = T.dot(summed_neighbors, self.W_degrees[d]).dimshuffle(0,1,'x') else: tmp = T.dot(summed_neighbors, self.W_degrees[d]).dimshuffle(0,1,'x') activations = T.concatenate([activations, tmp], axis=2) mol_degrees_newaixs=mol_degrees.dimshuffle(0,'x',1) neighbour_activations = T.sum(activations*mol_degrees_newaixs,axis=2) total_activations = self.self_activations + neighbour_activations + self.b_layer.dimshuffle('x',0) self.layer_conv=relu(total_activations) class MOL_FP_Output(object): def __init__(self, rng, W_out, b_out, num_input, num_atom_features, mol_atom_features, mask, fp_length=216, max_nodes_in_mol=20): if W_out is None: Wo_bound = numpy.sqrt(6. / (num_atom_features + fp_length)) self.W_out = theano.shared(numpy.asarray(rng.uniform(low=-Wo_bound, high=Wo_bound, size=(num_atom_features,fp_length)),dtype=theano.config.floatX),borrow=True) else: self.W_out = W_out if b_out is None: bo_values = numpy.zeros((fp_length,), dtype=theano.config.floatX) self.b_out = theano.shared(value=bo_values, borrow=True) else: self.b_out = b_out mol_features_reshaped = T.reshape(mol_atom_features, (int(num_input*max_nodes_in_mol), int(num_atom_features)), ndim=2) mol_outputs = T.nnet.softmax(self.b_out.dimshuffle('x', 0) + T.dot(mol_features_reshaped, self.W_out)) self.layer_fp = sum_and_stack_atoms(mol_outputs, mask, num_input, max_nodes_in_mol, fp_length) class POC_FP_Output(object): def __init__(self, rng, W_out, b_out, num_input, num_in_features, env_features, mask, fp_length=512, max_nodes_in_poc=50): if W_out is None: Wo_bound = numpy.sqrt(6. / (num_in_features + fp_length)) self.W_out = theano.shared(numpy.asarray(rng.uniform(low=-Wo_bound, high=Wo_bound, size=(num_in_features,fp_length)),dtype=theano.config.floatX),borrow=True) else: self.W_out = W_out if b_out is None: bo_values = numpy.zeros((fp_length,), dtype=theano.config.floatX) self.b_out = theano.shared(value=bo_values, borrow=True) else: self.b_out = b_out self.env_features_reshaped = T.reshape(env_features, (int(num_input*max_nodes_in_poc), int(num_in_features)), ndim=2) self.env_outputs = T.nnet.softmax(self.b_out.dimshuffle('x', 0) + T.dot(self.env_features_reshaped, self.W_out)) self.layer_fp = sum_and_stack(self.env_outputs, mask, num_input*max_nodes_in_poc, fp_length, max_nodes_in_poc) class Graph_Conv(object): def __init__(self, rng, b_layer, W_self, W_degrees, num_in_features, num_hidden_features, num_input, env_features, env_neighbors, env_degrees, mask, max_nodes_in_poc=50, max_poc_degrees=20): """Sets up functions to compute convnets over all molecules in a minibatch together.""" self.num_hidden_features = num_hidden_features self.num_in_features = num_in_features self.env_features = env_features ################## if b_layer is None: bl_values = numpy.zeros((num_hidden_features,), dtype=theano.config.floatX) self.b_layer = theano.shared(value=bl_values, borrow=True) else: self.b_layer = b_layer if W_self is None: W_self_bound = numpy.sqrt(6. / (num_in_features + num_hidden_features)) self.W_self = theano.shared(numpy.asarray(rng.uniform(low=-W_self_bound, high=W_self_bound, size=(num_in_features,num_hidden_features)),dtype=theano.config.floatX),borrow=True) else: self.W_self = W_self self.W_degrees = [] if W_degrees is None: for degree in range(max_poc_degrees): W_d = theano.shared(numpy.asarray(rng.uniform(low=-W_self_bound, high=W_self_bound, size=(num_in_features,num_hidden_features)),dtype=theano.config.floatX),borrow=True) self.W_degrees.append(W_d) else: for degree in range(max_poc_degrees): self.W_degrees.append(W_degrees[degree]) self.self_activations = T.dot(env_features, self.W_self) degree_idx_order=[] deg = T.sum(env_neighbors,axis=1).reshape((num_input*max_nodes_in_poc,1)) self.summed_neighbors = T.dot(env_neighbors, env_features) / (deg+1e-8) for d in range(max_poc_degrees): if d == 0: activations = T.dot(self.summed_neighbors, self.W_degrees[d]).dimshuffle(0,1,'x') else: tmp = T.dot(self.summed_neighbors, self.W_degrees[d]).dimshuffle(0,1,'x') activations = T.concatenate([activations, tmp], axis=2) env_degrees_newaixs=env_degrees.dimshuffle(0,'x',1) neighbour_activations = T.sum(activations*env_degrees_newaixs,axis=2) total_activations = self.self_activations + neighbour_activations + self.b_layer.dimshuffle('x',0) self.layer_conv=relu(total_activations) def _dropout_from_layer(rng, layer, p): srng = theano.tensor.shared_randomstreams.RandomStreams( rng.randint(999999)) mask = srng.binomial(n=1, p=1-p, size=layer.shape) output = layer * T.cast(mask, theano.config.floatX) return output class DropoutHiddenLayer(HiddenLayer): def __init__(self, rng, input, n_in, n_out, activation, dropout_rate, use_bias=True, W=None, b=None): super(DropoutHiddenLayer, self).__init__( rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b, activation=activation) self.output = _dropout_from_layer(rng, self.output, p=dropout_rate) class DropoutGraphConvo(Graph_Conv): def __init__(self, rng, b_layer, W_self, W_degrees, num_in_features, num_hidden_features, num_input, env_features, env_neighbors, env_degrees, mask, dropout_rate=0.3, max_nodes_in_poc=50, max_poc_degrees=20): super(DropoutGraphConvo, self).__init__( rng=rng, W_self=W_self, b_layer=b_layer, W_degrees=W_degrees, num_in_features=num_in_features, num_hidden_features=num_hidden_features, num_input=num_input, env_features=env_features, env_neighbors=env_neighbors, env_degrees=env_degrees, mask=mask, max_nodes_in_poc=max_nodes_in_poc, max_poc_degrees=max_poc_degrees) self.layer_conv = _dropout_from_layer(rng, self.layer_conv, p=dropout_rate) class DropoutGraphConvMol(Graph_Conv_mol): def __init__(self, rng, W_self, b_layer, W_degrees, num_atom_features, num_bond_features, num_hidden_features, num_input, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mask, dropout_rate, max_nodes_in_mol=80, max_mol_degrees=6): super(DropoutGraphConvMol, self).__init__( rng=rng, W_self=W_self, b_layer=b_layer, W_degrees=W_degrees, num_atom_features=num_atom_features, num_bond_features=num_bond_features, num_hidden_features=num_hidden_features, num_input=num_input, mol_atom_features=mol_atom_features, mol_bond_features=mol_bond_features, mol_atom_neighbors=mol_atom_neighbors, mol_bond_neighbors=mol_bond_neighbors, mol_degrees=mol_degrees, mask=mask, max_nodes_in_mol=max_nodes_in_mol, max_mol_degrees=max_mol_degrees) self.layer_conv = _dropout_from_layer(rng, self.layer_conv, p=dropout_rate) class DropoutPocOutput(POC_FP_Output): def __init__(self, rng, W_out, b_out, num_input, num_in_features, env_features, mask, dropout_rate=0.3, fp_length=512, max_nodes_in_poc=50): super(DropoutPocOutput, self).__init__( rng=rng, W_out=W_out, b_out=b_out, num_input=num_input, num_in_features=num_in_features, env_features=env_features, mask=mask, fp_length=fp_length, max_nodes_in_poc=max_nodes_in_poc) self.layer_fp = _dropout_from_layer(rng, self.layer_fp, p=dropout_rate) class DropoutMolOutput(MOL_FP_Output): def __init__(self, rng, W_out, b_out, num_input, num_atom_features, mol_atom_features, mask, dropout_rate=0.3, fp_length=216, max_nodes_in_mol=20): super(DropoutMolOutput, self).__init__( rng=rng, W_out=W_out, b_out=b_out, num_input=num_input, num_atom_features=num_atom_features, mol_atom_features=mol_atom_features, mask=mask, fp_length=fp_length,max_nodes_in_mol=max_nodes_in_mol) self.layer_fp = _dropout_from_layer(rng, self.layer_fp, p=dropout_rate) class Conv3D_Graph_Conv(object): def __init__( self, numpy_rng, Weights, Bias, env_features, env_neighbors, env_mask, env_degrees, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_mask, mol_degrees, labels, num_input, num_poc_in_features=480, num_mol_atom_features=62, num_mol_bond_features=6, poc_fp_length=512, mol_fp_length=216, dropout_rate=0.3, theano_rng=None ): self.tparams = OrderedDict() if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) if labels is None: self.y = T.ivector('y') else: self.y = labels [W_poc_self_1, W_poc_self_2, W_poc_out, W_poc_degrees_1, W_poc_degrees_2, W_mol_self_1, W_mol_self_2, W_mol_out, W_mol_degrees_1, W_mol_degrees_2, W_hid, W_log] = Weights [b_poc_layer_1, b_poc_layer_2, b_poc_out, b_mol_layer_1, b_mol_layer_2, b_mol_out, b_hid, b_log] = Bias ######################### Poc Graph ################################# self.Drop_Graph_poc_layer_1 = DropoutGraphConvo(rng=numpy_rng, b_layer=b_poc_layer_1, W_self=W_poc_self_1, W_degrees=W_poc_degrees_1, num_in_features=num_poc_in_features, num_hidden_features=200, num_input=num_input, env_features=env_features, env_neighbors=env_neighbors, env_degrees=env_degrees, mask=env_mask, max_nodes_in_poc=max_nodes_in_poc, max_poc_degrees=max_poc_degrees, dropout_rate=dropout_rate ) corrected_poc_degree_1 = [] for d in range(0,max_poc_degrees): corrected_poc_degree_1.append(self.Drop_Graph_poc_layer_1.W_degrees[d]* (1 - dropout_rate)) self.Graph_poc_layer_1 = Graph_Conv(rng=numpy_rng, b_layer=self.Drop_Graph_poc_layer_1.b_layer* (1 - dropout_rate), W_self=self.Drop_Graph_poc_layer_1.W_self* (1 - dropout_rate), W_degrees=corrected_poc_degree_1, num_in_features=num_poc_in_features, num_hidden_features=200, num_input=num_input, env_features=env_features, env_neighbors=env_neighbors, env_degrees=env_degrees, mask=env_mask, max_nodes_in_poc=max_nodes_in_poc, max_poc_degrees=max_poc_degrees ) ######################### self.Drop_Graph_poc_layer_2 = DropoutGraphConvo(rng=numpy_rng, b_layer=b_poc_layer_2, W_self=W_poc_self_2, W_degrees=W_poc_degrees_2, num_in_features=200, num_hidden_features=100, num_input=num_input, env_features=self.Drop_Graph_poc_layer_1.layer_conv, env_neighbors=env_neighbors, env_degrees=env_degrees, mask=env_mask, max_nodes_in_poc=max_nodes_in_poc, max_poc_degrees=max_poc_degrees, dropout_rate=dropout_rate ) self.tparams['W_poc_self_2'] = self.Drop_Graph_poc_layer_2.W_self self.tparams['b_poc_layer_2'] = self.Drop_Graph_poc_layer_2.b_layer for d in range(0,max_poc_degrees): self.tparams['W_poc_l_2_d_'+str(d)] = self.Drop_Graph_poc_layer_2.W_degrees[d] corrected_poc_degree_2 = [] for d in range(0,max_poc_degrees): corrected_poc_degree_2.append(self.Drop_Graph_poc_layer_2.W_degrees[d]* (1 - dropout_rate)) self.Graph_poc_layer_2 = Graph_Conv(rng=numpy_rng, b_layer=self.Drop_Graph_poc_layer_2.b_layer* (1 - dropout_rate), W_self=self.Drop_Graph_poc_layer_2.W_self* (1 - dropout_rate), W_degrees=corrected_poc_degree_2, num_in_features=200, num_hidden_features=100, num_input=num_input, env_features=self.Graph_poc_layer_1.layer_conv, env_neighbors=env_neighbors, env_degrees=env_degrees, mask=env_mask, max_nodes_in_poc=max_nodes_in_poc, max_poc_degrees=max_poc_degrees ) ######################### self.Drop_POC_FP_layer = DropoutPocOutput( rng=numpy_rng, W_out=W_poc_out, b_out=b_poc_out, num_in_features=100, num_input=num_input, env_features=self.Drop_Graph_poc_layer_2.layer_conv, mask=env_mask, fp_length=poc_fp_length, max_nodes_in_poc=max_nodes_in_poc, dropout_rate=dropout_rate ) self.tparams['W_poc_out'] = self.Drop_POC_FP_layer.W_out self.tparams['b_poc_out'] = self.Drop_POC_FP_layer.b_out self.POC_FP_layer = POC_FP_Output(rng=numpy_rng, W_out=self.Drop_POC_FP_layer.W_out* (1 - dropout_rate), b_out=self.Drop_POC_FP_layer.b_out* (1 - dropout_rate), num_in_features=100, num_input=num_input, env_features=self.Graph_poc_layer_2.layer_conv, mask=env_mask, fp_length=poc_fp_length, max_nodes_in_poc=max_nodes_in_poc ) ######################### Mol Graph ################################# self.Drop_Graph_mol_layer_1 = DropoutGraphConvMol(rng=numpy_rng, b_layer=b_mol_layer_1, W_self=W_mol_self_1, W_degrees=W_mol_degrees_1, num_atom_features=num_mol_atom_features, num_bond_features=num_mol_bond_features, num_hidden_features=200, num_input=num_input, mol_atom_features=mol_atom_features, mol_bond_features=mol_bond_features, mol_atom_neighbors=mol_atom_neighbors, mol_bond_neighbors=mol_bond_neighbors, mol_degrees=mol_degrees, mask=mol_mask, max_nodes_in_mol=max_nodes_in_mol, max_mol_degrees=max_mol_degrees, dropout_rate=dropout_rate ) corrected_mol_degree_1 = [] for d in range(0,max_mol_degrees): corrected_mol_degree_1.append(self.Drop_Graph_mol_layer_1.W_degrees[d]* (1 - dropout_rate)) self.Graph_mol_layer_1 = Graph_Conv_mol(rng=numpy_rng, b_layer=self.Drop_Graph_mol_layer_1.b_layer* (1 - dropout_rate), W_self=self.Drop_Graph_mol_layer_1.W_self* (1 - dropout_rate), W_degrees=corrected_mol_degree_1, num_atom_features=num_mol_atom_features, num_bond_features=num_mol_bond_features, num_hidden_features=200, num_input=num_input, mol_atom_features=mol_atom_features, mol_bond_features=mol_bond_features, mol_atom_neighbors=mol_atom_neighbors, mol_bond_neighbors=mol_bond_neighbors, mol_degrees=mol_degrees, mask=mol_mask, max_nodes_in_mol=max_nodes_in_mol, max_mol_degrees=max_mol_degrees ) ######################### self.Drop_Graph_mol_layer_2 = DropoutGraphConvo(rng=numpy_rng, b_layer=b_mol_layer_2, W_self=W_mol_self_2, W_degrees=W_mol_degrees_2, num_in_features=200, num_hidden_features=100, num_input=num_input, env_features=self.Drop_Graph_mol_layer_1.layer_conv, env_neighbors=mol_atom_neighbors, env_degrees=mol_degrees, mask=mol_mask, max_nodes_in_poc=max_nodes_in_mol, max_poc_degrees=max_mol_degrees, dropout_rate=dropout_rate ) self.tparams['W_mol_self_2'] = self.Drop_Graph_mol_layer_2.W_self self.tparams['b_mol_layer_2'] = self.Drop_Graph_mol_layer_2.b_layer for d in range(0,max_mol_degrees): self.tparams['W_mol_l_2_d_'+str(d)] = self.Drop_Graph_mol_layer_2.W_degrees[d] corrected_mol_degree_2 = [] for d in range(0,max_mol_degrees): corrected_mol_degree_2.append(self.Drop_Graph_mol_layer_2.W_degrees[d]* (1 - dropout_rate)) self.Graph_mol_layer_2 = Graph_Conv(rng=numpy_rng, b_layer=self.Drop_Graph_mol_layer_2.b_layer* (1 - dropout_rate), W_self=self.Drop_Graph_mol_layer_2.W_self* (1 - dropout_rate), W_degrees=corrected_mol_degree_2, num_in_features=200, num_hidden_features=100, num_input=num_input, env_features=self.Graph_mol_layer_1.layer_conv, env_neighbors=mol_atom_neighbors, env_degrees=mol_degrees, mask=mol_mask, max_nodes_in_poc=max_nodes_in_mol, max_poc_degrees=max_mol_degrees ) ######################### self.Drop_MOL_FP_layer = DropoutMolOutput( rng=numpy_rng, W_out=W_mol_out, b_out=b_mol_out, num_input=num_input, num_atom_features=100, mol_atom_features=self.Graph_mol_layer_2.layer_conv, mask=mol_mask, max_nodes_in_mol=max_nodes_in_mol, fp_length=mol_fp_length, dropout_rate=dropout_rate ) self.tparams['W_mol_out'] = self.Drop_MOL_FP_layer.W_out self.tparams['b_mol_out'] = self.Drop_MOL_FP_layer.b_out self.MOL_FP_layer = MOL_FP_Output(rng=numpy_rng, W_out=self.Drop_MOL_FP_layer.W_out* (1 - dropout_rate), b_out=self.Drop_MOL_FP_layer.b_out* (1 - dropout_rate), num_input=num_input, num_atom_features=100, mol_atom_features=self.Graph_mol_layer_2.layer_conv, mask=mol_mask, max_nodes_in_mol=max_nodes_in_mol, fp_length=mol_fp_length ) ########################################################## self.Drop_FC_layer=DropoutHiddenLayer( rng=numpy_rng, input=T.concatenate([self.Drop_POC_FP_layer.layer_fp, self.Drop_MOL_FP_layer.layer_fp],axis=1), n_in=poc_fp_length+mol_fp_length, n_out=100, activation=relu, W=W_hid, b=b_hid, dropout_rate=dropout_rate, ) self.tparams['W_hid'] = self.Drop_FC_layer.W self.tparams['b_hid'] = self.Drop_FC_layer.b self.FC_layer=HiddenLayer( rng=numpy_rng, input=T.concatenate([self.POC_FP_layer.layer_fp, self.MOL_FP_layer.layer_fp],axis=1), n_in=poc_fp_length+mol_fp_length, n_out=100, activation=relu, W=self.Drop_FC_layer.W* (1 - dropout_rate), b=self.Drop_FC_layer.b* (1 - dropout_rate) ) self.Drop_logLayer = LogisticRegression( input=self.Drop_FC_layer.output, n_in=100, n_out=2, W=W_log, b=b_log, ) self.tparams['W_log'] = self.Drop_logLayer.W self.tparams['b_log'] = self.Drop_logLayer.b self.logLayer = LogisticRegression( input=self.FC_layer.output, n_in=100, n_out=2, W=self.Drop_logLayer.W, b=self.Drop_logLayer.b, ) self.finetune_cost = self.Drop_logLayer.negative_log_likelihood(self.y) self.dropout_errors = self.Drop_logLayer.errors self.errors = self.logLayer.errors(self.y) self.y_pred = self.logLayer.y_pred self.y_prob = self.logLayer.p_y_given_x def train_model(init,num_of_poc_from_pocFEA,decoy_ratio,num_drugFEATURE_sample,finetune_lr=0.003, training_epochs=1, batch_size=5, reg=5e-9): [W_poc_self_1, W_poc_degrees_1, b_poc_prime_1, b_poc_prime_self_1, b_poc_layer_1, W_poc_out_1, b_poc_out_1, b_poc_prime_out_1, W_poc_self_2, W_poc_degrees_2, b_poc_prime_2, b_poc_prime_self_2, b_poc_layer_2, W_poc_out_2, b_poc_out_2, b_poc_prime_out_2]= load_poc_W() W_mol_layer_1, W_mol_layer_2 = load_mol_W() [W_mol_self_1, W_mol_degrees_1, b_mol_prime_1, b_mol_prime_self_1, b_mol_layer_1, W_mol_out_1, b_mol_out_1, b_mol_prime_out_1] = W_mol_layer_1 [W_mol_self_2, W_mol_degrees_2, b_mol_prime_2, b_mol_prime_self_2, b_mol_layer_2, W_mol_out_2, b_mol_out_2, b_mol_prime_out_2] = W_mol_layer_2 all_poc, max_ff, min_ff, mean_ff = get_all_pocs() W_poc_out = None b_poc_out = None W_mol_out = None b_mol_out = None W_hid = None W_log = None b_hid = None b_log = None Weights = [W_poc_self_1, W_poc_self_2, W_poc_out, W_poc_degrees_1, W_poc_degrees_2, W_mol_self_1, W_mol_self_2, W_mol_out, W_mol_degrees_1, W_mol_degrees_2, W_hid, W_log] Bias = [b_poc_layer_1, b_poc_layer_2, b_poc_out, b_mol_layer_1, b_mol_layer_2, b_mol_out, b_hid, b_log] all_train_pocs = [] all_train_mols = [] all_train_labels = [] for fold in range(0,4): traindata, testdata = load_DUDE_data_neg_assay_fold(fold,num_of_poc_from_pocFEA) train_pockets, train_mols, labels_train = traindata # one pair of (pocket, mol), one label all_train_pocs.extend(train_pockets) all_train_mols.extend(train_mols) all_train_labels.extend(labels_train) labels_train = all_train_labels train_pockets = all_train_pocs train_mols = all_train_mols num_atom_features, num_bond_features = get_atom_bond_dim([train_mols[0]]) n_train_batches = len(labels_train) n_train_batches = int(n_train_batches/batch_size) # batch_size should be the number of paris of (pocket, mol) numpy_rng = numpy.random.RandomState(89677) print ('... building the model') batchsize = batch_size env_features = T.matrix('env_features', dtype=theano.config.floatX) env_neighbors = T.matrix('env_neighbors', dtype=theano.config.floatX) env_mask = T.fvector('env_mask') env_degrees = T.matrix('env_degrees', dtype=theano.config.floatX) mol_atom_features = T.matrix('mol_atom_features', dtype=theano.config.floatX) mol_bond_features = T.matrix('mol_bond_features', dtype=theano.config.floatX) mol_atom_neighbors = T.matrix('mol_atom_neighbors', dtype=theano.config.floatX) mol_bond_neighbors = T.matrix('mol_bond_neighbors', dtype=theano.config.floatX) mol_mask = T.fvector('mol_mask') mol_degrees = T.matrix('mol_degrees', dtype=theano.config.floatX) labels = T.ivector('labels') # mask = T.matrix('mask', dtype=theano.config.floatX) s_cnn_da = Conv3D_Graph_Conv( numpy_rng=numpy_rng, Weights=Weights, Bias=Bias, env_features=env_features, env_neighbors=env_neighbors, env_mask=env_mask, env_degrees=env_degrees, mol_atom_features=mol_atom_features, mol_bond_features=mol_bond_features, mol_atom_neighbors=mol_atom_neighbors, mol_bond_neighbors=mol_bond_neighbors, mol_mask=mol_mask, mol_degrees=mol_degrees, labels=labels, num_input=batch_size, num_poc_in_features=480, poc_fp_length=512, num_mol_atom_features=62, num_mol_bond_features=6, mol_fp_length=216, dropout_rate=0.1 ) # compute number of minibatches for training, validation and testing index = T.lscalar('index') # index to a [mini]batch lr = T.scalar(name='lr') tparams = s_cnn_da.tparams params = [] for p in tparams.keys(): params.append(tparams[p]) cost = s_cnn_da.finetune_cost grads = T.grad(cost, params) tparams_keys= tparams.keys() zipped_grads = [theano.shared(tparams[k].get_value() * numpy_floatX(0.), name='%s_grad' % k) for k in tparams_keys] running_grads = [theano.shared(tparams[k].get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k in tparams_keys] running_grads2 = [theano.shared(tparams[k].get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)for k in tparams_keys] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] train_fn = theano.function([env_features, env_neighbors, env_degrees, env_mask, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mol_mask, labels], cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [theano.shared(tparams[k].get_value() * numpy_floatX(0.),name='%s_updir' % k) for k in tparams_keys] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') index = T.lscalar('index') # index to a [mini]batch train_score_i = theano.function( [env_features, env_neighbors, env_degrees, env_mask, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mol_mask, labels], s_cnn_da.errors, name='train' ) test_score_i = theano.function( [env_features, env_neighbors, env_degrees, env_mask, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mol_mask, labels], s_cnn_da.errors, name='test' ) print ('... finetunning the model') # early-stopping parameters patience = 100000 # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant best_test_score = numpy.inf test_score = 0. done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 train_losses=[] for minibatch_index in range(0,n_train_batches): env_features_train, env_neighbors_train, env_degrees_train, env_mask_train = get_pocket_attributes(train_pockets[minibatch_index * batch_size: (minibatch_index + 1) * batch_size],max_ff, min_ff, mean_ff) mol_atom_features_train, mol_bond_features_train, mol_atom_neighbors_train, mol_bond_neighbors_train, mol_degrees_train, mol_mask_train = get_mol_attributes(train_mols[minibatch_index * batch_size: (minibatch_index + 1) * batch_size]) y_train=labels_train[minibatch_index * batch_size: (minibatch_index + 1) * batch_size] train_EF, train_EN, train_ED, train_env_mask, train_labels = shared_dataset(env_features_train, env_neighbors_train, env_degrees_train, env_mask_train, y_train) train_MF_atom, train_MF_bond, train_MN_atom, train_MN_bond, train_MD, train_mol_mask = shared_dataset_mol(mol_atom_features_train, mol_bond_features_train, mol_atom_neighbors_train, mol_bond_neighbors_train, mol_degrees_train, mol_mask_train) minibatch_avg_cost = train_fn(train_EF.eval(), train_EN.eval(), train_ED.eval(), train_env_mask.eval(), train_MF_atom.eval(), train_MF_bond.eval(), train_MN_atom.eval(), train_MN_bond.eval(), train_MD.eval(), train_mol_mask.eval(), train_labels.eval()) # list_file.write('epoch: '+str(epoch)+',cost: '+str(minibatch_avg_cost)+'\n') train_err=train_score_i(train_EF.eval(), train_EN.eval(), train_ED.eval(), train_env_mask.eval(), train_MF_atom.eval(), train_MF_bond.eval(), train_MN_atom.eval(), train_MN_bond.eval(), train_MD.eval(), train_mol_mask.eval(), train_labels.eval()) train_losses.append(train_err) f_update(finetune_lr) iter = (epoch - 1) * n_train_batches + minibatch_index if minibatch_index % 10000==0 and minibatch_index!=0: train_losses_=numpy.array(train_losses) this_train_loss=numpy.mean(train_losses_) list_file= open('../progress_DUDE_all_folds_neg_assay.txt','a') list_file.write('epoch %i, minibatch %i, n_train_batches %i, train error %f %%' %(epoch, minibatch_index, n_train_batches, this_train_loss * 100.)) list_file.write('\n') list_file.close() #### dump weights ###### W_poc_out = s_cnn_da.Drop_POC_FP_layer.W_out b_poc_out = s_cnn_da.Drop_POC_FP_layer.b_out W_mol_out = s_cnn_da.Drop_MOL_FP_layer.W_out b_mol_out = s_cnn_da.Drop_MOL_FP_layer.b_out W_poc_self_1 = s_cnn_da.Drop_Graph_poc_layer_1.W_self b_poc_layer_1 = s_cnn_da.Drop_Graph_poc_layer_1.b_layer W_mol_self_1 = s_cnn_da.Drop_Graph_mol_layer_1.W_self b_mol_layer_1 = s_cnn_da.Drop_Graph_mol_layer_1.b_layer W_poc_self_2 = s_cnn_da.Drop_Graph_poc_layer_2.W_self b_poc_layer_2 = s_cnn_da.Drop_Graph_poc_layer_2.b_layer W_mol_self_2 = s_cnn_da.Drop_Graph_mol_layer_2.W_self b_mol_layer_2 = s_cnn_da.Drop_Graph_mol_layer_2.b_layer W_hid = s_cnn_da.Drop_FC_layer.W b_hid = s_cnn_da.Drop_FC_layer.b W_log = s_cnn_da.Drop_logLayer.W b_log = s_cnn_da.Drop_logLayer.b Weights = [W_poc_self_1, W_poc_self_2, W_poc_out, W_mol_self_1, W_mol_self_2, W_mol_out, W_hid, W_log] Bias = [b_poc_layer_1, b_poc_layer_2, b_poc_out, b_mol_layer_1, b_mol_layer_2, b_mol_out, b_hid, b_log] W_poc_degrees_1 = [] for d in range(0,max_poc_degrees): W_poc_degrees_1.append(s_cnn_da.Drop_Graph_poc_layer_1.W_degrees[d]) W_poc_degrees_2 = [] for d in range(0,max_poc_degrees): W_poc_degrees_2.append(s_cnn_da.Drop_Graph_poc_layer_2.W_degrees[d]) W_mol_degrees_1 = [] for d in range(0,max_mol_degrees): W_mol_degrees_1.append(s_cnn_da.Drop_Graph_mol_layer_1.W_degrees[d]) W_mol_degrees_2 = [] for d in range(0,max_mol_degrees): W_mol_degrees_2.append(s_cnn_da.Drop_Graph_mol_layer_2.W_degrees[d]) Weights.extend([W_poc_degrees_1, W_poc_degrees_2, W_mol_degrees_1, W_mol_degrees_2]) dump_weights_drop_pickle(Weights, Bias, file_name='../weights/weight_DUDE_all_folds_neg_assay.zip') train_losses=numpy.array(train_losses) this_train_loss=numpy.mean(train_losses) list_file= open('../progress_DUDE_all_folds_neg_assay.txt','a') list_file.write('epoch %i, train error %f %%' %(epoch, this_train_loss * 100.)) list_file.write('\n') def eval_ROC(pro_name,PDB_ID): y_true = numpy.load('../results/MUV/test_labels_'+pro_name+'_'+PDB_ID+'.dat') y_prob = numpy.load('../results/MUV/test_probs_'+pro_name+'_'+PDB_ID+'.dat') fpr, tpr, thresholds = metrics.roc_curve(y_true, y_prob) S_1 = metrics.auc(fpr, tpr) list_file = open('../results/ROC_MUV_results.txt','a') list_file.write("target:"+pro_name+' '+PDB_ID+'\n') list_file.write("AUC score:"+str(S_1)+'\n') def eval_ROC_MUV(file_name,batch_size=5): all_poc, max_ff, min_ff, mean_ff = get_all_pocs() [W_poc_self_1, W_poc_self_2, W_poc_out, W_mol_self_1, W_mol_self_2, W_mol_out, W_hid, W_log, b_poc_layer_1, b_poc_layer_2, b_poc_out, b_mol_layer_1, b_mol_layer_2, b_mol_out, b_hid, b_log, W_poc_degrees_1, W_mol_degrees_1, W_poc_degrees_2, W_mol_degrees_2] = load_DUDE_all_folds_weights(file_name) Weights = [W_poc_self_1, W_poc_self_2, W_poc_out, W_poc_degrees_1, W_poc_degrees_2, W_mol_self_1, W_mol_self_2, W_mol_out, W_mol_degrees_1, W_mol_degrees_2, W_hid, W_log] Bias = [b_poc_layer_1, b_poc_layer_2, b_poc_out, b_mol_layer_1, b_mol_layer_2, b_mol_out, b_hid, b_log] all_targets = gene_target_smiles_MUV_ROC_dict() numpy_rng = numpy.random.RandomState(89677) for pro_name_ in all_targets.keys(): pro_name = pro_name_[0] PDB_ID = pro_name_[1] test_pockets=[] test_mols=[] labels_test=[] test_target_mol = all_targets[pro_name_] for entry in test_target_mol: pocket_name=entry[0] sm_name=entry[1] label_name=entry[2] p_name=entry[3] test_pockets.append(pocket_name) test_mols.append(sm_name) labels_test.append(label_name) if p_name!= pro_name: print (p_name) print (pro_name) print ("WRONG WRONG WRONG") continue labels_test = numpy.array(labels_test) num_atom_features, num_bond_features = get_atom_bond_dim([test_mols[0]]) n_test_batches = len(labels_test) n_test_batches = int(n_test_batches/batch_size) batchsize = batch_size env_features = T.matrix('env_features', dtype=theano.config.floatX) env_neighbors = T.matrix('env_neighbors', dtype=theano.config.floatX) env_mask = T.fvector('env_mask') env_degrees = T.matrix('env_degrees', dtype=theano.config.floatX) mol_atom_features = T.matrix('mol_atom_features', dtype=theano.config.floatX) mol_bond_features = T.matrix('mol_bond_features', dtype=theano.config.floatX) mol_atom_neighbors = T.matrix('mol_atom_neighbors', dtype=theano.config.floatX) mol_bond_neighbors = T.matrix('mol_bond_neighbors', dtype=theano.config.floatX) mol_mask = T.fvector('mol_mask') mol_degrees = T.matrix('mol_degrees', dtype=theano.config.floatX) labels = T.ivector('labels') s_cnn_da = Conv3D_Graph_Conv( numpy_rng=numpy_rng, Weights=Weights, Bias=Bias, env_features=env_features, env_neighbors=env_neighbors, env_mask=env_mask, env_degrees=env_degrees, mol_atom_features=mol_atom_features, mol_bond_features=mol_bond_features, mol_atom_neighbors=mol_atom_neighbors, mol_bond_neighbors=mol_bond_neighbors, mol_mask=mol_mask, mol_degrees=mol_degrees, labels=labels, num_input=batch_size, num_poc_in_features=480, poc_fp_length=512, num_mol_atom_features=62, num_mol_bond_features=6, mol_fp_length=216, dropout_rate=0.1 ) test_score_i = theano.function( [env_features, env_neighbors, env_degrees, env_mask, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mol_mask, labels], [s_cnn_da.errors,s_cnn_da.y_prob], name='test' ) test_losses=[] test_y_prob=[] test_y_true=[] for i in range(n_test_batches): env_features_test, env_neighbors_test, env_degrees_test, env_mask_test = get_pocket_attributes(test_pockets[i * batch_size: (i + 1) * batch_size],max_ff, min_ff, mean_ff) mol_atom_features_test, mol_bond_features_test, mol_atom_neighbors_test, mol_bond_neighbors_test, mol_degrees_test, mol_mask_test = get_mol_attributes(test_mols[i * batch_size: (i + 1) * batch_size]) y_test=labels_test[i * batch_size: (i + 1) * batch_size] test_EF, test_EN, test_ED, test_env_mask, test_labels = shared_dataset(env_features_test, env_neighbors_test, env_degrees_test, env_mask_test, y_test) test_MF_atom, test_MF_bond, test_MN_atom, test_MN_bond, test_MD, test_mol_mask = shared_dataset_mol(mol_atom_features_test, mol_bond_features_test, mol_atom_neighbors_test, mol_bond_neighbors_test, mol_degrees_test, mol_mask_test) [test_err,test_prob]=test_score_i(test_EF.eval(), test_EN.eval(), test_ED.eval(), test_env_mask.eval(), test_MF_atom.eval(), test_MF_bond.eval(), test_MN_atom.eval(), test_MN_bond.eval(), test_MD.eval(), test_mol_mask.eval(), test_labels.eval()) test_losses.append(test_err) test_y_prob.extend(test_prob[:,1]) test_y_true.extend(y_test) test_score = numpy.mean(test_losses) test_y_prob = numpy.array(test_y_prob) test_y_true = numpy.array(test_y_true) test_y_true.dump('../results/MUV/test_labels_'+pro_name+'_'+PDB_ID+'.dat') test_y_prob.dump('../results/MUV/test_probs_'+pro_name+'_'+PDB_ID+'.dat') eval_ROC(pro_name,PDB_ID) def shared_test_dataset(env_features, env_neighbors, env_degrees, mask, borrow=True): shared_env_features = theano.shared(numpy.asarray(env_features,dtype=theano.config.floatX),borrow=borrow) shared_env_neighbors = theano.shared(numpy.asarray(env_neighbors,dtype=theano.config.floatX),borrow=borrow) shared_env_degrees = theano.shared(numpy.asarray(env_degrees,dtype=theano.config.floatX),borrow=borrow) shared_mask = theano.shared(numpy.asarray(mask,dtype=theano.config.floatX),borrow=borrow) return shared_env_features, shared_env_neighbors, shared_env_degrees, shared_mask def eval_user_structure(test_pocket,file_name,batch_size=1): all_poc, max_ff, min_ff, mean_ff = get_all_pocs() [W_poc_self_1, W_poc_self_2, W_poc_out, W_mol_self_1, W_mol_self_2, W_mol_out, W_hid, W_log, b_poc_layer_1, b_poc_layer_2, b_poc_out, b_mol_layer_1, b_mol_layer_2, b_mol_out, b_hid, b_log, W_poc_degrees_1, W_mol_degrees_1, W_poc_degrees_2, W_mol_degrees_2] = load_DUDE_all_folds_weights(file_name) Weights = [W_poc_self_1, W_poc_self_2, W_poc_out, W_poc_degrees_1, W_poc_degrees_2, W_mol_self_1, W_mol_self_2, W_mol_out, W_mol_degrees_1, W_mol_degrees_2, W_hid, W_log] Bias = [b_poc_layer_1, b_poc_layer_2, b_poc_out, b_mol_layer_1, b_mol_layer_2, b_mol_out, b_hid, b_log] numpy_rng = numpy.random.RandomState(89677) PDB_ID = test_pocket[0:4] test_mols, mol_tar_dict=test_ligand_screen() print "screening "+str(len(test_mols))+" ligands ..." test_pockets=[test_pocket]*len(test_mols) num_atom_features, num_bond_features = get_atom_bond_dim([test_mols[0]]) n_test_batches = len(test_mols) n_test_batches = int(n_test_batches/batch_size) batchsize = batch_size env_features = T.matrix('env_features', dtype=theano.config.floatX) env_neighbors = T.matrix('env_neighbors', dtype=theano.config.floatX) env_mask = T.fvector('env_mask') env_degrees = T.matrix('env_degrees', dtype=theano.config.floatX) mol_atom_features = T.matrix('mol_atom_features', dtype=theano.config.floatX) mol_bond_features = T.matrix('mol_bond_features', dtype=theano.config.floatX) mol_atom_neighbors = T.matrix('mol_atom_neighbors', dtype=theano.config.floatX) mol_bond_neighbors = T.matrix('mol_bond_neighbors', dtype=theano.config.floatX) mol_mask = T.fvector('mol_mask') mol_degrees = T.matrix('mol_degrees', dtype=theano.config.floatX) labels = T.ivector('labels') s_cnn_da = Conv3D_Graph_Conv( numpy_rng=numpy_rng, Weights=Weights, Bias=Bias, env_features=env_features, env_neighbors=env_neighbors, env_mask=env_mask, env_degrees=env_degrees, mol_atom_features=mol_atom_features, mol_bond_features=mol_bond_features, mol_atom_neighbors=mol_atom_neighbors, mol_bond_neighbors=mol_bond_neighbors, mol_mask=mol_mask, mol_degrees=mol_degrees, labels=labels, num_input=batch_size, num_poc_in_features=480, poc_fp_length=512, num_mol_atom_features=62, num_mol_bond_features=6, mol_fp_length=216, dropout_rate=0.1 ) test_score_i = theano.function( [env_features, env_neighbors, env_degrees, env_mask, mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mol_mask], [s_cnn_da.y_prob], name='test' ) test_y_prob=[] test_ligs=[] for i in range(n_test_batches): env_features_test, env_neighbors_test, env_degrees_test, env_mask_test = get_pocket_attributes(test_pockets[i * batch_size: (i + 1) * batch_size],max_ff, min_ff, mean_ff, input_dir = '../data/User/ff/') mol_atom_features_test, mol_bond_features_test, mol_atom_neighbors_test, mol_bond_neighbors_test, mol_degrees_test, mol_mask_test = get_mol_attributes(test_mols[i * batch_size: (i + 1) * batch_size]) test_EF, test_EN, test_ED, test_env_mask = shared_test_dataset(env_features_test, env_neighbors_test, env_degrees_test, env_mask_test) test_MF_atom, test_MF_bond, test_MN_atom, test_MN_bond, test_MD, test_mol_mask = shared_dataset_mol(mol_atom_features_test, mol_bond_features_test, mol_atom_neighbors_test, mol_bond_neighbors_test, mol_degrees_test, mol_mask_test) [test_prob]=test_score_i(test_EF.eval(), test_EN.eval(), test_ED.eval(), test_env_mask.eval(), test_MF_atom.eval(), test_MF_bond.eval(), test_MN_atom.eval(), test_MN_bond.eval(), test_MD.eval(), test_mol_mask.eval()) test_y_prob.extend(test_prob[:,1]) test_ligs.extend(test_mols[i * batch_size: (i + 1) * batch_size]) test_y_prob = numpy.array(test_y_prob) test_y_prob.dump('../results/User/test_probs_'+PDB_ID+'.dat') result_file = open('../results/User/'+PDB_ID+'.out','w') high_score_idx = numpy.where(test_y_prob>0.9) for idx in high_score_idx[0]: lig = test_ligs[idx] if lig in mol_tar_dict: result_file.write(str(test_y_prob[idx])+'\t'+lig+'\t'+mol_tar_dict[lig]+'\n') else: result_file.write(str(test_y_prob[idx])+'\t'+lig+'\t'+'None'+'\n') if __name__ == '__main__': mode = sys.argv[1] if mode == 'train': train_model(init=True,num_of_poc_from_pocFEA=5,decoy_ratio=0,num_drugFEATURE_sample=0,finetune_lr=0.005,training_epochs=1) elif mode == 'ROC': eval_ROC_MUV(file_name='../weights/weight_DUDE_all_folds_neg_assay.zip') elif mode == 'user': test_pocket = sys.argv[2] eval_user_structure(test_pocket,file_name='../weights/weight_DUDE_all_folds_neg_assay.zip')