# Author: Wen Torng and Russ B. Altman (2018) # Framework for training 3DCNN functional site models import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from pprint import pprint from theano.tensor.signal import downsample from theano.tensor.nnet import conv from theano.tensor.nnet import conv3d2d from scipy.io import matlab import re import math from theano import shared from collections import OrderedDict from layers import * from theano.misc.pkl_utils import dump sys.path.insert(0,'../data_process/') from store_pytable import * def dump_weights_pickle(Weights, Bias, file_name): W0=Weights[0] W1=Weights[1] W2=Weights[2] W3=Weights[3] b0=Bias[0] b1=Bias[1] b2=Bias[2] b3=Bias[3] with open(file_name, 'wb') as f: dump((W0, W1, W2, W3, b0, b1, b2, b3), f) def shared_dataset(data_x, data_y, borrow=True): shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32') def numpy_floatX(data): return numpy.asarray(data, dtype=theano.config.floatX) def load_Voxel_data(fold): dataName = "data"; labelName = "label"; labelShape = []; pos_or_neg = 'pos' ID = site+'.'+target_RES+'.'+target_ATOM+'.'+pos_or_neg filename_train_pos = '../data/PROSITE_TP_TN/pytables_Voxel/'+"train_data_"+ID+".pytables"; h5file_train = tables.openFile(filename_train_pos, mode="r") dataColumn_train = getH5column(h5file_train, dataName); labelColumn_train = getH5column(h5file_train, labelName); X_pos=dataColumn_train[:] y_pos=labelColumn_train[:] pos_or_neg = 'neg' ID = site+'.'+target_RES+'.'+target_ATOM+'.'+pos_or_neg filename_train_neg = '../data/PROSITE_TP_TN/pytables_Voxel/'+"train_data_"+ID+".pytables"; h5file_train = tables.openFile(filename_train_neg, mode="r") dataColumn_train = getH5column(h5file_train, dataName); labelColumn_train = getH5column(h5file_train, labelName); X_neg=dataColumn_train[:] y_neg=labelColumn_train[:] num_of_pos_test = X_pos.shape[0]/total_fold num_of_neg_test = X_neg.shape[0]/total_fold mask_pos_test=range(num_of_pos_test*fold,num_of_pos_test*(fold+1)) mask_neg_test=range(num_of_neg_test*fold,num_of_neg_test*(fold+1)) Xt_pos =X_pos[mask_pos_test] yt_pos = y_pos[mask_pos_test] X_pos = numpy.delete(X_pos, mask_pos_test, 0) y_pos = numpy.delete(y_pos, mask_pos_test, 0) Xt_neg =X_neg[mask_neg_test] yt_neg = y_neg[mask_neg_test] X_neg = numpy.delete(X_neg, mask_neg_test, 0) y_neg = numpy.delete(y_neg, mask_neg_test, 0) Xt=numpy.concatenate((Xt_pos, Xt_neg), axis=0) yt=numpy.concatenate((yt_pos, yt_neg), axis=0) s = [X_pos.shape[0],X_neg.shape[0]] data_size = min(s) min_index = s.index(data_size) X_pos_all = [] y_pos_all = [] X_neg_all = [] y_neg_all = [] if(min_index==0): ratio = float(s[1])/s[0] r = int(numpy.floor(ratio)) for i in range (0,r): X_pos_all.append(X_pos) y_pos_all.append(y_pos) X_neg_all.append(X_neg[i*data_size:(i+1)*data_size]) y_neg_all.append(y_neg[i*data_size:(i+1)*data_size]) elif(min_index==1): ratio = float(s[0])/s[1] r = int(numpy.floor(ratio)) for i in range (0,r): X_pos_all.append(X_pos[i*data_size:(i+1)*data_size]) y_pos_all.append(y_pos[i*data_size:(i+1)*data_size]) X_neg_all.append(X_neg) y_neg_all.append(y_neg) equal_examples=[] equal_labels=[] for i in range (0,len(X_pos_all)): equal_examples.append(numpy.concatenate((X_pos_all[i], X_neg_all[i]), axis=0)) equal_labels.append(numpy.concatenate((y_pos_all[i], y_neg_all[i]), axis=0)) equal_examples = numpy.array(equal_examples) equal_labels=numpy.array(equal_labels) equal_examples = numpy.reshape(equal_examples,(equal_examples.shape[0]*equal_examples.shape[1], 4, 20, 20, 20)) equal_labels = equal_labels.flatten() num_of_train = int(19*float(equal_examples.shape[0])/20) num_of_val = int(1*float(equal_examples.shape[0])/20) mask_train = random.sample(xrange(equal_examples.shape[0]), num_of_train) X = equal_examples[mask_train] y = equal_labels[mask_train] equal_examples = numpy.delete(equal_examples, mask_train, 0) equal_labels = numpy.delete(equal_labels, mask_train, 0) mask_val=random.sample(xrange(equal_examples.shape[0]), num_of_val) Xv =equal_examples[mask_val] yv = equal_labels[mask_val] equal_examples = numpy.delete(equal_examples, mask_val, 0) equal_labels = numpy.delete(equal_labels, mask_val, 0) all_train_x=[] all_train_y=[] all_train_sizes=[] train_size = X.shape[0] part = train_size/20 for i in range (0,20): mask = range(part*i,part*(i+1)) Xtr=X[mask] ytr=y[mask] all_train_x.append(Xtr) all_train_y.append(ytr) all_train_sizes.append(Xtr.shape[0]) all_examples=[all_train_x,Xt,Xv] all_labels=[all_train_y,yt,yv] return [all_examples, all_labels, 4, all_train_sizes, Xt.shape[0], Xv.shape[0]] class CNN_3D(object): def __init__( self, numpy_rng, signals_shape, filters_shape, input, theano_rng=None, filter_sizes=[100, 200, 400], n_outs=2, corruption_levels=[0.1, 0.1, 0.1], ): self.CNN_pool_layers = [] self.tparams = OrderedDict() self.params = [] self.n_layers = len(filter_sizes) assert self.n_layers > 0 (batchsize, in_time, in_channels, in_height, in_width) = signals_shape (flt_channels, flt_time, in_channels, flt_height, flt_width) = filters_shape assert flt_channels == filter_sizes[0] if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) dtensor5 = T.TensorType('float32', (False,)*5) for i in xrange(self.n_layers-1): if i == 0: image_shape = signals_shape filter_shape = filters_shape else: image_shape = (batchsize, int(in_time/math.pow(2, i)), filter_sizes[i - 1], int(in_height/math.pow(2, i)), int(in_width/math.pow(2, i))) filter_shape = (filter_sizes[i], flt_time, filter_sizes[i - 1], flt_height, flt_width) if i == 0: layer_input = input else: layer_input = self.CNN_pool_layers[-1].output print "layer "+str(i) print "signal shape:" print image_shape print "filter shape:" print filter_shape CNN_pool_layer = Pad_Conv_Pool(rng=numpy_rng, input=layer_input, filter_shape=filter_shape, image_shape=image_shape, ) # add the layer to our list of layers self.CNN_pool_layers.append(CNN_pool_layer) self.params.extend(CNN_pool_layer.params) self.tparams['W_'+str(i)] = CNN_pool_layer.W self.tparams['b_'+str(i)] = CNN_pool_layer.b print "CNN layer W:" print CNN_pool_layer.W.shape.eval() print "CNN layer b:" print CNN_pool_layer.b.shape.eval() i=i+1 image_shape = (batchsize, int(in_time/math.pow(2, i)), filter_sizes[i - 1], int(in_height/math.pow(2, i)), int(in_width/math.pow(2, i))) filter_shape = (filter_sizes[i], flt_time, filter_sizes[i - 1], flt_height, flt_width) layer_input = self.CNN_pool_layers[-1].output print "signal shape:" print image_shape print "filter shape:" print filter_shape self.CNN_layer = Conv_3d_Layer(rng=numpy_rng, input=layer_input, filter_shape=filter_shape, image_shape=image_shape, ) self.params.extend(self.CNN_layer.params) self.tparams['W_2'] = self.CNN_layer.W self.tparams['b_2'] = self.CNN_layer.b self.logLayer = LogisticRegression( input=self.CNN_layer.output.flatten(2), n_in=(filter_sizes[-1]*in_time*in_height*in_width)/(math.pow(2, i)*math.pow(2, i)*math.pow(2, i)), n_out=n_outs, W=None, b=None, ) self.params.extend(self.logLayer.params) self.tparams['W_log'] = self.logLayer.W self.tparams['b_log'] = self.logLayer.b self.L2_sqr = T.sum(self.CNN_pool_layers[0].W ** 2)+T.sum(self.CNN_pool_layers[1].W**2)+ T.sum(self.CNN_layer.W**2)+T.sum(self.logLayer.W**2) def train_3DCNN(fold, finetune_lr=0.003, training_epochs=7, batch_size=5, reg=1e-7): print "fold "+str(fold) [all_examples, all_labels, in_channels, all_train_sizes, test_size, val_size]= load_Voxel_data(fold) Xtr=all_examples[0] Xt=all_examples[1] Xv=all_examples[2] ytr=all_labels[0] yt=all_labels[1] yv=all_labels[2] n_train_batches = [ a/batch_size for a in all_train_sizes] n_valid_batches = val_size n_test_batches = test_size n_valid_batches /= batch_size n_test_batches /= batch_size numpy_rng = numpy.random.RandomState(89677) print '... building the model' flt_channels = 32 flt_time = 3 in_channels = 4 flt_height = 3 flt_width = 3 batchsize = batch_size in_time = 20 in_height = 20 in_width = 20 signals_shape = (batchsize, in_time, in_channels, in_height, in_width) filters_shape = (flt_channels, flt_time, in_channels, flt_height, flt_width) dtensor5 = T.TensorType('float32', (False,)*5) x = dtensor5('x') y = T.ivector('y') s_cnn_da = CNN_3D( input = x, numpy_rng=numpy_rng, signals_shape=signals_shape, filters_shape=filters_shape, filter_sizes=[32, 64, 128], n_outs=2, corruption_levels=[0., 0., 0.] ) finetune_cost = s_cnn_da.logLayer.negative_log_likelihood(y) errors = s_cnn_da.logLayer.errors(y) # get the training, validation and testing function for the model print '... getting the finetuning functions' # compute number of minibatches for training, validation and testing index = T.lscalar('index') # index to a [mini]batch lr = T.scalar(name='lr') tparams = s_cnn_da.tparams cost = finetune_cost+(reg*s_cnn_da.L2_sqr) grads = T.grad(cost, tparams.values()) zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] train_fn = theano.function([x,y], cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') index = T.lscalar('index') # index to a [mini]batch train_score_i = theano.function( [x,y], errors, name='train' ) test_score_i = theano.function( [x,y], errors, name='test' ) valid_score_i = theano.function( [x,y], errors, name='valid' ) print '... finetunning the model' # early-stopping parameters patience = 100000 # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches[0], patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_test_score = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): list_file= open('../progress/progress_3DCNN_'+site+'_'+str(fold)+'.txt','a') epoch = epoch + 1 train_losses=[] for part_index in xrange (20): for minibatch_index in xrange(n_train_batches[part_index]): X_train=Xtr[part_index][minibatch_index * batch_size: (minibatch_index + 1) * batch_size] y_train=ytr[part_index][minibatch_index * batch_size: (minibatch_index + 1) * batch_size] train_set_x, train_set_y = shared_dataset(X_train,y_train) train_set_x = train_set_x.dimshuffle(0,4,1,2,3) minibatch_avg_cost = train_fn(train_set_x.eval(), train_set_y.eval()) train_err=train_score_i(train_set_x.eval(), train_set_y.eval()) train_losses.append(train_err) f_update(finetune_lr) iter = (epoch - 1) * n_train_batches[part_index] + minibatch_index train_losses=numpy.array(train_losses) this_train_loss=numpy.mean(train_losses) list_file.write('epoch %i, train error %f %%' % (epoch, this_train_loss * 100.)) list_file.write('\n') validation_losses=[] for i in xrange(n_valid_batches): valid_set_x, valid_set_y = shared_dataset(Xv[i*batch_size:(i+1)*batch_size],yv[i*batch_size:(i+1)*batch_size]) valid_set_x=valid_set_x.dimshuffle(0,4,1,2,3) validation_losses.append(valid_score_i(valid_set_x.eval(),valid_set_y.eval())) this_validation_loss = numpy.mean(validation_losses) print('epoch %i, validation error %f %%' % (epoch, this_validation_loss * 100.)) list_file.write('epoch %i, validation error %f %%' % (epoch, this_validation_loss * 100.)) list_file.write('\n') # if we got the best validation score until now if this_validation_loss <= best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter test_losses=[] for i in xrange(n_test_batches): test_set_x, test_set_y = shared_dataset(Xt[i*batch_size:(i+1)*batch_size], yt[i*batch_size:(i+1)*batch_size]) test_set_x=test_set_x.dimshuffle(0,4,1,2,3) test_losses.append(test_score_i(test_set_x.eval(),test_set_y.eval())) test_score = numpy.mean(test_losses) print((' epoch %i, test error of best model %f %%') % (epoch, test_score * 100.)) list_file= open('../progress/progress_3DCNN_'+site+'_'+str(fold)+'.txt','a') list_file.write((' epoch %i, test error of best model %f %%') % (epoch, test_score * 100.)) list_file.write('\n') list_file.close() best_test_score = test_score * 100. final_weights = s_cnn_da.params W0 = s_cnn_da.CNN_pool_layers[0].W.get_value() b0 = s_cnn_da.CNN_pool_layers[0].b.get_value() W1 = s_cnn_da.CNN_pool_layers[1].W.get_value() b1 = s_cnn_da.CNN_pool_layers[1].b.get_value() W2 = s_cnn_da.CNN_layer.W.get_value() b2 = s_cnn_da.CNN_layer.b.get_value() W3 = s_cnn_da.logLayer.W.get_value() b3 = s_cnn_da.logLayer.b.get_value() Weights=[W0,W1,W2,W3] Bias=[b0,b1,b2,b3] dump_weights_pickle(Weights, Bias, file_name='../results/weights/weight_3DCNN_'+result_weights_ID+'_'+str(fold)+'.zip') if __name__ == '__main__': target_RES = sys.argv[1] target_ATOM = sys.argv[2] site = sys.argv[3] fold = int(sys.argv[4]) lr = 0.003 epoch = 10 total_fold = 5 result_weights_ID = target_RES+"_"+target_ATOM+"_"+site train_3DCNN(fold=fold,finetune_lr=lr,training_epochs=epoch)