# Author: Wen Torng and Russ B. Altman (2018) # Framework for training 1DCNN-FEATURE functional site models import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from pprint import pprint from theano.tensor.signal import downsample from theano.tensor.nnet import conv from theano.tensor.nnet import conv3d2d from scipy.io import matlab import re import math from theano import shared from collections import OrderedDict from layers import * from theano.misc.pkl_utils import dump import random def shared_dataset(data_x, data_y, borrow=True): shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32') def numpy_floatX(data): return numpy.asarray(data, dtype=theano.config.floatX) def load_FEATURE_data(fold): pos_ff_file = open('../data/PROSITE_TP_TN/site_FF/'+site+'.'+target_RES+'.'+target_ATOM+'.'+'pos'+'.ff') neg_ff_file = open('../data/PROSITE_TP_TN/site_FF/'+site+'.'+target_RES+'.'+target_ATOM+'.'+'neg'+'.ff') pos_FF = [] for line in pos_ff_file: if line[0:4]=='Env_': ele = line.split() FV=numpy.zeros((480,)) for i in range(1,481): FV[i-1]=ele[i] pos_FF.append(FV) neg_FF = [] for line in neg_ff_file: if line[0:4]=='Env_': ele = line.split() FV=numpy.zeros((480,)) for i in range(1,481): FV[i-1]=ele[i] neg_FF.append(FV) X_pos=numpy.array(pos_FF) X_neg=numpy.array(neg_FF) y_pos=numpy.ones((X_pos.shape[0],)) y_neg=numpy.zeros((X_neg.shape[0],)) num_of_pos_test = X_pos.shape[0]/total_fold num_of_neg_test = X_neg.shape[0]/total_fold mask_pos_test=range(num_of_pos_test*fold,num_of_pos_test*(fold+1)) mask_neg_test=range(num_of_neg_test*fold,num_of_neg_test*(fold+1)) Xt_pos =X_pos[mask_pos_test] yt_pos = y_pos[mask_pos_test] X_pos = numpy.delete(X_pos, mask_pos_test, 0) y_pos = numpy.delete(y_pos, mask_pos_test, 0) Xt_neg =X_neg[mask_neg_test] yt_neg = y_neg[mask_neg_test] X_neg = numpy.delete(X_neg, mask_neg_test, 0) y_neg = numpy.delete(y_neg, mask_neg_test, 0) Xt=numpy.concatenate((Xt_pos, Xt_neg), axis=0) yt=numpy.concatenate((yt_pos, yt_neg), axis=0) s = [X_pos.shape[0],X_neg.shape[0]] data_size = min(s) min_index = s.index(data_size) print "pos:"+str(s[0]) print "neg:"+str(s[1]) X_pos_all = [] y_pos_all = [] X_neg_all = [] y_neg_all = [] if(min_index==0): ratio = float(s[1])/s[0] r = int(numpy.floor(ratio)) print "more neg, ratio="+str(ratio) for i in range (0,r): X_pos_all.append(X_pos) y_pos_all.append(y_pos) X_neg_all.append(X_neg[i*data_size:(i+1)*data_size]) y_neg_all.append(y_neg[i*data_size:(i+1)*data_size]) elif(min_index==1): ratio = float(s[0])/s[1] r = int(numpy.floor(ratio)) print "more pos ,ratio="+str(ratio) for i in range (0,r): X_pos_all.append(X_pos[i*data_size:(i+1)*data_size]) y_pos_all.append(y_pos[i*data_size:(i+1)*data_size]) X_neg_all.append(X_neg) y_neg_all.append(y_neg) equal_examples=[] equal_labels=[] for i in range (0,len(X_pos_all)): equal_examples.append(numpy.concatenate((X_pos_all[i], X_neg_all[i]), axis=0)) equal_labels.append(numpy.concatenate((y_pos_all[i], y_neg_all[i]), axis=0)) equal_examples = numpy.array(equal_examples) equal_labels=numpy.array(equal_labels) print equal_examples.shape equal_examples = numpy.reshape(equal_examples,(equal_examples.shape[0]*equal_examples.shape[1], 480)) equal_labels = equal_labels.flatten() num_of_train = int(19*float(equal_examples.shape[0])/20) num_of_val = int(1*float(equal_examples.shape[0])/20) print num_of_train print num_of_val mask_train = random.sample(xrange(equal_examples.shape[0]), num_of_train) X = equal_examples[mask_train] y = equal_labels[mask_train] equal_examples = numpy.delete(equal_examples, mask_train, 0) equal_labels = numpy.delete(equal_labels, mask_train, 0) mask_val=random.sample(xrange(equal_examples.shape[0]), num_of_val) Xv =equal_examples[mask_val] yv = equal_labels[mask_val] equal_examples = numpy.delete(equal_examples, mask_val, 0) equal_labels = numpy.delete(equal_labels, mask_val, 0) all_train_x=[] all_train_y=[] all_train_sizes=[] all_examples=[X,Xt,Xv] all_labels=[y,yt,yv] return [all_examples, all_labels, X.shape[0], Xt.shape[0], Xv.shape[0]] class CNN_1D(object): def __init__(self, rng, input, dropout_rates, batch_size, use_bias=True): self.layers = [] self.dropout_layers = [] print '... building the model' batchsize = batch_size filter_size = 10 num_chs = [32,64,128] image_shape = (batchsize, 1, 1, 480) filter_shape = (num_chs[0], 1, 1, filter_size) layer0_input = input.reshape(image_shape) #20 next_dropout_input = layer0_input next_layer_input = layer0_input ################################## Dropout_layer3 = DropoutCNNLayer(rng=rng, input=next_dropout_input, filter_shape=filter_shape, image_shape=image_shape, dropout_rate=dropout_rates[0]) self.dropout_layers.append(Dropout_layer3) next_dropout_input = Dropout_layer3.output layer3 = Conv_1D(rng=rng, input=next_layer_input, filter_shape=filter_shape, image_shape=image_shape, W=Dropout_layer3.W * (1 - dropout_rates[0]), b=Dropout_layer3.b * (1 - dropout_rates[0]), ) self.layers.append(layer3) next_layer_input = layer3.output image_shape = (batchsize, num_chs[0], 1, (480-filter_size)+1) filter_shape = (num_chs[1], num_chs[0], 1, filter_size) Dropout_layer4 = DropoutCNNLayer(rng=rng, input=next_dropout_input, filter_shape=filter_shape, image_shape=image_shape, dropout_rate=dropout_rates[1]) self.dropout_layers.append(Dropout_layer4) next_dropout_input = Dropout_layer4.output.flatten(2) layer4 = Conv_1D(rng=rng, input=next_layer_input, filter_shape=filter_shape, image_shape=image_shape, # scale the weight matrix W with (1-p) W=Dropout_layer4.W * (1 - dropout_rates[1]), b=Dropout_layer4.b * (1 - dropout_rates[1]) ) self.layers.append(layer4) next_layer_input = layer4.output.flatten(2) image_shape = (batchsize, num_chs[1], 1, (480-2*filter_size)+2) n_in = num_chs[1] * (482-2*filter_size) Dropout_layer5 = LogisticRegression( input=next_dropout_input, n_in=n_in, n_out=2) self.dropout_layers.append(Dropout_layer5) layer5 = LogisticRegression( input=next_layer_input, W=Dropout_layer5.W, b=Dropout_layer5.b, n_in=n_in, n_out=2) self.layers.append(layer5) self.dropout_negative_log_likelihood = self.dropout_layers[-1].negative_log_likelihood self.dropout_errors = self.dropout_layers[-1].errors self.negative_log_likelihood = self.layers[-1].negative_log_likelihood self.errors = self.layers[-1].errors self.L2_sqr = T.sum(Dropout_layer3.W**2)+T.sum(Dropout_layer4.W**2)+T.sum(Dropout_layer5.W**2) self.params = [ param for layer in self.dropout_layers for param in layer.params ] def train_CNN_1D(fold, learning_rate=0.002, n_epochs=10, batch_size=20, reg=5e-6, dropout=True, dropout_rates=[0.3,0.3]): rng = numpy.random.RandomState(23455) [all_examples, all_labels, train_size, test_size, val_size]=load_FEATURE_data(fold) Xtr=all_examples[0] Xt=all_examples[1] Xv=all_examples[2] ytr=all_labels[0] yt=all_labels[1] yv=all_labels[2] n_train_batches = train_size/batch_size n_valid_batches = val_size n_test_batches = test_size n_valid_batches /= batch_size n_test_batches /= batch_size Xtr = Xtr[:,numpy.newaxis,numpy.newaxis,:] Xt = Xt[:,numpy.newaxis,numpy.newaxis,:] Xv = Xv[:,numpy.newaxis,numpy.newaxis,:] test_set_x, test_set_y = shared_dataset(Xt, yt) valid_set_x, valid_set_y = shared_dataset(Xv, yv) index = T.lscalar() # index to a [mini]batch dtensor4 = T.TensorType('float32', (False,)*4) x = dtensor4('x') y = T.ivector('y') classifier = CNN_1D(rng=rng, input=x, batch_size=batch_size, dropout_rates=[0.3,0.3]) L2_sqr = classifier.L2_sqr cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y)+reg*L2_sqr print '... building the model' # Compile theano function for testing. test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) output = dropout_cost if dropout else cost grads = [] for param in classifier.params: # Use the right cost function here to train with or without dropout. gparam = T.grad(output, param) grads.append(gparam) updates = [] for param_i, grad_i in zip(classifier.params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function([x,y], cost, updates=updates) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 100000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() cost_ij = 0 epoch = 0 done_looping = False iter = 0 startc = time.clock() cost_hisotry=[] train_history=[] valid_history=[] while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): X_train=Xtr[minibatch_index * batch_size: (minibatch_index + 1) * batch_size] y_train=ytr[minibatch_index * batch_size: (minibatch_index + 1) * batch_size] print "X_train:" print X_train.shape print "y_train:" print y_train.shape train_set_x, train_set_y = shared_dataset(X_train,y_train) iter = (epoch - 1) * n_train_batches + minibatch_index cost_ij = train_model(train_set_x.eval(), train_set_y.eval()) cost_hisotry.append(cost_ij) if (iter + 1) % validation_frequency == 0: list_file= open('../progress/progress_FSCNN_FEATURE_CNN_1D_'+site+'_'+str(fold)+'.txt','a') validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) valid_history.append(100*(1-this_validation_loss)) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) list_file.write('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) list_file.write('\n') # if we got the best validation score until now if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) list_file.write((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) list_file.write('\n') dump_weights_pickle(classifier,'../results/weights/weight_FEATURE_1DCNN_'+site+'_'+str(fold)+'.zip') list_file.close() def dump_weights_pickle(classifier,file_name): W3 = classifier.params[0] W4 = classifier.params[2] W5 = classifier.params[4] b3 = classifier.params[1] b4 = classifier.params[3] b5 = classifier.params[5] with open(file_name, 'wb') as f: dump((W3, W4, W5, b3, b4, b5), f) if __name__ == '__main__': target_RES = sys.argv[1] target_ATOM = sys.argv[2] site = sys.argv[3] fold = int(sys.argv[4]) total_fold = 5 result_weights_ID = target_RES+"_"+target_ATOM+"_"+site train_CNN_1D(fold=fold,learning_rate=0.002, n_epochs=20, batch_size=20, reg=5e-6)