import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from pprint import pprint from theano.tensor.signal import downsample from theano.tensor.nnet import conv from theano.tensor.nnet import conv3d2d from scipy.io import matlab import re import math from theano import shared from collections import OrderedDict from store_pytable_NOS import * from layers import * from theano.misc.pkl_utils import dump def dump_weights_pickle(Weights, Bias, file_name): W0=Weights[0] W1=Weights[1] W2=Weights[2] W3=Weights[3] b0=Bias[0] b1=Bias[1] b2=Bias[2] b3=Bias[3] with open(file_name, 'wb') as f: dump((W0, W1, W2, W3, b0, b1, b2, b3), f) def shared_dataset(data_x, data_y, borrow=True): shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) return shared_x, T.cast(shared_y, 'int32') def numpy_floatX(data): return numpy.asarray(data, dtype=theano.config.floatX) def load_FS_data(fold): dataName = "data"; labelName = "label"; labelShape = []; pos_or_neg = 'pos' ID = target_RES+'.'+target_ATOM+'.'+pos_or_neg filename_train_pos = "../data/pytables/train_data_"+ID+".pytables"; h5file_train = tables.openFile(filename_train_pos, mode="r") dataColumn_train = getH5column(h5file_train, dataName); labelColumn_train = getH5column(h5file_train, labelName); X_pos=dataColumn_train[:] y_pos=labelColumn_train[:] print "X_pos.shape" print X_pos.shape pos_or_neg = 'neg' ID = target_RES+'.'+target_ATOM+'.'+pos_or_neg filename_train_neg = "../data/pytables/train_data_"+ID+".pytables"; h5file_train = tables.openFile(filename_train_neg, mode="r") dataColumn_train = getH5column(h5file_train, dataName); labelColumn_train = getH5column(h5file_train, labelName); X_neg=dataColumn_train[:] y_neg=labelColumn_train[:] num_of_pos_test = X_pos.shape[0]/total_fold num_of_neg_test = X_neg.shape[0]/total_fold mask_pos_test=range(num_of_pos_test*fold,num_of_pos_test*(fold+1)) mask_neg_test=range(num_of_neg_test*fold,num_of_neg_test*(fold+1)) print mask_pos_test Xt_pos =X_pos[mask_pos_test] yt_pos = y_pos[mask_pos_test] X_pos = numpy.delete(X_pos, mask_pos_test, 0) y_pos = numpy.delete(y_pos, mask_pos_test, 0) Xt_neg =X_neg[mask_neg_test] yt_neg = y_neg[mask_neg_test] X_neg = numpy.delete(X_neg, mask_neg_test, 0) y_neg = numpy.delete(y_neg, mask_neg_test, 0) Xt=numpy.concatenate((Xt_pos, Xt_neg), axis=0) yt=numpy.concatenate((yt_pos, yt_neg), axis=0) s = [X_pos.shape[0],X_neg.shape[0]] data_size = min(s) min_index = s.index(data_size) print "pos:"+str(s[0]) print "neg:"+str(s[1]) X_pos_all = [] y_pos_all = [] X_neg_all = [] y_neg_all = [] if(min_index==0): ratio = float(s[1])/s[0] r = int(numpy.floor(ratio)) print "more neg, ratio="+str(ratio) for i in range (0,r): X_pos_all.append(X_pos) y_pos_all.append(y_pos) X_neg_all.append(X_neg[i*data_size:(i+1)*data_size]) y_neg_all.append(y_neg[i*data_size:(i+1)*data_size]) elif(min_index==1): ratio = float(s[0])/s[1] r = int(numpy.floor(ratio)) print "more pos ,ratio="+str(ratio) for i in range (0,r): X_pos_all.append(X_pos[i*data_size:(i+1)*data_size]) y_pos_all.append(y_pos[i*data_size:(i+1)*data_size]) X_neg_all.append(X_neg) y_neg_all.append(y_neg) equal_examples=[] equal_labels=[] for i in range (0,len(X_pos_all)): equal_examples.append(numpy.concatenate((X_pos_all[i], X_neg_all[i]), axis=0)) equal_labels.append(numpy.concatenate((y_pos_all[i], y_neg_all[i]), axis=0)) equal_examples = numpy.array(equal_examples) equal_labels=numpy.array(equal_labels) equal_examples = numpy.reshape(equal_examples,(equal_examples.shape[0]*equal_examples.shape[1], 4, 20, 20, 20)) equal_labels = equal_labels.flatten() num_of_train = int(19*float(equal_examples.shape[0])/20) num_of_val = int(1*float(equal_examples.shape[0])/20) print num_of_train print num_of_val mask_train = random.sample(xrange(equal_examples.shape[0]), num_of_train) X = equal_examples[mask_train] y = equal_labels[mask_train] equal_examples = numpy.delete(equal_examples, mask_train, 0) equal_labels = numpy.delete(equal_labels, mask_train, 0) mask_val=random.sample(xrange(equal_examples.shape[0]), num_of_val) Xv =equal_examples[mask_val] yv = equal_labels[mask_val] equal_examples = numpy.delete(equal_examples, mask_val, 0) equal_labels = numpy.delete(equal_labels, mask_val, 0) all_train_x=[] all_train_y=[] all_train_sizes=[] train_size = X.shape[0] part = train_size/20 for i in range (0,20): mask = range(part*i,part*(i+1)) Xtr=X[mask] ytr=y[mask] all_train_x.append(Xtr) all_train_y.append(ytr) all_train_sizes.append(Xtr.shape[0]) all_examples=[all_train_x,Xt,Xv] all_labels=[all_train_y,yt,yv] return [all_examples, all_labels, 4, all_train_sizes, Xt.shape[0], Xv.shape[0]] class Conv_3d_Layer(object): def __init__(self, rng, input, filter_shape, image_shape, W=None, b=None): (batchsize, in_time, in_channels, in_height, in_width) = image_shape pad_image_shape = (batchsize, in_time+2, in_channels, in_height+2, in_width+2) #filters_shape = (flt_channels, flt_time, in_channels, flt_height, flt_width) self.input=input assert image_shape[2] == filter_shape[2] # initialize weights with random weights fan_in = numpy.prod(filter_shape[2:]) fan_out = (filter_shape[0] * numpy.prod(filter_shape[3:])) W_bound = numpy.sqrt(6. / (fan_in + fan_out)) if W is None: self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),dtype=theano.config.floatX),borrow=True) else: #W = theano.shared(value=W, name='W', borrow=True) self.W = W if b is None: b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) else: #b= theano.shared(value=b, name='b', borrow=True) self.b = b pad_layer=padding_3D(input=input,data_shape=image_shape) pad_inp = pad_layer.output # print "pad_inp.shape.eval" # print pad_inp.shape.eval() conv_out5D = conv3d2d.conv3d(signals=pad_inp, filters=self.W, signals_shape=pad_image_shape, filters_shape=filter_shape) self.output = relu(conv_out5D + self.b.dimshuffle('x', 0, 'x', 'x')) self.params = [self.W, self.b] class Pad_Conv_Pool(object): def __init__(self, rng, input, filter_shape, image_shape, W=None, b=None): assert image_shape[2] == filter_shape[2] fan_in = numpy.prod(filter_shape[2:]) fan_out = (filter_shape[0] * numpy.prod(filter_shape[3:])) W_bound = numpy.sqrt(6. / (fan_in + fan_out)) if W is None: self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),dtype=theano.config.floatX),borrow=True) else: self.W = W if b is None: b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) else: self.b = b self.input=input pad_conv_layer = Conv_3d_Layer(rng=rng, input=input, filter_shape=filter_shape, image_shape=image_shape, W=self.W, b=self.b) pool_layer = PoolLayer3D(input=pad_conv_layer.output.dimshuffle(0,2,1,3,4), pool_shape=(2,2,2)) self.output = pool_layer.output.dimshuffle(0,2,1,3,4) self.params = [self.W, self.b] class fine_pool_S_CNN_dA(object): def __init__( self, numpy_rng, signals_shape, filters_shape, input, theano_rng=None, filter_sizes=[100, 200, 400], n_outs=2, corruption_levels=[0.1, 0.1, 0.1], ): self.CNN_pool_layers = [] self.tparams = OrderedDict() self.params = [] self.n_layers = len(filter_sizes) assert self.n_layers > 0 (batchsize, in_time, in_channels, in_height, in_width) = signals_shape (flt_channels, flt_time, in_channels, flt_height, flt_width) = filters_shape assert flt_channels == filter_sizes[0] if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data dtensor5 = T.TensorType('float32', (False,)*5) for i in xrange(self.n_layers-1): if i == 0: image_shape = signals_shape filter_shape = filters_shape else: image_shape = (batchsize, int(in_time/math.pow(2, i)), filter_sizes[i - 1], int(in_height/math.pow(2, i)), int(in_width/math.pow(2, i))) filter_shape = (filter_sizes[i], flt_time, filter_sizes[i - 1], flt_height, flt_width) if i == 0: layer_input = input else: layer_input = self.CNN_pool_layers[-1].output print "layer "+str(i) print "signal shape:" print image_shape print "filter shape:" print filter_shape CNN_pool_layer = Pad_Conv_Pool(rng=numpy_rng, input=layer_input, filter_shape=filter_shape, image_shape=image_shape, ) # add the layer to our list of layers self.CNN_pool_layers.append(CNN_pool_layer) self.params.extend(CNN_pool_layer.params) self.tparams['W_'+str(i)] = CNN_pool_layer.W self.tparams['b_'+str(i)] = CNN_pool_layer.b print "CNN layer W:" print CNN_pool_layer.W.shape.eval() print "CNN layer b:" print CNN_pool_layer.b.shape.eval() # end-snippet-2 i=i+1 image_shape = (batchsize, int(in_time/math.pow(2, i)), filter_sizes[i - 1], int(in_height/math.pow(2, i)), int(in_width/math.pow(2, i))) filter_shape = (filter_sizes[i], flt_time, filter_sizes[i - 1], flt_height, flt_width) layer_input = self.CNN_pool_layers[-1].output print "signal shape:" print image_shape print "filter shape:" print filter_shape self.CNN_layer = Conv_3d_Layer(rng=numpy_rng, input=layer_input, filter_shape=filter_shape, image_shape=image_shape, ) self.params.extend(self.CNN_layer.params) self.tparams['W_2'] = self.CNN_layer.W self.tparams['b_2'] = self.CNN_layer.b self.logLayer = LogisticRegression( input=self.CNN_layer.output.flatten(2), n_in=(filter_sizes[-1]*in_time*in_height*in_width)/(math.pow(2, i)*math.pow(2, i)*math.pow(2, i)), n_out=n_outs, W=None, b=None, ) self.params.extend(self.logLayer.params) self.tparams['W_log'] = self.logLayer.W self.tparams['b_log'] = self.logLayer.b self.L2_sqr = T.sum(self.CNN_pool_layers[0].W ** 2)+T.sum(self.CNN_pool_layers[1].W**2)+ T.sum(self.CNN_layer.W**2)+T.sum(self.logLayer.W**2) def test_fine_S_CNN_dA(fold, finetune_lr=0.003, training_epochs=7, batch_size=5, reg=1e-7): print "fold "+str(fold) [all_examples, all_labels, in_channels, all_train_sizes, test_size, val_size]= load_FS_data(fold) Xtr=all_examples[0] Xt=all_examples[1] Xv=all_examples[2] ytr=all_labels[0] yt=all_labels[1] yv=all_labels[2] n_train_batches = [ a/batch_size for a in all_train_sizes] n_valid_batches = val_size n_test_batches = test_size n_valid_batches /= batch_size n_test_batches /= batch_size numpy_rng = numpy.random.RandomState(89677) print '... building the model' flt_channels = 32 flt_time = 3 in_channels = 4 flt_height = 3 flt_width = 3 batchsize = batch_size in_time = 20 in_height = 20 in_width = 20 signals_shape = (batchsize, in_time, in_channels, in_height, in_width) filters_shape = (flt_channels, flt_time, in_channels, flt_height, flt_width) # construct the stacked denoising autoencoder class dtensor5 = T.TensorType('float32', (False,)*5) x = dtensor5('x') y = T.ivector('y') s_cnn_da = fine_pool_S_CNN_dA( input = x, numpy_rng=numpy_rng, signals_shape=signals_shape, filters_shape=filters_shape, filter_sizes=[32, 64, 128], n_outs=2, corruption_levels=[0., 0., 0.] ) finetune_cost = s_cnn_da.logLayer.negative_log_likelihood(y) errors = s_cnn_da.logLayer.errors(y) print '... getting the finetuning functions' index = T.lscalar('index') lr = T.scalar(name='lr') tparams = s_cnn_da.tparams cost = finetune_cost+(reg*s_cnn_da.L2_sqr) grads = T.grad(cost, tparams.values()) zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] train_fn = theano.function([x,y], cost, updates=zgup + rgup + rg2up, name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') index = T.lscalar('index') # index to a [mini]batch train_score_i = theano.function( [x,y], errors, name='train' ) test_score_i = theano.function( [x,y], errors, name='test' ) valid_score_i = theano.function( [x,y], errors, name='valid' ) print '... finetunning the model' # early-stopping parameters patience = 100000 # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches[0], patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_test_score = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): list_file= open('../progress/progress_NOS_'+result_weights_ID+'_'+str(fold)+'.txt','a') epoch = epoch + 1 train_losses=[] for part_index in xrange (20): for minibatch_index in xrange(n_train_batches[part_index]): X_train=Xtr[part_index][minibatch_index * batch_size: (minibatch_index + 1) * batch_size] y_train=ytr[part_index][minibatch_index * batch_size: (minibatch_index + 1) * batch_size] train_set_x, train_set_y = shared_dataset(X_train,y_train) train_set_x = train_set_x.dimshuffle(0,4,1,2,3) minibatch_avg_cost = train_fn(train_set_x.eval(), train_set_y.eval()) train_err=train_score_i(train_set_x.eval(), train_set_y.eval()) train_losses.append(train_err) f_update(finetune_lr) iter = (epoch - 1) * n_train_batches[part_index] + minibatch_index train_losses=numpy.array(train_losses) this_train_loss=numpy.mean(train_losses) list_file.write('epoch %i, train error %f %%' % (epoch, this_train_loss * 100.)) list_file.write('\n') validation_losses=[] for i in xrange(n_valid_batches): valid_set_x, valid_set_y = shared_dataset(Xv[i*batch_size:(i+1)*batch_size],yv[i*batch_size:(i+1)*batch_size]) valid_set_x=valid_set_x.dimshuffle(0,4,1,2,3) validation_losses.append(valid_score_i(valid_set_x.eval(),valid_set_y.eval())) this_validation_loss = numpy.mean(validation_losses) print('epoch %i, validation error %f %%' % (epoch, this_validation_loss * 100.)) list_file.write('epoch %i, validation error %f %%' % (epoch, this_validation_loss * 100.)) list_file.write('\n') test_losses=[] for i in xrange(n_test_batches): test_set_x, test_set_y = shared_dataset(Xt[i*batch_size:(i+1)*batch_size], yt[i*batch_size:(i+1)*batch_size]) test_set_x=test_set_x.dimshuffle(0,4,1,2,3) test_losses.append(test_score_i(test_set_x.eval(),test_set_y.eval())) test_score = numpy.mean(test_losses) print((' epoch %i, test error of best model %f %%') % (epoch, test_score * 100.)) list_file.write((' epoch %i, test error of best model %f %%') % (epoch, test_score * 100.)) list_file.write('\n') list_file.close() W0 = s_cnn_da.CNN_pool_layers[0].W.get_value() b0 = s_cnn_da.CNN_pool_layers[0].b.get_value() W1 = s_cnn_da.CNN_pool_layers[1].W.get_value() b1 = s_cnn_da.CNN_pool_layers[1].b.get_value() W2 = s_cnn_da.CNN_layer.W.get_value() b2 = s_cnn_da.CNN_layer.b.get_value() W3 = s_cnn_da.logLayer.W.get_value() b3 = s_cnn_da.logLayer.b.get_value() Weights=[W0,W1,W2,W3] Bias=[b0,b1,b2,b3] dump_weights_pickle(Weights, Bias, file_name='../results/weights/weight_'+result_weights_ID+'_'+str(fold)+'.zip') # if we got the best validation score until now if this_validation_loss <= best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter best_test_score = test_score * 100. end_time = time.clock() print( ( 'Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%' ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.) ) print >> sys.stderr, ('The training code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) final_weights = s_cnn_da.params W0 = s_cnn_da.CNN_pool_layers[0].W.get_value() b0 = s_cnn_da.CNN_pool_layers[0].b.get_value() W1 = s_cnn_da.CNN_pool_layers[1].W.get_value() b1 = s_cnn_da.CNN_pool_layers[1].b.get_value() W2 = s_cnn_da.CNN_layer.W.get_value() b2 = s_cnn_da.CNN_layer.b.get_value() W3 = s_cnn_da.logLayer.W.get_value() b3 = s_cnn_da.logLayer.b.get_value() Weights=[W0,W1,W2,W3] Bias=[b0,b1,b2,b3] dump_weights_pickle(Weights, Bias, file_name='../results/weights/weight_'+result_weights_ID+'_'+str(fold)+'.zip') if __name__ == '__main__': # THEANO_FLAGS=device=gpu,floatX=float32 python FSCNN_sulfur.py CYS SG EGF_1.10 0 0.003 10 target_RES = sys.argv[1] target_ATOM = sys.argv[2] begin = int(sys.argv[3]) lr = 0.003 epoch = 10 total_fold = 5 result_weights_ID = target_RES+"_"+target_ATOM test_scores=[] for fold in range(begin,begin+1): test_fine_S_CNN_dA(fold=fold,finetune_lr=lr,training_epochs=epoch)