# Author: Wen Torng and Russ B. Altman (2018) # Framework for training FEATURE-SVM functional site models import os import sys import time import numpy from scipy.io import matlab import re import math from collections import OrderedDict def numpy_floatX(data): return numpy.asarray(data, dtype=theano.config.floatX) def load_FEATURE_data(fold): pos_ff_file = open('../data/PROSITE_TP_TN/site_FF/'+site+'.'+target_RES+'.'+target_ATOM+'.'+'pos'+'.ff') neg_ff_file = open('../data/PROSITE_TP_TN/site_FF/'+site+'.'+target_RES+'.'+target_ATOM+'.'+'neg'+'.ff') pos_FF = [] for line in pos_ff_file: if line[0:4]=='Env_': ele = line.split() FV=numpy.zeros((480,)) for i in range(1,481): FV[i-1]=ele[i] pos_FF.append(FV) neg_FF = [] for line in neg_ff_file: if line[0:4]=='Env_': ele = line.split() FV=numpy.zeros((480,)) for i in range(1,481): FV[i-1]=ele[i] neg_FF.append(FV) X_pos=numpy.array(pos_FF) X_neg=numpy.array(neg_FF) y_pos=numpy.ones((X_pos.shape[0],)) y_neg=numpy.zeros((X_neg.shape[0],)) num_of_pos_test = X_pos.shape[0]/total_fold num_of_neg_test = X_neg.shape[0]/total_fold mask_pos_test=range(num_of_pos_test*fold,num_of_pos_test*(fold+1)) mask_neg_test=range(num_of_neg_test*fold,num_of_neg_test*(fold+1)) Xt_pos =X_pos[mask_pos_test] yt_pos = y_pos[mask_pos_test] X_pos = numpy.delete(X_pos, mask_pos_test, 0) y_pos = numpy.delete(y_pos, mask_pos_test, 0) Xt_neg =X_neg[mask_neg_test] yt_neg = y_neg[mask_neg_test] X_neg = numpy.delete(X_neg, mask_neg_test, 0) y_neg = numpy.delete(y_neg, mask_neg_test, 0) Xt=numpy.concatenate((Xt_pos, Xt_neg), axis=0) yt=numpy.concatenate((yt_pos, yt_neg), axis=0) num_of_train_pos = int(19*float(X_pos.shape[0])/20) num_of_val_pos = int(1*float(X_pos.shape[0])/20) num_of_train_neg = int(19*float(X_neg.shape[0])/20) num_of_val_neg = int(1*float(X_neg.shape[0])/20) mask_train_pos = random.sample(xrange(X_pos.shape[0]), num_of_train_pos) X_tr_pos = X_pos[mask_train_pos] y_tr_pos = y_pos[mask_train_pos] mask_train_neg = random.sample(xrange(X_neg.shape[0]), num_of_train_neg) X_tr_neg = X_neg[mask_train_neg] y_tr_neg = y_neg[mask_train_neg] X_pos = numpy.delete(X_pos, mask_train_pos, 0) y_pos = numpy.delete(y_pos, mask_train_pos, 0) X_neg = numpy.delete(X_neg, mask_train_neg, 0) y_neg = numpy.delete(y_neg, mask_train_neg, 0) Xv_pos =X_pos yv_pos = y_pos Xv_neg =X_neg yv_neg = y_neg X = numpy.concatenate((X_tr_pos,X_tr_neg),axis=0) y = numpy.concatenate((y_tr_pos,y_tr_neg),axis=0) Xv = numpy.concatenate((Xv_pos,Xv_neg),axis=0) yv = numpy.concatenate((yv_pos,yv_neg),axis=0) from sklearn.utils import shuffle X, y = shuffle(X, y) all_train_x=[] all_train_y=[] all_train_sizes=[] all_examples=[X,Xt,Xv] all_labels=[y,yt,yv] return [all_examples, all_labels, X.shape[0], Xt.shape[0], Xv.shape[0]] import os import sys import time import numpy import theano import theano.tensor as T from layers import * import random from theano.misc.pkl_utils import dump import argparse def test_fine_S_CNN_dA(fold, learning_rate=0.002, n_epochs=10, batch_size=20, reg=5e-6, dropout=True, dropout_rates=[0.3,0.3]): rng = numpy.random.RandomState(23455) [all_examples, all_labels, train_size, test_size, val_size]=load_FEATURE_data(fold) Xtr=all_examples[0] Xt=all_examples[1] Xv=all_examples[2] ytr=all_labels[0] yt=all_labels[1] yv=all_labels[2] n_train_batches = train_size/batch_size n_valid_batches = val_size n_test_batches = test_size n_valid_batches /= batch_size n_test_batches /= batch_size from sklearn.preprocessing import MinMaxScaler import math scaling = MinMaxScaler(feature_range=(-1,1)).fit(Xtr) Xtr = scaling.transform(Xtr) Xt = scaling.transform(Xt) Xv = scaling.transform(Xv) best_acc = 0 best_c = None kernel = 'rbf' for c in [2e-5,2e-4,2e-3,0.25,0.5,1,2,4,8,16,32,64,128,256,512]: from sklearn import svm clf = svm.SVC(C=c,kernel=kernel,class_weight='balanced',probability=True) clf.fit(Xtr, ytr) pred_y = clf.predict(Xv) # prob_y = clf.predict_proba(Xv) correct = numpy.where(pred_y==yv)[0] acc = float(len(correct))/yv.shape[0] outfile=open('../results/prob_score/FEATURE_SVM/'+target_RES+'_'+target_ATOM+'_'+site+'_fold_'+str(fold)+'.param', 'a') outfile.write('c:'+str(c)+', acc:'+str(acc)+'\n') outfile.close() if acc>best_acc: best_c = c best_acc = acc clf = svm.SVC(C=best_c,kernel=kernel,class_weight='balanced',probability=True) clf.fit(Xtr, ytr) pred_y = clf.predict(Xt) prob_y = clf.predict_proba(Xt) correct = numpy.where(pred_y==yt)[0] test_acc = float(len(correct))/yt.shape[0] pred_y.dump('../results/prob_score/FEATURE_SVM/'+'pred_y_'+target_RES+'_'+target_ATOM+'_'+site+'_fold_'+str(fold)+'.dat') prob_y.dump('../results/prob_score/FEATURE_SVM/'+'prob_y_'+target_RES+'_'+target_ATOM+'_'+site+'_fold_'+str(fold)+'.dat') yt.dump('../results/prob_score/FEATURE_SVM/'+'true_y_'+target_RES+'_'+target_ATOM+'_'+site+'_fold_'+str(fold)+'.dat') outfile=open('../results/prob_score/FEATURE_SVM/'+target_RES+'_'+target_ATOM+'_'+site+'_fold_'+str(fold)+'.param', 'a') outfile.write('best c:'+str(best_c)+', acc:'+str(test_acc)+'\n') outfile.close() import cPickle with open('../results/weights/FEATURE_SVM_'+target_RES+'_'+target_ATOM+'_'+site+'_fold_'+str(fold)+'.pkl', 'wb') as fid: cPickle.dump(clf, fid) if __name__ == '__main__': target_RES = sys.argv[1] target_ATOM = sys.argv[2] site = sys.argv[3] fold = int(sys.argv[4]) total_fold = 5 result_weights_ID = target_RES+"_"+target_ATOM+"_"+site test_fine_S_CNN_dA(fold=fold)