# Author: Wen Torng and Russ B. Altman (2018) # Framework for training Voxel-SVM functional site models import os import sys import time import numpy from scipy.io import matlab import re import math from collections import OrderedDict sys.path.insert(0,'../data_process/') from store_pytable import * def load_Voxel_data(fold): dataName = "data"; labelName = "label"; labelShape = []; pos_or_neg = 'pos' ID = site+'.'+target_RES+'.'+target_ATOM+'.'+pos_or_neg filename_train_pos = '../data/PROSITE_TP_TN/pytables_Voxel/'+"train_data_"+ID+".pytables"; h5file_train = tables.openFile(filename_train_pos, mode="r") dataColumn_train = getH5column(h5file_train, dataName); labelColumn_train = getH5column(h5file_train, labelName); X_pos=dataColumn_train[:] y_pos=labelColumn_train[:] pos_or_neg = 'neg' ID = site+'.'+target_RES+'.'+target_ATOM+'.'+pos_or_neg filename_train_neg = '../data/PROSITE_TP_TN/pytables_Voxel/'+"train_data_"+ID+".pytables"; h5file_train = tables.openFile(filename_train_neg, mode="r") dataColumn_train = getH5column(h5file_train, dataName); labelColumn_train = getH5column(h5file_train, labelName); X_neg=dataColumn_train[:] y_neg=labelColumn_train[:] num_of_pos_test = X_pos.shape[0]/total_fold num_of_neg_test = X_neg.shape[0]/total_fold mask_pos_test=range(num_of_pos_test*fold,num_of_pos_test*(fold+1)) mask_neg_test=range(num_of_neg_test*fold,num_of_neg_test*(fold+1)) Xt_pos =X_pos[mask_pos_test] yt_pos = y_pos[mask_pos_test] X_pos = numpy.delete(X_pos, mask_pos_test, 0) y_pos = numpy.delete(y_pos, mask_pos_test, 0) Xt_neg =X_neg[mask_neg_test] yt_neg = y_neg[mask_neg_test] X_neg = numpy.delete(X_neg, mask_neg_test, 0) y_neg = numpy.delete(y_neg, mask_neg_test, 0) Xt=numpy.concatenate((Xt_pos, Xt_neg), axis=0) yt=numpy.concatenate((yt_pos, yt_neg), axis=0) num_of_train_pos = int(19*float(X_pos.shape[0])/20) num_of_val_pos = int(1*float(X_pos.shape[0])/20) num_of_train_neg = int(19*float(X_neg.shape[0])/20) num_of_val_neg = int(1*float(X_neg.shape[0])/20) mask_train_pos = random.sample(xrange(X_pos.shape[0]), num_of_train_pos) X_tr_pos = X_pos[mask_train_pos] y_tr_pos = y_pos[mask_train_pos] mask_train_neg = random.sample(xrange(X_neg.shape[0]), num_of_train_neg) X_tr_neg = X_neg[mask_train_neg] y_tr_neg = y_neg[mask_train_neg] X_pos = numpy.delete(X_pos, mask_train_pos, 0) y_pos = numpy.delete(y_pos, mask_train_pos, 0) X_neg = numpy.delete(X_neg, mask_train_neg, 0) y_neg = numpy.delete(y_neg, mask_train_neg, 0) Xv_pos =X_pos yv_pos = y_pos Xv_neg =X_neg yv_neg = y_neg X = numpy.concatenate((X_tr_pos,X_tr_neg),axis=0) y = numpy.concatenate((y_tr_pos,y_tr_neg),axis=0) Xv = numpy.concatenate((Xv_pos,Xv_neg),axis=0) yv = numpy.concatenate((yv_pos,yv_neg),axis=0) from sklearn.utils import shuffle X, y = shuffle(X, y) all_train_x=[] all_train_y=[] all_train_sizes=[] all_examples=[X,Xt,Xv] all_labels=[y,yt,yv] return [all_examples, all_labels, 4, all_train_sizes, Xt.shape[0], Xv.shape[0]] def train_Voxel_SVM(fold): print "fold "+str(fold) [all_examples, all_labels, in_channels, all_train_sizes, test_size, val_size]= load_Voxel_data(fold) Xtr=all_examples[0] Xt=all_examples[1] Xv=all_examples[2] Xtr = numpy.reshape(Xtr, (Xtr.shape[0],-1)) Xt = numpy.reshape(Xt, (Xt.shape[0],-1)) Xv = numpy.reshape(Xv, (Xv.shape[0],-1)) from sklearn.preprocessing import MinMaxScaler scaling = MinMaxScaler(feature_range=(-1,1)).fit(Xtr) Xtr = scaling.transform(Xtr) Xt = scaling.transform(Xt) Xv = scaling.transform(Xv) ytr=all_labels[0] yt=all_labels[1] yv=all_labels[2] from sklearn import svm clf = svm.SVC(class_weight='balanced',probability=True) clf.fit(Xtr, ytr) pred_y = clf.predict(Xt) prob_y = clf.predict_proba(Xt) pred_y.dump('../results/prob_score/Voxel_SVM/'+'pred_y_'+target_RES+'_'+target_ATOM+'_'+site+'_'+str(fold)+'.dat') prob_y.dump('../results/prob_score/Voxel_SVM/'+'prob_y_'+target_RES+'_'+target_ATOM+'_'+site+'_'+str(fold)+'.dat') yt.dump('../results/prob_score/Voxel_SVM/'+'true_y_'+target_RES+'_'+target_ATOM+'_'+site+'_'+str(fold)+'.dat') import cPickle with open('../results/weights/Voxel_SVM_'+target_RES+'_'+target_ATOM+'_'+site+'_'+str(fold)+'.pkl', 'wb') as fid: cPickle.dump(clf, fid) if __name__ == '__main__': target_RES = sys.argv[1] target_ATOM = sys.argv[2] site = sys.argv[3] begin = int(sys.argv[4]) total_fold = 5 result_weights_ID = target_RES+"_"+target_ATOM+"_"+site test_scores=[] for fold in range(begin,begin+1): train_Voxel_SVM(fold=fold)