# from __future__ import division; # from __future__ import print_function; # import tables; import numpy import os import random import json import scipy.ndimage from scipy import spatial import sys DEFAULT_NODE_NAME = "defaultNode"; label_res_dict={0:'HIS',1:'LYS',2:'ARG',3:'ASP',4:'GLU',5:'SER',6:'THR',7:'ASN',8:'GLN',9:'ALA',10:'VAL',11:'LEU',12:'ILE',13:'MET',14:'PHE',15:'TYR',16:'TRP',17:'PRO',18:'GLY',19:'CYS'} resiName_to_label={'ILE': 12, 'GLN': 8, 'GLY': 18, 'GLU': 4, 'CYS': 19, 'HIS': 0, 'SER': 5, 'LYS': 1, 'PRO': 17, 'ASN': 7, 'VAL': 10, 'THR': 6, 'ASP': 3, 'TRP': 16, 'PHE': 14, 'ALA': 9, 'MET': 13, 'LEU': 11, 'ARG': 2, 'TYR': 15} def load_dict(dict_name): if os.path.isfile(os.path.join('../data/DICT',dict_name)): with open(os.path.join('../data/DICT',dict_name)) as f: tmp_dict = json.load(f) res_count_dict={} for i in range (0,20): res_count_dict[i]=tmp_dict[str(i)] else: print ("dictionary not exist!") res_count_dict={0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0,13:0,14:0,15:0,16:0,17:0,18:0,19:0} files = [ f for f in os.listdir(in_dir) if os.path.isfile(os.path.join(in_dir,f))] for f in files: file_name=f.strip('\n') parts=file_name.split('_') res=parts[0] label=resiName_to_label[res] res_count_dict[label]+=1 print ("res_count_dict content:") for key in res_count_dict: print (label_res_dict[key]+" "+str(res_count_dict[key])) min_ind = min(res_count_dict, key=res_count_dict.get) min_data=res_count_dict[min_ind] print (min_ind,min_data) res_files_dict={} for label in range (0,20): mask=random.sample(range(res_count_dict[label]), min_data) res_files_dict[label]=mask return res_count_dict, res_files_dict, min_ind, min_data def integrate_data(d_name, num_3d_pixel=20, num_of_channels=4, num_of_parts=6): print d_name block_size = 1000 if d_name == 'train' else 100 dict_name = d_name+'_20AA_boxes.json' in_dir = '../data/RAW_DATA/'+d_name+'/' out_dir = '../data/Sampled_Numpy/'+d_name+'/' res_count_dict, res_files_dict, min_ind, min_data = load_dict(dict_name) unit_size = int(20*(min_data/num_of_parts)*block_size) for part in range(0,num_of_parts): equal_examples=[] equal_labels=[] for label in range (0,20): res_files = res_files_dict[label] s = int(part*(min_data/num_of_parts)) e = int((part+1)*(min_data/num_of_parts)) for i in range (s,e): num = res_files[i] X = numpy.load(in_dir+label_res_dict[label]+"_"+str(num)+'.dat') y=label*numpy.ones((block_size,1)) equal_examples.append(X) equal_labels.append(y) equal_examples=numpy.array(equal_examples) equal_labels=numpy.array(equal_labels) print "equal_examples.shape" print "equal_labels.shape" print equal_examples.shape print equal_labels.shape equal_examples=numpy.reshape(equal_examples,(unit_size, num_of_channels, num_3d_pixel, num_3d_pixel, num_3d_pixel)) equal_labels=numpy.reshape(equal_labels,unit_size) print "equal_examples.shape" print "equal_labels.shape" print equal_examples.shape print equal_labels.shape if d_name=='test': Xt_smooth=equal_examples yt=equal_labels Xt_smooth.dump(out_dir+"Xt_smooth_"+str(part+1)+".dat") yt.dump(out_dir+"yt_"+str(part+1)+".dat") print "Xt_smooth.shape" print "yt.shape" print Xt_smooth.shape print yt.shape else: num_of_train=int(19*float(unit_size)/20) num_of_val=int(1*float(unit_size)/20) mask_train=random.sample(xrange(unit_size), num_of_train) X_smooth=equal_examples[mask_train] y=equal_labels[mask_train] equal_examples=numpy.delete(equal_examples, mask_train, 0) equal_labels=numpy.delete(equal_labels, mask_train, 0) Xv_smooth=equal_examples yv=equal_labels # Dumping validation dataset as numpy array Xv_smooth.dump(out_dir+"Xv_smooth_"+str(part+1)+".dat") yv.dump(out_dir+"yv_"+str(part+1)+".dat") print "Xv_smooth.shape" print "yv.shape" print Xv_smooth.shape print yv.shape train_mean = numpy.mean(X_smooth, axis=0) train_mean.dump("../data/Sampled_Numpy/train/train_mean.dat") # Dumping training dataset as numpy array partition = int(num_of_train/19) for i in range (1,20): mask = range(partition*(i-1),partition*i) X_tmp = X_smooth[mask] y_tmp = y[mask] print "X_tmp.shape" print "y_tmp.shape" print X_tmp.shape print y_tmp.shape X_tmp.dump(out_dir+"X_smooth"+str(i)+"_"+str(part+1)+".dat") y_tmp.dump(out_dir+"y"+str(i)+"_"+str(part+1)+".dat") if __name__ == '__main__': num_of_channels=4 num_3d_pixel=20 # integrate training and validation data integrate_data('train', num_3d_pixel, num_of_channels, num_of_parts=6) # integrate test data integrate_data('test', num_3d_pixel, num_of_channels, num_of_parts=6)