import sys import numpy import scipy.ndimage from sets import Set import json import os from scipy import spatial import theano from Bio.PDB import PDBList num_of_channels=4 x_dim=20 y_dim=20 z_dim=20 num_3d_pixel=20 atom_density=0 bias=[-0.558,-0.73,1.226] resiName_to_label={'ILE': 12, 'GLN': 8, 'GLY': 18, 'GLU': 4, 'CYS': 19, 'HIS': 0, 'SER': 5, 'LYS': 1, 'PRO': 17, 'ASN': 7, 'VAL': 10, 'THR': 6, 'ASP': 3, 'TRP': 16, 'PHE': 14, 'ALA': 9, 'MET': 13, 'LEU': 11, 'ARG': 2, 'TYR': 15} label_atom_type_dict={ 18:Set(['N','CA','C','O']), 19:Set(['N','CA','C','O','CB','SG']), 2:Set(['N','CA','C','O','CB','CG','CD','NE','CZ','NH1','NH2']), 5:Set(['N','CA','C','O','CB','OG']), 6:Set(['N','CA','C','O','CB','OG1','CG2']), 1:Set(['N','CA','C','O','CB','CG','CD','CE','NZ']), 13:Set(['N','CA','C','O','CB','CG','SD','CE']), 9:Set(['N','CA','C','O','CB']), 11:Set(['N','CA','C','O','CB','CG','CD1','CD2']), 12:Set(['N','CA','C','O','CB','CG1','CG2','CD1']), 10:Set(['N','CA','C','O','CB','CG1','CG2']), 3:Set(['N','CA','C','O','CB','CG','OD1','OD2']), 4:Set(['N','CA','C','O','CB','CG','CD','OE1','OE2']), 0:Set(['N','CA','C','O','CB','CG','ND1','CD2','CE1','NE2']), 7:Set(['N','CA','C','O','CB','CG','OD1','ND2']), 17:Set(['N','CA','C','O','CB','CG','CD']), 8:Set(['N','CA','C','O','CB','CG','CD','OE1','NE2']), 14:Set(['N','CA','C','O','CB','CG','CD1','CD2','CE1','CE2','CZ']), 16:Set(['N','CA','C','O','CB','CG','CD1','CD2','NE1','CE2','CE3','CZ2','CZ3','CH2']), 15:Set(['N','CA','C','O','CB','CG','CD1','CD2','CE1','CE2','CZ','OH']), } class PDB_atom: def __init__(self,atom_type,res,chain_ID,x,y,z,index): self.atom = atom_type self.res = res self.chain_ID = chain_ID self.x = x self.y = y self.z = z self.index = index def __eq__(self, other): return self.__dict__ == other.__dict__ def find_actual_pos(my_kd_tree,cor,PDB_entries): [d,i] = my_kd_tree.query(cor,k=1) return PDB_entries[i] def get_position_dict(all_PDB_atoms): get_position={} for a in all_PDB_atoms: get_position[a.atom]=[a.x,a.y,a.z] return get_position def center_and_transform(get_position): reference = get_position["CA"] axis_x = numpy.array(get_position["N"]) - numpy.array(get_position["CA"]) pseudo_axis_y = numpy.array(get_position["C"]) - numpy.array(get_position["CA"]) axis_z = numpy.cross(axis_x , pseudo_axis_y) if "CB" in get_position.keys(): direction = numpy.array(get_position["CB"]) - numpy.array(get_position["CA"]) axis_z *= numpy.sign( direction.dot(axis_z) ) axis_y= numpy.cross(axis_z , axis_x) axis_x/=numpy.sqrt(sum(axis_x**2)) axis_y/=numpy.sqrt(sum(axis_y**2)) axis_z/=numpy.sqrt(sum(axis_z**2)) transform=numpy.array([axis_x, axis_y, axis_z], 'float16').T return [reference,transform] def PDB_is_in_list(PDB_entry,PDB_list): for element in PDB_list: if PDB_entry.x==element.x and PDB_entry.y==element.y and PDB_entry.z==element.z: #print "true" return True #print "false" return False def grab_PDB(entry_list): if_NMR=False ID_dict={} all_pos=[] all_lines=[] all_atom_type =[] PDB_entries = [] atom_index = 0 model_ID = 0 MODELS = [] for line1 in entry_list: line=line1.split() if model_ID>0: print "NMR!!!!!" if_NMR=True break if line[0]=="MODEL": print "model_ID:" +str(model_ID) if line[0]=="ATOM": atom=(line1[13:16].strip(' ')) res=(line1[17:20]) chain_ID=line1[21:26] chain=chain_ID[0] res_no=chain_ID[1:].strip(' ') res_no=int(res_no) #print res_no chain_ID=(chain,res_no) new_pos=[float(line1[30:37]),float(line1[38:45]),float(line1[46:53])] all_pos.append(new_pos) all_lines.append(line1) all_atom_type.append(atom[0]) if chain_ID not in ID_dict.keys(): l=[] a=PDB_atom(atom_type=atom,res=res,chain_ID=chain_ID,x=new_pos[0],y=new_pos[1],z=new_pos[2],index=atom_index) l.append(a) ID_dict[chain_ID]=l else: ID_dict[chain_ID].append(PDB_atom(atom,res,chain_ID,new_pos[0],new_pos[1],new_pos[2],index=atom_index)) PDB_entries.append(PDB_atom(atom,res,chain_ID,new_pos[0],new_pos[1],new_pos[2],index=atom_index)) atom_index+=1 if line[0]=="ENDMDL" and model_ID==0: MODELS.append([ID_dict,all_pos,all_lines, all_atom_type, PDB_entries]) model_ID+=1 ID_dict={} all_pos=[] all_lines=[] all_atom_type =[] PDB_entries = [] atom_index = 0 if model_ID == 0: MODELS.append([ID_dict,all_pos,all_lines, all_atom_type, PDB_entries]) return [MODELS,if_NMR] def parse_processed_list(name_list): exist_PDB = Set([]) for l in name_list: list_file= open(l) f = list(list_file) for line in f: PDB_ID=line.split()[-1] exist_PDB.add(PDB_ID) return exist_PDB if __name__ == '__main__': target_RES = sys.argv[1] target_ATOM = sys.argv[2] site_ID = sys.argv[3] pos_or_neg = 'fp' pixel_size = 1 std = 0.6 output_dir = '../data/PROSITE_FP/site_fp_Voxel_numpy' ptf_dir = '../data/PROSITE_FP/site_fp_Voxel_numpy' pdb_dir = '../data/PROSITE_PDB/FN_FP/'+site_ID+'_FP' ptf_file_name = site_ID+'.'+target_RES+'.'+target_ATOM+'.fp.ptf' mut_dict={'G':18,'C':19,'R':2,'S':5,'T':6,'K':1,'M':13,'A':9,'L':11,'I':12,'V':10,'D':3,'E':4,'H':0,'N':7,'P':17,'Q':8,'F':14,'W':16,'Y':15} # for each ff file # for each line # (1)grab the pdb # (2)take the ori x,y,z # (3)find c-beta # (4)cut box total_num_of_samples=0 box_size=x_dim samples=[] box_x_min=-box_size/2 box_x_max=+box_size/2 box_y_min=-box_size/2 box_y_max=+box_size/2 box_z_min=-box_size/2 box_z_max=+box_size/2 container =[] dat_num=0 stop=0 ptf_file=open(os.path.join(ptf_dir,ptf_file_name)) ptf_lines=list(ptf_file) num = 0 for line in ptf_lines: print line S=line.split() PDB_ID=S[0] x_= float(S[1]) y_= float(S[2]) z_= float(S[3]) pdb_file_name=pdb_dir+'/pdb'+PDB_ID.lower()+'.ent' pdb_file=open(pdb_file_name) l=list(pdb_file) [MODELS,if_NMR]=grab_PDB(l) [ID_dict,all_pos,all_lines,all_atom_type,PDB_entries]=MODELS[0] all_pos = numpy.array(all_pos) my_kd_tree = scipy.spatial.KDTree(all_pos) actual_atom=find_actual_pos(my_kd_tree, [x_,y_,z_], PDB_entries) chain_ID=actual_atom.chain_ID res_atoms=ID_dict[chain_ID] res=actual_atom.res if res in resiName_to_label.keys(): label = resiName_to_label[res] get_position=get_position_dict(res_atoms) if "CA" in Set(get_position.keys()) and "C" in Set(get_position.keys()) and "N" in Set(get_position.keys()): backbone=ID_dict[chain_ID][0:4] [reference,transform]=center_and_transform(get_position) transformed_pos = ((all_pos - reference).dot(transform))-bias x_index = numpy.intersect1d(numpy.where(transformed_pos[:,0]>box_x_min),numpy.where(transformed_pos[:,0]box_y_min),numpy.where(transformed_pos[:,1]box_z_min),numpy.where(transformed_pos[:,2](box_size**3)*atom_density: samplega=numpy.zeros((num_of_channels,x_dim/pixel_size,y_dim/pixel_size,z_dim/pixel_size)) for i in range (0,len(box_ori)): atoms = box_ori[i] x=new_pos_in_box[i][0] y=new_pos_in_box[i][1] z=new_pos_in_box[i][2] x_new=x-box_x_min y_new=y-box_y_min z_new=z-box_z_min bin_x=numpy.floor(x_new/pixel_size) bin_y=numpy.floor(y_new/pixel_size) bin_z=numpy.floor(z_new/pixel_size) if(bin_x==num_3d_pixel): bin_x=num_3d_pixel-1 if(bin_y==num_3d_pixel): bin_y=num_3d_pixel-1 if(bin_z==num_3d_pixel): bin_z=num_3d_pixel-1 if atoms.atom[0]=='O': samplega[0,bin_x,bin_y,bin_z] = samplega[0,bin_x,bin_y,bin_z] + 1 elif atoms.atom[0]=='C': samplega[1,bin_x,bin_y,bin_z] = samplega[1,bin_x,bin_y,bin_z] + 1 elif atoms.atom[0]=='N': samplega[2,bin_x,bin_y,bin_z] = samplega[2,bin_x,bin_y,bin_z] + 1 elif atoms.atom[0]=='S': samplega[3,bin_x,bin_y,bin_z] = samplega[3,bin_x,bin_y,bin_z] + 1 X_smooth=numpy.zeros(samplega.shape, dtype=theano.config.floatX) for j in range (0,num_of_channels): X_smooth[j,:,:,:]=scipy.ndimage.filters.gaussian_filter(samplega[j,:,:,:], sigma=std, order=0, output=None, mode='reflect', cval=0.0, truncate=4.0) container.append(X_smooth) #map_and_show(PDB_ID, num, box_ori, new_pos_in_box, X_smooth) num=num+1 if(len(container)==1000): sample_time_t = numpy.array(container) sample_time_t.dump(output_dir+'/'+site_ID+'.'+target_RES+'.'+target_ATOM+'.'+pos_or_neg+'_'+str(dat_num)+'.dat') container=[] dat_num+=1 sample_time_t = numpy.array(container) sample_time_t.dump(output_dir+'/'+site_ID+'.'+target_RES+'.'+target_ATOM+'.'+pos_or_neg+'_'+str(dat_num)+'.dat') ptf_file.close() print "done .."