import os import sys import numpy import scipy.ndimage from sets import Set import json import os from scipy import spatial from atom_res_dict import * from generate_backbone_box_20A import * def find_all_key_res(x,y,z,PDB_entries,my_kd_tree): all_key_res = Set([]) atom_index = my_kd_tree.query_ball_point([x,y,z], r=9, p=2.0) for i in range (0,len(atom_index)): a = PDB_entries[atom_index[i]] all_key_res.add((a.chain_ID,a.res)) return all_key_res def check_valid(res,res_atoms): if res not in res_label_dict.keys(): return False else: label=res_label_dict[res] get_position=get_position_dict(res_atoms) side_exist = (label == 18 or "CB" in get_position.keys()) backbone_valid = ("CA" in get_position.keys() and "N" in get_position.keys() and "C" in get_position.keys()) return side_exist and backbone_valid def extract_patch(PDB_ID,PDB_a,ID_dict,PDB_entries,my_kd_tree,pts_file_num): chain_ID=PDB_a.chain_ID res_atoms=ID_dict[chain_ID] res=PDB_a.res if check_valid(res,res_atoms): label=res_label_dict[res] get_position=get_position_dict(res_atoms) patch_ctr=get_position["CA"] pts_file=open(dat_dir+PDB_ID+'_'+str(pts_file_num)+'.ptf','w') x,y,z = patch_ctr[0], patch_ctr[1], patch_ctr[2] pts_file.write(PDB_ID+'\t'+str(x)+'\t'+str(y)+'\t'+str(z)+'\t'+'#'+'\t'+chain_ID[0]+'\t'+str(chain_ID[1])+'\t'+res+'\t'+str(label)+'\n') all_key_res = find_all_key_res(x,y,z,PDB_entries,my_kd_tree) for key_AA in all_key_res: [key_chainID,key_res]=key_AA res_atoms=ID_dict[key_chainID] if check_valid(key_res,res_atoms): label=res_label_dict[key_res] get_position=get_position_dict(res_atoms) res_ctr = get_position["CA"] if res_ctr!=patch_ctr: pts_file.write(PDB_ID+'\t'+str(res_ctr[0])+'\t'+str(res_ctr[1])+'\t'+str(res_ctr[2])+'\t'+'#'+'\t'+key_chainID[0]+'\t'+str(key_chainID[1])+'\t'+key_res+'\t'+str(label)+'\n') pts_file.close() def generate_patches_per_PDB(PDB_f): pdb_file = open(pdb_dir+'/'+PDB_f) PDB_ID = PDB_f[0:4] infile=list(pdb_file) pts_file_num=0 PROTEIN = grab_PDB(infile) [ID_dict, all_pos, all_lines, all_atom_type, PDB_entries, all_x, all_y , all_z] = PROTEIN visited=set() if len(all_pos)>0: my_kd_tree = scipy.spatial.KDTree(all_pos) pos = find_grid_points(all_x,all_y,all_z) actual_pos=[find_actual_pos(my_kd_tree, pos[i], PDB_entries) for i in range(len(pos))] for PDB_a in actual_pos: if PDB_a.chain_ID not in visited: extract_patch(PDB_ID,PDB_a,ID_dict,PDB_entries,my_kd_tree,pts_file_num) pts_file_num = pts_file_num+1 visited.add(PDB_a.chain_ID) if __name__ == '__main__': d_name = sys.argv[1] # 'train' or 'test' dat_dir = '../data/Protein_backbone_patch/'+d_name+'/' pdb_dir = '../../3DCNN_data_backbone/data/PDB_family_'+d_name PDBs = [ f for f in os.listdir(pdb_dir) if os.path.isfile(os.path.join(pdb_dir,f))] if not os.path.exists(dat_dir): os.makedirs(dat_dir) for PDB_f in PDBs: generate_patches_per_PDB(PDB_f)