import os import csv import numpy as np import itertools as it import numpy import numpy.random from random import shuffle import sys import math import collections import scipy import scipy.spatial res_to_key_atom={ 'ILE': [["CB"]], 'GLN': [["OE1","CD","NE2"]], 'GLY': [["CA"]], 'GLU': [["OE1","CD","OE2"]], 'CYS': [["SG"]], 'HIS': [["NE2","ND1"]], 'SER': [["OG"]], 'LYS': [["NZ"]], 'PRO': [["N","CA","CB","CD","CG"]], 'ASN': [["OD1","CG","ND2"]], 'VAL': [["CB"]], 'THR': [["OG1"]], 'ASP': [["OD1","CG","OD2"]], 'TRP': [["CD2","CE2","CE3","CZ2","CZ3","CH2"],["NE1"]], 'PHE': [["CG","CD1","CD2","CE1","CE2","CZ"]], 'ALA': [["CB"]], 'MET': [["SD"]], 'LEU': [["CB"]], 'ARG': [["CZ"]], 'TYR': [["CG","CD1","CD2","CE1","CE2","CZ"], ["OH"]] } def read_csv_DUDE(filename, input_name, target_name): data = ([], []) with open(filename) as file: reader = csv.DictReader(file) for row in reader: data[0].append(row[input_name]) data[1].append(row[target_name]) return data class PDB_atom: def __init__(self,atom_type,res,chain_ID,x,y,z,index): self.atom = atom_type self.res = res self.chain_ID = chain_ID self.x = x self.y = y self.z = z self.index = index def __eq__(self, other): return self.__dict__ == other.__dict__ def find_actual_pos(my_kd_tree,cor): [d,i] = my_kd_tree.query(cor,k=1) return PDB_entries[i] def get_position_dict(all_PDB_atoms): get_position={} for a in all_PDB_atoms: get_position[a.atom]=[a.x,a.y,a.z] return get_position def grab_PDB(entry_list): ID_dict={} all_pos=[] all_lines=[] all_atom_type =[] PDB_entries = [] atom_index = 0 model_ID = 0 MODELS = [] for line in entry_list: ele=line.split() if model_ID>0: break if ele[0]=="ATOM": atom=(line[12:16].strip(' ')) res=(line[17:20]) chain_ID=line[21:26] chain=chain_ID[0] res_no=chain_ID[1:].strip(' ') res_no=int(res_no) chain_ID=(chain,res_no) new_pos=[float(line[30:37]),float(line[38:45]),float(line[46:53])] all_pos.append(new_pos) all_lines.append(line) all_atom_type.append(atom[0]) if chain_ID not in ID_dict.keys(): l=[] a=PDB_atom(atom_type=atom,res=res,chain_ID=chain_ID,x=new_pos[0],y=new_pos[1],z=new_pos[2],index=atom_index) l.append(a) ID_dict[chain_ID]=l else: ID_dict[chain_ID].append(PDB_atom(atom,res,chain_ID,new_pos[0],new_pos[1],new_pos[2],index=atom_index)) PDB_entries.append(PDB_atom(atom,res,chain_ID,new_pos[0],new_pos[1],new_pos[2],index=atom_index)) atom_index+=1 if ele[0]=="ENDMDL" and model_ID==0: model_ID+=1 return [ID_dict,all_pos,all_lines, all_atom_type, PDB_entries] def find_all_key_res(x,y,z,ID_dict,all_pos,PDB_entries,my_kd_tree): all_key_res = set([]) atom_index = my_kd_tree.query_ball_point([x,y,z], r=6, p=2.0) for i in range (0,len(atom_index)): a = PDB_entries[atom_index[i]] all_key_res.add((a.chain_ID,a.res)) return all_key_res def grab_ligand(entry_list): if_NMR=False ID_dict={} atom_index = 0 model_ID = 0 for line1 in entry_list: line=line1.split() if model_ID>0: if_NMR=True break if line1[0:6]=="HETATM" and line1[17:20]!="HOH": atom=(line1[13:16].strip(' ')) res=(line1[17:20]) chain_ID=line1[21:26] chain=chain_ID[0] res_no=chain_ID[1:].strip(' ') res_no=int(res_no) #print res_no chain_ID=(chain,res_no,res) new_pos=[float(line1[30:37]),float(line1[38:45]),float(line1[46:53])] if chain_ID not in ID_dict.keys(): l=[] a=PDB_atom(atom_type=atom,res=res,chain_ID=chain_ID,x=new_pos[0],y=new_pos[1],z=new_pos[2],index=atom_index) l.append(a) ID_dict[chain_ID]=l else: ID_dict[chain_ID].append(PDB_atom(atom,res,chain_ID,new_pos[0],new_pos[1],new_pos[2],index=atom_index)) atom_index+=1 if line[0]=="ENDMDL" and model_ID==0: model_ID+=1 return ID_dict def parse_crystal_lig(all_lines): start=0 end=0 atom=0 ctr=numpy.zeros((1,3)) for line in all_lines: # print "start:"+str(start) # print "end:"+str(end) if line.strip(' ').strip('\n')=='@ATOM': start=1 elif line.strip(' ').strip('\n')=='@BOND': end=1 else: if start>0 and end<1: # print "yaaaaaaa" ele=line.split() x=float(ele[2]) y=float(ele[3]) z=float(ele[4]) ctr=ctr+numpy.array([x,y,z]) atom=atom+1 ctr=ctr/atom return ctr def cut_ligand_all_atoms(PDB_ID,DUDE_ctr,build_ptf=False): result=[] pdb_file_name='../data/DUDE/PDB/'+PDB_ID.lower()+'.pdb' if os.path.isfile(pdb_file_name): pdb_file=open(pdb_file_name) l=list(pdb_file) lig_ID_dict=grab_ligand(l) PROTEIN_MODEL=grab_PDB(l) [pro_ID_dict,pro_all_pos,all_lines, all_atom_type, PDB_entries]=PROTEIN_MODEL pro_all_pos = numpy.array(pro_all_pos) my_kd_tree = scipy.spatial.KDTree(pro_all_pos) for ID in lig_ID_dict: lig = lig_ID_dict[ID] lig_name = ID[2] lig_chain = ID[0] lig_no = ID[1] lig_center = numpy.zeros((3,)) total_atom = 0 if len(lig)>1: FINAL_RES=set() for atoms in lig: x= atoms.x y= atoms.y z= atoms.z atype=atoms.atom if atype[0]!='H': cor = [atoms.x,atoms.y,atoms.z] total_atom +=1 lig_center=lig_center+cor all_key_res = find_all_key_res(x,y,z,pro_ID_dict,pro_all_pos,PDB_entries,my_kd_tree) for a in all_key_res: FINAL_RES.add(a) if total_atom>0: lig_center = numpy.array(lig_center) lig_center=lig_center/total_atom if dist(DUDE_ctr,lig_center)<2.5: result.append([lig_name,lig_chain,lig_no, lig_center]) return result def dist(a,b): d=numpy.sqrt(numpy.sum((a-b)**2)) return d