import math import numpy import os import sys import theano import theano.tensor as T import collections from sets import Set import scipy import gzip X = ['H','K','R','D','E','S','T','N','Q','A','V','L','I','M','F','Y','W','P','G','C'] abrev={'HIS':'H','LYS':'K','ARG':'R','ASP':'D','GLU':'E','SER':'S','THR':'T','ASN':'N','GLN':'Q','ALA':'A','VAL':'V','LEU':'L','ILE':'I','MET':'M','PHE':'F','TYR':'Y','TRP': 'W','PRO': 'P','GLY': 'G', 'CYS': 'C'} class PDB_atom: def __init__(self,atom_type,res,chain_ID,x,y,z,index,value): self.atom = atom_type self.res = res self.chain_ID = chain_ID self.x = x self.y = y self.z = z self.index = index self.value = value def __eq__(self, other): return self.__dict__ == other.__dict__ def get_position_dict(all_PDB_atoms): get_position={} for a in all_PDB_atoms: get_position[a.atom]=[a.x,a.y,a.z] return get_position def get_target_res(target_RES, target_ATOM, Query,chain_IDs, ID_dict): index = 0 start = 0 end = 0 pos = [] target_residue=abrev[target_RES] for i in range (0,len(Query)): if Query[i]==target_residue: TARGET_RES = Query[i] TARGET_CHAIN = chain_IDs[i] res_atoms = ID_dict[TARGET_CHAIN] get_position=get_position_dict(res_atoms) if target_ATOM in get_position.keys(): pos.append((TARGET_RES,TARGET_CHAIN,get_position[target_ATOM])) return [True,pos] def grab_PDB(entry_list): ID_dict={} all_pos=[] all_lines=[] all_atom_type =[] PDB_entries = [] atom_index = 0 model_ID = 0 MODELS = [] SEQ= [] chain_IDs=[] for line1 in entry_list: line=line1.split() if line[0]=="ATOM": atom=(line1[13:16].strip(' ')) res=(line1[17:20]) chain_ID=line1[21:26] chain=chain_ID[0] res_no=chain_ID[1:].strip(' ') res_no=int(res_no) chain_ID=(chain,res_no) new_pos=[float(line1[30:37]),float(line1[38:45]),float(line1[46:53])] all_pos.append(new_pos) all_lines.append(line1) all_atom_type.append(atom[0]) if chain_ID not in ID_dict.keys(): if res in abrev.keys(): SEQ.append(abrev[res]) chain_IDs.append(chain_ID) l=[] a=PDB_atom(atom_type=atom,res=res,chain_ID=chain_ID,x=new_pos[0],y=new_pos[1],z=new_pos[2],index=atom_index,value=1) l.append(a) ID_dict[chain_ID]=l else: ID_dict[chain_ID].append(PDB_atom(atom,res,chain_ID,new_pos[0],new_pos[1],new_pos[2],index=atom_index,value=1)) PDB_entries.append(PDB_atom(atom,res,chain_ID,new_pos[0],new_pos[1],new_pos[2],index=atom_index,value=1)) atom_index+=1 if line[0]=="ENDMDL": break MODEL=[ID_dict, SEQ, chain_IDs] return MODEL def extract_all_res_defined_AA_type(pdb_list, pdb_dir,target_RES, target_ATOM, site): print "Extracting all "+target_RES+" residues from PDBs in pdb_list" ptf_name = '../data/ptf/'+target_RES+'_'+target_ATOM+'_'+site+'.ptf' pdb_list = open(pdb_list) ptf_file = open(ptf_name,'w') for line in pdb_list: PDB_ID=line.split('.')[0] print PDB_ID pdb_file_name=pdb_dir+'/'+PDB_ID.lower()+'.pdb' if os.path.isfile(pdb_file_name): pdb_file=open(pdb_file_name) l=list(pdb_file) MODEL=grab_PDB(l) [ID_dict, SEQ, chain_IDs]=MODEL Query=SEQ [if_pattern,pos]=get_target_res(target_RES, target_ATOM, Query,chain_IDs,ID_dict) if if_pattern: for p_ in pos: TARGET_RES,TARGET_CHAIN,p=p_ x=p[0] y=p[1] z=p[2] #print PDB_ID+'\t'+str(x)+'\t'+str(y)+'\t'+str(z)+'\n' ptf_file.write(PDB_ID+'\t'+str(x)+'\t'+str(y)+'\t'+str(z)+'\t'+TARGET_RES+'\t'+TARGET_CHAIN[0]+'\t'+str(TARGET_CHAIN[1])+'\n') else: print "PDB not found!" print PDB_ID ptf_file.close() print "Writing extracting positions in "+ptf_name return ptf_name