# PDBs used to train the PROSITE TRYPSIN_HIS.5.HIS.NE2 model fold_train_pdb_dict_HIS = { 0:['1a0h', '1ab9', '1afq', '1aut', '1bb0', '1bma', '1bml', '1boq', '1c5m', '1ca0', '1ca8', '1cbw', '1cgi', '1cgj', '1chg', '1cho', '1d3d', '1d3p', '1d3q', '1d3t', '1d4p', '1ddj', '1dlk', '1eaw', '1eax', '1ela', '1elb', '1elc', '1eld', '1ele', '1elf', '1elg', '1esa', '1esb', '1ex3', '1ezq', '1ezx', '1f0r', '1f0s', '1fax', '1fjs', '1fon', '1fxy', '1g2l', '1g2m', '1gct', '1gg6', '1ggd', '1gha', '1ghb', '1gmc', '1gmd', '1gmh', '1h9h', '1hcg', '1hja', '1hv7', '1hyl', '1ioe', '1iqe', '1iqf', '1iqg', '1iqh', '1iqi', '1iqj', '1iqk', '1iql', '1iqm', '1iqn', '1jwt', '1kdq', '1kig', '1ksn', '1l0z', '1l1g', '1l4d', '1l4z', '1lka'], 1:['1lkb', '1lpg', '1lpk', '1lpz', '1lqd', '1mmj', '1mq5', '1mq6', '1mtn', '1n8o', '1nfu', '1nfw', '1nfx', '1nfy', '1nn6', '1nu9', '1o5e', '1o5f', '1okx', '1p0s', '1p57', '1p8v', '1ppz', '1pq5', '1pq7', '1pq8', '1pqa', '1qa0', '1qb1', '1qb6', '1qb9', '1qbn', '1qbo', '1qfk', '1qq4', '1qrw', '1qrx', '1qrz', '1riw', '1rjx', '1s0q', '1s0r', '1t4u', '1t4v', '1uo6', '1uvo', '1uvp', '1v3x', '1vgc', '1vr1', '1wu1', '1xka', '1xkb', '1xvm', '1xvo', '1yph', '1z6e', '2a7c', '2a7h', '2a7j', '2bmg', '2boh', '2bok', '2bq6', '2bq7', '2bqw', '2cga', '2cha', '2cji', '2cv3', '2d1j', '2d8w', '2de8', '2de9', '2ei6', '2ei7', '2ei8', '2f3c', '2f83'], 2:['2fo9', '2foa', '2fob', '2foc', '2fod', '2foe', '2fof', '2fog', '2foh', '2fzz', '2g00', '2g4t', '2g4u', '2g51', '2g52', '2g55', '2g5n', '2g5v', '2g8t', '2gch', '2gct', '2gd4', '2gmt', '2gv6', '2gv7', '2h9e', '2hlc', '2i6q', '2j2u', '2j34', '2j38', '2j4i', '2j94', '2j95', '2j9n', '2jet', '2jkh', '2nwn', '2o9q', '2oqu', '2p16', '2p3f', '2p3t', '2p3u', '2p8o', '2p93', '2p94', '2p95', '2phb', '2pks', '2pr3', '2psx', '2psy', '2puq', '2q1j', '2qn5', '2r2m', '2ra0', '2sfa', '2uuy', '2uwl', '2uwo', '2uwp', '2v0b', '2vgc', '2vh0', '2vh6', '2vvc', '2vvu', '2vvv', '2vwl', '2vwm', '2vwn', '2vwo', '2w26', '2w3i', '2w3k', '2win', '2wyg', '2wyj', '2xbv', '2xbw'], 3:['2xbx', '2xby', '2xc0', '2xc4', '2xc5', '2xw9', '2xwa', '2xwb', '2y5f', '2y5g', '2y5h', '2y7x', '2y7z', '2y80', '2y81', '2y82', '2za5', '2zeb', '2zec', '2zgc', '2zgh', '2zgj', '2zks', '3a7t', '3a7v', '3a7w', '3a7x', '3a7y', '3a7z', '3a80', '3a81', '3a82', '3a83', '3a84', '3a85', '3a86', '3a87', '3a88', '3a89', '3a8a', '3a8b', '3a8c', '3a8d', '3aas', '3aau', '3aav', '3ati', '3atk', '3atl', '3atm', '3bg4', '3bg8', '3bn9', '3bsq', '3c27', '3cen', '3cs7', '3da9', '3dfj', '3dfl', '3e3t', '3e8l', '3ela', '3ens', '3f6u', '3ffg', '3fzz', '3g01'], 4:['3gch', '3gct', '3gov', '3hpt', '3i77', '3i78', '3iit', '3iti', '3k65', '3k9x', '3kgp', '3khv', '3kl6', '3kqb', '3kqc', '3kqd', '3kqe', '3liw', '3m36', '3m37', '3m7q', '3m7t', '3m7u', '3ncl', '3nps', '3p6z', '3p70', '3p8f', '3p8g', '3po1', '3pro', '3q3k', '3qgj', '3rdz', '3rxa', '3rxb', '3rxc', '3rxd', '3rxe', '3rxf', '3rxg', '3rxh', '3rxi', '3rxj', '3rxk', '3rxl', '3rxm', '3rxo', '3rxp', '3rxq', '3rxr', '3rxs', '3rxt', '3rxu', '3rxv', '3s0n', '3sw2', '3tju', '3tjv', '3tk9', '3vgc', '4a6l', '4a7i', '4cha', '4gch', '4pro', '4vgc', '5cha', '5gch', '6cha', '6gch', '7gch', '8gch']} # PDBs used to train the PROSITE TRYPSIN_SER.6.SER.OG model fold_train_pdb_dict_SER = { 0:['1a0h', '1a5h', '1ab9', '1afq', '1bio', '1bma', '1boq', '1c4y', '1ca0', '1cbw', '1cgi', '1cgj', '1cho', '1dfp', '1dic', '1dle', '1dlk', '1dst', '1dsu', '1ejm', '1ela', '1elb', '1elc', '1eld', '1ele', '1elf', '1elg', '1esa', '1esb', '1ex3', '1fn8', '1fon', '1fxy', '1fy4', '1fy5', '1g2l', '1gct', '1gdn', '1gdq', '1gdu', '1gg6', '1ggd', '1gha', '1ghb', '1gl1', '1gmc', '1gmd', '1gmh', '1hcg', '1hfd'], 1:['1hja', '1hv7', '1jrs', '1jrt', '1jwt', '1k2i', '1kdq', '1l0z', '1l1g', '1lka', '1lkb', '1mmj', '1mtn', '1n8o', '1okx', '1op8', '1p2n', '1p2q', '1pq5', '1pq7', '1pq8', '1qa0', '1qb1', '1qb6', '1qb9', '1qbn', '1qbo', '1qq4', '1qrw', '1qrx', '1qrz', '1riw', '1s0q', '1s0r', '1t7c', '1t8l', '1t8m', '1t8n', '1t8o', '1try', '1uo6', '1utk', '1uvo', '1uvp', '1vgc', '1xvm'], 2:['1xvo', '1xx9', '1ybw', '1yc0', '1yph', '1zjk', '2a0q', '2a7c', '2a7h', '2a7j', '2cga', '2cha', '2cv3', '2de8', '2de9', '2fo9', '2foa', '2fob', '2foc', '2fod', '2foe', '2fof', '2fog', '2foh', '2g4t', '2g4u', '2g51', '2g52', '2g55', '2g5n', '2g5v', '2g8t', '2gch', '2gct', '2gmt', '2i6q', '2j9n', '2jet', '2o9q', '2oqu', '2p8o', '2pks', '2puq', '2qn5', '2sfa', '2uuy', '2v0b', '2vgc', '2vu8', '2win', '2xwa', '2y6t'], 3:['2za5', '2zch', '2zck', '2zcl', '2zeb', '2zec', '2zgc', '2zgh', '2zgj', '2zks', '3a7t', '3a7v', '3a7w', '3a7x', '3a7y', '3a7z', '3a80', '3a81', '3a82', '3a83', '3a84', '3a85', '3a86', '3a87', '3a88', '3a89', '3a8a', '3a8b', '3a8c', '3a8d', '3aas', '3aau', '3aav', '3ati', '3atk', '3atl', '3atm', '3bg4', '3bg8', '3bn9', '3bsq', '3bth', '3dfj', '3dfl', '3e3t', '3e8l', '3ela'], 4:['3gch', '3gct', '3gov', '3hs0', '3iti', '3k2u', '3k6y', '3m7q', '3m7t', '3m7u', '3nps', '3po1', '3pro', '3qgj', '3qum', '3rxa', '3rxb', '3rxc', '3rxd', '3rxe', '3rxf', '3rxg', '3rxh', '3rxi', '3rxj', '3rxk', '3rxl', '3rxm', '3rxo', '3rxp', '3rxq', '3rxr', '3rxs', '3rxt', '3rxu', '3rxv', '3s0n', '3s9a', '3s9b', '3s9c', '3sbk', '3tju', '3tjv', '3tk9', '3vgc', '4cha', '4gch', '4pro', '4vgc', '5cha', '5gch', '6cha', '6gch', '7gch', '8gch'], } def write_PDB_prob_all_folds(target_RES, target_ATOM, site_ID,pos_or_neg='pos'): result_weights_ID = target_RES+"_"+target_ATOM+"_"+site_ID total_fold = 5 ptf_order=open('../data/numpy/'+target_RES+'.'+target_ATOM+'.order'+'.ptf') dict_list = list(ptf_order) ID = site_ID+'.'+target_RES+'.'+target_ATOM files = [ os.path.join('../data/numpy/',f) for f in os.listdir('../data/numpy/') if os.path.isfile(os.path.join('../data/numpy/',f))] files = [t for t in files if ID in t] files = [t for t in files if '.pos' in t] files = [t for t in files if '.dat' in t] total_num = len(files) all_prob = [] for fold in range(5): for i in range(total_num): prob = numpy.load('../results/post_prob/'+target_RES+'.'+target_ATOM+'_prob'+'_fold_'+str(fold)+'_'+str(i)+'.dat') if i == 0: fold_prob = prob else: fold_prob = numpy.concatenate((fold_prob,prob),axis=0) all_prob.append(fold_prob) summary_file=open('../results/'+'PDB_prob_'+result_weights_ID+'_numpy.txt','w') pdb_set = set() for i in range(len(dict_list)): line = dict_list[i] chain_ID = line.strip('\n')[-6:] chain = chain_ID[0] res_no = chain_ID[1:] S=line.split() PDB_ID=S[0] # chain = S[-2] # res_no = S[-1] x_ = float(S[1]) y_ = float(S[2]) z_ = float(S[3]) if PDB_ID not in pdb_set: site_no = 0 pdb_set.add(PDB_ID) for fold in range(5): probs_y = all_prob[fold][i] summary_file.write(PDB_ID+'\t'+str(site_no)+'\t'+str(x_)+'\t'+str(y_)+'\t'+str(z_)+'\t'+target_RES+'\t'+chain+'\t'+str(res_no)+'\t'+str(fold)+'\t'+str(probs_y)+'\n') site_no+=1 def write_summary_pos(target_RES,target_ATOM,pos_or_neg='pos'): result_weights_ID = target_RES+"_"+target_ATOM+"_"+site if target_RES == 'SER': prob_file = open('../results/PDB_prob_SER_OG_TRYPSIN_SER.6_numpy.txt') out_file = open('../results/selected_fold_'+result_weights_ID+'_pred_prob.txt','w') elif target_RES == 'HIS': prob_file = open('../results/PDB_prob_HIS_NE2_TRYPSIN_HIS.5_numpy.txt') out_file = open('../results/selected_fold_'+result_weights_ID+'_pred_prob.txt','w') fold_train_pdb_dict = fold_train_pdb_dict_HIS if target_RES == 'HIS' else fold_train_pdb_dict_SER result_list = prob_file total_fold = 5 results_dict=collections.defaultdict(dict) for line in result_list: ele = line.split() pdb_id = ele[0] res = ele[-5] chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if (chain,res,res_no) not in results_dict[pdb_id]: results_dict[pdb_id][(chain,res,res_no)]=[] results_dict[pdb_id][(chain,res,res_no)].append((prob,line)) for pdb_id in results_dict: fold_to_use = None if pdb_id in fold_train_pdb_dict[0]: fold_to_use = 0 elif pdb_id in fold_train_pdb_dict[1]: fold_to_use = 1 elif pdb_id in fold_train_pdb_dict[2]: fold_to_use = 2 elif pdb_id in fold_train_pdb_dict[3]: fold_to_use = 3 elif pdb_id in fold_train_pdb_dict[4]: fold_to_use = 4 if fold_to_use==None: for (chain,res,res_no) in results_dict[pdb_id]: entries = results_dict[pdb_id][(chain,res,res_no)] entries.sort(key=lambda entries: entries[0],reverse=True) max_ = entries[0] out_file.write(max_[1]) else: for (chain,res,res_no) in results_dict[pdb_id]: entries = results_dict[pdb_id][(chain,res,res_no)] out_file.write(entries[fold_to_use][1]) def get_PDB_to_fold_pos_prob(pos_or_neg = 'pos'): pos_PDB_list = open('../data/SCOP_PDB.txt') outfile=open('../results/'+'detect_prob_TRYPSIN_PDB_level.txt','w') inte_dict=collections.defaultdict(list) for target_RES, target_ATOM, site in [('SER','OG','TRYPSIN_SER.6'),('HIS','NE2','TRYPSIN_HIS.5')]: result_weights_ID = target_RES+"_"+target_ATOM+"_"+site integrate_file = open('../results/selected_fold_'+result_weights_ID+'_pred_prob.txt') for line in integrate_file: ele = line.split() pdb_id = ele[0] inte_dict[pdb_id].append(line) for line in pos_PDB_list: pdb_id = line.split()[0] if pdb_id in inte_dict: entries = inte_dict[pdb_id] else: entries = [pdb_id+'\t'+'None'+'\n'] for l in entries: outfile.write(l) def final_stats(thres=0.5): count_file = open('../data/csa_pdb_HIS_SER_annotation.txt') count_list = list(count_file) pos_file = open('../results/'+'detect_prob_TRYPSIN_PDB_level.txt') # all: 1447 (SCOP_fscnn_with_csa_pdb.txt) make sure they are all in the dataset? # HIS only: 92 # SER only: 4 # SER, HIS: 1347 # None: 4 # Ser_Text = 'Ser' # His_Text = 'His' Ser_Text = 'SER' His_Text = 'HIS' detected_dict = collections.defaultdict(dict) for line in pos_file: ele = line.split() pdb_id = ele[0] if ele[1]!='None': res = ele[-5].lower() chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if prob>thres: if res not in detected_dict[pdb_id]: detected_dict[pdb_id][res]=[] detected_dict[pdb_id][res].append((chain,res_no)) csa_res_dict=collections.defaultdict(dict) report_dict=collections.defaultdict(list) HIS_dict=collections.defaultdict(list) SER_dict=collections.defaultdict(list) ALL_dict=collections.defaultdict(list) detect_csa_file = open('../results/detected_TRYPSIN_CSA_site_summary.txt','w') detect_non_file = open('../results/detected_TRYPSIN_nonCSA_site_summary.txt','w') csa_pdb=set() for line in count_list: ele = line.split() pdb_id = ele[0] csa_pdb.add(pdb_id) if ele[1]!='None': res = ele[1].lower() chain = ele[2] res_no = ele[3] if res not in csa_res_dict[pdb_id]: csa_res_dict[pdb_id][res]=[] csa_res_dict[pdb_id][res].append((chain,res_no)) if res in detected_dict[pdb_id]: if (chain,res_no) in detected_dict[pdb_id][res]: report_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n') if res == 'ser': SER_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n') elif res== 'his': HIS_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n') ALL_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n') else: report_dict[pdb_id].append(line.strip('\n')+'\t'+'False'+'\n') else: report_dict[pdb_id].append(line.strip('\n')+'\t'+'False'+'\n') print "HIS model detected " +str(len(HIS_dict.keys())) +" out of "+str(len(csa_pdb))+" PDBs" print "SER model detected " +str(len(SER_dict.keys())) +" out of "+str(len(csa_pdb))+" PDBs" print "Both model detected " +str(len(ALL_dict.keys())) +" out of "+str(len(csa_pdb))+" PDBs" print "writing summary into" print '../results/detected_TRYPSIN_CSA_site_summary.txt' print '../results/detected_TRYPSIN_nonCSA_site_summary.txt' #print len(report_dict.keys()) for pdb_id in report_dict: entries = report_dict[pdb_id] for e in entries: detect_csa_file.write(e) for pdb_id in detected_dict: for res in detected_dict[pdb_id]: pos_pred = detected_dict[pdb_id][res] if res in csa_res_dict[pdb_id]: csa_res = csa_res_dict[pdb_id][res] for p in pos_pred: if p not in csa_res: detect_non_file.write(pdb_id+'\t'+res+'\t'+p[0]+'\t'+p[1]+'\n') else: for p in pos_pred: detect_non_file.write(pdb_id+'\t'+res+'\t'+p[0]+'\t'+p[1]+'\n') if __name__ == '__main__': import sys import os import numpy import collections total_fold=5 for target_RES, target_ATOM, site in [('SER','OG','TRYPSIN_SER.6'),('HIS','NE2','TRYPSIN_HIS.5')]: write_PDB_prob_all_folds(target_RES, target_ATOM, site) write_summary_pos(target_RES,target_ATOM,site) get_PDB_to_fold_pos_prob() final_stats()