import os import sys import time import numpy import re import math import collections from collections import OrderedDict import random from sets import Set def PDB_all_folds_NOS_predicted_prob(target_RES,target_ATOM,pos_or_neg): result_weights_ID = target_RES+"_"+target_ATOM+"_"+pos_or_neg total_fold = 5 ptf_order=open('../data/ptf/'+target_RES+'.'+target_ATOM+'.'+pos_or_neg+'_detect.ptf') dict_list = list(ptf_order) ID = target_RES+'.'+target_ATOM+'.'+pos_or_neg files = [ os.path.join('../data/numpy/detect/',f) for f in os.listdir('../data/numpy/detect') if os.path.isfile(os.path.join('../data/numpy/detect/',f))] files = [t for t in files if ID in t] files = [t for t in files if '.dat' in t] total_num = len(files) all_prob = [] for fold in range(5): for i in range(total_num): prob = numpy.load('../results/detect_prob/'+target_RES+'.'+target_ATOM+'.'+pos_or_neg+'_prob'+'_fold_'+str(fold)+'_'+str(i)+'.dat') if i == 0: fold_prob = prob else: fold_prob = numpy.concatenate((fold_prob,prob),axis=0) all_prob.append(fold_prob) summary_file=open('../results/detect_results/'+'PDB_prob_'+result_weights_ID+'_numpy.txt','w') uniprot_PDBID={} final_results={} PDB_results={} pdb_set = set() for i in range(len(dict_list)): line = dict_list[i] chain_ID = line.strip('\n')[-6:] chain = chain_ID[0] res_no = chain_ID[1:] S=line.split() PDB_ID=S[0] x_ = float(S[1]) y_ = float(S[2]) z_ = float(S[3]) if PDB_ID not in pdb_set: site_no = 0 pdb_set.add(PDB_ID) for fold in range(5): probs_y = all_prob[fold][i] summary_file.write(PDB_ID+'\t'+str(site_no)+'\t'+str(x_)+'\t'+str(y_)+'\t'+str(z_)+'\t'+target_RES+'\t'+chain+'\t'+str(res_no)+'\t'+str(fold)+'\t'+str(probs_y)+'\n') site_no+=1 def write_summary_pos(target_RES,target_ATOM,pos_or_neg='pos'): fold_PDB={ ('CYS','SG',0):['3hsn', '3hso', '3hsp', '1q2o', '1tll', '3b3p', '2hx2', '2hx3', '1f20', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '2ort', '1fop', '2orp', '3fc5', '1p6i', '1p6h'], ('ARG','CZ',0):['3hsn', '3hso', '3hsp', '1q2o', '3b3p', '2hx2', '2hx3', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '2ort', '1fop', '2orp', '3fc5', '1p6i', '1p6h', '1p6k', '1p6j', '1p6m'], ('GLU','OE1',0):['3hsn', '3hso', '3hsp', '1q2o', '3b3p', '2hx2', '2hx3', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '1fop', '2orp', '3fc5', '1p6i', '1p6h', '1p6k', '1p6j', '1p6m'], ('TRP','NE1',0):['3hsn', '3hso', '3hsp', '1q2o', '3b3p', '2hx2', '2hx3', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '2ort', '1fop', '2orp', '3fc5', '1p6i', '1p6h', '1p6k', '1p6j', '1p6m'], ('CYS','SG',1):['1p6k', '1p6j', '1p6m', '1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5'], ('ARG','CZ',1):['1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5', '1qw4', '1zvi'], ('GLU','OE1',1):['1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5', '1qw4'], ('TRP','NE1',1):['1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5', '1qw4', '1zvi'], ('CYS','SG',2):['1qw4', '1zvi', '1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o'], ('ARG','CZ',2):['1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o', '1d0c', '1ed6'], ('GLU','OE1',2):['1zvi', '1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o', '1d0c'], ('TRP','NE1',2):['1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o', '1d0c', '1ed6'], ('CYS','SG',3):['1d0c', '1ed6', '1ed4', '1ed5', '3dwj', '1dd7', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '2oro', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk'], ('ARG','CZ',3):['1ed4', '1ed5', '3dwj', '1dd7', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '2oro', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk', '2ors'], ('GLU','OE1',3):['1ed6', '1ed4', '1ed5', '3dwj', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk'], ('TRP','NE1',3):['1ed4', '1ed5', '3dwj', '1dd7', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '2oro', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk', '2ors'], ('CYS','SG',4):['2ors', '2orr', '2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g'], ('ARG','CZ',4):['2orr', '2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g'], ('GLU','OE1',4):['2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g'], ('TRP','NE1',4):['2orr', '2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g'] } result_weights_ID = target_RES+"_"+target_ATOM+"_"+pos_or_neg trial = result_weights_ID result_list = open('../results/detect_results/'+'PDB_prob_'+trial+'_numpy.txt') integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt','w') total_fold = 5 results_dict=collections.defaultdict(dict) for line in result_list: ele = line.split() pdb_id = ele[0] res = ele[-5] chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = ele[-1] if (fold,res) not in results_dict[pdb_id]: results_dict[pdb_id][(fold,res)]=[] results_dict[pdb_id][(fold,res)].append(line) for fold in range(0,5): for pdb_id in fold_PDB[(target_RES,target_ATOM,fold)]: if pdb_id in results_dict: entries = results_dict[pdb_id][(fold,target_RES)] for line in entries: integrate_file.write(line) def write_summary_neg(target_RES,target_ATOM,pos_or_neg='neg'): result_weights_ID = target_RES+"_"+target_ATOM+"_"+pos_or_neg trial = result_weights_ID result_list = open('../results/detect_results/'+'PDB_prob_'+trial+'_numpy.txt') integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt','w') total_fold = 5 results_dict=collections.defaultdict(dict) for line in result_list: #3e7g 21 -20.54 28.77 26.97 CYS C 217 2 9.70189e-09 ele = line.split() pdb_id = ele[0] res = ele[-5] chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if (chain,res,res_no) not in results_dict[pdb_id]: results_dict[pdb_id][(chain,res,res_no)]=[] results_dict[pdb_id][(chain,res,res_no)].append((prob,line)) for pdb_id in results_dict: for (chain,res,res_no) in results_dict[pdb_id]: entries = results_dict[pdb_id][(chain,res,res_no)] entries.sort(key=lambda entries: entries[0],reverse=True) max_ = entries[0] integrate_file.write(max_[1]) def get_PDB_to_fold_pos_prob(pos_or_neg = 'pos'): pos_PDB_list = open('../data/train_pos_PDBs.txt') outfile=open('../results/detect_results/'+'results_pos_PDB_all_residues_test_fold.txt','w') inte_dict=collections.defaultdict(list) for target_RES, target_ATOM in [('CYS','SG'),('ARG','CZ'),('GLU','OE1'),('TRP','NE1')]: trial = target_RES+"_"+target_ATOM+"_"+pos_or_neg integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt') for line in integrate_file: ele = line.split() pdb_id = ele[0] inte_dict[pdb_id].append(line) for line in pos_PDB_list: pdb_id = line.strip('\n') if pdb_id in inte_dict: entries = inte_dict[pdb_id] else: entries = ['None'+'\n'] for l in entries: outfile.write(l) def get_PDB_to_fold_neg_prob(pos_or_neg = 'neg'): neg_PDB_list = open('../data/test_neg_PDBs.txt') outfile=open('../results/detect_results/'+'results_neg_PDB_all_residues_max_fold.txt','w') inte_dict=collections.defaultdict(list) for target_RES, target_ATOM in [('CYS','SG'),('ARG','CZ'),('GLU','OE1'),('TRP','NE1')]: trial = target_RES+"_"+target_ATOM+"_"+pos_or_neg integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt') for line in integrate_file: ele = line.split() pdb_id = ele[0].strip(' ') inte_dict[pdb_id].append(line) for line in neg_PDB_list: pdb_id = line.strip('\n').strip(' ') if pdb_id in inte_dict: entries = inte_dict[pdb_id] else: entries = [pdb_id+'\t'+'None'+'\n'] for l in entries: outfile.write(l) def final_stats_pos(thres=0.5): pdb_residue_site_file = open('../results/detect_results/'+'pdb_residue_site_exist.txt') pos_file = open('../results/detect_results/'+'results_pos_PDB_all_residues_test_fold.txt') detected_dict = collections.defaultdict(dict) for line in pdb_residue_site_file: ele = line.split() pdb_id = ele[0] CYS_exist = ele[1] ARG_exist = ele[2] TRP_exist = ele[3] GLU_exist = ele[4] if CYS_exist: detected_dict[pdb_id]['cys']=[] if ARG_exist: detected_dict[pdb_id]['arg']=[] if TRP_exist: detected_dict[pdb_id]['trp']=[] if GLU_exist: detected_dict[pdb_id]['glu']=[] for line in pos_file: ele = line.split() pdb_id = ele[0] res = ele[-5].lower() chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if prob>thres: detected_dict[pdb_id][res].append((chain,res_no)) true_site_dict=collections.defaultdict(dict) report_dict=collections.defaultdict(list) # "detection_summary_true_sites.txt" # summarize detect (True) or not detected (False) of all the annotated CSA true sites by our models # "detection_summary_fp_sites_pos_pdb.txt" # summarize any detected false positive sites (if any) in positive NOS structures detect_true_site = open('../results/detect_results/detection_summary_true_sites.txt','w') detect_fp_site = open('../results/detect_results/detection_summary_fp_sites_pos_pdb.txt','w') print ("writing summary files into ") print ('../results/detect_results/detection_summary_true_sites.txt') print ('../results/detect_results/detection_summary_fp_sites_pos_pdb.txt') for ptf_name in ['../data/ptf/CYS.SG.pos_train.ptf','../data/ptf/ARG.CZ.pos_train.ptf','../data/ptf/TRP.NE1.pos_train.ptf','../data/ptf/GLU.OE1.pos_train.ptf']: ptf_file = open(ptf_name) for line in ptf_file: ele = line.split() pdb_id = ele[0] res = ele[-3].lower() chain = ele[-2] res_no = ele[-1] if res not in true_site_dict[pdb_id]: true_site_dict[pdb_id][res]=[] true_site_dict[pdb_id][res].append((chain,res_no)) if (chain,res_no) in detected_dict[pdb_id][res]: report_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n') else: report_dict[pdb_id].append(line.strip('\n')+'\t'+'False'+'\n') for pdb_id in report_dict: entries = report_dict[pdb_id] for e in entries: detect_true_site.write(e) fp_site_count = 0 for pdb_id in detected_dict: for res in detected_dict[pdb_id]: pos_pred = detected_dict[pdb_id][res] for p in pos_pred: if p not in true_site_dict[pdb_id][res]: detect_fp_site.write(pdb_id+'\t'+res+'\t'+p+'\n') fp_site_count+=1 if fp_site_count==0: detect_fp_site.write("No false positive site detected in positive NOS structures"+'\n') def final_stats_neg(thres=0.5): neg_file = open('../results/detect_results/'+'results_neg_PDB_all_residues_max_fold.txt') detect_fp_site = open('../results/detect_results/detection_summary_fp_sites_neg_pdb.txt','w') print ("writing summary files into ") print ('../results/detect_results/detection_summary_fp_sites_neg_pdb.txt') detected_dict = collections.defaultdict(dict) for line in neg_file: ele = line.split() if ele[1]!='None': pdb_id = ele[0] res = ele[-5] chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if prob>thres: if res not in detected_dict[pdb_id]: detected_dict[pdb_id][res]=[] detected_dict[pdb_id][res].append((prob,line)) fp_site_count = 0 for pdb_id in detected_dict: for res in detected_dict[pdb_id]: pos_pred = detected_dict[pdb_id][res] for p in pos_pred: detect_fp_site.write(pdb_id+'\t'+res+'\t'+p+'\n') fp_site_count+=1 if fp_site_count==0: detect_fp_site.write("No false positive site detected in negative structures"+'\n') if __name__ == '__main__': import sys import os total_fold=5 import collections # STEP 1 for target_RES, target_ATOM, pos_or_neg in [('CYS','SG','pos'),('ARG','CZ','pos'),('GLU','OE1','pos'),('TRP','NE1','pos'),('CYS','SG','neg'),('ARG','CZ','neg'),('GLU','OE1','neg'),('TRP','NE1','neg')]: PDB_all_folds_NOS_predicted_prob(target_RES,target_ATOM,pos_or_neg) # STEP 2 for target_RES, target_ATOM in [('CYS','SG'),('ARG','CZ'),('GLU','OE1'),('TRP','NE1')]: write_summary_pos(target_RES,target_ATOM) write_summary_neg(target_RES,target_ATOM) # STEP 3 get_PDB_to_fold_pos_prob() get_PDB_to_fold_neg_prob() # STEP 4 final_stats_pos() final_stats_neg()