def site_AUC(): pos_file=open('../results/detect_results/results_pos_PDB_all_residues_test_fold.txt') neg_file=open('../results/detect_results/results_neg_PDB_all_residues_max_fold.txt') count_file = open('../results/detect_results/pdb_residue_site_exist.txt') results_dict = collections.defaultdict(dict) for line in count_file: ele = line.split() pdb_id = ele[0] CYS_exist = ele[1] ARG_exist = ele[2] TRP_exist = ele[3] GLU_exist = ele[4] if CYS_exist: results_dict[pdb_id]['cys']=[] if ARG_exist: results_dict[pdb_id]['arg']=[] if TRP_exist: results_dict[pdb_id]['trp']=[] if GLU_exist: results_dict[pdb_id]['glu']=[] true_dict=collections.defaultdict(dict) true_set = set() for ptf_name in ['../data/ptf/CYS.SG.pos_train.ptf','../data/ptf/ARG.CZ.pos_train.ptf','../data/ptf/TRP.NE1.pos_train.ptf','../data/ptf/GLU.OE1.pos_train.ptf']: ptf_file = open(ptf_name) for line in ptf_file: ele = line.split() pdb_id = ele[0] res = ele[-3].lower() chain = ele[-2] res_no = ele[-1] if res not in true_dict[pdb_id]: true_dict[pdb_id][res]=[] true_dict[pdb_id][res].append((chain,res_no)) true_set.add((pdb_id,res,chain,res_no)) y_pred = [] y_true = [] pos_set= set() for line in pos_file: ele = line.split() pdb_id = ele[0] res = ele[-5].lower() chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if pdb_id in true_dict.keys(): if res in true_dict[pdb_id]: if (chain,res_no) in true_dict[pdb_id][res]: y_true.append(1) y_pred.append(prob) pos_set.add((pdb_id,res,chain,res_no)) else: y_true.append(0) y_pred.append(prob) else: y_true.append(0) y_pred.append(prob) for line in neg_file: ele = line.split() pdb_id = ele[0] res = ele[-5].lower() chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) y_true.append(0) y_pred.append(prob) y_true = numpy.array(y_true) y_pred = numpy.array(y_pred) from sklearn import metrics from sklearn.metrics import roc_auc_score fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred) S_1 = metrics.auc(fpr, tpr) print "residue site level summary:" print "number of total residue sites" print y_true.shape[0] print "number of positive sites:" print len(numpy.where(y_true)[0]) print "AUC" print S_1 def PDB_AUC(): pos_file=open('../results/detect_results/results_pos_PDB_all_residues_test_fold.txt') neg_file=open('../results/detect_results/results_neg_PDB_all_residues_max_fold.txt') count_file = open('../results/detect_results/pdb_residue_site_exist.txt') true_dict=collections.defaultdict(dict) true_set = set() for ptf_name in ['../data/ptf/CYS.SG.pos_train.ptf','../data/ptf/ARG.CZ.pos_train.ptf','../data/ptf/TRP.NE1.pos_train.ptf','../data/ptf/GLU.OE1.pos_train.ptf']: ptf_file = open(ptf_name) for line in ptf_file: ele = line.split() pdb_id = ele[0] res = ele[-3].lower() chain = ele[-2] res_no = ele[-1] if res not in true_dict[pdb_id]: true_dict[pdb_id][res]=[] true_dict[pdb_id][res].append((chain,res_no)) true_set.add((pdb_id,res,chain,res_no)) y_pred = [] y_true = [] pos_set= set() pos_dict={} neg_dict={} for line in pos_file: ele = line.split() pdb_id = ele[0] res = ele[-5].lower() chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if pdb_id not in pos_dict: pos_dict[pdb_id]=[] pos_dict[pdb_id].append((res,chain,res_no,prob)) for line in neg_file: ele = line.split() pdb_id = ele[0] res = ele[-5].lower() chain = ele[-4] res_no = ele[-3] fold = int(ele[-2]) prob = float(ele[-1]) if pdb_id not in neg_dict: neg_dict[pdb_id]=[] neg_dict[pdb_id].append((res,chain,res_no,prob)) for pdb_id in pos_dict: entries = pos_dict[pdb_id] entries.sort(key=lambda x: x[3], reverse=True) max_prob = entries[0][3] res = entries[0][0] chain,res_no = entries[0][1],entries[0][2] if pdb_id in true_dict.keys(): if res in true_dict[pdb_id]: if (chain,res_no) in true_dict[pdb_id][res]: y_true.append(1) y_pred.append(max_prob) else: # pdb_id, chain,res_no is wrong y_true.append(1) y_pred.append(0.0) else: # pdb_id, chain,res_no is wrong y_true.append(1) y_pred.append(0.0) for pdb_id in neg_dict: entries = neg_dict[pdb_id] entries.sort(key=lambda x: x[3], reverse=True) max_prob = entries[0][3] res = entries[0][0] chain,res_no = entries[0][1],entries[0][2] y_true.append(0) y_pred.append(max_prob) y_true = numpy.array(y_true) y_pred = numpy.array(y_pred) from sklearn import metrics from sklearn.metrics import roc_auc_score fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred) S_1 = metrics.auc(fpr, tpr) print "PDB_level_summary:" print "total PDBs" print y_true.shape[0] print "number of positive PDBs" print len(numpy.where(y_true)[0]) print "AUC" print S_1 import sys import os import numpy total_fold=5 import collections site_AUC() PDB_AUC()