import os
import sys
import time
import numpy
import re
import math
import collections
from collections import OrderedDict
import random
from sets import Set

def PDB_all_folds_NOS_predicted_prob(target_RES,target_ATOM,pos_or_neg):

	result_weights_ID = target_RES+"_"+target_ATOM+"_"+pos_or_neg
	total_fold = 5
	ptf_order=open('../data/ptf/'+target_RES+'.'+target_ATOM+'.'+pos_or_neg+'_detect.ptf')
	dict_list = list(ptf_order)

	ID = target_RES+'.'+target_ATOM+'.'+pos_or_neg
	files = [ os.path.join('../data/numpy/detect/',f) for f in os.listdir('../data/numpy/detect') if os.path.isfile(os.path.join('../data/numpy/detect/',f))]
	files = [t for t in files if ID in t]
	files = [t for t in files if '.dat' in t]
	total_num = len(files)
	
	all_prob = []
	for fold in range(5):
		for i in range(total_num):
			prob = numpy.load('../results/detect_prob/'+target_RES+'.'+target_ATOM+'.'+pos_or_neg+'_prob'+'_fold_'+str(fold)+'_'+str(i)+'.dat')
			if i == 0:
				fold_prob = prob
			else:
				fold_prob = numpy.concatenate((fold_prob,prob),axis=0)
		all_prob.append(fold_prob)

	summary_file=open('../results/detect_results/'+'PDB_prob_'+result_weights_ID+'_numpy.txt','w')

	uniprot_PDBID={}
	final_results={}
	PDB_results={}
	pdb_set = set()
	for i in range(len(dict_list)):
		line = dict_list[i]
		chain_ID = line.strip('\n')[-6:]
		chain = chain_ID[0]
		res_no = chain_ID[1:]
		S=line.split()
		PDB_ID=S[0]
		
		x_ = float(S[1])
		y_ = float(S[2])
		z_ = float(S[3])
		if PDB_ID not in pdb_set:
			site_no = 0
			pdb_set.add(PDB_ID)
		
		for fold in range(5):
			probs_y = all_prob[fold][i]
			summary_file.write(PDB_ID+'\t'+str(site_no)+'\t'+str(x_)+'\t'+str(y_)+'\t'+str(z_)+'\t'+target_RES+'\t'+chain+'\t'+str(res_no)+'\t'+str(fold)+'\t'+str(probs_y)+'\n')
		site_no+=1

def write_summary_pos(target_RES,target_ATOM,pos_or_neg='pos'):

	fold_PDB={
	('CYS','SG',0):['3hsn', '3hso', '3hsp', '1q2o', '1tll', '3b3p', '2hx2', '2hx3', '1f20', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '2ort', '1fop', '2orp', '3fc5', '1p6i', '1p6h'],
	('ARG','CZ',0):['3hsn', '3hso', '3hsp', '1q2o', '3b3p', '2hx2', '2hx3', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '2ort', '1fop', '2orp', '3fc5', '1p6i', '1p6h', '1p6k', '1p6j', '1p6m'],
	('GLU','OE1',0):['3hsn', '3hso', '3hsp', '1q2o', '3b3p', '2hx2', '2hx3', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '1fop', '2orp', '3fc5', '1p6i', '1p6h', '1p6k', '1p6j', '1p6m'],
	('TRP','NE1',0):['3hsn', '3hso', '3hsp', '1q2o', '3b3p', '2hx2', '2hx3', '2hx4', '2g6k', '2g6j', '2g6i', '2g6h', '2g6o', '2g6n', '2g6m', '2g6l', '3e65', '3e7t', '3e68', '2ort', '1fop', '2orp', '3fc5', '1p6i', '1p6h', '1p6k', '1p6j', '1p6m'],
	
	('CYS','SG',1):['1p6k', '1p6j', '1p6m', '1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5'],
	('ARG','CZ',1):['1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5', '1qw4', '1zvi'],
	('GLU','OE1',1):['1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5', '1qw4'],
	('TRP','NE1',1):['1p6l', '1p6n', '1fol', '1foo', '2nsi', '1foi', '1foj', '1d1x', '1d1y', '1d1v', '1d1w', '1qwc', '4nos', '2nse', '2nos', '1lzx', '1lzz', '3b3n', '3b3o', '3b3m', '1qom', '2nod', '1qw6', '1qw5', '1qw4', '1zvi'],
	
	('CYS','SG',2):['1qw4', '1zvi', '1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o'],
	('ARG','CZ',2):['1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o', '1d0c', '1ed6'],
	('GLU','OE1',2):['1zvi', '1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o', '1d0c'],
	('TRP','NE1',2):['1zvl', '3nse', '1nsi', '3dqt', '3dqr', '3dqs', '1nse', '1om5', '1om4', '1r35', '3e7s', '1dm8', '1dm6', '1dm7', '1k2r', '1k2s', '1k2t', '1k2u', '1rs6', '1rs7', '1rs8', '1rs9', '3eai', '1d0o', '1d0c', '1ed6'],
	
	('CYS','SG',3):['1d0c', '1ed6', '1ed4', '1ed5', '3dwj', '1dd7', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '2oro', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk'],
	('ARG','CZ',3):['1ed4', '1ed5', '3dwj', '1dd7', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '2oro', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk', '2ors'],
	('GLU','OE1',3):['1ed6', '1ed4', '1ed5', '3dwj', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk'],
	('TRP','NE1',3):['1ed4', '1ed5', '3dwj', '1dd7', '1dmi', '1dmj', '1dmk', '3nod', '5nse', '1vaf', '1vag', '8nse', '3e6t', '2oro', '3e7i', '3ej8', '1nod', '1m00', '1i83', '2bhj', '1noc', '3e67', '1mmw', '1mmv', '1m9t', '1jwj', '1jwk', '2ors'],
	
	('CYS','SG',4):['2ors', '2orr', '2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g'],
	('ARG','CZ',4):['2orr', '2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g'],
	('GLU','OE1',4):['2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g'],
	('TRP','NE1',4):['2orr', '2orq', '1nos', '3e7m', '1dwv', '1dww', '1dwx', '4nse', '9nse', '7nse', '3ebf', '3ebd', '6nse', '1n2n', '1m8d', '1m8e', '1m8h', '1m8i', '3e6n', '3e6o', '3e6l', '1zzu', '1zzt', '1zzs', '1zzr', '1zzq', '1df1', '3e7g']
	}


	result_weights_ID = target_RES+"_"+target_ATOM+"_"+pos_or_neg
	trial = result_weights_ID
	result_list = open('../results/detect_results/'+'PDB_prob_'+trial+'_numpy.txt')
	integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt','w')

	total_fold = 5

	results_dict=collections.defaultdict(dict)

	for line in result_list:
		ele = line.split()
		pdb_id = ele[0]
		res = ele[-5]
		chain = ele[-4]
		res_no = ele[-3]
		fold = int(ele[-2])
		prob = ele[-1]
		if (fold,res) not in results_dict[pdb_id]:
			results_dict[pdb_id][(fold,res)]=[]
		results_dict[pdb_id][(fold,res)].append(line)

	for fold in range(0,5):
		for pdb_id in fold_PDB[(target_RES,target_ATOM,fold)]:
			if pdb_id in results_dict:
				entries = results_dict[pdb_id][(fold,target_RES)]
				for line in entries:
					integrate_file.write(line)

def write_summary_neg(target_RES,target_ATOM,pos_or_neg='neg'):
	result_weights_ID = target_RES+"_"+target_ATOM+"_"+pos_or_neg
	trial = result_weights_ID
	result_list = open('../results/detect_results/'+'PDB_prob_'+trial+'_numpy.txt')
	integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt','w')

	total_fold = 5

	results_dict=collections.defaultdict(dict)

	for line in result_list:
		#3e7g    21  -20.54  28.77   26.97   CYS C   217 2   9.70189e-09
		ele = line.split()
		pdb_id = ele[0]
		res = ele[-5]
		chain = ele[-4]
		res_no = ele[-3]
		fold = int(ele[-2])
		prob = float(ele[-1])

		if (chain,res,res_no) not in results_dict[pdb_id]:
			results_dict[pdb_id][(chain,res,res_no)]=[]
		results_dict[pdb_id][(chain,res,res_no)].append((prob,line))

	for pdb_id in results_dict:
		for (chain,res,res_no) in results_dict[pdb_id]:
			entries = results_dict[pdb_id][(chain,res,res_no)]
			entries.sort(key=lambda entries: entries[0],reverse=True)
			max_ = entries[0]
			integrate_file.write(max_[1])


def get_PDB_to_fold_pos_prob(pos_or_neg = 'pos'):
	pos_PDB_list = open('../data/train_pos_PDBs.txt')
	outfile=open('../results/detect_results/'+'results_pos_PDB_all_residues_test_fold.txt','w')

	inte_dict=collections.defaultdict(list)
	for target_RES, target_ATOM in [('CYS','SG'),('ARG','CZ'),('GLU','OE1'),('TRP','NE1')]:
		trial = target_RES+"_"+target_ATOM+"_"+pos_or_neg
		integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt')
		for line in integrate_file:
			ele = line.split()
			pdb_id = ele[0]
			inte_dict[pdb_id].append(line)
			
	for line in pos_PDB_list:
		pdb_id = line.strip('\n')
		if pdb_id in inte_dict:
			entries = inte_dict[pdb_id]
		else:
			entries = ['None'+'\n']
		for l in entries:
			outfile.write(l)


def get_PDB_to_fold_neg_prob(pos_or_neg = 'neg'):
	neg_PDB_list = open('../data/test_neg_PDBs.txt')
	outfile=open('../results/detect_results/'+'results_neg_PDB_all_residues_max_fold.txt','w')

	inte_dict=collections.defaultdict(list)
	for target_RES, target_ATOM in [('CYS','SG'),('ARG','CZ'),('GLU','OE1'),('TRP','NE1')]:
		trial = target_RES+"_"+target_ATOM+"_"+pos_or_neg
		integrate_file = open('../results/detect_results/'+'integrate_'+trial+'.txt')
		for line in integrate_file:
			ele = line.split()
			pdb_id = ele[0].strip(' ')
			inte_dict[pdb_id].append(line)

	for line in neg_PDB_list:
		pdb_id = line.strip('\n').strip(' ')
		if pdb_id in inte_dict:
			entries = inte_dict[pdb_id]
		else:
			entries = [pdb_id+'\t'+'None'+'\n']

		for l in entries:
			outfile.write(l)

	
def final_stats_pos(thres=0.5):
	pdb_residue_site_file = open('../results/detect_results/'+'pdb_residue_site_exist.txt')
	pos_file = open('../results/detect_results/'+'results_pos_PDB_all_residues_test_fold.txt')
	detected_dict = collections.defaultdict(dict)

	for line in pdb_residue_site_file:
		ele = line.split()
		pdb_id = ele[0]
		CYS_exist = ele[1]
		ARG_exist = ele[2]
		TRP_exist = ele[3]
		GLU_exist = ele[4]
		if CYS_exist:
			detected_dict[pdb_id]['cys']=[]
		if ARG_exist:
			detected_dict[pdb_id]['arg']=[]
		if TRP_exist:
			detected_dict[pdb_id]['trp']=[]
		if GLU_exist:
			detected_dict[pdb_id]['glu']=[]

	for line in pos_file:
		ele = line.split()
		pdb_id = ele[0]	
		res = ele[-5].lower()
		chain = ele[-4]
		res_no = ele[-3]
		fold = int(ele[-2])
		prob = float(ele[-1])
		if prob>thres:
			detected_dict[pdb_id][res].append((chain,res_no))

	true_site_dict=collections.defaultdict(dict)
	report_dict=collections.defaultdict(list)
	# "detection_summary_true_sites.txt" 
	#	summarize detect (True) or not detected (False) of all the annotated CSA true sites by our models

	# "detection_summary_fp_sites_pos_pdb.txt" 
	#	summarize any detected false positive sites (if any) in positive NOS structures 

	detect_true_site = open('../results/detect_results/detection_summary_true_sites.txt','w')
	detect_fp_site = open('../results/detect_results/detection_summary_fp_sites_pos_pdb.txt','w')
	print ("writing summary files into ")
	print ('../results/detect_results/detection_summary_true_sites.txt')
	print ('../results/detect_results/detection_summary_fp_sites_pos_pdb.txt')

	for ptf_name in ['../data/ptf/CYS.SG.pos_train.ptf','../data/ptf/ARG.CZ.pos_train.ptf','../data/ptf/TRP.NE1.pos_train.ptf','../data/ptf/GLU.OE1.pos_train.ptf']:
		ptf_file = open(ptf_name)
		for line in ptf_file:
			ele = line.split()
			pdb_id = ele[0]
			res = ele[-3].lower()
			chain = ele[-2]
			res_no = ele[-1]
			if res not in true_site_dict[pdb_id]:
				true_site_dict[pdb_id][res]=[]
			true_site_dict[pdb_id][res].append((chain,res_no))

			if (chain,res_no) in detected_dict[pdb_id][res]:
				report_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n')
			else:
				report_dict[pdb_id].append(line.strip('\n')+'\t'+'False'+'\n')

	for pdb_id in report_dict:
		entries = report_dict[pdb_id]
		for e in entries:
			detect_true_site.write(e)

	fp_site_count = 0
	for pdb_id in detected_dict:
		for res in detected_dict[pdb_id]:
			pos_pred = detected_dict[pdb_id][res]
			for p in pos_pred:
				if p not in true_site_dict[pdb_id][res]:
					detect_fp_site.write(pdb_id+'\t'+res+'\t'+p+'\n')
					fp_site_count+=1
	if fp_site_count==0:
		detect_fp_site.write("No false positive site detected in positive NOS structures"+'\n')
					
def final_stats_neg(thres=0.5):
	neg_file = open('../results/detect_results/'+'results_neg_PDB_all_residues_max_fold.txt')
	detect_fp_site = open('../results/detect_results/detection_summary_fp_sites_neg_pdb.txt','w')
	print ("writing summary files into ")
	print ('../results/detect_results/detection_summary_fp_sites_neg_pdb.txt')

	detected_dict = collections.defaultdict(dict)
	
	for line in neg_file:
		ele = line.split()
		if ele[1]!='None':
			pdb_id = ele[0]	
			res = ele[-5]
			chain = ele[-4]
			res_no = ele[-3]
			fold = int(ele[-2])
			prob = float(ele[-1])
			if prob>thres:
				if res not in detected_dict[pdb_id]:
					detected_dict[pdb_id][res]=[]
				detected_dict[pdb_id][res].append((prob,line))

	fp_site_count = 0
	for pdb_id in detected_dict:
		for res in detected_dict[pdb_id]:
			pos_pred = detected_dict[pdb_id][res]
			for p in pos_pred:
				detect_fp_site.write(pdb_id+'\t'+res+'\t'+p+'\n')
				fp_site_count+=1

	if fp_site_count==0:
		detect_fp_site.write("No false positive site detected in negative structures"+'\n')
					

if __name__ == '__main__':

	import sys
	import os
	total_fold=5
	import collections

	# STEP 1
	for target_RES, target_ATOM, pos_or_neg in [('CYS','SG','pos'),('ARG','CZ','pos'),('GLU','OE1','pos'),('TRP','NE1','pos'),('CYS','SG','neg'),('ARG','CZ','neg'),('GLU','OE1','neg'),('TRP','NE1','neg')]:
		PDB_all_folds_NOS_predicted_prob(target_RES,target_ATOM,pos_or_neg)

	# STEP 2
	for target_RES, target_ATOM in [('CYS','SG'),('ARG','CZ'),('GLU','OE1'),('TRP','NE1')]:
		write_summary_pos(target_RES,target_ATOM)
		write_summary_neg(target_RES,target_ATOM)

	# STEP 3
	get_PDB_to_fold_pos_prob()
	get_PDB_to_fold_neg_prob()

	# STEP 4
	final_stats_pos()
	final_stats_neg()