# PDBs used to train the PROSITE TRYPSIN_HIS.5.HIS.NE2 model 
fold_train_pdb_dict_HIS = {
0:['1a0h', '1ab9', '1afq', '1aut', '1bb0', '1bma', '1bml', '1boq', '1c5m', '1ca0', '1ca8', '1cbw', '1cgi', '1cgj', '1chg', '1cho', '1d3d', '1d3p', '1d3q', '1d3t', '1d4p', '1ddj', '1dlk', '1eaw', '1eax', '1ela', '1elb', '1elc', '1eld', '1ele', '1elf', '1elg', '1esa', '1esb', '1ex3', '1ezq', '1ezx', '1f0r', '1f0s', '1fax', '1fjs', '1fon', '1fxy', '1g2l', '1g2m', '1gct', '1gg6', '1ggd', '1gha', '1ghb', '1gmc', '1gmd', '1gmh', '1h9h', '1hcg', '1hja', '1hv7', '1hyl', '1ioe', '1iqe', '1iqf', '1iqg', '1iqh', '1iqi', '1iqj', '1iqk', '1iql', '1iqm', '1iqn', '1jwt', '1kdq', '1kig', '1ksn', '1l0z', '1l1g', '1l4d', '1l4z', '1lka'],
1:['1lkb', '1lpg', '1lpk', '1lpz', '1lqd', '1mmj', '1mq5', '1mq6', '1mtn', '1n8o', '1nfu', '1nfw', '1nfx', '1nfy', '1nn6', '1nu9', '1o5e', '1o5f', '1okx', '1p0s', '1p57', '1p8v', '1ppz', '1pq5', '1pq7', '1pq8', '1pqa', '1qa0', '1qb1', '1qb6', '1qb9', '1qbn', '1qbo', '1qfk', '1qq4', '1qrw', '1qrx', '1qrz', '1riw', '1rjx', '1s0q', '1s0r', '1t4u', '1t4v', '1uo6', '1uvo', '1uvp', '1v3x', '1vgc', '1vr1', '1wu1', '1xka', '1xkb', '1xvm', '1xvo', '1yph', '1z6e', '2a7c', '2a7h', '2a7j', '2bmg', '2boh', '2bok', '2bq6', '2bq7', '2bqw', '2cga', '2cha', '2cji', '2cv3', '2d1j', '2d8w', '2de8', '2de9', '2ei6', '2ei7', '2ei8', '2f3c', '2f83'],
2:['2fo9', '2foa', '2fob', '2foc', '2fod', '2foe', '2fof', '2fog', '2foh', '2fzz', '2g00', '2g4t', '2g4u', '2g51', '2g52', '2g55', '2g5n', '2g5v', '2g8t', '2gch', '2gct', '2gd4', '2gmt', '2gv6', '2gv7', '2h9e', '2hlc', '2i6q', '2j2u', '2j34', '2j38', '2j4i', '2j94', '2j95', '2j9n', '2jet', '2jkh', '2nwn', '2o9q', '2oqu', '2p16', '2p3f', '2p3t', '2p3u', '2p8o', '2p93', '2p94', '2p95', '2phb', '2pks', '2pr3', '2psx', '2psy', '2puq', '2q1j', '2qn5', '2r2m', '2ra0', '2sfa', '2uuy', '2uwl', '2uwo', '2uwp', '2v0b', '2vgc', '2vh0', '2vh6', '2vvc', '2vvu', '2vvv', '2vwl', '2vwm', '2vwn', '2vwo', '2w26', '2w3i', '2w3k', '2win', '2wyg', '2wyj', '2xbv', '2xbw'],
3:['2xbx', '2xby', '2xc0', '2xc4', '2xc5', '2xw9', '2xwa', '2xwb', '2y5f', '2y5g', '2y5h', '2y7x', '2y7z', '2y80', '2y81', '2y82', '2za5', '2zeb', '2zec', '2zgc', '2zgh', '2zgj', '2zks', '3a7t', '3a7v', '3a7w', '3a7x', '3a7y', '3a7z', '3a80', '3a81', '3a82', '3a83', '3a84', '3a85', '3a86', '3a87', '3a88', '3a89', '3a8a', '3a8b', '3a8c', '3a8d', '3aas', '3aau', '3aav', '3ati', '3atk', '3atl', '3atm', '3bg4', '3bg8', '3bn9', '3bsq', '3c27', '3cen', '3cs7', '3da9', '3dfj', '3dfl', '3e3t', '3e8l', '3ela', '3ens', '3f6u', '3ffg', '3fzz', '3g01'],
4:['3gch', '3gct', '3gov', '3hpt', '3i77', '3i78', '3iit', '3iti', '3k65', '3k9x', '3kgp', '3khv', '3kl6', '3kqb', '3kqc', '3kqd', '3kqe', '3liw', '3m36', '3m37', '3m7q', '3m7t', '3m7u', '3ncl', '3nps', '3p6z', '3p70', '3p8f', '3p8g', '3po1', '3pro', '3q3k', '3qgj', '3rdz', '3rxa', '3rxb', '3rxc', '3rxd', '3rxe', '3rxf', '3rxg', '3rxh', '3rxi', '3rxj', '3rxk', '3rxl', '3rxm', '3rxo', '3rxp', '3rxq', '3rxr', '3rxs', '3rxt', '3rxu', '3rxv', '3s0n', '3sw2', '3tju', '3tjv', '3tk9', '3vgc', '4a6l', '4a7i', '4cha', '4gch', '4pro', '4vgc', '5cha', '5gch', '6cha', '6gch', '7gch', '8gch']}

# PDBs used to train the PROSITE TRYPSIN_SER.6.SER.OG model 
fold_train_pdb_dict_SER = {
0:['1a0h', '1a5h', '1ab9', '1afq', '1bio', '1bma', '1boq', '1c4y', '1ca0', '1cbw', '1cgi', '1cgj', '1cho', '1dfp', '1dic', '1dle', '1dlk', '1dst', '1dsu', '1ejm', '1ela', '1elb', '1elc', '1eld', '1ele', '1elf', '1elg', '1esa', '1esb', '1ex3', '1fn8', '1fon', '1fxy', '1fy4', '1fy5', '1g2l', '1gct', '1gdn', '1gdq', '1gdu', '1gg6', '1ggd', '1gha', '1ghb', '1gl1', '1gmc', '1gmd', '1gmh', '1hcg', '1hfd'],
1:['1hja', '1hv7', '1jrs', '1jrt', '1jwt', '1k2i', '1kdq', '1l0z', '1l1g', '1lka', '1lkb', '1mmj', '1mtn', '1n8o', '1okx', '1op8', '1p2n', '1p2q', '1pq5', '1pq7', '1pq8', '1qa0', '1qb1', '1qb6', '1qb9', '1qbn', '1qbo', '1qq4', '1qrw', '1qrx', '1qrz', '1riw', '1s0q', '1s0r', '1t7c', '1t8l', '1t8m', '1t8n', '1t8o', '1try', '1uo6', '1utk', '1uvo', '1uvp', '1vgc', '1xvm'],
2:['1xvo', '1xx9', '1ybw', '1yc0', '1yph', '1zjk', '2a0q', '2a7c', '2a7h', '2a7j', '2cga', '2cha', '2cv3', '2de8', '2de9', '2fo9', '2foa', '2fob', '2foc', '2fod', '2foe', '2fof', '2fog', '2foh', '2g4t', '2g4u', '2g51', '2g52', '2g55', '2g5n', '2g5v', '2g8t', '2gch', '2gct', '2gmt', '2i6q', '2j9n', '2jet', '2o9q', '2oqu', '2p8o', '2pks', '2puq', '2qn5', '2sfa', '2uuy', '2v0b', '2vgc', '2vu8', '2win', '2xwa', '2y6t'],
3:['2za5', '2zch', '2zck', '2zcl', '2zeb', '2zec', '2zgc', '2zgh', '2zgj', '2zks', '3a7t', '3a7v', '3a7w', '3a7x', '3a7y', '3a7z', '3a80', '3a81', '3a82', '3a83', '3a84', '3a85', '3a86', '3a87', '3a88', '3a89', '3a8a', '3a8b', '3a8c', '3a8d', '3aas', '3aau', '3aav', '3ati', '3atk', '3atl', '3atm', '3bg4', '3bg8', '3bn9', '3bsq', '3bth', '3dfj', '3dfl', '3e3t', '3e8l', '3ela'],
4:['3gch', '3gct', '3gov', '3hs0', '3iti', '3k2u', '3k6y', '3m7q', '3m7t', '3m7u', '3nps', '3po1', '3pro', '3qgj', '3qum', '3rxa', '3rxb', '3rxc', '3rxd', '3rxe', '3rxf', '3rxg', '3rxh', '3rxi', '3rxj', '3rxk', '3rxl', '3rxm', '3rxo', '3rxp', '3rxq', '3rxr', '3rxs', '3rxt', '3rxu', '3rxv', '3s0n', '3s9a', '3s9b', '3s9c', '3sbk', '3tju', '3tjv', '3tk9', '3vgc', '4cha', '4gch', '4pro', '4vgc', '5cha', '5gch', '6cha', '6gch', '7gch', '8gch'],
}

def write_PDB_prob_all_folds(target_RES, target_ATOM, site_ID,pos_or_neg='pos'):

	result_weights_ID = target_RES+"_"+target_ATOM+"_"+site_ID

	total_fold = 5
	ptf_order=open('../data/numpy/'+target_RES+'.'+target_ATOM+'.order'+'.ptf')
	dict_list = list(ptf_order)

	ID = site_ID+'.'+target_RES+'.'+target_ATOM
	files = [ os.path.join('../data/numpy/',f) for f in os.listdir('../data/numpy/') if os.path.isfile(os.path.join('../data/numpy/',f))]
	files = [t for t in files if ID in t]
	files = [t for t in files if '.pos' in t]
	files = [t for t in files if '.dat' in t]
	total_num = len(files)
	
	all_prob = []
	for fold in range(5):
		for i in range(total_num):
			prob = numpy.load('../results/post_prob/'+target_RES+'.'+target_ATOM+'_prob'+'_fold_'+str(fold)+'_'+str(i)+'.dat')
			if i == 0:
				fold_prob = prob
			else:
				fold_prob = numpy.concatenate((fold_prob,prob),axis=0)
		
		all_prob.append(fold_prob)

	summary_file=open('../results/'+'PDB_prob_'+result_weights_ID+'_numpy.txt','w')

	pdb_set = set()
	for i in range(len(dict_list)):
		line = dict_list[i]
		chain_ID = line.strip('\n')[-6:]
		chain = chain_ID[0]
		res_no = chain_ID[1:]

		S=line.split()
		PDB_ID=S[0]
		# chain = S[-2]
		# res_no = S[-1]
		x_ = float(S[1])
		y_ = float(S[2])
		z_ = float(S[3])
		if PDB_ID not in pdb_set:
			site_no = 0
			pdb_set.add(PDB_ID)
		
		for fold in range(5):
			probs_y = all_prob[fold][i]
			summary_file.write(PDB_ID+'\t'+str(site_no)+'\t'+str(x_)+'\t'+str(y_)+'\t'+str(z_)+'\t'+target_RES+'\t'+chain+'\t'+str(res_no)+'\t'+str(fold)+'\t'+str(probs_y)+'\n')
		site_no+=1
	
def write_summary_pos(target_RES,target_ATOM,pos_or_neg='pos'):

	result_weights_ID = target_RES+"_"+target_ATOM+"_"+site
	if target_RES == 'SER':
		prob_file = open('../results/PDB_prob_SER_OG_TRYPSIN_SER.6_numpy.txt')
		out_file = open('../results/selected_fold_'+result_weights_ID+'_pred_prob.txt','w')
	elif target_RES == 'HIS':
		prob_file = open('../results/PDB_prob_HIS_NE2_TRYPSIN_HIS.5_numpy.txt')
		out_file = open('../results/selected_fold_'+result_weights_ID+'_pred_prob.txt','w')

	fold_train_pdb_dict = fold_train_pdb_dict_HIS if target_RES == 'HIS' else fold_train_pdb_dict_SER
	
	result_list = prob_file
	
	total_fold = 5

	results_dict=collections.defaultdict(dict)

	for line in result_list:
		ele = line.split()
		pdb_id = ele[0]
		res = ele[-5]
		chain = ele[-4]
		res_no = ele[-3]
		fold = int(ele[-2])
		prob = float(ele[-1])
		if (chain,res,res_no) not in results_dict[pdb_id]:
			results_dict[pdb_id][(chain,res,res_no)]=[]
		results_dict[pdb_id][(chain,res,res_no)].append((prob,line))

	for pdb_id in results_dict:
		fold_to_use = None
		if pdb_id in fold_train_pdb_dict[0]:
			fold_to_use = 0
		elif pdb_id in fold_train_pdb_dict[1]:
			fold_to_use = 1
		elif pdb_id in fold_train_pdb_dict[2]:
			fold_to_use = 2
		elif pdb_id in fold_train_pdb_dict[3]:
			fold_to_use = 3
		elif pdb_id in fold_train_pdb_dict[4]:
			fold_to_use = 4

		if fold_to_use==None:
			for (chain,res,res_no) in results_dict[pdb_id]:
				entries = results_dict[pdb_id][(chain,res,res_no)]

				entries.sort(key=lambda entries: entries[0],reverse=True)
				max_ = entries[0]

				out_file.write(max_[1])
		else:
			for (chain,res,res_no) in results_dict[pdb_id]:
				entries = results_dict[pdb_id][(chain,res,res_no)]

				out_file.write(entries[fold_to_use][1])


def get_PDB_to_fold_pos_prob(pos_or_neg = 'pos'):
	pos_PDB_list = open('../data/SCOP_PDB.txt')
	outfile=open('../results/'+'detect_prob_TRYPSIN_PDB_level.txt','w')

	inte_dict=collections.defaultdict(list)
	for target_RES, target_ATOM, site in [('SER','OG','TRYPSIN_SER.6'),('HIS','NE2','TRYPSIN_HIS.5')]:
		result_weights_ID = target_RES+"_"+target_ATOM+"_"+site
		integrate_file = open('../results/selected_fold_'+result_weights_ID+'_pred_prob.txt')
		for line in integrate_file:
			ele = line.split()
			pdb_id = ele[0]
			inte_dict[pdb_id].append(line)

	for line in pos_PDB_list:
		pdb_id = line.split()[0]
		if pdb_id in inte_dict:
			entries = inte_dict[pdb_id]
		else:
			entries = [pdb_id+'\t'+'None'+'\n']
		for l in entries:
			outfile.write(l)

def final_stats(thres=0.5):
	count_file = open('../data/csa_pdb_HIS_SER_annotation.txt')
	count_list = list(count_file)
	pos_file = open('../results/'+'detect_prob_TRYPSIN_PDB_level.txt')
	
	# all: 1447 (SCOP_fscnn_with_csa_pdb.txt) make sure they are all in the dataset?
	# HIS only:  92
	# SER only:  4
	# SER, HIS: 1347
	# None: 4

	# Ser_Text = 'Ser'
	# His_Text = 'His'

	Ser_Text = 'SER'
	His_Text = 'HIS'

	detected_dict = collections.defaultdict(dict)
	for line in pos_file:
		ele = line.split()
		pdb_id = ele[0]	
		if ele[1]!='None':
			res = ele[-5].lower()
			chain = ele[-4]
			res_no = ele[-3]
			fold = int(ele[-2])
			prob = float(ele[-1])
			if prob>thres:
				if res not in detected_dict[pdb_id]:
					detected_dict[pdb_id][res]=[]
				detected_dict[pdb_id][res].append((chain,res_no))
				
	csa_res_dict=collections.defaultdict(dict)
	report_dict=collections.defaultdict(list)
	HIS_dict=collections.defaultdict(list)
	SER_dict=collections.defaultdict(list)
	ALL_dict=collections.defaultdict(list)

	detect_csa_file = open('../results/detected_TRYPSIN_CSA_site_summary.txt','w')
	detect_non_file = open('../results/detected_TRYPSIN_nonCSA_site_summary.txt','w')
	csa_pdb=set()
	for line in count_list:
		ele = line.split()
		pdb_id = ele[0]
		csa_pdb.add(pdb_id)
		if ele[1]!='None':
			res = ele[1].lower()
			chain = ele[2]
			res_no = ele[3]
			
			if res not in csa_res_dict[pdb_id]:
				csa_res_dict[pdb_id][res]=[]
			csa_res_dict[pdb_id][res].append((chain,res_no))

			if res in detected_dict[pdb_id]:
				if (chain,res_no) in detected_dict[pdb_id][res]:
					report_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n')
					if res == 'ser':
						SER_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n')
					elif res== 'his':
						HIS_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n')
					ALL_dict[pdb_id].append(line.strip('\n')+'\t'+'True'+'\n')
				else:
					report_dict[pdb_id].append(line.strip('\n')+'\t'+'False'+'\n')
			else:
				report_dict[pdb_id].append(line.strip('\n')+'\t'+'False'+'\n')
	
	print "HIS model detected " +str(len(HIS_dict.keys())) +" out of "+str(len(csa_pdb))+" PDBs"
	print "SER model detected " +str(len(SER_dict.keys())) +" out of "+str(len(csa_pdb))+" PDBs"
	print "Both model detected " +str(len(ALL_dict.keys())) +" out of "+str(len(csa_pdb))+" PDBs"
	print "writing summary into"
	print '../results/detected_TRYPSIN_CSA_site_summary.txt'
	print '../results/detected_TRYPSIN_nonCSA_site_summary.txt'

	#print len(report_dict.keys())

	for pdb_id in report_dict:
		entries = report_dict[pdb_id]
		for e in entries:
			detect_csa_file.write(e)


	for pdb_id in detected_dict:
		for res in detected_dict[pdb_id]:

			pos_pred = detected_dict[pdb_id][res]
			if res in csa_res_dict[pdb_id]:
				csa_res = csa_res_dict[pdb_id][res]
				for p in pos_pred:
					if p not in csa_res:
						detect_non_file.write(pdb_id+'\t'+res+'\t'+p[0]+'\t'+p[1]+'\n')
			else:
				for p in pos_pred:
					detect_non_file.write(pdb_id+'\t'+res+'\t'+p[0]+'\t'+p[1]+'\n')


if __name__ == '__main__':

	import sys
	import os
	import numpy
	import collections

	total_fold=5
	for target_RES, target_ATOM, site in [('SER','OG','TRYPSIN_SER.6'),('HIS','NE2','TRYPSIN_HIS.5')]:
		write_PDB_prob_all_folds(target_RES, target_ATOM, site)
		write_summary_pos(target_RES,target_ATOM,site)

	get_PDB_to_fold_pos_prob()
	final_stats()