import os import csv import numpy as np import itertools as it import numpy import numpy.random from random import shuffle import sys import collections from rdkit.Chem import MolFromSmiles import scipy import json from features import * from extract_pts import * #### Load other negative molecules max_poc_degrees = 20 max_nodes_in_poc = 50 max_mol_degrees = 6 max_nodes_in_mol = 60 def load_DUDE_neg(): # CHEMBL assay negative data infile = open('../data/DUDE_CHEMBL_target_mol_neg_0920.txt') Uniprot_neg_mols={} for line in infile: ele = line.split() Uniprot = ele[0] smiles = ele[2] if Uniprot not in Uniprot_neg_mols.keys(): Uniprot_neg_mols[Uniprot]=set() Uniprot_neg_mols[Uniprot].add(smiles) return Uniprot_neg_mols def load_zinc_decoys(): # CHEMBL dataset negative decoys decoy_mols=[] chembl_zinc_file='../data/CHEMBL_decoys/cmp_list_ChEMBL_zinc_decoys.dat' chembl_zinc_decoys=open(chembl_zinc_file) decoy_list = list(chembl_zinc_decoys) for line in decoy_list[1:]: eles = line.split() smiles = eles[2] decoy_mols.append(smiles) train_decoys = decoy_mols[0:9000] test_decoys = decoy_mols[9000:] return train_decoys, test_decoys #### Load pocket informaton and similarity scores def get_DUDE_Uniprot_dict(): infile = open('../data/DUDE/PDB_Uniprot.txt') DUDE_Uniprot={} for line in infile: ele = line.split() PDB_ID = ele[0] Uniprot = ele[1] DUDE_Uniprot[PDB_ID]=Uniprot return DUDE_Uniprot def get_FF(target,PDB_ID,DUDE_ctr): print (target, PDB_ID, DUDE_ctr) pro_name=target ff_name = None ligs=cut_ligand_all_atoms(PDB_ID,DUDE_ctr,False) if ligs == []: print('cant find ligs'+'\n') for l in ligs: [lig_ID,lig_chain,lig_no, ctr]=l ff_name=PDB_ID+'_'+str(lig_ID)+'.ff' return ff_name def get_pocketFEATURE_dict(): score_file = open('../data/DUDE_pocket_similarity_score.txt') score_list = list(score_file) pocketFEATURE_dict={} for line in score_list: eles = line.split('\t') target_FF = eles[0] com_FF = eles[1] FF_score = float(eles[-1]) pocketFEATURE_dict[(target_FF,com_FF)]= FF_score return pocketFEATURE_dict def get_drugFEATURE_dict(): score_file = open('../data/DUDE_drugFEATURE_pocket_similarity_score.txt') score_list = list(score_file) drugFEATURE_dict={} for line in score_list: eles = line.split('\t') target_FF = eles[0] drug_FF = eles[1] if eles[-1].strip()!='NA': FF_score = float(eles[-1]) if FF_score>-1.9: if target_FF not in drugFEATURE_dict.keys(): drugFEATURE_dict[target_FF]=[] drugFEATURE_dict[target_FF].append((drug_FF,FF_score)) return drugFEATURE_dict def test_ligand_screen(): filename = "../data/DUDE/DUDE_PDBID.csv" data = read_csv_DUDE(filename,"Target Name","PDB") DUDE_Uniprot = get_DUDE_Uniprot_dict() Uniprot_neg_mols = load_DUDE_neg() test_mols = [] final_mols = [] mol_tar_dict = {} for i in range(0,len(data[0])): # For each target target = data[0][i] PDB_ID = data[1][i] target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_actives_list=list(target_actives_file) # Actives for this target for active in target_actives_list: eles=active.split() smiles=eles[0] test_mols.append(smiles) mol_tar_dict[smiles]=target if PDB_ID in DUDE_Uniprot.keys(): Uniprot = DUDE_Uniprot[PDB_ID] if Uniprot in Uniprot_neg_mols.keys(): TrueNeg_mols = Uniprot_neg_mols[Uniprot] test_mols.extend(TrueNeg_mols) for s in test_mols: mol = MolFromSmiles(s) if not mol: print ("Could not parse SMILES string:") print (s) else: num_of_atoms = len(mol.GetAtoms()) if num_of_atoms > max_nodes_in_mol: print ("SMILES string>"+str(max_nodes_in_mol)+"!!") print (s) else: if test_valid_atom(mol.GetAtoms()): final_mols.append(s) return final_mols, mol_tar_dict def get_all_mols(): from collections import OrderedDict filename = "../data/DUDE/DUDE_PDBID.csv" data = read_csv_DUDE(filename,"Target Name","PDB") target_actives_dict=OrderedDict() target_decoys_dict=OrderedDict() target_FF_dict=OrderedDict() for i in range(0,len(data[0])): # For each target target = data[0][i] PDB_ID = data[1][i] target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_decoys_file=open('../data/DUDE/'+target.lower()+'/'+'decoys_final.ism') crystal_lig = open('../data/DUDE/'+target.lower()+'/'+'crystal_ligand.mol2') lig_ctr=parse_crystal_lig(list(crystal_lig)) target_actives_list=list(target_actives_file) # Actives for this target target_decoys_list=list(target_decoys_file) # Decoys for this target # DUDE target FF target_FF = get_FF(target,PDB_ID,lig_ctr) # Target FF if target_FF!=None: target_FF_dict[target]=target_FF # DUDE ACTIVES for active in target_actives_list: eles=active.split() smiles=eles[0] if target not in target_actives_dict.keys(): target_actives_dict[target]=set() target_actives_dict[target].add(smiles) # DUDE DECOYS tar_lig_dec=[] for decoy in target_decoys_list: eles=decoy.split() smiles=eles[0] if target not in target_decoys_dict.keys(): target_decoys_dict[target]=set() target_decoys_dict[target].add(smiles) return target_actives_dict, target_decoys_dict, target_FF_dict ################ # ROC ################ def load_DUDE_dataset(): filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") target_lig=[] for i in range(0,len(data[0])): target = data[0][i] PDB_ID = data[1][i] target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_decoys_file=open('../data/DUDE/'+target.lower()+'/'+'decoys_final.ism') crystal_lig = open('../data/DUDE/'+target.lower()+'/'+'crystal_ligand.mol2') receptor=open('../data/DUDE/'+target.lower()+'/'+'receptor.pdb') lig_ctr=parse_crystal_lig(list(crystal_lig)) target_actives_list=list(target_actives_file) target_decoys_list=list(target_decoys_file) tar_lig_pos=[] tar_lig_neg=[] for active in target_actives_list: eles=active.split() smiles=eles[0] tar_lig_pos.append(smiles) for decoy in target_decoys_list: eles=decoy.split() smiles=eles[0] tar_lig_neg.append(smiles) target_lig.append([data[0][i],data[1][i],lig_ctr,tar_lig_pos,tar_lig_neg]) return target_lig def gene_DUDE_ROC_dict(build_ptf=False): target_lig=load_DUDE_dataset() csv_content=[] slices=[] all_targets={} for t in target_lig: per_targets=[] FOUND=False pro_name=t[0] PDB_ID=t[1] DUDE_ctr=t[2][0] pos_ligs=t[3] neg_ligs=t[4] ligs=cut_ligand_all_atoms(PDB_ID,DUDE_ctr,build_ptf) if ligs == []: continue for l in ligs: [lig_ID,lig_chain,lig_no, ctr]=l ff_name=PDB_ID+'_'+str(lig_ID)+'_'+str(lig_chain)+'_'+str(lig_no)+'.ff' num_actual_pos_lig=0 for smiles in pos_ligs: mol = MolFromSmiles(smiles) if not mol: continue else: binding=1 entry=[ff_name,smiles,binding,pro_name] per_targets.append(entry) for smiles in neg_ligs: mol = MolFromSmiles(smiles) if not mol: continue else: binding=0 entry=[ff_name,smiles,binding,pro_name] per_targets.append(entry) all_targets[(pro_name,PDB_ID)]=per_targets return all_targets def gene_target_smiles_DUDE_ROC_dict(build_ptf=False): sum_file=open('../data/process_DUDE_total_target_mols_ROC_new_bal.txt','w') total_pos=0 total_neg=0 target_lig=load_DUDE_dataset() csv_content=[] slices=[] all_targets={} for t in target_lig: per_targets=[] FOUND=False pro_name=t[0] PDB_ID=t[1] DUDE_ctr=t[2][0] pos_ligs=t[3] neg_ligs=t[4] ligs=cut_ligand_all_atoms(PDB_ID,DUDE_ctr,build_ptf) if ligs == []: sum_file.write('cant find ligs'+'\n') for l in ligs: [lig_ID,lig_chain,lig_no, ctr]=l ff_name=PDB_ID+'_'+str(lig_ID)+'_'+str(lig_chain)+'_'+str(lig_no)+'.ff' num_actual_pos_lig=0 for smiles in pos_ligs: mol = MolFromSmiles(smiles) if not mol: continue else: binding=1 entry=[ff_name,smiles,binding,pro_name] per_targets.append(entry) for smiles in neg_ligs: mol = MolFromSmiles(smiles) if not mol: continue else: binding=0 entry=[ff_name,smiles,binding,pro_name] per_targets.append(entry) sum_file.write("found num of mols:") sum_file.write(str(len(per_targets))+'\n') all_targets[pro_name]=per_targets sum_file.write("num of all targets:") sum_file.write(str(len(all_targets))+'\n') return all_targets def gene_target_smiles_5_tar(): tar_ff_dict_5 = {'DRD3':'3pbl_ETQ.ff','KIT':'3g0e_B49.ff','INHA':'4trj_665.ff','FNTA':'3e37_ED5.ff','HIVINT':'3nf7_CIW.ff'} #4trj_NAD.ff filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") target_lig=[] for i in range(0,len(data[0])): target = data[0][i] PDB_ID = data[1][i] if target in ['DRD3','KIT','INHA','FNTA','HIVINT']: target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_decoys_file=open('../data/DUDE/'+target.lower()+'/'+'decoys_final.ism') target_actives_list=list(target_actives_file) target_decoys_list=list(target_decoys_file) tar_lig_pos=[] tar_lig_neg=[] for active in target_actives_list: eles=active.split() smiles=eles[0] tar_lig_pos.append(smiles) for decoy in target_decoys_list: eles=decoy.split() smiles=eles[0] tar_lig_neg.append(smiles) target_lig.append([data[0][i],data[1][i],tar_lig_pos,tar_lig_neg]) all_targets={} for t in target_lig: per_targets=[] FOUND=False pro_name=t[0] PDB_ID=t[1] pos_ligs=t[2] neg_ligs=t[3] ff_name=tar_ff_dict_5[pro_name] num_actual_pos_lig=0 for smiles in pos_ligs: mol = MolFromSmiles(smiles) if not mol: continue else: binding=1 entry=[ff_name,smiles,binding,pro_name] per_targets.append(entry) for smiles in neg_ligs: mol = MolFromSmiles(smiles) if not mol: continue else: binding=0 entry=[ff_name,smiles,binding,pro_name] per_targets.append(entry) all_targets[pro_name]=per_targets return all_targets ################ # neg poc ################ def load_neg_poc_dataset_train(fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample): filename = "../data/DUDE/DUDE_PDBID.csv" data = read_csv_DUDE(filename,"Target Name","PDB") train_pro = list(load_train_pro(fold)) num_of_val_pro = 3 all_pro = train_pro shuffle(train_pro) val_pro = train_pro[0:num_of_val_pro] train_pro = train_pro[num_of_val_pro:] # pocket similarity scores pocketFEATURE_dict = get_pocketFEATURE_dict() drugFEATURE_dict = get_drugFEATURE_dict() DUDE_Uniprot = get_DUDE_Uniprot_dict() # negative mols train_decoys, test_decoys = load_zinc_decoys() Uniprot_neg_mols = load_DUDE_neg() target_actives_dict={} target_decoys_dict={} target_FF_dict={} target_TN_dict={} # LOAD DUDE dataset for i in range(0,len(data[0])): # For each target target = data[0][i] PDB_ID = data[1][i] # load assay neg mols for the target, if available TrueNeg_mols = [] if PDB_ID in DUDE_Uniprot.keys(): Uniprot = DUDE_Uniprot[PDB_ID] if Uniprot in Uniprot_neg_mols.keys(): TrueNeg_mols = Uniprot_neg_mols[Uniprot] target_TN_dict[target]=TrueNeg_mols target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_decoys_file=open('../data/DUDE/'+target.lower()+'/'+'decoys_final.ism') crystal_lig = open('../data/DUDE/'+target.lower()+'/'+'crystal_ligand.mol2') receptor=open('../data/DUDE/'+target.lower()+'/'+'receptor.pdb') lig_ctr=parse_crystal_lig(list(crystal_lig)) target_actives_list=list(target_actives_file) # Actives for this target target_decoys_list=list(target_decoys_file) # Decoys for this target # DUDE target FF target_FF = get_FF(target,PDB_ID,lig_ctr) # Target FF if target_FF!=None: target_FF_dict[target]=target_FF # DUDE ACTIVES for active in target_actives_list: eles=active.split() smiles=eles[0] if target not in target_actives_dict.keys(): target_actives_dict[target]=set() target_actives_dict[target].add(smiles) # DUDE DECOYS tar_lig_dec=[] for decoy in target_decoys_list: eles=decoy.split() smiles=eles[0] tar_lig_dec.append(smiles) target_decoys_dict[target]=tar_lig_dec ############################### ############ TRAIN ############ ############################### key_list = list(target_actives_dict.keys()) active_to_all_poc_data=[] decoy_to_self_poc_data=[] tar_neg_poc_dict={} all_train_targets={} for this_target in train_pro: tar_neg_poc_dict[this_target]=[] if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] neg_poc_tar_FF=[] for com_target in train_pro: if com_target in target_FF_dict.keys(): com_target_FF=target_FF_dict[com_target] if pocketFEATURE_dict[(this_target_FF[:-3],com_target_FF[:-3])]<-1.9: print (this_target_FF,com_target_FF) print ("too similar: "+str(pocketFEATURE_dict[(this_target_FF[:-3],com_target_FF[:-3])])) else: neg_poc_tar_FF.append((com_target,com_target_FF)) neg_poc_num=0 for mol in this_tar_actives: arr = numpy.arange(len(neg_poc_tar_FF)) numpy.random.shuffle(arr) if len(neg_poc_tar_FF)>num_of_poc_from_pocFEA: idx = arr[0:num_of_poc_from_pocFEA] else: idx = arr for d in idx: (com_target,com_target_FF) = neg_poc_tar_FF[d] com_tar_actives = target_actives_dict[com_target] if mol not in com_tar_actives: active_to_all_poc_data.append([com_target_FF,mol,0,com_target]) tar_neg_poc_dict[this_target].append([com_target_FF,mol,0,com_target]) neg_poc_num=neg_poc_num+1 poc_summary_dict={} for entry in active_to_all_poc_data: pro_name = entry[-1] bind = entry[-2] if (pro_name,bind) not in poc_summary_dict.keys(): poc_summary_dict[(pro_name,bind)]=[] poc_summary_dict[(pro_name,bind)].append(entry) for this_target in train_pro: if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] this_tar_all_decoys = target_decoys_dict[this_target] this_tar_true_neg = target_TN_dict[this_target] DUDE_neg_poc = tar_neg_poc_dict[this_target] per_target = [] per_target.extend(DUDE_neg_poc) if (this_target,0) in poc_summary_dict.keys(): num_of_neg_DUDE_poc = len(poc_summary_dict[(this_target,0)]) else: num_of_neg_DUDE_poc = 0 num_of_actives = len(this_tar_actives) neg_poc_ratio = max(1,int(float(num_of_neg_DUDE_poc)/num_of_actives)) print ("neg_poc_ratio:"+str(neg_poc_ratio)) pos_poc_num=0 pos_mol_num=0 neg_mol_num=0 pos_mol_num_drug=0 for r in range(neg_poc_ratio): for mol in this_tar_actives: active_to_all_poc_data.append([this_target_FF,mol,1,this_target]) per_target.append([this_target_FF,mol,1,this_target]) pos_mol_num=pos_mol_num+1 pos_poc_num=pos_poc_num+1 #### drugFEATURE ###### if num_of_poc_from_pocFEA>0: drugFEATURE_neg_poc_FF=[] entries = drugFEATURE_dict[this_target_FF[:-3]] for drug_FF_scores in entries: (drug_FF,score)=drug_FF_scores drugFEATURE_neg_poc_FF.append(drug_FF+'.ff') for mol in this_tar_actives: arr = numpy.arange(len(drugFEATURE_neg_poc_FF)) numpy.random.shuffle(arr) idx = arr[0:num_drugFEATURE_sample] for d in idx: drug_FF = drugFEATURE_neg_poc_FF[d] active_to_all_poc_data.append([drug_FF,mol,0,'drug_FEATURE']) active_to_all_poc_data.append([this_target_FF,mol,1,this_target]) per_target.append([drug_FF,mol,0,'drug_FEATURE']) per_target.append([this_target_FF,mol,1,this_target]) pos_mol_num_drug=pos_mol_num_drug+1 # DUDE DECOYS this_tar_decoys=[] ra=int(len(this_tar_all_decoys)/len(this_tar_actives)) final_act_to_dec_ra = float(pos_mol_num)/len(this_tar_all_decoys) final_dec_to_act_ra = int(len(this_tar_all_decoys)/pos_mol_num) act_neg_pocs_to_total_pos = float(len(DUDE_neg_poc))/float(pos_mol_num) if final_act_to_dec_ra>1: for u in range(0,int(final_act_to_dec_ra)): this_tar_decoys.extend(this_tar_all_decoys) else: decoy_ratio = int(final_act_to_dec_ra*50) for u in range(0,len(this_tar_actives)): this_tar_decoys.extend(this_tar_all_decoys[u*ra:u*ra+decoy_ratio]) for mol in this_tar_decoys: decoy_to_self_poc_data.append([this_target_FF,mol,0,this_target]) per_target.append([this_target_FF,mol,0,this_target]) neg_mol_num=neg_mol_num+1 # CHEMBL NEG DECOYs tar_lig_zinc=[] idx = np.arange(0 , len(train_decoys)) np.random.shuffle(idx) idx = idx[:len(this_tar_decoys)] for d in idx: tar_lig_zinc.append(train_decoys[d]) all_train_targets[this_target]=per_target ############################### ############ VAL ############ ############################### key_list = list(target_actives_dict.keys()) active_to_all_poc_data=[] decoy_to_self_poc_data=[] tar_neg_poc_dict={} all_val_targets={} if num_of_poc_from_pocFEA>0: num_of_poc_from_pocFEA = max(1,int(num_of_val_pro*float(num_of_poc_from_pocFEA) / len(train_pro))) for this_target in val_pro: tar_neg_poc_dict[this_target]=[] if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] neg_poc_tar_FF=[] for com_target in val_pro: if com_target in target_FF_dict.keys(): com_target_FF=target_FF_dict[com_target] if pocketFEATURE_dict[(this_target_FF[:-3],com_target_FF[:-3])]>-1.9: neg_poc_tar_FF.append((com_target,com_target_FF)) neg_poc_num=0 for mol in this_tar_actives: arr = numpy.arange(len(neg_poc_tar_FF)) numpy.random.shuffle(arr) if len(neg_poc_tar_FF)>num_of_poc_from_pocFEA: idx = arr[0:num_of_poc_from_pocFEA] else: idx = arr for d in idx: (com_target,com_target_FF) = neg_poc_tar_FF[d] com_tar_actives = target_actives_dict[com_target] if mol not in com_tar_actives: active_to_all_poc_data.append([com_target_FF,mol,0,com_target]) tar_neg_poc_dict[this_target].append([com_target_FF,mol,0,com_target]) neg_poc_num=neg_poc_num+1 poc_summary_dict={} for entry in active_to_all_poc_data: pro_name = entry[-1] bind = entry[-2] if (pro_name,bind) not in poc_summary_dict.keys(): poc_summary_dict[(pro_name,bind)]=[] poc_summary_dict[(pro_name,bind)].append(entry) for this_target in val_pro: if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] this_tar_all_decoys = target_decoys_dict[this_target] this_tar_true_neg = target_TN_dict[this_target] DUDE_neg_poc = tar_neg_poc_dict[this_target] per_target = [] per_target.extend(DUDE_neg_poc) if (this_target,0) in poc_summary_dict.keys(): num_of_neg_DUDE_poc = len(poc_summary_dict[(this_target,0)]) else: num_of_neg_DUDE_poc = 0 num_of_actives = len(this_tar_actives) neg_poc_ratio = max(1,int(float(num_of_neg_DUDE_poc)/num_of_actives)) pos_poc_num=0 pos_mol_num=0 neg_mol_num=0 pos_mol_num_drug=0 for r in range(neg_poc_ratio): for mol in this_tar_actives: active_to_all_poc_data.append([this_target_FF,mol,1,this_target]) per_target.append([this_target_FF,mol,1,this_target]) pos_mol_num=pos_mol_num+1 pos_poc_num=pos_poc_num+1 # DUDE DECOYS this_tar_decoys=[] ra=int(len(this_tar_all_decoys)/len(this_tar_actives)) act_neg_pocs_to_total_pos = float(len(DUDE_neg_poc))/float(pos_mol_num) final_act_to_dec_ra = float(pos_mol_num)/len(this_tar_all_decoys) final_dec_to_act_ra = int(len(this_tar_all_decoys)/pos_mol_num) if final_act_to_dec_ra>1: for u in range(0,int(final_act_to_dec_ra)): this_tar_decoys.extend(this_tar_all_decoys) else: decoy_ratio = int(final_act_to_dec_ra*50) for u in range(0,len(this_tar_actives)): this_tar_decoys.extend(this_tar_all_decoys[u*ra:u*ra+decoy_ratio]) for mol in this_tar_decoys: decoy_to_self_poc_data.append([this_target_FF,mol,0,this_target]) per_target.append([this_target_FF,mol,0,this_target]) neg_mol_num=neg_mol_num+1 all_val_targets[this_target]=per_target return all_train_targets, all_val_targets def gene_fold_entries_neg_poc(all_targets_train,all_targets_val,fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample): all_train_entries=[] all_val_entries=[] for pro_name in all_targets_val.keys(): val_target_mol = all_targets_val[pro_name] all_val_entries.extend(val_target_mol) for pro_name in all_targets_train.keys(): train_target_mol = all_targets_train[pro_name] all_train_entries.extend(train_target_mol) shuffle(all_train_entries) shuffle(all_val_entries) with open('../data/fold_order_ES/train_fold_'+str(fold)+'_poc_'+str(num_of_poc_from_pocFEA)+'_drug_'+str(num_drugFEATURE_sample)+'_DUDE_order_list.txt','w') as myfile: json.dump(all_train_entries,myfile) with open('../data/fold_order_ES/val_fold_'+str(fold)+'_poc_'+str(num_of_poc_from_pocFEA)+'_drug_'+str(num_drugFEATURE_sample)+'_DUDE_order_list.txt','w') as myfile: json.dump(all_val_entries,myfile) def load_DUDE_data_neg_poc_fold(fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample): final_data = [] for which_dataset in ['train','val']: print ("dataset:" + which_dataset) print ("fold:" + str(fold)) order_list_name = '../data/fold_order_ES/'+which_dataset+'_fold_'+str(fold)+'_poc_'+str(num_of_poc_from_pocFEA)+'_drug_'+str(num_drugFEATURE_sample)+'_DUDE_order_list.txt' pockets=[] smiles=[] binding=[] pos_no = 0 neg_no = 0 with open(order_list_name,'r') as infile: all_entries = json.load(infile) for entry in all_entries: pocket_name=entry[0] sm_name=entry[1] label_name=entry[2] pro_name = entry[3] pockets.append(pocket_name) smiles.append(sm_name) binding.append(label_name) if int(label_name)==1: pos_no=pos_no+1 else: neg_no=neg_no+1 smiles,pockets,labels=test_60(smiles,pockets,binding) in_slice_data=(pockets,smiles,numpy.array(labels)) final_data.append(in_slice_data) return final_data def init_fold_order_neg_poc(num_of_poc_from_pocFEA,num_drugFEATURE_sample): for fold in range (0,4): all_targets_train, all_targets_val = load_neg_poc_dataset_train(fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample) gene_fold_entries_neg_poc(all_targets_train,all_targets_val,fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample) ################ ################ ################ # assay zinc ################ def load_neg_assay_dataset_train(fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample): filename = "../data/DUDE/DUDE_PDBID.csv" data = read_csv_DUDE(filename,"Target Name","PDB") train_pro = load_train_pro(fold) # pocket similarity scores pocketFEATURE_dict = get_pocketFEATURE_dict() drugFEATURE_dict = get_drugFEATURE_dict() DUDE_Uniprot = get_DUDE_Uniprot_dict() # negative mols train_decoys, test_decoys = load_zinc_decoys() Uniprot_neg_mols = load_DUDE_neg() target_actives_dict={} target_decoys_dict={} target_FF_dict={} target_TN_dict={} # LOAD DUDE dataset for i in range(0,len(data[0])): # For each target target = data[0][i] PDB_ID = data[1][i] # load assay neg mols for the target, if available TrueNeg_mols = [] if PDB_ID in DUDE_Uniprot.keys(): Uniprot = DUDE_Uniprot[PDB_ID] if Uniprot in Uniprot_neg_mols.keys(): TrueNeg_mols = Uniprot_neg_mols[Uniprot] target_TN_dict[target]=TrueNeg_mols target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_decoys_file=open('../data/DUDE/'+target.lower()+'/'+'decoys_final.ism') crystal_lig = open('../data/DUDE/'+target.lower()+'/'+'crystal_ligand.mol2') receptor=open('../data/DUDE/'+target.lower()+'/'+'receptor.pdb') lig_ctr=parse_crystal_lig(list(crystal_lig)) target_actives_list=list(target_actives_file) # Actives for this target target_decoys_list=list(target_decoys_file) # Decoys for this target # DUDE target FF target_FF = get_FF(target,PDB_ID,lig_ctr) # Target FF if target_FF!=None: target_FF_dict[target]=target_FF # DUDE ACTIVES for active in target_actives_list: eles=active.split() smiles=eles[0] if target not in target_actives_dict.keys(): target_actives_dict[target]=set() target_actives_dict[target].add(smiles) ###################################### ############ neg DUDE poc ############ ###################################### key_list = list(target_actives_dict.keys()) active_to_all_poc_data=[] decoy_to_self_poc_data=[] tar_neg_poc_dict={} all_targets={} for this_target in train_pro: tar_neg_poc_dict[this_target]=[] if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] neg_poc_tar_FF=[] for com_target in train_pro: if com_target in target_FF_dict.keys(): com_target_FF=target_FF_dict[com_target] if pocketFEATURE_dict[(this_target_FF[:-3],com_target_FF[:-3])]<-1.2: continue # print (this_target_FF,com_target_FF) # print ("too similar: "+str(pocketFEATURE_dict[(this_target_FF[:-3],com_target_FF[:-3])])) else: neg_poc_tar_FF.append((com_target,com_target_FF)) print ("this pocket") print (this_target,this_target_FF) print ("len(neg_poc_tar_FF)") print (len(neg_poc_tar_FF)) neg_poc_num=0 for mol in this_tar_actives: arr = numpy.arange(len(neg_poc_tar_FF)) numpy.random.shuffle(arr) if len(neg_poc_tar_FF)>num_of_poc_from_pocFEA: idx = arr[0:num_of_poc_from_pocFEA] else: idx = arr for d in idx: (com_target,com_target_FF) = neg_poc_tar_FF[d] com_tar_actives = target_actives_dict[com_target] if mol not in com_tar_actives: active_to_all_poc_data.append([com_target_FF,mol,0,com_target]) tar_neg_poc_dict[this_target].append([com_target_FF,mol,0,com_target]) neg_poc_num=neg_poc_num+1 print ("neg_poc_num") print (neg_poc_num) poc_summary_dict={} for entry in active_to_all_poc_data: pro_name = entry[-1] bind = entry[-2] if (pro_name,bind) not in poc_summary_dict.keys(): poc_summary_dict[(pro_name,bind)]=[] poc_summary_dict[(pro_name,bind)].append(entry) for this_target in train_pro: per_target = [] num_neg_ligs_for_this_tar=0 num_pos_ligs_for_this_tar=0 if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] this_tar_true_neg = target_TN_dict[this_target] DUDE_neg_poc = tar_neg_poc_dict[this_target] per_target.extend(DUDE_neg_poc) if (this_target,0) in poc_summary_dict.keys(): num_of_neg_DUDE_poc = len(poc_summary_dict[(this_target,0)]) else: num_of_neg_DUDE_poc = 0 print (this_target) print ("num_of this poc as others neg") print (num_of_neg_DUDE_poc) print ("num_of_neg_poc to this target active") print (len(DUDE_neg_poc)) num_of_actives = len(this_tar_actives) neg_poc_ratio = max(1,int(float(num_of_neg_DUDE_poc)/num_of_actives)) print ("neg_poc_ratio:"+str(neg_poc_ratio)) pos_poc_num=0 pos_mol_num=0 neg_mol_num=0 pos_mol_num_drug=0 for r in range(neg_poc_ratio): for mol in this_tar_actives: active_to_all_poc_data.append([this_target_FF,mol,1,this_target]) per_target.append([this_target_FF,mol,1,this_target]) num_pos_ligs_for_this_tar = num_pos_ligs_for_this_tar + 1 # CHEMBL NEG DECOYs tar_lig_zinc=[] idx = np.arange(0 , len(train_decoys)) np.random.shuffle(idx) #idx = idx[:len(this_tar_decoys)] idx = idx[:1000] for d in idx: tar_lig_zinc.append(train_decoys[d]) # CHEMBL dataset NEG decoy mols for mol in tar_lig_zinc: per_target.append([this_target_FF,mol,0,this_target]) num_neg_ligs_for_this_tar = num_neg_ligs_for_this_tar + 1 # CHEMBL ASSAY NEG mols for mol in this_tar_true_neg: per_target.append([this_target_FF,mol,0,this_target]) num_neg_ligs_for_this_tar = num_neg_ligs_for_this_tar + 1 # upsample! balance again!! ra=int(num_neg_ligs_for_this_tar/num_pos_ligs_for_this_tar) for t in range(ra): for mol in this_tar_actives: per_target.append([this_target_FF,mol,1,this_target]) all_targets[this_target]=per_target summary_dict={} for entry in active_to_all_poc_data: pro_name = entry[-1] bind = entry[-2] if (pro_name,bind) not in summary_dict.keys(): summary_dict[(pro_name,bind)]=[] summary_dict[(pro_name,bind)].append(entry) for entry in decoy_to_self_poc_data: pro_name = entry[-1] bind = entry[-2] if (pro_name,bind) not in summary_dict.keys(): summary_dict[(pro_name,bind)]=[] summary_dict[(pro_name,bind)].append(entry) print ("This target actives, this target to other active and this target decoys: entries of summary_dict") for target in train_pro: print (target) if (target,0) in summary_dict.keys(): print (len(summary_dict[(target,0)])) print (len(summary_dict[(target,1)])) print ("This target actives, other pocs to this active: entries of all_targets") per_tar_sum={} for target in train_pro: if target not in all_targets.keys(): continue per_tar = all_targets[target] for entry in per_tar: bind = entry[-2] if (target,bind) not in per_tar_sum.keys(): per_tar_sum[(target,bind)]=[] per_tar_sum[(target,bind)].append(entry) for target in train_pro: print (target) if (target,0) in per_tar_sum.keys(): print ("len(per_tar_sum[(target,0)])") print (len(per_tar_sum[(target,0)])) print ("len(per_tar_sum[(target,1)])") print (len(per_tar_sum[(target,1)])) return all_targets def load_neg_assay_dataset_test(fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample): filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") train_pro = load_train_pro(fold) test_pro = load_test_pro(fold) all_pro = train_pro | test_pro # pocket similarity scores pocketFEATURE_dict = get_pocketFEATURE_dict() drugFEATURE_dict = get_drugFEATURE_dict() DUDE_Uniprot = get_DUDE_Uniprot_dict() # negative mols train_decoys, test_decoys = load_zinc_decoys() Uniprot_neg_mols = load_DUDE_neg() target_actives_dict={} target_decoys_dict={} target_FF_dict={} target_TN_dict={} # LOAD DUDE dataset for i in range(0,len(data[0])): # For each target target = data[0][i] PDB_ID = data[1][i] print ("test target") print (target) TrueNeg_mols = [] if PDB_ID in DUDE_Uniprot.keys(): Uniprot = DUDE_Uniprot[PDB_ID] if Uniprot in Uniprot_neg_mols.keys(): TrueNeg_mols = Uniprot_neg_mols[Uniprot] target_TN_dict[target]=TrueNeg_mols target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_decoys_file=open('../data/DUDE/'+target.lower()+'/'+'decoys_final.ism') crystal_lig = open('../data/DUDE/'+target.lower()+'/'+'crystal_ligand.mol2') receptor=open('../data/DUDE/'+target.lower()+'/'+'receptor.pdb') lig_ctr=parse_crystal_lig(list(crystal_lig)) target_actives_list=list(target_actives_file) # Actives for this target target_decoys_list=list(target_decoys_file) # Decoys for this target print ("len(target_actives_list)") print (len(target_actives_list)) # DUDE target FF target_FF = get_FF(target,PDB_ID,lig_ctr) # Target FF if target_FF!=None: target_FF_dict[target]=target_FF # DUDE ACTIVES for active in target_actives_list: eles=active.split() smiles=eles[0] if target not in target_actives_dict.keys(): target_actives_dict[target]=set() target_actives_dict[target].add(smiles) ###################################### ############ neg DUDE poc ############ ###################################### key_list = list(target_actives_dict.keys()) active_to_all_poc_data=[] decoy_to_self_poc_data=[] tar_neg_poc_dict={} all_targets={} for this_target in test_pro: tar_neg_poc_dict[this_target]=[] print (this_target) if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] neg_poc_tar_FF=[] for com_target in all_pro: if com_target in target_FF_dict.keys(): com_target_FF=target_FF_dict[com_target] if pocketFEATURE_dict[(this_target_FF[:-3],com_target_FF[:-3])]<-1.2: print (this_target_FF,com_target_FF) print ("too similar: "+str(pocketFEATURE_dict[(this_target_FF[:-3],com_target_FF[:-3])])) else: neg_poc_tar_FF.append((com_target,com_target_FF)) ##### Nothing happens here if num_of_poc_from_pocFEA == 0 ##### neg_poc_num=0 for mol in this_tar_actives: arr = numpy.arange(len(neg_poc_tar_FF)) numpy.random.shuffle(arr) if len(neg_poc_tar_FF)>num_of_poc_from_pocFEA: idx = arr[0:num_of_poc_from_pocFEA] else: idx = arr for d in idx: (com_target,com_target_FF) = neg_poc_tar_FF[d] com_tar_actives = target_actives_dict[com_target] if mol not in com_tar_actives: active_to_all_poc_data.append([com_target_FF,mol,0,com_target]) tar_neg_poc_dict[this_target].append([com_target_FF,mol,0,com_target]) neg_poc_num=neg_poc_num+1 ########################################## poc_summary_dict={} for entry in active_to_all_poc_data: pro_name = entry[-1] bind = entry[-2] if (pro_name,bind) not in poc_summary_dict.keys(): poc_summary_dict[(pro_name,bind)]=[] poc_summary_dict[(pro_name,bind)].append(entry) for this_target in test_pro: if this_target not in target_FF_dict.keys(): continue this_target_FF = target_FF_dict[this_target] this_tar_actives = target_actives_dict[this_target] DUDE_neg_poc = tar_neg_poc_dict[this_target] this_tar_true_neg = target_TN_dict[this_target] per_target = [] num_pos_ligs_for_this_tar = 0 num_neg_ligs_for_this_tar = 0 per_target.extend(DUDE_neg_poc) if (this_target,0) in poc_summary_dict.keys(): num_of_neg_DUDE_poc = len(poc_summary_dict[(this_target,0)]) else: num_of_neg_DUDE_poc = 0 num_of_actives = len(this_tar_actives) neg_poc_ratio = max(1,int(float(num_of_neg_DUDE_poc)/num_of_actives)) pos_poc_num=0 pos_mol_num=0 neg_mol_num=0 pos_mol_num_drug=0 print (this_target) print ("len(this_tar_actives)") print (len(this_tar_actives)) for r in range(neg_poc_ratio): for mol in this_tar_actives: active_to_all_poc_data.append([this_target_FF,mol,1,this_target]) per_target.append([this_target_FF,mol,1,this_target]) pos_mol_num=pos_mol_num+1 pos_poc_num=pos_poc_num+1 num_pos_ligs_for_this_tar = num_pos_ligs_for_this_tar+1 tar_lig_zinc=[] idx = np.arange(0 , len(train_decoys)) np.random.shuffle(idx) idx = idx[:1000] for d in idx: tar_lig_zinc.append(train_decoys[d]) for mol in tar_lig_zinc: per_target.append([this_target_FF,mol,0,this_target]) num_neg_ligs_for_this_tar = num_neg_ligs_for_this_tar+1 for mol in this_tar_true_neg: per_target.append([this_target_FF,mol,0,this_target]) num_neg_ligs_for_this_tar = num_neg_ligs_for_this_tar+1 ra=int(num_neg_ligs_for_this_tar/num_pos_ligs_for_this_tar) for t in range(ra): for mol in this_tar_actives: per_target.append([this_target_FF,mol,1,this_target]) all_targets[this_target]=per_target summary_dict={} print ("active_to_all_poc_data") for entry in active_to_all_poc_data: pro_name = entry[-1] bind = entry[-2] print ("pro_name,bind") print (pro_name,bind) if (pro_name,bind) not in summary_dict.keys(): summary_dict[(pro_name,bind)]=[] summary_dict[(pro_name,bind)].append(entry) for entry in decoy_to_self_poc_data: pro_name = entry[-1] bind = entry[-2] if (pro_name,bind) not in summary_dict.keys(): summary_dict[(pro_name,bind)]=[] summary_dict[(pro_name,bind)].append(entry) print ("This target actives, this target to other active and this target decoys: entries of summary_dict") for target in test_pro: print (target) if (target,0) in summary_dict.keys(): print (len(summary_dict[(target,0)])) print (len(summary_dict[(target,1)])) print ("This target actives, other pocs to this active: entries of all_targets") per_tar_sum={} for target in test_pro: if target not in all_targets.keys(): continue per_tar = all_targets[target] for entry in per_tar: bind = entry[-2] if (target,bind) not in per_tar_sum.keys(): per_tar_sum[(target,bind)]=[] per_tar_sum[(target,bind)].append(entry) for target in test_pro: print (target) if (target,0) in per_tar_sum.keys(): print ("len(per_tar_sum[(target,0)])") print (len(per_tar_sum[(target,0)])) print ("len(per_tar_sum[(target,1)])") print (len(per_tar_sum[(target,1)])) return all_targets def gene_fold_entries_neg_assay(all_targets_train,all_targets_test,fold,num_of_poc_from_pocFEA): test_pro = load_test_pro(fold) train_pro = load_train_pro(fold) all_train_entries=[] all_test_entries=[] for pro_name in test_pro: if pro_name in all_targets_test.keys(): test_target_mol = all_targets_test[pro_name] all_test_entries.extend(test_target_mol) else: print (pro_name +"not in all_targets!") for pro_name in train_pro: if pro_name in all_targets_train.keys(): train_target_mol = all_targets_train[pro_name] all_train_entries.extend(train_target_mol) else: print (pro_name +"not in all_targets!") shuffle(all_train_entries) shuffle(all_test_entries) with open('../data/fold_order_assay/train_fold_'+str(fold)+'_poc_'+str(num_of_poc_from_pocFEA)+'_nodec_DUDE_order_list.txt','w') as myfile: json.dump(all_train_entries,myfile) with open('../data/fold_order_assay/test_fold_'+str(fold)+'_poc_'+str(num_of_poc_from_pocFEA)+'_nodec_DUDE_order_list.txt','w') as myfile: json.dump(all_test_entries,myfile) def load_DUDE_data_neg_assay_fold(fold,num_of_poc_from_pocFEA): final_data = [] for which_dataset in ['train','test']: print ("dataset:" + which_dataset) print ("fold:" + str(fold)) order_list_name = '../data/fold_order_assay/'+which_dataset+'_fold_'+str(fold)+'_poc_'+str(num_of_poc_from_pocFEA)+'_nodec_DUDE_order_list.txt' pockets=[] smiles=[] binding=[] pos_no = 0 neg_no = 0 with open(order_list_name,'r') as infile: all_entries = json.load(infile) for entry in all_entries: pocket_name=entry[0] sm_name=entry[1] label_name=entry[2] pro_name = entry[3] pockets.append(pocket_name) smiles.append(sm_name) binding.append(label_name) if int(label_name)==1: pos_no=pos_no+1 else: neg_no=neg_no+1 smiles,pockets,labels=test_60(smiles,pockets,binding) in_slice_data=(pockets,smiles,numpy.array(labels)) final_data.append(in_slice_data) return final_data def init_fold_order_neg_assay(num_of_poc_from_pocFEA,num_drugFEATURE_sample): for fold in range (0,4): all_targets_train = load_neg_assay_dataset_train(fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample) all_targets_test = load_neg_assay_dataset_test(fold,num_of_poc_from_pocFEA,num_drugFEATURE_sample) gene_fold_entries_neg_assay(all_targets_train,all_targets_test,fold,num_of_poc_from_pocFEA) ############## ############## def test_valid_atom(atoms): for atom in atoms: features = atom_features(atom) if features[0]==False: return False return True def test_60(smiles,pockets,labels): valid_smiles=[] valid_pockets=[] valid_labels=[] for i in range(len(smiles)): s = smiles[i] mol = MolFromSmiles(s) if not mol: print ("Could not parse SMILES string:") print (s) else: num_of_atoms = len(mol.GetAtoms()) if num_of_atoms > max_nodes_in_mol: print ("SMILES string>"+str(max_nodes_in_mol)+"!!") print (s) else: if test_valid_atom(mol.GetAtoms()): valid_smiles.append(smiles[i]) valid_pockets.append(pockets[i]) valid_labels.append(labels[i]) return valid_smiles,valid_pockets,valid_labels def load_test_pro(fold): filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") DUDE_PDB_pro_dict={} for i in range(0,len(data[0])): target = data[0][i] PDB_ID = data[1][i] DUDE_PDB_pro_dict[PDB_ID]=target test_fold_file = open('../data/DUDE_test_PDB_fold_'+str(fold)+'.txt') test_fold_list = list(test_fold_file) test_set = set() for line in test_fold_list: PDBs = line.strip('\n').split(',') for pdb in PDBs: if pdb != '': pro_name = DUDE_PDB_pro_dict[pdb] test_set.add(pro_name) print ("test set fold "+str(fold)+":") print (test_set) return test_set def load_train_pro(fold): filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") DUDE_PDB_pro_dict={} for i in range(0,len(data[0])): target = data[0][i] PDB_ID = data[1][i] DUDE_PDB_pro_dict[PDB_ID]=target train_fold_file = open('../data/DUDE_train_PDB_fold_'+str(fold)+'.txt') train_fold_list = list(train_fold_file) train_set = set() for line in train_fold_list: PDBs = line.strip('\n').split(',') for pdb in PDBs: if pdb != '': pro_name = DUDE_PDB_pro_dict[pdb] train_set.add(pro_name) print ("train set fold "+str(fold)+":") print (train_set) return train_set