import os import sys import time import numpy import theano import theano.tensor as T import bisect from scipy.io import matlab import math import scipy.ndimage import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import random import argparse import numpy as np import os from extract_pts import * from collections import OrderedDict def read_csv_DUDE(filename, input_name, target_name): data = ([], []) with open(filename) as file: reader = csv.DictReader(file) for row in reader: data[0].append(row[input_name]) data[1].append(row[target_name]) return data def save(path, ext='png', close=True, verbose=True): # Extract the directory and filename from the given path directory = os.path.split(path)[0] filename = "%s.%s" % (os.path.split(path)[1], ext) if directory == '': directory = '.' # If the directory does not exist, create it if not os.path.exists(directory): os.makedirs(directory) # The final path to save to savepath = os.path.join(directory, filename) if verbose: print("Saving figure to '%s'..." % savepath), plt.savefig(savepath+'.eps', format='eps',dpi=900) # Close it if close: plt.close() if verbose: print("Done") def sum_sqroot(v): sum_square = 0 for i in range (0,len(v)): sum_square = sum_square+(v[i]*v[i]) return numpy.sqrt(sum_square) def get_FF(target,PDB_ID,DUDE_ctr): pro_name=target ff_name = None ligs=cut_ligand_all_atoms(PDB_ID,DUDE_ctr,False) if ligs == []: print('cant find ligs'+'\n') for l in ligs: [lig_ID,lig_chain,lig_no, ctr]=l ff_name=PDB_ID+'_'+str(lig_ID)+'_'+str(lig_chain)+'_'+str(lig_no)+'.ff' return ff_name def load_test_pro(fold): filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") DUDE_PDB_pro_dict={} for i in range(0,len(data[0])): target = data[0][i] PDB_ID = data[1][i] DUDE_PDB_pro_dict[PDB_ID]=target test_fold_file = open('../data/DUDE_test_PDB_fold_'+str(fold)+'.txt') test_fold_list = list(test_fold_file) test_set = set() for line in test_fold_list: PDBs = line.strip('\n').split(',') for pdb in PDBs: if pdb != '': pro_name = DUDE_PDB_pro_dict[pdb] test_set.add(pro_name) print ("test set fold "+str(fold)+":") print (test_set) return test_set def load_train_pro(fold): filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") DUDE_PDB_pro_dict={} for i in range(0,len(data[0])): target = data[0][i] PDB_ID = data[1][i] DUDE_PDB_pro_dict[PDB_ID]=target train_fold_file = open('../data/DUDE_train_PDB_fold_'+str(fold)+'.txt') train_fold_list = list(train_fold_file) train_set = set() for line in train_fold_list: PDBs = line.strip('\n').split(',') for pdb in PDBs: if pdb != '': pro_name = DUDE_PDB_pro_dict[pdb] train_set.add(pro_name) # pro_name = line.strip('\n') # train_set.add(pro_name) print ("train set fold "+str(fold)+":") print (train_set) return train_set from sklearn.metrics import confusion_matrix method = 'ward' fig = plt.figure() target_FF_dict=OrderedDict() tar_ff_dict_5 = {'DRD3':'3pbl_ETQ.ff','KIT':'3g0e_B49.ff','INHA':'4trj_665.ff','FNTA':'3e37_ED5.ff','HIVINT':'3nf7_CIW.ff'} #4trj_NAD.ff filename = "../data/DUDE/DUDE_PDBID.csv" data=read_csv_DUDE(filename,"Target Name","PDB") for i in range(0,len(data[0])): target = data[0][i] PDB_ID = data[1][i] target_actives_file=open('../data/DUDE/'+target.lower()+'/'+'actives_final.ism') target_decoys_file=open('../data/DUDE/'+target.lower()+'/'+'decoys_final.ism') crystal_lig = open('../data/DUDE/'+target.lower()+'/'+'crystal_ligand.mol2') receptor=open('../data/DUDE/'+target.lower()+'/'+'receptor.pdb') lig_ctr=parse_crystal_lig(list(crystal_lig)) target_actives_list=list(target_actives_file) target_decoys_list=list(target_decoys_file) if target in ['DRD3','KIT','INHA','FNTA','HIVINT']: target_FF=tar_ff_dict_5[target] else: target_FF = get_FF(target,PDB_ID,lig_ctr) target_FF_dict[target]=target_FF train_pro = load_train_pro(0) test_pro = load_test_pro(0) split_tar_order=[] for target in target_FF_dict: split_tar_order.append(target) num_tar= 102 all_prob = [] for target in split_tar_order: if os.path.isfile('../results/DUDE_profile/'+target+'_y_prob.dat'): prob = numpy.load('../results/DUDE_profile/'+target+'_y_prob.dat') else: prob = numpy.zeros((1,num_tar)) ave_prob = numpy.mean(prob,axis=0) all_prob.append(ave_prob) all_prob = numpy.array(all_prob) num_tar = all_prob.shape[0] from scipy.cluster.hierarchy import dendrogram, linkage #################################################### ############## Hierarchical Clustering ############# #################################################### import scipy import pylab import scipy.cluster.hierarchy as sch fig = pylab.figure(figsize=(16,16)) ax1 = fig.add_axes([0.055,0.1,0.2,0.6]) Y = sch.linkage(all_prob, method=method) Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.745,0.6,0.2]) Y = sch.linkage(all_prob, method=method) Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] n_row_ = all_prob[idx1,:] n_row_ = n_row_[:,idx2] im = axmatrix.matshow(n_row_, vmin = 0, vmax = 1, aspect='auto', origin='lower', cmap=plt.get_cmap('Spectral_r')) axmatrix.set_xticks([]) axmatrix.set_yticks([]) tar_idx1=[] for o in idx1: tar_idx1.append(split_tar_order[o]) tar_idx2=[] for o in idx2: tar_idx2.append(split_tar_order[o]) axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) axmatrix.set_xticks(range(num_tar)) axmatrix.set_xticklabels(tar_idx1,rotation='vertical',fontsize = 10) axmatrix.xaxis.set_label_position('top') axmatrix.xaxis.tick_top() axmatrix.set_yticks(range(num_tar)) axmatrix.set_yticklabels(tar_idx2 ,fontsize = 10) axmatrix.yaxis.set_label_position('left') axmatrix.yaxis.tick_left() save("../results/profile/hierarchy", ext="png", close=True, verbose=True) ticks=numpy.arange(0,num_tar,1) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(n_row_, vmin = 0, vmax = 1, cmap=plt.get_cmap('Spectral_r'),interpolation='nearest') fig.colorbar(cax) plt.xticks(ticks, tar_idx1,rotation='vertical',fontsize = 4) plt.yticks(ticks, tar_idx2,fontsize = 4) ax.tick_params(width=0.5) save("../results/profile/hier", ext="png", close=True, verbose=True) # #################################################### # ############## Hierarchical Clustering ############# # #################################################### import scipy import pylab import scipy.cluster.hierarchy as sch all_prob_trans = numpy.transpose(all_prob) fig = pylab.figure(figsize=(16,16)) ax1 = fig.add_axes([0.055,0.1,0.2,0.6]) Y = sch.linkage(all_prob_trans, method=method) Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.745,0.6,0.2]) Y = sch.linkage(all_prob_trans, method=method) Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] print (idx1) print (idx2) print (all_prob_trans.shape) n_row_ = all_prob_trans[idx1,:] n_row_ = n_row_[:,idx2] im = axmatrix.matshow(n_row_, vmin = 0, vmax = 1, aspect='auto', origin='lower', cmap=plt.get_cmap('Spectral_r')) axmatrix.set_xticks([]) axmatrix.set_yticks([]) tar_idx1=[] for o in idx1: tar_idx1.append(split_tar_order[o]) # print tar_idx1 tar_idx2=[] for o in idx2: tar_idx2.append(split_tar_order[o]) axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) axmatrix.set_xticks(range(num_tar)) axmatrix.set_xticklabels(tar_idx1,rotation='vertical',fontsize = 10) axmatrix.xaxis.set_label_position('top') axmatrix.xaxis.tick_top() axmatrix.set_yticks(range(num_tar)) axmatrix.set_yticklabels(tar_idx2 ,fontsize = 10) axmatrix.yaxis.set_label_position('left') axmatrix.yaxis.tick_left() save("../results/profile/hierarchy_col", ext="png", close=True, verbose=True) ticks=numpy.arange(0,num_tar,1) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(n_row_, vmin = 0, vmax = 1, cmap=plt.get_cmap('Spectral_r'),interpolation='nearest') fig.colorbar(cax) plt.xticks(ticks, tar_idx1,rotation='vertical',fontsize = 4) plt.yticks(ticks, tar_idx2,fontsize = 4) ax.tick_params(width=0.5) save("../results/profile/hier_col", ext="png", close=True, verbose=True) #### TEST BOTH #################################################### ############## Hierarchical Clustering ############# #################################################### import scipy import pylab import scipy.cluster.hierarchy as sch all_prob_trans = numpy.transpose(all_prob) fig = pylab.figure(figsize=(16,16)) ax2 = fig.add_axes([0.3,0.745,0.6,0.2]) Y = sch.linkage(all_prob_trans, method=method) Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) n_col_ = all_prob_trans[idx2,:] n_col_ = n_col_[:,idx2] n_col_trans = numpy.transpose(n_col_) ax1 = fig.add_axes([0.055,0.1,0.2,0.6]) Y = sch.linkage(n_col_trans, method=method) Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx2 = Z2['leaves'] idx1 = Z1['leaves'] n_row_ = n_col_trans[idx1,:] # n_row_ = n_row_[:,idx2] im = axmatrix.matshow(n_row_, vmin = 0, vmax = 1, aspect='auto', origin='lower', cmap=plt.get_cmap('Spectral_r')) axmatrix.set_xticks([]) axmatrix.set_yticks([]) tar_idx2=[] for o in idx2: tar_idx2.append(split_tar_order[o]) tar_idx1=[] for o in idx1: tar_idx1.append(tar_idx2[o]) axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) pylab.colorbar(im, cax=axcolor) axmatrix.set_xticks(range(num_tar)) axmatrix.set_xticklabels(tar_idx2,rotation='vertical',fontsize = 10) axmatrix.xaxis.set_label_position('top') axmatrix.xaxis.tick_top() axmatrix.set_yticks(range(num_tar)) axmatrix.set_yticklabels(tar_idx1 ,fontsize = 10) axmatrix.yaxis.set_label_position('left') axmatrix.yaxis.tick_left() save("../results/profile/hierarchy_both", ext="png", close=True, verbose=True) ticks=numpy.arange(0,num_tar,1) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(n_row_, vmin = 0, vmax = 1, cmap=plt.get_cmap('Spectral_r'),interpolation='nearest') fig.colorbar(cax) plt.xticks(ticks, tar_idx2,rotation='vertical',fontsize = 4) plt.yticks(ticks, tar_idx1,fontsize = 4) ax.tick_params(width=0.5) save("../results/profile/hier_both", ext="png", close=True, verbose=True) # cluster on rows # Group0 = ['ROCK1','MP2K1','CDK2','AKT2','MK10','BRAF','FAK1','AKT1','FGFR1','MK01','EGFR','LCK','CSF1R','PLK1','ABL1','MAPK2','TGFR1','MK14','KIT','MET','JAK2','VGFR2','KPCB','WEE1','TGFR1','SRC'] # Group1 = ['PGH1','PGH2','PYRD','THB','FABP4','PPARA','PPARD','PPARG','MCR','ANDR','PRGR','RXRA','GCR','ESR1','ESR2','PA2GA','INHA'] # Group2 = ['LKHA4','ADA','DYR','ACES','DRD3','FA7','HDAC2','AOFB','DHI1','ITAL','KIF11','DPP4','CXCR4','NOS1','CASP3','XIAP','AMPC','DEF','CAH2','ADA17','MMP13','ADRB1','ADRB2','HMDH'] # Group3 = ['ACE','HIVPR','FKB1A','NRAM','THRB','TRY1','TRYB1','BACE1','RENI','FPPS'] # Group4 = ['PUR2','COMT','GRIK1','GLCM','HIVINT','HDAC8','ALDR','PTN1','PYGM','CP2C9','CP3A4','FNTA','HIVPR','HS90A','FA10','PARP1','UROK','AA2AR','PDE5A','KITH','SAHH','GRIA2','TYSY','HXK4','PNPH'] Group0 = ['ROCK1','MP2K1','CDK2','AKT2','MK10','BRAF','FAK1','AKT1','FGFR1','MK01','EGFR','LCK','CSF1R','PLK1','ABL1','MAPK2','TGFR1','MK14','KIT','MET','JAK2','VGFR2','KPCB','WEE1','TGFR1','SRC'] Group1 = ['PGH1','PGH2','PYRD','THB','PPARA','PPARD','PPARG','MCR','ANDR','PRGR','RXRA','GCR','ESR1','ESR2','INHA','FABP4','PA2GA'] Group2 = ['DYR','ACES','ADA','LKHA4','FA7','DPP4','CASP3','ADA17','MMP13','HIVPR','THRB','TRY1','TRYB1','BACE1','RENI','FA10','UROK','ACE','HDAC2','AOFB','DHI1','ITAL','KIF11'] Group3 = ['CXCR4','DRD3','ADRB1','ADRB2','AA2AR','NOS1','XIAP','AMPC','DEF','CAH2','HMDH','FKB1A','NRAM','FPPS'] Group4 = ['PUR2','COMT','GRIK1','GLCM','HIVINT','HDAC8','ALDR','PTN1','PYGM','CP2C9','CP3A4','FNTA','HIVRT','HS90A','PARP1','PDE5A','KITH','SAHH','GRIA2','TYSY','HXK4','PNPH'] # Group0 = ['ABL1','AKT1','AKT2','BRAF','CDK2','CSF1R','EGFR','FAK1','FGFR1','IGF1R','JAK2','KPCB','LCK','MAPK2','MET','MK01','MK10','MK14','MP2K1','PLK1','ROCK1','SRC','TGFR1','HXK4'] # Group1 = ['FPPS','GLCM','HDAC2','HDAC8','GRIK1','DEF','AMPC','NRAM','PUR2','TYSY','SAHH','HS90A','PNPH','COMT','NOS1','ADA'] # Group2 = ['ACE','UROK','LKHA4','ADA17','BACE1','CASP3','MMP13','THRB','TRYB1','FA10','DPP4','TRY1','FA7','HIVPR','RENI'] # Group3 = ['HMDH','AA2AR','CXCR4','ADRB1','ADRB2'] # Group4 = ['ANDR','ESR1','ESR2','GCR','MCR','PRGR','RXRA','THB', 'PPARA','PPARD','PPARG','PGH1','PGH2','PYRD','PDE5A','CP3A4','DHI1','ALDR','FKB1A','FABP4','PA2GA'] # Group5 = ['KITH','CP2C9','ITAL','PTN1','HIVINT','PYGM','KIF11','DYR','HIVRT','PARP1','GRIA2','AOFB','CAH2','ACES'] row_val_dict = {} for i in range(n_row_.shape[0]): row_val_dict[tar_idx1[i]]=n_row_[i] row_order=Group0+Group1+Group2+Group3+Group4 col_order=Group0+Group1+Group2+Group3+Group4 print (len(row_order)) print (len(col_order)) new_row=[] for target in row_order: new_row.append(row_val_dict[target]) new_row=numpy.array(new_row) col_val_dict = {} trans_new_row = numpy.transpose(new_row) for i in range(trans_new_row.shape[0]): col_val_dict[tar_idx2[i]]=trans_new_row[i] new_col=[] for target in col_order: new_col.append(col_val_dict[target]) new_col=numpy.array(new_col) final = numpy.transpose(new_col) ticks=numpy.arange(0,102,1) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(final, vmin = 0, vmax = 1, cmap=plt.get_cmap('Spectral_r'),interpolation='nearest') fig.colorbar(cax) plt.xticks(ticks, col_order,rotation='vertical',fontsize = 4) plt.yticks(ticks, row_order,fontsize = 4) ax.tick_params(width=0.5) save("../results/profile/hier_both_by_knowledge", ext="png", close=True, verbose=True)