import matplotlib as mpl mpl.use('Agg') import os import numpy from sklearn.decomposition import PCA import pickle, sys from sklearn.manifold import TSNE import matplotlib.pyplot as plt import csv from collections import OrderedDict def read_csv_DUDE(filename, input_name, target_name): data = ([], []) with open(filename) as file: reader = csv.DictReader(file) for row in reader: data[0].append(row[input_name]) data[1].append(row[target_name]) return data def save(path, lgd, ext='png', close=True, verbose=True): directory = os.path.split(path)[0] filename = "%s.%s" % (os.path.split(path)[1], ext) if directory == '': directory = '.' if not os.path.exists(directory): os.makedirs(directory) savepath = os.path.join(directory, filename) if verbose: print("Saving figure to '%s'..." % savepath), plt.savefig(savepath+'.eps', format='eps',dpi=900, bbox_extra_artists=(lgd,), bbox_inches='tight')#bbox_extra_artists=(lgd,), bbox_inches='tight' if close: plt.close() if verbose: print("Done") def load_scop_label(): family_pdb={} pdb_family={} scop_file = open('../../data/SCOP_pdb_family.txt') scop_list = list(scop_file) for line in scop_list: ele=line.strip('\n').split('\t') pdb = ele[0] sid = ele[1] family = ele[2] if family not in family_pdb.keys(): family_pdb[family]=[] if pdb not in pdb_family.keys(): pdb_family[pdb]=[] family_pdb[family].append(pdb) pdb_family[pdb].append(family) return family_pdb, pdb_family def get_cmap(n, name='hsv'):#'Dark2', 'RdBu', 'Blues','hsv' return plt.cm.get_cmap(name, n) def plot_data(mode,mem_thres,poc_order_file): family_pdb, pdb_family = load_scop_label() pdb_single_fam = {} single_fam_pdb = OrderedDict() from collections import Counter for pdb in pdb_family.keys(): if pdb=='3kl6': value = 'b.47.1.2' elif pdb=='1w7x': value = 'b.47.1.2' elif pdb=='2w26': value = 'b.47.1.2' else: families = pdb_family[pdb] c = Counter(families) value, count = c.most_common()[0] pdb_single_fam[pdb]=value for pdb in pdb_single_fam: fam = pdb_single_fam[pdb] if fam not in single_fam_pdb.keys(): single_fam_pdb[fam]=[] single_fam_pdb[fam].append(pdb) color_dict = {} marker_dict = {} fam_dict = {} final_fams = list(single_fam_pdb.keys()) final_fams.sort(reverse=True) abun_fams = [] for fam in final_fams: PDBs = single_fam_pdb[fam] if len(PDBs)>mem_thres and fam!='l.1.1.1': abun_fams.append(fam) final_fams = abun_fams num_of_class = len(final_fams) print (final_fams) print ("num_of_total class") print (len(final_fams)) print ("num_of_class") print (num_of_class) c=0 import matplotlib.cm as cm import matplotlib.pyplot as plt colormap = plt.cm.gist_ncar #nipy_spectral, Set1,Paired colorst = [colormap(i) for i in numpy.linspace(0, 0.9,num_of_class)] colors = cm.rainbow(numpy.linspace(0, 1, num_of_class)) markers = [u'o',u'^',u'p',u'o',u'^',u'p',u'o',u'^',u'p',u'o',u'^',u'p',u'o',u'^',u'p',u'o',u'^',u'p',u'o',u'^',u'p'] markers = markers+markers+markers markers = markers+markers+markers markers = markers[0:num_of_class] for fam in final_fams: PDBs = single_fam_pdb[fam] for pdb in PDBs: color_dict[pdb]=colorst[c] marker_dict[pdb]=markers[c] fam_dict[pdb]=fam c=c+1 import json with open('../../data/target_FF_dict.json','r') as infile: target_FF_dict = json.load(infile) GPCR=['AA2AR','ADRB1','ADRB2','CXCR4','DRD3'] KINASE=['ABL1','AKT1','AKT2','BRAF','CDK2','CSF1R','EGFR','FAK1','FGFR1','IGF1R','JAK2','KIT','KPCB','LCK','MAPK2','MET','MK01','MK14','MP2K1','PLK1','ROCK1','SRC','TGFR1','VGFR2'] NUCLEAR=['ANDR','ESR1','ESR2','GCR','MCR','PPARG','PPARD','PPARA','PRGR','RXRA','THB'] PROTEASE=['ACE','ADA17','BACE1','CASP3','DPP4','FA10','FA7','HIVPR','LKHA4','MMP13','RENI','THRB','TRY1','TRYB1','UROK'] all_poc_fps_0 = numpy.load('../../results/User/auto_poc_fp_layer_0.dat') all_poc_fps_0=numpy.reshape(all_poc_fps_0,(-1,512)) all_poc_fps_1 = numpy.load('../../results/User/auto_poc_fp_layer_1.dat') all_poc_fps_1=numpy.reshape(all_poc_fps_1,(-1,512)) # all_poc_fps_0 = numpy.load('../../results/User/finetune_poc_fp.dat') # all_poc_fps_0=numpy.reshape(all_poc_fps_0,(-1,512)) # all_poc_fps_1 = numpy.load('../../results/User/finetune_poc_fp.dat') # all_poc_fps_1=numpy.reshape(all_poc_fps_1,(-1,512)) all_poc_name = [] for line in poc_order_file: poc = line.strip('\n') all_poc_name.append(poc) all_poc_name=all_poc_name[:all_poc_fps_1.shape[0]] labels = all_poc_name all_poc_fps_ = [all_poc_fps_0,all_poc_fps_1] for layer in range(0,2): all_poc_fps = all_poc_fps_[layer] DUDE_poc_name = [] DUDE_poc_vec = [] DUDE_colors = [] SCOP_poc_name = [] SCOP_poc_vec = [] SCOP_colors = [] SCOP_markers = [] SCOP_fams = [] colors = [] labels = [] chosen_poc_name = [] # t = numpy.load("../results/tsne_3d_layer_"+str(layer)+".dat") n = 0 x=[] for i in range(len(all_poc_name)): poc_name = all_poc_name[i] poc_vec = all_poc_fps[i] pdb = poc_name[0:4].lower() if pdb in color_dict.keys(): c = color_dict[pdb] m = marker_dict[pdb] f = fam_dict[pdb] if poc_name in target_FF_dict.keys(): SCOP_poc_name.append(target_FF_dict[poc_name]) else: SCOP_poc_name.append('') SCOP_poc_vec.append(poc_vec) SCOP_colors.append(c) SCOP_markers.append(m) SCOP_fams.append(f) # x.append(t[n]) n+=1 x=numpy.array(x) if mode =='SCOP': all_poc_fps = SCOP_poc_vec all_poc_fps = numpy.array(all_poc_fps) labels = SCOP_poc_name colors = SCOP_colors markers = SCOP_markers fams = SCOP_fams add = '_thres_'+str(mem_thres)+'_nonorm_' chosen_poc_name = SCOP_poc_name perplexity=5 ncol=6 print ("NUMBER OF FINAL POCKETS!!!!") print (len(all_poc_fps)) print (len(x)) ######################################## ############## t-SNE ################### ######################################## model = TSNE( n_components=3, init="random", perplexity=perplexity, n_iter=50000, learning_rate=50.0, verbose=1) x = model.fit_transform( all_poc_fps ) x.dump("../../results/User/tsne_3d_layer_"+str(layer)+".dat") x=numpy.load("../../results/User/tsne_3d_layer_"+str(layer)+".dat") from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import numpy as np fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for cla in final_fams: xc = [p for (j,p) in enumerate(x[:,0]) if fams[j]==cla] yc = [p for (j,p) in enumerate(x[:,1]) if fams[j]==cla] zc = [p for (j,p) in enumerate(x[:,2]) if fams[j]==cla] cols = [c for (j,c) in enumerate(colors) if fams[j]==cla] m = [c for (j,c) in enumerate(markers) if fams[j]==cla] l = [c for (j,c) in enumerate(labels) if fams[j]==cla] if m: ax.scatter([xc],[yc],[zc], c=cols[0],marker=m[0],label=cla,edgecolors='k') lgd = plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=5,ncol=ncol) save("../../results/tsne/tsne_3d_012_layer_"+str(layer),lgd) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for cla in final_fams: xc = [p for (j,p) in enumerate(x[:,1]) if fams[j]==cla] yc = [p for (j,p) in enumerate(x[:,2]) if fams[j]==cla] zc = [p for (j,p) in enumerate(x[:,0]) if fams[j]==cla] cols = [c for (j,c) in enumerate(colors) if fams[j]==cla] m = [c for (j,c) in enumerate(markers) if fams[j]==cla] l = [c for (j,c) in enumerate(labels) if fams[j]==cla] if m: ax.scatter([xc],[yc],[zc], c=cols[0],marker=m[0],label=cla,edgecolors='k') lgd = plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=5,ncol=ncol) save("../../results/tsne/tsne_3d_120_layer_"+str(layer),lgd) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for cla in final_fams: xc = [p for (j,p) in enumerate(x[:,2]) if fams[j]==cla] yc = [p for (j,p) in enumerate(x[:,0]) if fams[j]==cla] zc = [p for (j,p) in enumerate(x[:,1]) if fams[j]==cla] cols = [c for (j,c) in enumerate(colors) if fams[j]==cla] m = [c for (j,c) in enumerate(markers) if fams[j]==cla] l = [c for (j,c) in enumerate(labels) if fams[j]==cla] if m: ax.scatter([xc],[yc],[zc], c=cols[0],marker=m[0],label=cla,edgecolors='k') lgd = plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=5,ncol=ncol) save("../../results/tsne/tsne_3d_201_layer_"+str(layer),lgd) fig, ax = plt.subplots() for cla in final_fams: xc = [p for (j,p) in enumerate(x[:,0]) if fams[j]==cla] yc = [p for (j,p) in enumerate(x[:,1]) if fams[j]==cla] cols = [c for (j,c) in enumerate(colors) if fams[j]==cla] m = [c for (j,c) in enumerate(markers) if fams[j]==cla] l = [c for (j,c) in enumerate(labels) if fams[j]==cla] if m: ax.scatter([xc],[yc],c=cols[0],marker=m[0],label=cla,edgecolors='k') for i in range (len(xc)): ax.annotate(l[i], (xc[i], yc[i]),fontsize=3) lgd = plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=5,ncol=ncol) save("../../results/tsne/tsne_color_01_"+str(layer),lgd) fig, ax = plt.subplots() for cla in final_fams: xc = [p for (j,p) in enumerate(x[:,1]) if fams[j]==cla] yc = [p for (j,p) in enumerate(x[:,2]) if fams[j]==cla] cols = [c for (j,c) in enumerate(colors) if fams[j]==cla] m = [c for (j,c) in enumerate(markers) if fams[j]==cla] l = [c for (j,c) in enumerate(labels) if fams[j]==cla] if m: ax.scatter([xc],[yc],c=cols[0],marker=m[0],label=cla,edgecolors='k') for i in range (len(xc)): ax.annotate(l[i], (xc[i], yc[i]),fontsize=3) lgd = plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=5,ncol=ncol) save("../../results/tsne/tsne_color_12_"+str(layer),lgd) fig, ax = plt.subplots() for cla in final_fams: xc = [p for (j,p) in enumerate(x[:,2]) if fams[j]==cla] yc = [p for (j,p) in enumerate(x[:,0]) if fams[j]==cla] cols = [c for (j,c) in enumerate(colors) if fams[j]==cla] m = [c for (j,c) in enumerate(markers) if fams[j]==cla] l = [c for (j,c) in enumerate(labels) if fams[j]==cla] if m: ax.scatter([xc],[yc],c=cols[0],marker=m[0],label=cla,edgecolors='k') for i in range (len(xc)): ax.annotate(l[i], (xc[i], yc[i]),fontsize=3) lgd = plt.legend(loc="upper left", bbox_to_anchor=(1,1),fontsize=5,ncol=ncol) save("../../results/tsne/tsne_color_20_"+str(layer),lgd) poc_order_file = open('../../results/User/auto_poc_fp_order.txt') #poc_order_file = open('../../results/User/finetune_poc_fp_order.txt') plot_data('SCOP',5,poc_order_file)