import os import sys import time import numpy import theano import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams from pprint import pprint from scipy.io import matlab import re import math from theano import shared from collections import OrderedDict from layers import * from theano.misc.pkl_utils import dump import numpy as np from io_utils_MUV import load_MUV_data_shuffle_fold, load_MUV_dataset_CV, test_60 from mol_graph import * from process_poc_pretrain import * max_poc_degrees = 20 max_nodes_in_poc = 50 max_mol_degrees = 6 max_nodes_in_mol = 60 input_dir = '../data/ALL_ff/' input_ext = '.ff' def pocket_ff_to_numpy(ff_list,max_ff, min_ff, mean_ff): print ("ff_list") all_pocket=[] dat_num=0 for fn in ff_list: FV=[] site_ID=fn.strip('.ff') ele = fn.split('_') PDB = fn[0:4] lig = ele[1].split('.')[0] correct_fn = PDB.lower()+'_'+lig+'.ff' f = open(os.path.join(input_dir,correct_fn)) infile=list(f) for line in infile: S=line.split() if S!=[]: if len(S[0])>3: if S[0][0:3]=="Env": # print line feature_vec=numpy.zeros((480,)) for i in range (0,480): #S[1]-S[480] if max_ff[i]-min_ff[i]!=0: feature_vec[i]=(float(S[i+1])-min_ff[i])/(max_ff[i]-min_ff[i]) else: feature_vec[i]=float(S[i+1]) x=float(S[482]) y=float(S[483]) z=float(S[484]) pos=[x,y,z] pos=numpy.array(pos) T=[feature_vec,pos] T=numpy.array(T) FV.append(T) f.close() vectors = numpy.array(FV) all_pocket.append([fn,vectors]) return all_pocket def shared_dataset_mol(mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mol_mask, borrow=True): shared_mol_atom_features = theano.shared(numpy.asarray(mol_atom_features, dtype=theano.config.floatX),borrow=borrow) shared_mol_bond_features = theano.shared(numpy.asarray(mol_bond_features, dtype=theano.config.floatX),borrow=borrow) shared_mol_atom_neighbors = theano.shared(numpy.asarray(mol_atom_neighbors, dtype=theano.config.floatX),borrow=borrow) shared_mol_bond_neighbors = theano.shared(numpy.asarray(mol_bond_neighbors, dtype=theano.config.floatX),borrow=borrow) shared_mol_degrees = theano.shared(numpy.asarray(mol_degrees, dtype=theano.config.floatX),borrow=borrow) shared_mask = theano.shared(numpy.asarray(mol_mask, dtype=theano.config.floatX),borrow=borrow) return shared_mol_atom_features, shared_mol_bond_features, shared_mol_atom_neighbors, shared_mol_bond_neighbors, shared_mol_degrees, shared_mask def shared_dataset(env_features, env_neighbors, env_degrees, mask, labels, borrow=True): shared_env_features = theano.shared(numpy.asarray(env_features,dtype=theano.config.floatX),borrow=borrow) shared_env_neighbors = theano.shared(numpy.asarray(env_neighbors,dtype=theano.config.floatX),borrow=borrow) shared_env_degrees = theano.shared(numpy.asarray(env_degrees,dtype=theano.config.floatX),borrow=borrow) shared_mask = theano.shared(numpy.asarray(mask,dtype=theano.config.floatX),borrow=borrow) shared_labels = theano.shared(numpy.asarray(labels,dtype=theano.config.floatX),borrow=borrow) return shared_env_features, shared_env_neighbors, shared_env_degrees, shared_mask, T.cast(shared_labels, 'int32') def sum_and_stack(features, mask, num_envs, fp_length, max_nodes): # features: 250, 512 # mask: 250 features = features * mask.dimshuffle(0,'x') stacked_features = T.reshape(features, (int(num_envs/max_nodes), int(max_nodes), int(fp_length)), ndim=3) return T.sum(stacked_features,axis=1) def sum_and_stack_atoms(features, mask, num_input, max_nodes_in_mol, fp_length): # features: 250, 512 # mask: 250 features = features * mask.dimshuffle(0,'x') stacked_features = T.reshape(features, (int(num_input), int(max_nodes_in_mol), int(fp_length)), ndim=3) return T.sum(stacked_features,axis=1) def relu(X): """Rectified linear units (relu)""" return T.maximum(0,X) class PocketGraph(object): def __init__(self): self.nodes = {} # dict of lists of nodes, keyed by node type def new_node(self, ntype, pos, features=None, env_ix=None): new_node = Env(ntype, pos, features, env_ix) self.nodes.setdefault(ntype, []).append(new_node) return new_node def add_subgraph(self, subgraph): old_nodes = self.nodes new_nodes = subgraph.nodes for ntype in set(old_nodes.keys()) | set(new_nodes.keys()): old_nodes.setdefault(ntype, []).extend(new_nodes.get(ntype, [])) def get_degree(self, ntype): all_node_degree=[] for node in self.nodes[ntype]: all_node_degree.append(len(node.get_neighbors(ntype))) return numpy.array(all_node_degree) def feature_array(self, ntype): assert ntype in self.nodes return np.array([node.features for node in self.nodes[ntype]]) def pos_array(self, ntype): assert ntype in self.nodes return np.array([node.pos for node in self.nodes[ntype]]) def neighbor_list(self, self_ntype, neighbor_ntype): assert self_ntype in self.nodes and neighbor_ntype in self.nodes neighbor_idxs = {n : i for i, n in enumerate(self.nodes[neighbor_ntype])} for self_node in self.nodes[self_ntype]: # print "indi number of env:" neighbors=self_node.get_neighbors(neighbor_ntype) return [[neighbor_idxs[neighbor] for neighbor in self_node.get_neighbors(neighbor_ntype)] for self_node in self.nodes[self_ntype]] def env_ix_array(self): return np.array([node.env_ix for node in self.nodes['env']]) class Env(object): __slots__ = ['ntype', 'features', '_neighbors', 'pos', 'env_ix'] def __init__(self, ntype, pos, features, env_ix): self.ntype = ntype self.features = features self._neighbors = [] self.pos = pos self.env_ix = env_ix def add_neighbors(self, neighbor_list): for neighbor in neighbor_list: self._neighbors.append(neighbor) neighbor._neighbors.append(self) def get_neighbors(self, ntype): return [n for n in self._neighbors if n.ntype == ntype] def dist(env_1,env_2): return np.sqrt(np.sum((env_1.pos-env_2.pos)**2)) def pad_neighbors(neighbors): pad_neighbors = numpy.zeros((len(neighbors),len(neighbors))) for i in range (len(neighbors)): entry = neighbors[i] for e in range (len(entry)): pad_neighbors[i][entry[e]]=1 return pad_neighbors def pad_neighbors_bond(neighbors,num_atoms,num_bonds): pad_neighbors = numpy.zeros((num_atoms,num_bonds)) for i in range (len(neighbors)): entry = neighbors[i] for e in range (len(entry)): pad_neighbors[i][entry[e]]=1 return pad_neighbors def pad_degree(degrees,max_degrees): pad_degrees = numpy.zeros((len(degrees),max_degrees)) for i in range (len(degrees)): degree = degrees[i] pad_degrees[i][degree]=1 return pad_degrees def get_pocket_attributes(ff_list,max_ff, min_ff, mean_ff): pockets = pocket_ff_to_numpy(ff_list,max_ff, min_ff, mean_ff) big_graph, mask = graph_from_pocket_tuple(pockets) env_features = big_graph.feature_array('env') env_neighbors = big_graph.neighbor_list('env','env') env_neighbors = pad_neighbors(env_neighbors) env_degrees = big_graph.get_degree('env') env_degrees = pad_degree(env_degrees, max_poc_degrees) return env_features, env_neighbors, env_degrees, mask def get_atom_bond_dim(smiles_tuple): big_graph, mask = graph_from_smiles_tuple(smiles_tuple) mol_atom_features = big_graph.feature_array('atom') mol_bond_features = big_graph.feature_array('bond') num_atom_features = mol_atom_features.shape[1] num_bond_features = mol_bond_features.shape[1] return num_atom_features, num_bond_features def get_mol_attributes(smiles_tuple): big_graph, mask = graph_from_smiles_tuple(smiles_tuple) num_atoms = len(big_graph.nodes['atom']) num_bonds = len(big_graph.nodes['bond']) mol_atom_features = big_graph.feature_array('atom') mol_bond_features = big_graph.feature_array('bond') mol_atom_neighbors = big_graph.neighbor_list('atom','atom') mol_bond_neighbors = big_graph.neighbor_list('atom','bond') mol_atom_neighbors = pad_neighbors(mol_atom_neighbors) mol_bond_neighbors = pad_neighbors_bond(mol_bond_neighbors,num_atoms,num_bonds) mol_degrees = big_graph.get_degree('atom') mol_degrees = pad_degree(mol_degrees, max_mol_degrees) return mol_atom_features, mol_bond_features, mol_atom_neighbors, mol_bond_neighbors, mol_degrees, mask def graph_from_pocket_tuple(pocket_Env_list): graph_list = [graph_from_Env(s) for s in pocket_Env_list] big_graph = PocketGraph() for i in range (len(graph_list)): subgraph = graph_list[i] graph, mask = subgraph big_graph.add_subgraph(graph) if i ==0: big_graph_mask = mask else: big_graph_mask = numpy.concatenate((big_graph_mask, mask), axis=0) return big_graph, big_graph_mask def graph_from_smiles_tuple(smiles_tuple): graph_list = [graph_from_smiles(s) for s in smiles_tuple] big_graph = MolGraph() count=0 for i in range (len(graph_list)): subgraph = graph_list[i] graph, mask = subgraph if graph is None: print ("I found a bug!!!!") else: big_graph.add_subgraph(graph) if count ==0: big_graph_mask = mask else: big_graph_mask = numpy.concatenate((big_graph_mask, mask), axis=0) count=count+1 return big_graph, big_graph_mask def graph_from_smiles(smiles): graph = MolGraph() mol = MolFromSmiles(smiles) if not mol: print ("Could not parse SMILES string:") print (smiles) return None, None else: atoms_by_rd_idx = {} for atom in mol.GetAtoms(): features = atom_features(atom) if features[0]==False: return None, None new_atom_node = graph.new_node('atom', features=features[1], rdkit_ix=atom.GetIdx()) atoms_by_rd_idx[atom.GetIdx()] = new_atom_node dummpy_atom_shape=features[1].shape for bond in mol.GetBonds(): atom1_node = atoms_by_rd_idx[bond.GetBeginAtom().GetIdx()] atom2_node = atoms_by_rd_idx[bond.GetEndAtom().GetIdx()] new_bond_node = graph.new_node('bond', features=bond_features(bond)) new_bond_node.add_neighbors((atom1_node, atom2_node)) atom1_node.add_neighbors((atom2_node,)) num_of_atoms = len(mol.GetAtoms()) mask = numpy.zeros((max_nodes_in_mol,)) # print "num_of_atoms" # print num_of_atoms for i in range(num_of_atoms): mask[i]=1 if num_of_atoms