predict-distances.py

'''
File: Contains the code to predict distances
'''

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import os
import sys
import numpy as np
import datetime
import pickle
import getopt

from tensorflow.python.keras import layers
from tensorflow.python.keras.layers import Input, Convolution2D, Activation, add, Dropout, BatchNormalization
from tensorflow.python.keras.models import Model

epsilon = 1e-7

if sys.version_info < (3,0,0):
    print('Python 3 required!!!')
    sys.exit(1)

# Some GPUs don't allow memory growth by default (keep both options)
# Option 1
for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)
# Option 2
#import keras.backend as K
#gpu_options = tf.GPUOptions(allow_growth=True)
#sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
#K.tensorflow_backend.set_session(sess)

def usage():
    print('Usage:')
    print(sys.argv[0] + ' <-w file_weights> <-p file_pkl> <-o outrr>')

try:
    opts, args = getopt.getopt(sys.argv[1:], "w:p:o:h")
except getopt.GetoptError as err:
    print(err)
    usage()
    sys.exit(2)

wts = ''
pkl = ''
rr  = ''
for o, a in opts:
    if o in ("-h", "--help"):
        usage()
        sys.exit()
    elif o in ("-w"):
        wts = os.path.abspath(a)
    elif o in ("-p"):
        pkl = os.path.abspath(a)
    elif o in ("-o"):
        rr = os.path.abspath(a)
    else:
        assert False, "Error!! unhandled option!!"

if len(wts) < 2:
    print('wts file undefined!')
    usage()
    sys.exit()
if len(pkl) < 2:
    print('in pkl undefined!')
    usage()
    sys.exit()
if len(rr) < 2:
    print('our rr undefined!')
    usage()
    sys.exit()

pad_size = 10
expected_n_channels = 57
OUTL = 1024


def save_dist_to_contacts_rr(seq, pred_matrix, file_rr):
    rr = open(file_rr, 'w')
    rr.write(seq + "\n")
    P = np.copy(pred_matrix)
    maxd = np.max(P)
    L = len(P[:])
    for j in range(0, L):
        for k in range(j, L):
            P[j, k] = (P[k, j, 0] + P[j, k, 0]) / 2.0
    PROB = 4.0 / P
    PROB[PROB > 1.0] = 1.0
    for j in range(0, L):
        for k in range(j, L):
            if abs(j - k) < 5:
                continue
            rr.write("%i %i 0 8 %.5f\n" %(j+1, k+1, PROB[j][k]))
    rr.close()
    print('Written RR ' + file_rr + ' !')


def get_feature(infile, expected_n_channels):
    features = pickle.load(open(infile, 'rb'))
    l = len(features['seq'])
    seq = features['seq']
    # Create X and Y placeholders
    X = np.full((l, l, expected_n_channels), 0.0)
    # Add secondary structure
    ss = features['ss']
    assert ss.shape == (3, l)
    fi = 0
    for j in range(3):
        a = np.repeat(ss[j].reshape(1, l), l, axis = 0)
        X[:, :, fi] = a
        fi += 1
        X[:, :, fi] = a.T
        fi += 1
    # Add PSSM
    pssm = features['pssm']
    assert pssm.shape == (l, 22)
    for j in range(22):
        a = np.repeat(pssm[:, j].reshape(1, l), l, axis = 0)
        X[:, :, fi] = a
        fi += 1
        X[:, :, fi] = a.T
        fi += 1
    # Add SA
    sa = features['sa']
    assert sa.shape == (l, )
    a = np.repeat(sa.reshape(1, l), l, axis = 0)
    X[:, :, fi] = a
    fi += 1
    X[:, :, fi] = a.T
    fi += 1
    # Add entrophy
    entropy = features['entropy']
    assert entropy.shape == (l, )
    a = np.repeat(entropy.reshape(1, l), l, axis = 0)
    X[:, :, fi] = a
    fi += 1
    X[:, :, fi] = a.T
    fi += 1
    # Add CCMpred
    ccmpred = features['ccmpred']
    assert ccmpred.shape == ((l, l))
    X[:, :, fi] = ccmpred
    fi += 1
    # Add  FreeContact
    freecon = features['freecon']
    assert freecon.shape == ((l, l))
    X[:, :, fi] = freecon
    fi += 1
    # Add potential
    potential = features['potential']
    assert potential.shape == ((l, l))
    X[:, :, fi] = potential
    fi += 1
    assert fi == expected_n_channels
    assert X.max() < 100.0
    assert X.min() > -100.0
    return X


# Architecture DEEPCON (distances)
def deepcon_rdd_distances(L, num_blocks, width, expected_n_channels):
    print('')
    print('Model params:')
    print('L', L)
    print('num_blocks', num_blocks)
    print('width', width)
    print('expected_n_channels', expected_n_channels)
    print('')
    dropout_value = 0.3
    my_input = Input(shape = (L, L, expected_n_channels))
    tower = BatchNormalization()(my_input)
    tower = Activation('relu')(tower)
    tower = Convolution2D(width, 1, padding = 'same')(tower)
    n_channels = width
    d_rate = 1
    for i in range(num_blocks):
        block = BatchNormalization()(tower)
        block = Activation('relu')(block)
        block = Convolution2D(n_channels, kernel_size = (3, 3), padding = 'same')(block)
        block = Dropout(dropout_value)(block)
        block = Activation('relu')(block)
        block = Convolution2D(n_channels, kernel_size = (3, 3), dilation_rate=(d_rate, d_rate), padding = 'same')(block)
        tower = add([block, tower])
        if d_rate == 1:
            d_rate = 2
        elif d_rate == 2:
            d_rate = 4
        else:
            d_rate = 1
    tower = BatchNormalization()(tower)
    tower = Activation('relu')(tower)
    tower = Convolution2D(1, 3, padding = 'same')(tower)
    tower = Activation('relu')(tower)
    model = Model(my_input, tower)
    return model

features = pickle.load(open(pkl, 'rb'))
l = len(features['seq'])
seq = features['seq']

OUTL = l + 20

X = get_feature(pkl, expected_n_channels)
assert len(X[0, 0, :]) == expected_n_channels
l = len(X[:, 0, 0])

XX = np.full((1, OUTL, OUTL, expected_n_channels), 0.0)
Xpadded = np.zeros((l + pad_size, l + pad_size, len(X[0, 0, :])))
Xpadded[int(pad_size/2) : l+int(pad_size/2), int(pad_size/2) : l+int(pad_size/2), :] = X
l = len(Xpadded[:, 0, 0])
XX[0, :l, :l, :] = Xpadded

print('')
print('Channel summaries:')
print(' Channel        Avg        Max        Sum')
for i in range(len(X[0, 0, :])):
    (m, s, a) = (X[:, :, i].flatten().max(), X[:, :, i].flatten().sum(), X[:, :, i].flatten().mean())
    print(' %7s %10.4f %10.4f %10.1f' % (i+1, a, m, s))

model = deepcon_rdd_distances(OUTL, 128, 64, expected_n_channels)

model.load_weights(wts)

P = model.predict(XX)
P[ P < 0.01 ] = 0.01

P = 100.0 / (P + epsilon)

# Remove padding, i.e. shift up and left by int(pad_size/2)
P[:, :OUTL-pad_size, :OUTL-pad_size, :] = P[:, int(pad_size/2) : OUTL-int(pad_size/2), int(pad_size/2) : OUTL-int(pad_size/2), :]

print('')
print('Save predictions..')
save_dist_to_contacts_rr(seq, P[0, :len(seq), :len(seq)], rr)

print('')
print('Save numpy files..')
for j in range(0, len(P[0]) ):
    for k in range(j, len(P[0]) ):
        P[0, j, k] = (P[0, k, j] + P[0, j, k]) / 2.0
np.save(rr + '.npy', P[0, :len(seq), :len(seq)])