Skip to content

Commit

Permalink
Upload mhc predictor
Browse files Browse the repository at this point in the history
  • Loading branch information
julianspaeth committed Dec 12, 2017
1 parent f5b5ee2 commit b2fd13a
Show file tree
Hide file tree
Showing 89 changed files with 11,445 additions and 0 deletions.
Binary file added docs/paper_spaeth_krumm_app.pdf
Binary file not shown.
Empty file added src/__init__.py
Empty file.
10,296 changes: 10,296 additions & 0 deletions src/data/aaindex1.txt

Large diffs are not rendered by default.

727 changes: 727 additions & 0 deletions src/data/project_training.txt

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/python

from optparse import OptionParser, OptionGroup
import tools
from sklearn.externals import joblib


DEBUG = False

# print debug messages and errors
def debug(s):
if DEBUG:
print("DEBUG: " + s)


def error(s):
print("ERROR: " + s)


# read the input file and parse the lines
def parseAndReadInput(s):
try:
source = s
datei = open(source, "r")
lines = datei.read().splitlines()
new_lines = []
for line in lines:
# remove whitespace and tab
new_line = line.replace(' ','')
new_line = new_line.replace('\t','')
if(len(new_line) == 9):
new_lines.append(new_line)
debug("Got input: %s" % new_lines)
return new_lines
except Exception as e:
error("Couldn't read input file: %s" % str(e))
return None


# main method with menu
if __name__ == '__main__':
# build menu structure
usage = "\nCopyright (C) 2016 App, Krumm, Spaeth - Use at your own risk!\n" + \
"description: this is an SVM based MHC-I predictor \n\n" + \
"Input file required."

parser = OptionParser(usage)
parser.add_option("-d", "--debug",
action="store_true", dest="debug", default=False)

option_group = OptionGroup(parser, "Open input file")
option_group.add_option("--input", dest="input",
help="path to the input file")
option_group.add_option("--output", dest="output",
help="name of the generated output file")
parser.add_option_group(option_group)

(options, args) = parser.parse_args()

DEBUG = options.debug

# if input is given:
# set input path, read and parse input, execute svm
if options.input and options.output:
debug("Reading input file: " + options.input)
input_list = parseAndReadInput(options.input)
features = tools.prep.prepareData(input_list)
try:
debug("Loading existing svm...")
svm = joblib.load('svm/svm.pkl')
except Exception as e:
error("Couldn't load svm file: %s \n +"
"-> First execute tools/svm.py to generate a svm." % str(e))
# save prediction in file
predicted_labels = tools.svm_methods.svmPredict(svm, features)
try:
out_file = open("%s" % options.output, "w+")
for x in range(0, len(input_list)):
out_file.write("%s\t%s\n" % (input_list[x], predicted_labels[x]))
debug("Generated output file %s" %options.output)
except Exception as e:
error("Could't create output file: %s" % str(e))
else:
error("No input and output given!")
Empty file added src/selector/__init__.py
Empty file.
Binary file added src/selector/selector.pkl
Binary file not shown.
Binary file added src/selector/selector.pkl_01.npy
Binary file not shown.
Binary file added src/selector/selector.pkl_02.npy
Binary file not shown.
Empty file added src/svm/__init__.py
Empty file.
Binary file added src/svm/svm.pkl
Binary file not shown.
Binary file added src/svm/svm.pkl_01.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_02.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_03.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_04.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_05.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_06.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_07.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_08.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_09.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_10.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_11.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_12.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_13.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_14.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_15.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_16.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_17.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_18.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_19.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_20.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_21.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_22.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_23.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_24.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_25.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_26.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_27.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_28.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_29.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_30.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_31.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_32.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_33.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_34.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_35.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_36.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_37.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_38.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_39.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_40.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_41.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_42.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_43.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_44.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_45.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_46.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_47.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_48.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_49.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_50.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_51.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_52.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_53.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_54.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_55.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_56.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_57.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_58.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_59.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_60.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_61.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_62.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_63.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_64.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_65.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_66.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_67.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_68.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_69.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_70.npy
Binary file not shown.
Binary file added src/svm/svm.pkl_71.npy
Binary file not shown.
Empty file added src/tools/__init__.py
Empty file.
97 changes: 97 additions & 0 deletions src/tools/prep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Hier befinden sich die Methoden zur Vorverarbeitung der Trainingsdaten und Inputdaten
import tools.project_training_parser as project_training_parser
import tools.total_aaindex_parser as total_aaindex_parser
import numpy as np
from sklearn.externals import joblib
from sklearn import preprocessing
from sklearn.feature_selection import SelectPercentile, f_classif

# Lädt Trainingsdaten von training_data.txt
trainingData, ligands, labels = project_training_parser.parseProjectTraining()


# Gibt Trainingsdaten aus
# Out: Trainingsdaten
def getTrainingData():
return trainingData


# Gibt Liganden der Trainingsdaten aus
# Out: Liganden des Trainingsdatensatzes
def getLigands():
return ligands


# Gibt Labels der Trainingsdaten aus
# Out: Labels des Trainingsdatensatzes
def getLabels():
return labels


# Gibt Liste mit Features für die Trainingsdaten aus
# Out: Features für Aminosäuren
def getFeaturesForAS(x):
features_for_as = []
for feature in total_aaindex_parser.getFeaturesforAAX():
features_for_as.append(feature[x])
return features_for_as


# Weist jeder Aminosäure seine Features zu
# Out: Dictionary das jeder Aminosäure seine Eigenschaften zuweist
def setAllFeatures():
aas = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
feature_dic = {}
for x in range(0, len(aas)):
feature_dic.update({aas[x]: getFeaturesForAS(x)})
return feature_dic


# Stellt Peptide als Feature-Vektor dar
# In: Liganden und Features
# Out: Liganden als Feature-Vektor
def featureLigands(ligands, features):
a = setAllFeatures()
number_of_features = len(a["A"])
ligands_featured = []
for x in range(0, len(ligands)):
peptide = []
for y in range(0, number_of_features):
for char in ligands[x]:
peptide.append(features[char][y])
ligands_featured.append(peptide)
return ligands_featured


# Bereitet Trainingsdaten für SVM auf
# Out: Skalierte und Selektierte Features und Labels der Trainingsdaten
def prepareTrainingData():
# Einlesen der Trainingsdaten und casten in Numpy Array
labels = np.array(getLabels())
features = np.array(featureLigands(getLigands(), setAllFeatures()))
# Skalierung der Trainingsdaten mit MinMaxScaler
scaler = preprocessing.MinMaxScaler()
scaled_features = scaler.fit_transform(features)
# Feature Selection mit 10 Percentil
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(scaled_features, labels)
# Speichere Selektor um ihn bei INPUT später auch zu verwenden
joblib.dump(selector, '../selector/selector.pkl')
selected_features = selector.transform(scaled_features)
return selected_features, labels


# Bereitet Input-Daten fuer SVM auf
# In: Liganden aus Input
# Out: Skalierte und Selektierte Feature-Vektoren des Inputs
def prepareData(ligands):
# Skalierung mit MinMaxScaler
scaler = preprocessing.MinMaxScaler()
scaled_features = scaler.fit_transform(np.array(featureLigands(ligands, setAllFeatures())))
# Feature Selection mit Selektor aus Trainingsvorgängen
selector = joblib.load('selector/selector.pkl')
selected_features = selector.transform(scaled_features)
return selected_features
28 changes: 28 additions & 0 deletions src/tools/project_training_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os


# Lädt Trainingsdatensatz in einen Array
# Out: Trainingsdaten, Liganden und Labels
def parseProjectTraining():
training_data = []
source = "data/project_training.txt"
if not os.path.isfile(source):
source = "../data/project_training.txt"
input_file = open(source, "r")
ligands = []
labels_str = []
input_file.next()
for line in input_file:
ligands.append(line.split(None, 1)[0])
labels_str.append(line[-2])
input_file.close()
labels = []
for i in range(len(labels_str)):
labels.append(int(labels_str[i]))
for x in range(0, len(ligands)):
training_data.append([ligands[x], labels[x]])

return training_data, ligands, labels
21 changes: 21 additions & 0 deletions src/tools/svm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Eine neue SVM wird erstellt und in das SVM-Verzeichnis gespeichert
# Die SVM wird auf dem kompletten Datensatz trainiert

import prep
import svm_methods
from sklearn.externals import joblib

# Bereite Daten für SVM auf
features, labels = prep.prepareTrainingData()
# Grid-Search für kompletten Trainingsdatensatz
svm = svm_methods.svmGridFit(features, labels)
# Sage Labels für Trainingsdatensatz vorraus
# predicted_labels = svm_methods.svmPredict(svm, features)
# Zeige Ergebnisse der Vorhersage an
# svm_methods.showResults(predicted_labels, labels)
# svm_methods.plotROC(predicted_labels, labels)
# Speichere SVM
joblib.dump(svm, '../svm/svm.pkl')
30 changes: 30 additions & 0 deletions src/tools/svm_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Führt eine Stratified 10-Fold Kreuzvalidierung auf den Trainingsdaten aus, um die Qualität zu beurteilen
import tools.svm_methods as svm_methods

meanAUC = 0
meanFPR = 0
meanTPR = 0
counter = 0
#Führe Kreuzvalidierung durch (10 Stratified-KFold)
for ligands_train, ligands_test, labels_train, labels_test in svm_methods.skfCV():
#Grid-Search jeweils für die 10 Datensätze
svm = svm_methods.svmGridFit(ligands_train, labels_train)
#Vorhersage der Labels für die jeweils 10 Datensätze
predicted_labels = svm_methods.svmPredict(svm, ligands_test)
#Zeige AUC für die jeweils 10 Datensätze
print(svm_methods.getAUC(predicted_labels, labels_test))
#Berechne mittleren AUC
meanAUC = meanAUC+svm_methods.getAUC(predicted_labels, labels_test)
fpr, tpr = svm_methods.getFprTpr(predicted_labels, labels_test)
meanFPR = meanFPR+fpr
meanTPR = meanTPR+tpr
counter = counter + 1
#Zeige Mittleren AUC Wert der 10 Durchgänge
meanAUC = meanAUC/counter
meanFPR = meanFPR/counter
meanTPR = meanTPR/counter
print("MEAN AUC: %s" % (meanAUC))
svm_methods.plotMeanROC(meanFPR, meanTPR)
121 changes: 121 additions & 0 deletions src/tools/svm_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import tools.prep as prep
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


# Hier befinden sich die Methoden zum erstellen und trainieren einer SVM

# 10-Stratified-KFold Kreuzvalidierung der Daten
# Out: Jeweils Folds des Datensatzes
def skfCV():
# Liest aufbereitete Trainingsdaten ein
features, labels = prep.prepareTrainingData()
# Initialisiere SKF mit 10 Folds
skf = StratifiedKFold(labels, n_folds=10)
# Teile Datensatz in Folds auf
for train, test in skf:
features_train, features_test, labels_train, labels_test = features[train], features[test], labels[train], \
labels[test]
yield features_train, features_test, labels_train, labels_test


# SVM mit Grid-Search
# In: Features und Labels
# Out: SVM mit bestem Parameter
def svmGridFit(features, labels):
param_grid = [{'C': [1, 10, 20, 30, 40, 50, 100, 500, 750, 1000], 'gamma': [0.1, 0.01, 0.001, 0.002, 0.003, 0.005]}]
svm = SVC(kernel='rbf', cache_size=1000, class_weight='balanced')
clf = GridSearchCV(svm, param_grid=param_grid, cv=10, scoring='roc_auc')
clf.fit(features, labels)
# print "Best parameters: %s" % clf.best_params_
return clf


# Vorhersage der Labels für Input
# In: Input-Features
# Out: Vorhergesagte Labels
def svmPredict(clf, input):
pred_labels = clf.predict(input)
return pred_labels


# Zeigt Anzahl der Binder und Nicht-Binder an
# In: Labels
# Out: Binder, nicht-Binder
def showBinder(labels):
binder = 0
nonBinder = 0
for p in labels:
if p == 1:
binder = binder + 1
else:
nonBinder = nonBinder + 1
return binder, nonBinder


# Gibt FPR und TPR einer Vorhersage heraus
# In: Vorhergesagt Labels, Test Labels
# Out: FPR, TPR
def getFprTpr(pred_labels, test_labels):
fpr, tpr, thresholds = roc_curve(test_labels, pred_labels)
return fpr, tpr


# Plottet ROC Kurve aus FPR und TPR
# In: False Positive Rate, True Positiv Rate
# Out: ROC-Plot
def plotMeanROC(fpr, tpr):
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print("ROC-Curve plotted")
plt.show()


# Plottet ROC Kurve von der Vorhersage
# In: Vorhergesagte Labels, Test Labels
# Out: ROC-Plot
def plotROC(pred_labels, test_labels):
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
print("ROC-Curve plotted")
plt.show()


# Gibt Ergebnisse der Vorhersage aus
# In: Vorhergesagte Labels, Testlabels
# Out: Zeigt Anzahl der Binder, nicht-Binder und AUC an
def showResults(pred_labels, test_labels):
binder, nonBinder = showBinder(pred_labels)
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("SVM predicted %s binder and %s nonBinder. AUC = %s" % (binder, nonBinder, roc_auc))


# Ermittelt AUC der Vorhersage
# In: Vorhergesagte Labels, Testlabels
# Out: AUC der Vorhersage
def getAUC(pred_labels, test_labels):
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels)
roc_auc = auc(false_positive_rate, true_positive_rate)
return roc_auc
Loading

0 comments on commit b2fd13a

Please sign in to comment.