-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f5b5ee2
commit b2fd13a
Showing
89 changed files
with
11,445 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Empty file.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/python | ||
|
||
from optparse import OptionParser, OptionGroup | ||
import tools | ||
from sklearn.externals import joblib | ||
|
||
|
||
DEBUG = False | ||
|
||
# print debug messages and errors | ||
def debug(s): | ||
if DEBUG: | ||
print("DEBUG: " + s) | ||
|
||
|
||
def error(s): | ||
print("ERROR: " + s) | ||
|
||
|
||
# read the input file and parse the lines | ||
def parseAndReadInput(s): | ||
try: | ||
source = s | ||
datei = open(source, "r") | ||
lines = datei.read().splitlines() | ||
new_lines = [] | ||
for line in lines: | ||
# remove whitespace and tab | ||
new_line = line.replace(' ','') | ||
new_line = new_line.replace('\t','') | ||
if(len(new_line) == 9): | ||
new_lines.append(new_line) | ||
debug("Got input: %s" % new_lines) | ||
return new_lines | ||
except Exception as e: | ||
error("Couldn't read input file: %s" % str(e)) | ||
return None | ||
|
||
|
||
# main method with menu | ||
if __name__ == '__main__': | ||
# build menu structure | ||
usage = "\nCopyright (C) 2016 App, Krumm, Spaeth - Use at your own risk!\n" + \ | ||
"description: this is an SVM based MHC-I predictor \n\n" + \ | ||
"Input file required." | ||
|
||
parser = OptionParser(usage) | ||
parser.add_option("-d", "--debug", | ||
action="store_true", dest="debug", default=False) | ||
|
||
option_group = OptionGroup(parser, "Open input file") | ||
option_group.add_option("--input", dest="input", | ||
help="path to the input file") | ||
option_group.add_option("--output", dest="output", | ||
help="name of the generated output file") | ||
parser.add_option_group(option_group) | ||
|
||
(options, args) = parser.parse_args() | ||
|
||
DEBUG = options.debug | ||
|
||
# if input is given: | ||
# set input path, read and parse input, execute svm | ||
if options.input and options.output: | ||
debug("Reading input file: " + options.input) | ||
input_list = parseAndReadInput(options.input) | ||
features = tools.prep.prepareData(input_list) | ||
try: | ||
debug("Loading existing svm...") | ||
svm = joblib.load('svm/svm.pkl') | ||
except Exception as e: | ||
error("Couldn't load svm file: %s \n +" | ||
"-> First execute tools/svm.py to generate a svm." % str(e)) | ||
# save prediction in file | ||
predicted_labels = tools.svm_methods.svmPredict(svm, features) | ||
try: | ||
out_file = open("%s" % options.output, "w+") | ||
for x in range(0, len(input_list)): | ||
out_file.write("%s\t%s\n" % (input_list[x], predicted_labels[x])) | ||
debug("Generated output file %s" %options.output) | ||
except Exception as e: | ||
error("Could't create output file: %s" % str(e)) | ||
else: | ||
error("No input and output given!") |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
# Hier befinden sich die Methoden zur Vorverarbeitung der Trainingsdaten und Inputdaten | ||
import tools.project_training_parser as project_training_parser | ||
import tools.total_aaindex_parser as total_aaindex_parser | ||
import numpy as np | ||
from sklearn.externals import joblib | ||
from sklearn import preprocessing | ||
from sklearn.feature_selection import SelectPercentile, f_classif | ||
|
||
# Lädt Trainingsdaten von training_data.txt | ||
trainingData, ligands, labels = project_training_parser.parseProjectTraining() | ||
|
||
|
||
# Gibt Trainingsdaten aus | ||
# Out: Trainingsdaten | ||
def getTrainingData(): | ||
return trainingData | ||
|
||
|
||
# Gibt Liganden der Trainingsdaten aus | ||
# Out: Liganden des Trainingsdatensatzes | ||
def getLigands(): | ||
return ligands | ||
|
||
|
||
# Gibt Labels der Trainingsdaten aus | ||
# Out: Labels des Trainingsdatensatzes | ||
def getLabels(): | ||
return labels | ||
|
||
|
||
# Gibt Liste mit Features für die Trainingsdaten aus | ||
# Out: Features für Aminosäuren | ||
def getFeaturesForAS(x): | ||
features_for_as = [] | ||
for feature in total_aaindex_parser.getFeaturesforAAX(): | ||
features_for_as.append(feature[x]) | ||
return features_for_as | ||
|
||
|
||
# Weist jeder Aminosäure seine Features zu | ||
# Out: Dictionary das jeder Aminosäure seine Eigenschaften zuweist | ||
def setAllFeatures(): | ||
aas = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"] | ||
feature_dic = {} | ||
for x in range(0, len(aas)): | ||
feature_dic.update({aas[x]: getFeaturesForAS(x)}) | ||
return feature_dic | ||
|
||
|
||
# Stellt Peptide als Feature-Vektor dar | ||
# In: Liganden und Features | ||
# Out: Liganden als Feature-Vektor | ||
def featureLigands(ligands, features): | ||
a = setAllFeatures() | ||
number_of_features = len(a["A"]) | ||
ligands_featured = [] | ||
for x in range(0, len(ligands)): | ||
peptide = [] | ||
for y in range(0, number_of_features): | ||
for char in ligands[x]: | ||
peptide.append(features[char][y]) | ||
ligands_featured.append(peptide) | ||
return ligands_featured | ||
|
||
|
||
# Bereitet Trainingsdaten für SVM auf | ||
# Out: Skalierte und Selektierte Features und Labels der Trainingsdaten | ||
def prepareTrainingData(): | ||
# Einlesen der Trainingsdaten und casten in Numpy Array | ||
labels = np.array(getLabels()) | ||
features = np.array(featureLigands(getLigands(), setAllFeatures())) | ||
# Skalierung der Trainingsdaten mit MinMaxScaler | ||
scaler = preprocessing.MinMaxScaler() | ||
scaled_features = scaler.fit_transform(features) | ||
# Feature Selection mit 10 Percentil | ||
selector = SelectPercentile(f_classif, percentile=10) | ||
selector.fit(scaled_features, labels) | ||
# Speichere Selektor um ihn bei INPUT später auch zu verwenden | ||
joblib.dump(selector, '../selector/selector.pkl') | ||
selected_features = selector.transform(scaled_features) | ||
return selected_features, labels | ||
|
||
|
||
# Bereitet Input-Daten fuer SVM auf | ||
# In: Liganden aus Input | ||
# Out: Skalierte und Selektierte Feature-Vektoren des Inputs | ||
def prepareData(ligands): | ||
# Skalierung mit MinMaxScaler | ||
scaler = preprocessing.MinMaxScaler() | ||
scaled_features = scaler.fit_transform(np.array(featureLigands(ligands, setAllFeatures()))) | ||
# Feature Selection mit Selektor aus Trainingsvorgängen | ||
selector = joblib.load('selector/selector.pkl') | ||
selected_features = selector.transform(scaled_features) | ||
return selected_features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
import os | ||
|
||
|
||
# Lädt Trainingsdatensatz in einen Array | ||
# Out: Trainingsdaten, Liganden und Labels | ||
def parseProjectTraining(): | ||
training_data = [] | ||
source = "data/project_training.txt" | ||
if not os.path.isfile(source): | ||
source = "../data/project_training.txt" | ||
input_file = open(source, "r") | ||
ligands = [] | ||
labels_str = [] | ||
input_file.next() | ||
for line in input_file: | ||
ligands.append(line.split(None, 1)[0]) | ||
labels_str.append(line[-2]) | ||
input_file.close() | ||
labels = [] | ||
for i in range(len(labels_str)): | ||
labels.append(int(labels_str[i])) | ||
for x in range(0, len(ligands)): | ||
training_data.append([ligands[x], labels[x]]) | ||
|
||
return training_data, ligands, labels |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
# Eine neue SVM wird erstellt und in das SVM-Verzeichnis gespeichert | ||
# Die SVM wird auf dem kompletten Datensatz trainiert | ||
|
||
import prep | ||
import svm_methods | ||
from sklearn.externals import joblib | ||
|
||
# Bereite Daten für SVM auf | ||
features, labels = prep.prepareTrainingData() | ||
# Grid-Search für kompletten Trainingsdatensatz | ||
svm = svm_methods.svmGridFit(features, labels) | ||
# Sage Labels für Trainingsdatensatz vorraus | ||
# predicted_labels = svm_methods.svmPredict(svm, features) | ||
# Zeige Ergebnisse der Vorhersage an | ||
# svm_methods.showResults(predicted_labels, labels) | ||
# svm_methods.plotROC(predicted_labels, labels) | ||
# Speichere SVM | ||
joblib.dump(svm, '../svm/svm.pkl') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
# Führt eine Stratified 10-Fold Kreuzvalidierung auf den Trainingsdaten aus, um die Qualität zu beurteilen | ||
import tools.svm_methods as svm_methods | ||
|
||
meanAUC = 0 | ||
meanFPR = 0 | ||
meanTPR = 0 | ||
counter = 0 | ||
#Führe Kreuzvalidierung durch (10 Stratified-KFold) | ||
for ligands_train, ligands_test, labels_train, labels_test in svm_methods.skfCV(): | ||
#Grid-Search jeweils für die 10 Datensätze | ||
svm = svm_methods.svmGridFit(ligands_train, labels_train) | ||
#Vorhersage der Labels für die jeweils 10 Datensätze | ||
predicted_labels = svm_methods.svmPredict(svm, ligands_test) | ||
#Zeige AUC für die jeweils 10 Datensätze | ||
print(svm_methods.getAUC(predicted_labels, labels_test)) | ||
#Berechne mittleren AUC | ||
meanAUC = meanAUC+svm_methods.getAUC(predicted_labels, labels_test) | ||
fpr, tpr = svm_methods.getFprTpr(predicted_labels, labels_test) | ||
meanFPR = meanFPR+fpr | ||
meanTPR = meanTPR+tpr | ||
counter = counter + 1 | ||
#Zeige Mittleren AUC Wert der 10 Durchgänge | ||
meanAUC = meanAUC/counter | ||
meanFPR = meanFPR/counter | ||
meanTPR = meanTPR/counter | ||
print("MEAN AUC: %s" % (meanAUC)) | ||
svm_methods.plotMeanROC(meanFPR, meanTPR) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
import tools.prep as prep | ||
from sklearn.svm import SVC | ||
from sklearn.model_selection import StratifiedKFold | ||
from sklearn.model_selection import GridSearchCV | ||
from sklearn.metrics import roc_curve, auc | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
# Hier befinden sich die Methoden zum erstellen und trainieren einer SVM | ||
|
||
# 10-Stratified-KFold Kreuzvalidierung der Daten | ||
# Out: Jeweils Folds des Datensatzes | ||
def skfCV(): | ||
# Liest aufbereitete Trainingsdaten ein | ||
features, labels = prep.prepareTrainingData() | ||
# Initialisiere SKF mit 10 Folds | ||
skf = StratifiedKFold(labels, n_folds=10) | ||
# Teile Datensatz in Folds auf | ||
for train, test in skf: | ||
features_train, features_test, labels_train, labels_test = features[train], features[test], labels[train], \ | ||
labels[test] | ||
yield features_train, features_test, labels_train, labels_test | ||
|
||
|
||
# SVM mit Grid-Search | ||
# In: Features und Labels | ||
# Out: SVM mit bestem Parameter | ||
def svmGridFit(features, labels): | ||
param_grid = [{'C': [1, 10, 20, 30, 40, 50, 100, 500, 750, 1000], 'gamma': [0.1, 0.01, 0.001, 0.002, 0.003, 0.005]}] | ||
svm = SVC(kernel='rbf', cache_size=1000, class_weight='balanced') | ||
clf = GridSearchCV(svm, param_grid=param_grid, cv=10, scoring='roc_auc') | ||
clf.fit(features, labels) | ||
# print "Best parameters: %s" % clf.best_params_ | ||
return clf | ||
|
||
|
||
# Vorhersage der Labels für Input | ||
# In: Input-Features | ||
# Out: Vorhergesagte Labels | ||
def svmPredict(clf, input): | ||
pred_labels = clf.predict(input) | ||
return pred_labels | ||
|
||
|
||
# Zeigt Anzahl der Binder und Nicht-Binder an | ||
# In: Labels | ||
# Out: Binder, nicht-Binder | ||
def showBinder(labels): | ||
binder = 0 | ||
nonBinder = 0 | ||
for p in labels: | ||
if p == 1: | ||
binder = binder + 1 | ||
else: | ||
nonBinder = nonBinder + 1 | ||
return binder, nonBinder | ||
|
||
|
||
# Gibt FPR und TPR einer Vorhersage heraus | ||
# In: Vorhergesagt Labels, Test Labels | ||
# Out: FPR, TPR | ||
def getFprTpr(pred_labels, test_labels): | ||
fpr, tpr, thresholds = roc_curve(test_labels, pred_labels) | ||
return fpr, tpr | ||
|
||
|
||
# Plottet ROC Kurve aus FPR und TPR | ||
# In: False Positive Rate, True Positiv Rate | ||
# Out: ROC-Plot | ||
def plotMeanROC(fpr, tpr): | ||
roc_auc = auc(fpr, tpr) | ||
plt.title('Receiver Operating Characteristic') | ||
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) | ||
plt.legend(loc='lower right') | ||
plt.plot([0, 1], [0, 1], 'r--') | ||
plt.xlim([-0.1, 1.2]) | ||
plt.ylim([-0.1, 1.2]) | ||
plt.ylabel('True Positive Rate') | ||
plt.xlabel('False Positive Rate') | ||
print("ROC-Curve plotted") | ||
plt.show() | ||
|
||
|
||
# Plottet ROC Kurve von der Vorhersage | ||
# In: Vorhergesagte Labels, Test Labels | ||
# Out: ROC-Plot | ||
def plotROC(pred_labels, test_labels): | ||
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels) | ||
roc_auc = auc(false_positive_rate, true_positive_rate) | ||
plt.title('Receiver Operating Characteristic') | ||
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc) | ||
plt.legend(loc='lower right') | ||
plt.plot([0, 1], [0, 1], 'r--') | ||
plt.xlim([-0.1, 1.2]) | ||
plt.ylim([-0.1, 1.2]) | ||
plt.ylabel('True Positive Rate') | ||
plt.xlabel('False Positive Rate') | ||
print("ROC-Curve plotted") | ||
plt.show() | ||
|
||
|
||
# Gibt Ergebnisse der Vorhersage aus | ||
# In: Vorhergesagte Labels, Testlabels | ||
# Out: Zeigt Anzahl der Binder, nicht-Binder und AUC an | ||
def showResults(pred_labels, test_labels): | ||
binder, nonBinder = showBinder(pred_labels) | ||
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels) | ||
roc_auc = auc(false_positive_rate, true_positive_rate) | ||
print("SVM predicted %s binder and %s nonBinder. AUC = %s" % (binder, nonBinder, roc_auc)) | ||
|
||
|
||
# Ermittelt AUC der Vorhersage | ||
# In: Vorhergesagte Labels, Testlabels | ||
# Out: AUC der Vorhersage | ||
def getAUC(pred_labels, test_labels): | ||
false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels) | ||
roc_auc = auc(false_positive_rate, true_positive_rate) | ||
return roc_auc |
Oops, something went wrong.