-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcP_code1AII.py
97 lines (75 loc) · 3.2 KB
/
cP_code1AII.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
# Load the dataset
file_path = './Dengue_Fever_Prognosis_Dataset.csv'
data = pd.read_csv(file_path)
# Filter out ND columns and separate DHF and DF
dhf_columns = [col for col in data.columns if "DHF" in col]
df_columns = [col for col in data.columns if "DF" in col]
# Transpose for easier indexing
dhf_data = data[dhf_columns].T
df_data = data[df_columns].T
# Add a column to identify the category
dhf_data['Category'] = 'DHF'
df_data['Category'] = 'DF'
# Combine DHF and DF data
combined_data = pd.concat([dhf_data, df_data])
# Split into training and testing for each category
from sklearn.model_selection import train_test_split
dhf_train, dhf_test = train_test_split(dhf_data, test_size=0.2, random_state=42)
df_train, df_test = train_test_split(df_data, test_size=0.2, random_state=42)
# Prepare labels
dhf_train_labels = ['DHF'] * len(dhf_train.index)
df_train_labels = ['DF'] * len(df_train.index)
dhf_test_labels = ['DHF'] * len(dhf_test.index)
df_test_labels = ['DF'] * len(df_test.index)
train_labels = dhf_train_labels + df_train_labels
test_labels = dhf_test_labels + df_test_labels
# Set Probe_Set_ID as index
data.set_index('Probe_Set_ID', inplace=True)
# Gene of interest
gene = "212185_x_at"
# Extract training and testing data for the specific gene
train_dhf_values = data.loc[gene, dhf_train.index]
train_df_values = data.loc[gene, df_train.index]
test_dhf_values = data.loc[gene, dhf_test.index]
test_df_values = data.loc[gene, df_test.index]
gene_train_data = pd.concat([train_dhf_values, train_df_values])
gene_test_data = pd.concat([test_dhf_values, test_df_values])
# Reshape the data to fit the SVM model
gene_train_data = gene_train_data.values.reshape(-1, 1)
gene_test_data = gene_test_data.values.reshape(-1, 1)
# Train the SVM
svm = SVC(kernel='linear', random_state=42)
svm.fit(gene_train_data, train_labels)
# Predict on the test set
test_predictions = svm.predict(gene_test_data)
# Combine predictions with actual labels for comparison
test_results = pd.DataFrame({
'Actual': test_labels,
'Predicted': test_predictions
})
# Display classification results
print("Classification Results for Gene 212185_x_at")
print(test_results)
# Visualize the SVM decision boundary
gene_train_values = np.array(gene_train_data).flatten()
train_labels_numeric = [1 if label == 'DHF' else 0 for label in train_labels]
# Create a range of values for plotting decision boundaries
x_min, x_max = gene_train_values.min() - 1, gene_train_values.max() + 1
x_range = np.linspace(x_min, x_max, 500).reshape(-1, 1)
# Predict decision boundaries
decision_boundary = svm.decision_function(x_range)
# Plot the data points and decision boundary
plt.figure(figsize=(10, 6))
plt.scatter(gene_train_values, train_labels_numeric, c=train_labels_numeric, cmap='coolwarm', label='Training Data')
plt.plot(x_range, decision_boundary, color='black', linestyle='-', label='Decision Boundary')
plt.axhline(0, color='gray', linestyle='--', label='Hyperplane')
plt.xlabel('Gene Expression Levels')
plt.ylabel('Class (0 = DF, 1 = DHF)')
plt.title('SVM Decision Boundary for Gene 212185_x_at')
plt.legend()
plt.show()