-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathloadandpreprocessdata.py
62 lines (49 loc) · 2.34 KB
/
loadandpreprocessdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from joblib import dump
# Load the dataset
csv_file_path = '/Users/apple/Library/Mobile Documents/com~apple~CloudDocs/cybersecurity_data_50_rows.csv' # Update this path to your CSV file
data = pd.read_csv(csv_file_path)
# Display the first few rows to understand the data structure
print(data.head())
# Define features and label
features = ['Sensor_Data', 'Vehicle_Speed', 'Location', 'Sensor_Type', 'Sensor_Status',
'Attack_Severity', 'Attack_Duration', 'Attack_Frequency', 'Vehicle_Model',
'Firmware_Version', 'Network_Traffic', 'Error_Code', 'Geofencing_Status']
label = 'Adversarial_Attack'
# Ensure that there are no missing values in the features and label
if data[features].isnull().sum().any() or data[label].isnull().sum() > 0:
print("Data contains missing values. Handling missing values...")
data = data.dropna() # Example handling, adjust as needed
# Separate features and label
X = data[features]
y = data[label]
# Encode categorical features
# Convert all categorical features to string if they are not
categorical_features = ['Location', 'Sensor_Type', 'Sensor_Status', 'Vehicle_Model', 'Firmware_Version', 'Geofencing_Status']
for feature in categorical_features:
X[feature] = X[feature].astype(str)
# Encode categorical features using one-hot encoding
X = pd.get_dummies(X, columns=categorical_features)
# Convert label to numeric values if it's categorical
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Save the scaler
dump(scaler, 'scaler.joblib')
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Verify the shapes of the datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
# Optionally save the preprocessed data if needed
# pd.DataFrame(X_train).to_csv('X_train.csv', index=False)
# pd.DataFrame(X_test).to_csv('X_test.csv', index=False)
# pd.DataFrame(y_train).to_csv('y_train.csv', index=False)
# pd.DataFrame(y_test).to_csv('y_test.csv', index=False)