diff --git a/.ci/run_sample_configurations.bash b/.ci/run_sample_configurations.bash new file mode 100755 index 0000000..0a18724 --- /dev/null +++ b/.ci/run_sample_configurations.bash @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + + +# Set path variables. +repo_root="$(realpath $(dirname $BASH_SOURCE[0])/..)" +conf_path="$repo_root/configurations/examples" + +# Move to repository root to make sure all subsequent commands +# run from the right directory. +cd "$repo_root" + +# Run all training examples sequentially. +counter=1 +for p in $(ls $conf_path/train); do + echo "-----------------------------------------------" + echo "$counter: running example configuration from $p" + echo "-----------------------------------------------" + ((++counter)) + + python3 code/IoT-AD.py -c "$conf_path/train/$p" || \ + { echo "Error while running $conf_path/train/$p"; exit 1; } +done + +# Run all test examples sequentially. +counter=1 +for p in $(ls $conf_path/test); do + echo "-----------------------------------------------" + echo "$counter: running example configuration from $p" + echo "-----------------------------------------------" + ((++counter)) + + python3 code/IoT-AD.py -c "$conf_path/test/$p" || \ + { echo "Error while running $conf_path/test/$p"; exit 1; } +done + +echo "---------" +echo "Finished!" +echo "---------" diff --git a/.ci/run_sample_configurations_multithreaded.bash b/.ci/run_sample_configurations_multithreaded.bash new file mode 100755 index 0000000..de7df61 --- /dev/null +++ b/.ci/run_sample_configurations_multithreaded.bash @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + + +# Set path variables. +repo_root="$(realpath $(dirname $BASH_SOURCE[0])/..)" +conf_path="$repo_root/configurations/examples" + +# Move to repository root to make sure all subsequent commands +# run from the right directory. +cd "$repo_root" + +# Run all training examples sequentially. +counter=1 +for p in $(ls $conf_path/train); do + echo "-----------------------------------------------" + echo "$counter: running example configuration from $p" + echo "-----------------------------------------------" + ((++counter)) + + python3 code/IoT-AD.py -c "$conf_path/train/$p" & +done + +wait + +# Run all test examples sequentially. +counter=1 +for p in $(ls $conf_path/test); do + echo "-----------------------------------------------" + echo "$counter: running example configuration from $p" & + echo "-----------------------------------------------" + ((++counter)) + + python3 code/IoT-AD.py -c "$conf_path/test/$p" +done + +wait + +echo "---------" +echo "Finished!" +echo "---------" diff --git a/.github/workflows/run-samples.yaml b/.github/workflows/run-samples.yaml new file mode 100644 index 0000000..f5fd4b3 --- /dev/null +++ b/.github/workflows/run-samples.yaml @@ -0,0 +1,41 @@ +name: Set up SIURU and run sample configurations + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8"] + + steps: + - uses: actions/checkout@v3 + - name: Initialize submodules + run: git submodule update --init --recursive + - name: Install Ubuntu dependencies + run: sudo apt install libpcap-dev + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r code/requirements.txt + - name: Build and install PcapPlusPlus + run: | + cd code/cpp-extract-features/PcapPlusPlus + cmake -S . -B cmake-build + cmake --build cmake-build --config Release + sudo cmake --install cmake-build + - name: Build feature extractor + run: | + cd code/cpp-extract-features + mkdir cmake-build && cd cmake-build + cmake ../.. + cmake --build . --config Release + sudo setcap cap_net_raw+ep $(pwd)/pcap-feature-extraction + - name: Build and test sample configurations + run: ./.ci/run_sample_configurations.bash diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f31bd90 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/code/cpp-extract-features/cmake-build/ +/code/cpp-extract-features/PcapPlusPlus/cmake-build/ +/data/ +!/data/README.md +!/data/MQTTset-reduced/ +/influxdb/ +/logs/ +/models/ diff --git a/.gitmodules b/.gitmodules index 780889a..45eb4af 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "code/cpp-extract-features/PcapPlusPlus"] path = code/cpp-extract-features/PcapPlusPlus - url = git@github.com:seladb/PcapPlusPlus.git + url = https://github.com/seladb/PcapPlusPlus.git +[submodule "data/MQTTset-reduced"] + path = data/MQTTset-reduced + url = https://github.com/l-laura/MQTTset-reduced diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..002bc79 --- /dev/null +++ b/LICENSE @@ -0,0 +1,8 @@ +Copyright © 2023 Laura Lahesoo +Copyright © 2023 Fukuda-lab, National Institute of Informatics, Japan + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index c00f295..0ef3d7d 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,24 @@ # Scalable IoT Usage Research Utility (SIURU) -A framework for IoT anomaly detection, created at the National Institute of Informatics -during the research internship program. +A framework for IoT anomaly detection, created at the National Institute of Informatics during the research internship program. ## System architecture -Below is a diagram of the core components in the IoT-AD pipeline, with arrows marking -the flow of data between components: +Below is a diagram of the core components in the IoT-AD pipeline, with arrows marking the flow of data between components: ![System architecture diagram](graphics/iot-ad-system-overview.svg) -Elements in gray are not yet implemented. For details on the components, see the -_Repository Structure_ section below. +Elements in gray are not yet implemented. For details on the components, see the _Repository Structure_ section below. ## Reporting (optional) -These components are used in the demo configurations to store model predictions. -It is possible to run the demos without InfluxDB + Grafana reporting by removing -the ``InfluxDBReporter`` entry from the output section of configuration files. +These components are used in the demo configurations to store model predictions. It is possible to run the demos without InfluxDB + Grafana reporting by removing the ``InfluxDBReporter`` entry from the output section of configuration files. ### InfluxDB -Install [InfluxDB](https://docs.influxdata.com/influxdb/v2.6/install/) for example as -a Docker container and follow the setup guide to create an organization and bucket. +Install [InfluxDB](https://docs.influxdata.com/influxdb/v2.6/install/) for example as a Docker container and follow the setup guide to create an organization and bucket. -Also create a directory where InfluxDB should store the data. In the commands below, -it is referred to as `/influxdb`. +Also create a directory where InfluxDB should store the data. In the commands below, it is referred to as `/influxdb`. Start the image with: ``` @@ -34,12 +27,9 @@ docker run -p 8086:8086 \ influxdb:2.6.1 --reporting-disabled ``` -From the interface that starts under http://localhost:8086 by default, generate a -token with read-write permissions to use in Grafana and when running `IoT-AD.py` with -reporting enabled. +From the interface that starts under http://localhost:8086 by default, generate a token (Load Data > API Tokens) with read-write permissions to use in Grafana and when running `IoT-AD.py` with reporting enabled. -If you later wish to clear the sample data stored in InfluxDB, use the following recipe, -replacing the token placeholder with your generated one: +If you later wish to clear the sample data stored in InfluxDB, use the following recipe, replacing the token placeholder with your generated one: ```bash # Find the name of the container. @@ -56,35 +46,19 @@ exit ### Grafana -Install -[Grafana](https://grafana.com/docs/grafana/latest/setup-grafana/installation/debian/) -for example by following the guide for Ubuntu and Debian. Start the service: +Install [Grafana](https://grafana.com/docs/grafana/latest/setup-grafana/installation/debian/) for example by following the guide for Ubuntu and Debian. Start the service: ```bash sudo service grafana-server start ``` -and follow the guide to set up an -[InfluxDB data source](https://grafana.com/docs/grafana/latest/datasources/influxdb/). -If all goes well, Grafana should be able to connect to the InfluxDB instance you are -running. - -## Data preparation - -The examples below assumes that we have stored data for anomaly detection under -`/data/MQTTset/Data/PCAP/slowite.pcap`, -`/data/MQTTset/Data/PCAP/malariaDoS.pcap` and -`/data/MQTTset/Data/PCAP/capture_custom_1h.pcap`. - -The last one is a custom segment from the full MQTTSet normal traffic file, -ask me for a reference. - +and follow the guide to set up an [InfluxDB data source](https://grafana.com/docs/grafana/latest/datasources/influxdb/). If all goes well, Grafana should be able to connect to the InfluxDB instance you are running. ## Quickstart with Docker ### Setting up the repository -After cloning the repository, make sure to initialize the submodules (PcapPlusPlus): +After cloning the repository, make sure to initialize the submodules (PcapPlusPlus and MQTTset-reduced): ```bash git submodule update --init --recursive @@ -92,11 +66,9 @@ git submodule update --init --recursive ### Build the container -This automatic build is a quick alternative to manually setting up a Python environment -as described below. First, make sure Docker is installed and the daemon is running. +This automatic build is a quick alternative to manually setting up a Python environment as described below. First, make sure Docker is installed and the daemon is running. -Then you can build the container locally. -In this example, the container will be tagged as ``siuru:latest``: +Then you can build the container locally. In this example, the container will be tagged as ``siuru:latest``: ```bash cd code @@ -105,10 +77,7 @@ docker build . -t siuru:latest ### Start the container -Replace the project root with a path to your actual project. The command maps your -local configuration, data, and model paths into the container, allowing the trained -model to persist over multiple runs. The ``--network-host`` flag is needed to store -prediction reports in InfluxDB (enabled in the sample configuration files). +Replace the project root placeholders with the absolute path to your local SIURU repository. The command maps your local configuration, data, and model paths into the container, allowing the trained model to persist over multiple runs. The ``--network-host`` flag is needed to store prediction reports in InfluxDB (enabled in the sample configuration files). ```bash docker run -it \ @@ -125,35 +94,34 @@ siuru:latest \ In the interactive Docker session, run: ```bash -python3 code/IoT-AD.py -c /configurations/examples/flow-based-rf-train.json.jinja +python3 code/IoT-AD.py \ +-c /configurations/tutorial/ +-multi-rf-influxdb-train.json.jinja2 ``` -The model will be stored under ``/models/example-flow-based-rf``. +The model will be stored under ``/models/window-multi-rf-influxdb``. ### Test the model -In an interactive session, run the following command, replacing the placeholder -with a token generated in InfluxDB as described previously: +In an interactive session, run the following command, replacing the placeholder with a token generated in InfluxDB as described previously: ```bash python3 code/IoT-AD.py \ --c /configurations/examples/flow-based-rf-test.json.jinja \ +-c /configurations/tutorial/window-multi-rf-influxdb-test.json.jinja2 \ --influx-token ``` -You should see prediction data being stored in InfluxDB tagged as: -``_measurement="example-flow-based-rf"`` +You should see prediction data being stored in InfluxDB tagged as: ``_measurement="window-multi-rf"`` ## Setup without Docker -The pipeline is written in Python and C++, therefore Python requirements must be -installed and the C++ feature extractor component built before running the pipeline. +The pipeline is written in Python and C++, therefore Python requirements must be installed and the C++ feature extractor component built before running the pipeline. The commands below are meant to be run on Ubuntu 20.04. ### Setting up the repository -After cloning the repository, make sure to initialize the submodules (PcapPlusPlus): +After cloning the repository, make sure to initialize the submodules (PcapPlusPlus and MQTTset-reduced): ```bash git submodule update --init --recursive @@ -161,18 +129,15 @@ git submodule update --init --recursive ### Python environment -I recommend to set up a Python virtual environment, e.g. pyenv. The Python -libraries needed by this project can then be installed by running from root: +It is recommended to set up a Python virtual environment, e.g. pyenv. The Python libraries needed by this project can then be installed by running from project root: ```bash -cd code -pip install -r requirements.txt +pip install -r code/requirements.txt ``` ### Feature extractor -Since it is in C++, the code needs to be compiled using CMake, which can be installed -via Snap or package manager. +Since it is in C++, the code needs to be compiled using CMake, which can be installed via Snap or package manager. In addition, the following dependencies are needed: ```bash @@ -183,9 +148,9 @@ Build and install PcapPlusPlus, then build the feature extractor as follows: ```bash cd code/cpp-extract-features/PcapPlusPlus -cmake -S . -B build -cmake --build build -sudo cmake --install build +cmake -S . -B cmake-build +cmake --build cmake-build --config Release +sudo cmake --install cmake-build cd .. mkdir cmake-build && cd cmake-build @@ -194,56 +159,54 @@ cmake --build . --config Release sudo setcap cap_net_raw+ep $(pwd)/pcap-feature-extraction ``` -The last command is needed to give the executable permissions to listen on the network -interfaces. The path to the C++ executable is a command line argument to the main -anomaly detection program ``IoT-AD.py``. Whenever the executable is recompiled, the +The last command is needed to give the executable permissions to listen on the network interfaces. The path to the C++ executable is a command line argument to the main anomaly detection program ``IoT-AD.py``. Whenever the executable is recompiled, the permissions must also be assigned again. ### Training a model -Refer to the command line hints of ``code/IoT-AD.py`` for information on the available -parameters, and the files under ``configurations/examples`` for sample configuration -files. +Refer to the command line hints of ``code/IoT-AD.py`` for information on the available parameters, and the files under `configurations/tutorial` for the sample configuration files this demo will be using. -The example below assumes that we have stored the following: -1. data for anomaly detection as described above, -2. built the C++ feature extractor using CMake under `/models/example-flow-based-rf/flow-based-rf-train.pickle`. +As a result of successful training, we will have a random forest classifier stored under `/models/window-multi-rf-influxdb/window-multi-rf-influxdb.pickle`. ```bash -cd code -python IoT-AD.py -c ../configurations/examples/flow-based-rf-train.json.jinja +python code/IoT-AD.py \ +-c configurations/tutorial/window-multi-rf-influxdb-train.json.jinja2 ``` ### Running anomaly detection -Refer to the commandline hint of ``code/IoT-AD.py`` for information on the available -parameters. +Refer to the commandline hint of ``code/IoT-AD.py`` for information on the available parameters. The sample command below assumes that we have the following: -1. data for anomaly detection as described above, -2. built the C++ feature extractor using CMake under `/models/example-flow-based-rf/flow-based-rf-train.pickle` (see previous section), -4. configured InfluxDB as seen below, including the generated token. +1. built the C++ feature extractor using CMake under `/models/window-multi-rf-influxdb/window-multi-rf-influxdb.pickle` (see previous section), +3. configured InfluxDB as seen below, including the generated token. ```bash -cd code -python IoT-AD.py \ --c /configurations/examples/flow-based-rf-test.json.jinja \ +python code/IoT-AD.py \ +-c configurations/tutorial/window-multi-rf-influxdb-test.json.jinja \ --influx-token ``` +## Example pipelines + +Under `configurations/examples`, you can currently find 12 configurations demonstrating the possible combinations of components implemented in SIURU. + +The examples are automatically run as tests in the SIURU GitHub repository, but it is possible to run them locally after you have finished the setup (e.g. built the C++ feature extractor, installed Python dependencies): + +```bash +./.ci/run_sample_configurations.bash +``` + +Extending the test cases is easy: add your training and testing configurations into the same directories and they will be found by the CI script. + ## Working with data -Use the bash script under ``code/split_dataset.bash`` to split a PCAP file into training, -validation, and test sets. The script works based on flows, so packets from the same -connection end up in the same file after the split. +Use the bash script under ``code/split_dataset.bash`` to split a PCAP file into training, validation, and test sets. The script works based on flows, so packets from the same connection end up in the same file after the split. -The script makes use of ``PcapSplitter`` and ``mergecap``, which are both installed in -the Docker image mentioned above. If you run the Docker image with a mapping to the data -directory, the script should work out-of-the-box (replace with your -local path to the project): +The script makes use of ``PcapSplitter`` and ``mergecap``, which are both installed in the Docker image mentioned above. If you run the Docker image with a mapping to the data directory, the script should work out-of-the-box (replace with your local path to the project): ```bash docker run -it \ @@ -258,45 +221,47 @@ In the container, you can run: cd code ./split_dataset.bash --help ./split_dataset.bash head-tail /data/MQTTset/Data/PCAP/slowite.pcap /data/MQTTset/Data/PCAP/slowite-train-60-val-10-test-30 60 10 -./split_dataset.bash round-robin /data/MQTTset/Data/PCAP/bruteforce.pcapng /data/MQTTset/Data/PCAP/bruteforce-train-90-val-5-test-5 100 90 5 +./split_dataset.bash round-robin /data/MQTTset/Data/PCAP/bruteforce.pcapng /data/MQTTset/Data/PCAP/bruteforce-train-90-val-5-test-5 90 5 5 ``` ## Repository structure +### code/common + ### code/cpp-extract-features -Contains the C++ feature extractor, which is very quick compared to Scapy (refer to -``plots/benchmarking.png``). +Contains the feature extractor component written in C++, setup instructions above. ### code/dataloaders -Contains a generic data loader interface and some implementations for dataloaders from -common datasets (MQTTSet / kaiyodai-ship, Mawi). +Contains a generic data loader interface and some implementations, e.g. to load samples from a pcap file using `cpp-extract-features`. + + +### code/encoders + +Contains a generic feature encoder interface and implementations to encode a single or multiple samples using Numpy / xarray. ### code/Kafka -The Docker container that can listen on network interfaces and capture packet data. -As proof-of-concept, the following system was setup: +The Docker container that can listen on network interfaces and capture packet data. As proof-of-concept, the following system was setup: ![System architecture diagram](graphics/kafka-pcap-demo.svg) -In the future, the container should offer access to all the IoT-AD functionalities -from this project. +In the future, the container should offer access to all the IoT-AD functionalities from this project. ### code/models -Some ML models used to test the anomaly detection pipeline. +Contains a generic anomaly detection model interface and some implementations used to test the anomaly detection pipeline. ### code/preprocessors -Packet preprocessor providing features for ML models. +Contains a generic feature preprocessor interface and several implementations, including a converter from packet- to flow-based samples. ### code/reporting Reporting module sends prediction data to a logging or visualization endpoint. -In the future, this component would interface with a network controller that takes -actions based on the anomaly detection output. +In the future, this component would interface with a network controller that takes actions based on the anomaly detection output. ### code/IoT-AD.py @@ -306,22 +271,14 @@ The entry point to the IoT anomaly detection pipeline. Configuration files, which are required input for the IoT anomaly detection program. -The files are in JSON format and must define three pipeline elements: data source(s), -ML model, and output. [Jinja](https://palletsprojects.com/p/jinja/) is used to -support template variables, which the main program will replace with computed values -during runtime evaluation. +The files are Jinja2 templates for JSON files and must define three pipeline elements: data source(s), ML model, and output. [Jinja](https://palletsprojects.com/p/jinja/) is used to support template variables, which the main program will replace with computed values during runtime evaluation. -As a reference for the pipeline used to train a ML model, a copy of the processed -configuration file is stored in the same directory as the model after training. +As a reference for the pipeline used to train a ML model, a copy of the processed configuration file is stored in the same directory as the model after training. -To distinguish models by their creation date, include the `{{ timestamp }}` -template variable in the "model_name" field of the configuration file. The model name -and directory will then include a timestamp from the beginning of program execution. +To distinguish models by their creation date, include the `{{ timestamp }}` template variable in the "model_name" field of the configuration file. The model name and directory will then include a timestamp from the beginning of program execution. ### data -See README.md for references to some available datasets. +See README.md for references to some available datasets. A small sample dataset (MQTTset-reduced) is included as a submodule. -Data is automatically moved here when you run the -`code/stop_kafka_pcap.bash` script. Pcap files stored with timestamps. -While timestamps in pcap file names are in UST, packets inside have JST timestamps (+9). +Data is automatically moved here when you run the `code/stop_kafka_pcap.bash` script. Pcap files are stored with timestamps. While timestamps in pcap filenames are in UST, packets inside store the capture timestamp in system time (without time zone information). diff --git a/code/Dockerfile b/code/Dockerfile index c45075e..0d7cfe3 100644 --- a/code/Dockerfile +++ b/code/Dockerfile @@ -3,8 +3,6 @@ FROM ubuntu:20.04 ENV TZ=Europe/Berlin ENV DEBIAN_FRONTEND=noninteractive -COPY ./ /code/ - RUN apt-get clean && \ apt-get update && \ apt-get -y upgrade && \ @@ -22,15 +20,26 @@ RUN apt-get clean && \ python3-pip \ bc +# Copy only the needed files for each build step to avoid triggering +# unnecessary layer rebuilds: https://stackoverflow.com/a/45929376 +COPY ./requirements.txt /code/requirements.txt RUN pip3 install -r code/requirements.txt -# Using "build-docker" to not collide with the "build" dir. +COPY ./cpp-extract-features /code/cpp-extract-features RUN cd /code/cpp-extract-features/PcapPlusPlus && \ cmake -S . -B build-docker && \ cmake --build build-docker && \ cmake --install build-docker -RUN mkdir cmake-build-debug && \ - cd cmake-build-debug && \ - cmake ../code && \ - cmake --build . +COPY ./CMakeLists.txt /code/CMakeLists.txt +RUN cd /code/cpp-extract-features && \ + mkdir cmake-build && \ + cd cmake-build && \ + cmake ../.. && \ + cmake --build . --config Release && \ + setcap cap_net_raw+ep $(pwd)/pcap-feature-extraction +# TODO test if the setcap command is needed for live traffic capture in Docker. + +# Copy the codebase (which is the most likely part to change) as late +# as possible to reduce the number of layers needing to be rebuilt. +COPY ./ /code/ diff --git a/code/IoT-AD.py b/code/IoT-AD.py index da00f61..b273c8f 100644 --- a/code/IoT-AD.py +++ b/code/IoT-AD.py @@ -7,7 +7,8 @@ from jinja2 import Template -from common.functions import time_now, project_root, git_tag +import common.global_variables as global_variables +from common.functions import report_performance, time_now, project_root, git_tag from dataloaders import * from models import * from preprocessors import * @@ -19,56 +20,68 @@ log = PipelineLogger.get_logger() -def main(): +def main(args_config_path, args_influx_token): """ Run the IoT anomaly detection pipeline based on a configuration file. """ - parser = argparse.ArgumentParser() + pipeline_execution_start = time.process_time_ns() + log_time_tag = time_now() - parser.add_argument("-c", "--config-path", type=str, required=True) - parser.add_argument("--influx-token", type=str, required=False, default="") - - log.debug("Parsing arguments.") - args = parser.parse_args() - - config_path = os.path.abspath(args.config_path) + # Load configuration file that specifies pipeline components. + config_path = os.path.abspath(args_config_path) + config_file_name = os.path.basename(config_path).split('.')[0] assert os.path.exists(config_path), "Config file not found!" - log.debug(f"Loading configuration from: {config_path}") with open(config_path) as config_file: - # New functions for templating can be registered here. - template = Template(config_file.read()) - template.globals["timestamp"] = time_now() - template.globals["project_root"] = project_root() - template.globals["git_tag"] = git_tag() - template.globals["influx_token"] = args.influx_token - configuration = json.loads(template.render()) + if ".jinja" in config_path: + # New functions for templating can be registered here. + template = Template(config_file.read()) + template.globals["timestamp"] = log_time_tag + template.globals["project_root"] = project_root() + template.globals["git_tag"] = git_tag() + template.globals["influx_token"] = args_influx_token + template.globals["config_file_name"] = config_file_name + configuration = json.loads(template.render()) + else: + configuration = json.loads(config_path) if not configuration: log.error("Could not load configuration file!") exit(1) - log.debug("Configuration loaded!") + + class_initialization_start = time.process_time_ns() # Initialize file loggers. if "LOG" in configuration: for log_config in configuration["LOG"]: log_level = log_config.get("level", "DEBUG") # Default location is under the /logs directory in this repository. - log_path = log_config.get("path", os.path.join(project_root(), "logs")) + log_path = log_config.get( + "path", + os.path.join( + project_root(), "logs", "other", f"{log_time_tag}-log.txt" + ), + ) if not os.path.exists(os.path.dirname(log_path)): os.makedirs(os.path.dirname(log_path)) PipelineLogger.add_file_logger(log_level, log_path) - # Initialize elements of the feature generation/processing pipeline. - # TODO Move these steps to a separate config processor! + # Re-logging the path because file-based logger was not initialized before. + log.debug(f"Running configuration: {config_path}") + + # Feature stream is a Python generator object: https://wiki.python.org/moin/Generators + # It allows to process the samples memory-efficiently, avoiding the need to store all data in memory at the same time. feature_stream = itertools.chain([]) + + # Initialize data loaders classes corresponding to each component under DATA_SOURCES in configuration. for data_source in configuration["DATA_SOURCES"]: loader_name = data_source["loader"]["class"] loader_class = globals()[loader_name] log.info(f"Adding {loader_class.__name__} to pipeline.") loader: IDataLoader = loader_class(**data_source["loader"]["kwargs"]) - new_feature_stream = loader.get_features() + new_feature_stream = loader.get_samples() + # Initialize preprocessors specific to the data sources. Allowing each data source to specify its own preprocessor means data from different storage formats and with different processing needs can be combined to train models or perform prediction. for preprocessor_specification in data_source["preprocessors"]: preprocessor_name = preprocessor_specification["class"] preprocessor_class = globals()[preprocessor_name] @@ -80,6 +93,8 @@ def main(): feature_stream = itertools.chain(feature_stream, new_feature_stream) + # If no model is specified, count the number of samples in the loaded data. + # Just a convenience function, might be removed later. if len(configuration["MODEL"]) == 0: log.info("No model specified - counting input data points:") count = 0 @@ -88,22 +103,26 @@ def main(): log.info(f"{count} elements.") exit(0) + # Initialize model class based on the component specification in the configuration. model_specification = configuration["MODEL"] - # Initialize model from class name. model_name = model_specification["class"] model_class = globals()[model_name] model_instance: IAnomalyDetectionModel = model_class( - full_config_json=json.dumps(configuration, indent=4), - **model_specification + full_config_json=json.dumps(configuration, indent=4), **model_specification ) + # Initialize encoder class for the model. Encoders are model-specific to allow running multiple models simultaneously in the future, where each may require their own encoder instance. encoder_name = model_specification["encoder"]["class"] encoder_class = globals()[encoder_name] encoder_instance: IDataEncoder = encoder_class( **model_specification["encoder"]["kwargs"] ) - log.info("Encoding features.") + + # This moment is important for performance measurement because encoding is the first step + # where features are actually processed. Until here, the generator data has not been consumed, so no data processing needed to take place). + encoding_start = time.process_time_ns() + encoded_feature_generator = encoder_instance.encode(feature_stream) # Sanity check - peek at the first sample, print its fields and encoded format. @@ -114,24 +133,17 @@ def main(): elif len(first_sample) == 2: # Assure sample matches the intended signature. log.debug("Features of the first sample:") first_sample_data, _ = first_sample - if isinstance(first_sample_data, list): - # This is a hacky way to support both lists of sample features - # (as encoded my MultiSampleEncoder) and a dict containing the - # features of a single sample. - # TODO make more elegant (or move the logging to the encoder)! + # Extract first sample from list as encoded by MultiSampleEncoder. Otherwise, the first_sample_data object is already a dict containing the features of a single sample. first_sample_data = first_sample_data[0] for k, v in first_sample_data.items(): log.debug(f" | {k}: {v}") if model_specification["train_new_model"]: # Train the model. - start = time.perf_counter() model_instance.train( encoded_feature_generator, path_to_store=model_instance.store_file ) - end = time.perf_counter() - log.info(f"Trained new model in {end - start} seconds.") else: # Prediction time! @@ -143,25 +155,67 @@ def main(): reporter_instance = reporter_class(**output["kwargs"]) reporter_instances.append(reporter_instance) - count = 0 - start = time.perf_counter() - - for sample, encoding in encoded_feature_generator: - for predicted_sample in model_instance.predict(sample, encoding): - for reporter_instance in reporter_instances: - reporter_instance.report(predicted_sample) - count += 1 - - end = time.perf_counter() - packets_per_second = count / (end - start) - log.info( - f"Predicted and reported {count} samples in {end - start} seconds" - f" ({packets_per_second} packets/s)." - ) + for predicted_sample in model_instance.predict(encoded_feature_generator): + for reporter_instance in reporter_instances: + reporter_instance.report(predicted_sample) + # Reporters may require special shutdown steps, for example disconnecting from + # remote database or printing summaries of the processing -- call the handle for + # each reporter. for reporter_instance in reporter_instances: reporter_instance.end_processing() + pipeline_stopping_time = time.process_time_ns() + full_pipeline_time = pipeline_stopping_time - pipeline_execution_start + time_from_initialization = pipeline_stopping_time - class_initialization_start + time_from_processing = pipeline_stopping_time - encoding_start + + report_performance( + "FullPipeline", + log, + global_variables.global_pipeline_packet_count, + full_pipeline_time, + ) + report_performance( + "FromInitializationStart", + log, + global_variables.global_pipeline_packet_count, + time_from_initialization, + ) + report_performance( + "FromProcessingStart", + log, + global_variables.global_pipeline_packet_count, + time_from_processing, + ) + + # See Table 1 at: + # https://sec.cloudapps.cisco.com/security/center/resources/network_performance_metrics.html + total_ethernet_bytes = global_variables.global_sum_ip_packet_sizes + global_variables.global_pipeline_packet_count * 38 + total_pipeline_bandwidth = (total_ethernet_bytes * 8 / 1000000) / (full_pipeline_time / 1000000000) + from_init_bandwidth = (total_ethernet_bytes * 8 / 1000000) / (time_from_initialization / 1000000000) + from_processing_bandwidth = (total_ethernet_bytes * 8 / 1000000) / (time_from_processing / 1000000000) + log.info("---\nData volume and bandwidth:\n" + f" {global_variables.global_pipeline_packet_count} IP packets\n" + f" {global_variables.global_sum_ip_packet_sizes} bytes IP traffic\n" + f" {total_ethernet_bytes} bytes Ethernet traffic\n" + f" {round(total_pipeline_bandwidth, 2)} megabits/second " + f"Ethernet traffic bandwidth for full pipeline\n" + f" {round(from_init_bandwidth, 2)} megabits/second " + f"Ethernet traffic bandwidth from initialization start\n" + f" {round(from_processing_bandwidth, 2)} megabits/second " + f"Ethernet traffic bandwidth from processing start\n" + ) + + if __name__ == "__main__": - main() + + # Argument parser initialization. + parser = argparse.ArgumentParser() + parser.add_argument("-c", "--config-path", type=str, required=True) + parser.add_argument("--influx-token", type=str, required=False, default="") + log.debug("Parsing arguments.") + args = parser.parse_args() + + main(args.config_path, args.influx_token) diff --git a/code/Kafka/test-apps/LICENSE b/code/Kafka/LICENSE similarity index 100% rename from code/Kafka/test-apps/LICENSE rename to code/Kafka/LICENSE diff --git a/code/Kafka/test-apps/NOTICE b/code/Kafka/NOTICE similarity index 100% rename from code/Kafka/test-apps/NOTICE rename to code/Kafka/NOTICE diff --git a/code/common/features.py b/code/common/features.py index ce99b5e..8ffd1f8 100644 --- a/code/common/features.py +++ b/code/common/features.py @@ -1,4 +1,5 @@ import enum +import itertools from typing import NewType, Union, Dict, Any, Generator, Tuple @@ -13,7 +14,8 @@ class PacketFeature(str, enum.Enum): IP_SOURCE_PORT = "ip_src_port" IP_DESTINATION_PORT = "ip_dst_port" PROTOCOL = "proto" - IP_PACKET_SIZE = "ip_size" + IP_HEADER_SIZE = "ip_header_size" + IP_DATA_SIZE = "ip_data_size" TCP_CWR_FLAG = "tcp_cwr" TCP_ECE_FLAG = "tcp_ece" TCP_URG_FLAG = "tcp_urg" @@ -22,7 +24,8 @@ class PacketFeature(str, enum.Enum): TCP_RST_FLAG = "tcp_rst" TCP_SYN_FLAG = "tcp_syn" TCP_FIN_FLAG = "tcp_fin" - TCP_SEGMENT_SIZE = "tcp_size" + TCP_HEADER_SIZE = "tcp_header_size" + TCP_DATA_SIZE = "tcp_size" CPP_FEATURE_STRING = "cpp_feature_string" SOURCE_FILE_NAME = "source_file_name" @@ -86,16 +89,23 @@ class PredictionField(str, enum.Enum): "IFeature", Union[PacketFeature, HostFeature, FlowFeature, PredictionField] ) -# TODO can these be specified further? Otherwise, might just as well use 'Any'. -DataType = NewType("DataType", Any) -EncodedData = NewType("EncodedData", Any) -FeatureGenerator = NewType( - "FeatureGenerator", Generator[Dict[IFeature, DataType], None, None] +def resolve_feature(feature_tag: str) -> IFeature: + feature_enums = [PacketFeature, HostFeature, FlowFeature, PredictionField] + f: IFeature + for f in itertools.chain(*feature_enums): + if feature_tag == f.value: + return f + return None + + +SampleGenerator = NewType( + "SampleGenerator", Generator[Dict[IFeature, Any], None, None] ) -LabeledFeatureGenerator = NewType( - "LabeledFeatureGenerator", Generator[Tuple[Dict[IFeature, DataType], EncodedData], None, None] +EncodedSampleGenerator = NewType( + "EncodedSampleGenerator", + Generator[Tuple[Dict[IFeature, Any], Any], None, None], ) FlowIdentifier = NewType("FlowIdentifier", Tuple[str, str, int, int, str]) diff --git a/code/common/functions.py b/code/common/functions.py index 82c0046..6bdd8f0 100644 --- a/code/common/functions.py +++ b/code/common/functions.py @@ -25,11 +25,12 @@ def project_root(): return os.path.abspath(os.path.join(__file__, "..", "..", "..")) -def report_performance(tag, logger, packet_count, passed_time_ns): +def report_performance(tag, logger, sample_count, passed_time_ns): logger.info(f"[{ tag }] Completed processing:") - logger.info(f" > { packet_count } packets") - logger.info(f" > { passed_time_ns } ns") - if packet_count: - logger.info(f" > { passed_time_ns / packet_count } ns/packet") + if sample_count: + logger.info(f" > { sample_count } samples") if passed_time_ns: - logger.info(f" > { packet_count / (passed_time_ns / 1000000000) } packets/s") + logger.info(f" > { passed_time_ns } ns") + if sample_count and passed_time_ns: + logger.info(f" > { round(passed_time_ns / sample_count) } ns/sample") + logger.info(f" > { round(sample_count / (passed_time_ns / 1000000000), 2) } packets/s") diff --git a/code/common/global_variables.py b/code/common/global_variables.py new file mode 100644 index 0000000..b93a9c0 --- /dev/null +++ b/code/common/global_variables.py @@ -0,0 +1,5 @@ +# This file defines variables that can be imported by other modules +# to share data across components, for example to measure performance. + +global_pipeline_packet_count = 0 +global_sum_ip_packet_sizes = 0 diff --git a/code/cpp-extract-features/main.cpp b/code/cpp-extract-features/main.cpp index b5db77b..a87c208 100644 --- a/code/cpp-extract-features/main.cpp +++ b/code/cpp-extract-features/main.cpp @@ -111,7 +111,7 @@ static void packet_to_features(pcpp::RawPacket* rawPacket, pcpp::PcapLiveDevice* auto ts_ns = rawPacket->getPacketTimeStamp().tv_sec*1000000000L + rawPacket->getPacketTimeStamp().tv_nsec; printf( - "%s,%s,%d,%d,%s,%ld,%zu,%hu,%hu,%hu,%hu,%hu,%hu,%hu,%hu,%zu\n", + "%s,%s,%d,%d,%s,%ld,%zu,%zu,%hu,%hu,%hu,%hu,%hu,%hu,%hu,%hu,%zu\n", src_ip.c_str(), dst_ip.c_str(), src_port, @@ -119,6 +119,7 @@ static void packet_to_features(pcpp::RawPacket* rawPacket, pcpp::PcapLiveDevice* TCP_PROTO, ts_ns / 1000, ip_layer->getHeaderLen(), + ip_layer->getDataLen(), flag_cwr, flag_ece, flag_urg, diff --git a/code/dataloaders/IDataLoader.py b/code/dataloaders/IDataLoader.py index 43c9867..e2198ad 100644 --- a/code/dataloaders/IDataLoader.py +++ b/code/dataloaders/IDataLoader.py @@ -1,13 +1,12 @@ -import os from abc import ABC, abstractmethod -from typing import List, Union, Generator, Dict, Any +from typing import List -from common.features import IFeature, FeatureGenerator +from common.features import IFeature, SampleGenerator class IDataLoader(ABC): """ - Generic interface for data loading modules to implement. + Generic interface for data loading classes to implement. """ def __init__(self, **kwargs): @@ -16,11 +15,15 @@ def __init__(self, **kwargs): @staticmethod @abstractmethod def feature_signature() -> List[IFeature]: - return [] + """ + Returns a list of features that the data loader promises + to deliver in each sample when the generator is called. + """ + pass @abstractmethod - def get_features(self) -> FeatureGenerator: + def get_samples(self) -> SampleGenerator: """ Yields a dictionary of preprocessed features per sample. """ - yield {} + pass diff --git a/code/dataloaders/PacketSniffer.py b/code/dataloaders/PacketSniffer.py index 12b1bff..37a3f5e 100644 --- a/code/dataloaders/PacketSniffer.py +++ b/code/dataloaders/PacketSniffer.py @@ -9,7 +9,7 @@ class PacketSniffer(IDataLoader): def feature_signature() -> List[IFeature]: pass - def get_features( + def get_samples( self, ) -> Generator[Dict[IFeature, Any], None, None,]: pass diff --git a/code/dataloaders/PcapFileLoader.py b/code/dataloaders/PcapFileLoader.py index ef7871e..eb1a5ec 100644 --- a/code/dataloaders/PcapFileLoader.py +++ b/code/dataloaders/PcapFileLoader.py @@ -2,6 +2,7 @@ import time from typing import List, Generator, Dict, Any +import common.global_variables as global_variables from common.functions import report_performance from dataloaders.IDataLoader import IDataLoader from common.features import IFeature, PacketFeature @@ -12,13 +13,13 @@ class PcapFileLoader(IDataLoader): - def __init__(self, filepath: str, preprocessor_path: str, **kwargs): + def __init__(self, filepath: str, packet_processor_path: str, **kwargs): super().__init__(**kwargs) self.filepath = filepath - self.preprocessor_path = preprocessor_path + self.preprocessor_path = packet_processor_path log.info(f"[{ type(self).__name__ }] Reading from file: {self.filepath}") - def get_features( + def get_samples( self, ) -> Generator[Dict[IFeature, Any], None, None]: pcap_call = [self.preprocessor_path, "stream-file", self.filepath] @@ -27,11 +28,14 @@ def get_features( sum_processing_time = 0 packet_count = 0 process = subprocess.Popen( - pcap_call, stdout=subprocess.PIPE, universal_newlines=True + pcap_call, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True ) while True: start_time_ref = time.process_time_ns() + if process.poll() and process.returncode: + log.error(process.stdout.readlines()) + raise RuntimeError(f"PCAP feature extractor exited with error code {process.returncode}!") packet_features = { PacketFeature.CPP_FEATURE_STRING: process.stdout.readline() } @@ -44,6 +48,11 @@ def get_features( report_performance(type(self).__name__, log, packet_count, sum_processing_time) + # Data loaders only exists once per data source, therefore they are + # suitable for tracking the overall number of packets processed. This + # value will be reported by the main pipeline in the end. + global_variables.global_pipeline_packet_count += packet_count + @staticmethod def feature_signature() -> List[IFeature]: return [PacketFeature.CPP_FEATURE_STRING] diff --git a/code/dataloaders/XarrayPcapFileLoader.py b/code/dataloaders/XarrayPcapFileLoader.py deleted file mode 100644 index b8fdf7a..0000000 --- a/code/dataloaders/XarrayPcapFileLoader.py +++ /dev/null @@ -1,51 +0,0 @@ -import subprocess -import time -import xarray -from typing import List, Generator, Dict, Any - -from common.functions import report_performance -from dataloaders.IDataLoader import IDataLoader -from common.features import IFeature, PacketFeature - -from common.pipeline_logger import PipelineLogger - -log = PipelineLogger.get_logger() - - -class XarrayPcapFileLoader(IDataLoader): - def __init__(self, filepath: str, preprocessor_path: str, **kwargs): - super().__init__(**kwargs) - self.filepath = filepath - self.preprocessor_path = preprocessor_path - log.info(f"[{ type(self).__name__ }] Reading from file: {self.filepath}") - - def get_features( - self, - ) -> Generator[Dict[IFeature, Any], None, None]: - pcap_call = [self.preprocessor_path, "stream-file", self.filepath] - - log.info(f"[PcapFileLoader] Processing file: {self.filepath}") - sum_processing_time = 0 - packet_count = 0 - process = subprocess.Popen( - pcap_call, stdout=subprocess.PIPE, universal_newlines=True - ) - - while True: - start_time_ref = time.process_time_ns() - line = process.stdout.readline() - if line: - packet_features = xarray.DataArray( - [line], dims=[PacketFeature.CPP_FEATURE_STRING] - ) - sum_processing_time += time.process_time_ns() - start_time_ref - yield packet_features - packet_count += 1 - else: - break - - report_performance(type(self).__name__, log, packet_count, sum_processing_time) - - @staticmethod - def feature_signature() -> List[IFeature]: - return [PacketFeature.CPP_FEATURE_STRING] diff --git a/code/dataloaders/__init__.py b/code/dataloaders/__init__.py index e5c1e25..35adad2 100644 --- a/code/dataloaders/__init__.py +++ b/code/dataloaders/__init__.py @@ -4,4 +4,3 @@ from .IDataLoader import IDataLoader from .PcapFileLoader import PcapFileLoader from .PacketSniffer import PacketSniffer -from .XarrayPcapFileLoader import XarrayPcapFileLoader diff --git a/code/encoders/DefaultEncoder.py b/code/encoders/DefaultEncoder.py index 893b943..9c7a0b9 100644 --- a/code/encoders/DefaultEncoder.py +++ b/code/encoders/DefaultEncoder.py @@ -6,7 +6,7 @@ from common.functions import report_performance from encoders.IDataEncoder import IDataEncoder -from common.features import IFeature, FeatureGenerator +from common.features import IFeature, SampleGenerator, resolve_feature from common.pipeline_logger import PipelineLogger @@ -14,29 +14,52 @@ class DefaultEncoder(IDataEncoder): + """ + :param feature_filter: Feature names to include in the order as the + features should appear in the DataArray. If empty, all input features + of the first sample will be included. + + Based on the feature_filter passed at initialization, the encoder creates a + (1, n)-dimensional Numpy arrays from input samples, with the features ordered + according to their order in the filter. + + The DefaultEncoder can be initialized without a feature filter. In this case, all + features of the first received sample are used in their order of occurrence as the + filter for all subsequent samples. + """ def __init__(self, feature_filter: Optional[List[str]] = None, **kwargs): super().__init__(**kwargs) - self.feature_filter = feature_filter - log.info(f"Applied feature filter: {self.feature_filter}") + self.feature_filter = None + if feature_filter: + self.feature_filter = [resolve_feature(f) for f in feature_filter] + log.info(f"Applied feature filter: {[f.value for f in self.feature_filter]}") def encode( - self, features: FeatureGenerator, **kwargs + self, samples: SampleGenerator, **kwargs ) -> Generator[Tuple[Dict[IFeature, Any], np.ndarray], None, None]: + """ + :return: (1, n)-dimensional Numpy array, with n being the number of features in + the feature_filter (or in the first sample, if no filter was provided). + """ sum_processing_time = 0 packet_count = 0 - for sample in features: + for sample in samples: start_time_ref = time.process_time_ns() - if self.feature_filter: - encoding = np.fromiter( - [v for k, v in sample.items() if k.value in self.feature_filter], - dtype=np.float32, - ).reshape(1, -1) - else: - encoding = np.fromiter(sample.values(), dtype=np.float32).reshape(1, -1) + + if not self.feature_filter: + # All encoded samples will follow the first sample's feature scheme! + self.feature_filter = list(sample.keys()) + log.info(f"Applied feature filter: {[f.value for f in self.feature_filter]}") + + encoding = np.fromiter( + [sample[f] for f in self.feature_filter], + dtype=np.float32, + ).reshape(1, -1) sum_processing_time += time.process_time_ns() - start_time_ref packet_count += 1 + yield sample, encoding report_performance(type(self).__name__, log, packet_count, sum_processing_time) diff --git a/code/encoders/IDataEncoder.py b/code/encoders/IDataEncoder.py index be5d479..56ed17d 100644 --- a/code/encoders/IDataEncoder.py +++ b/code/encoders/IDataEncoder.py @@ -1,20 +1,19 @@ from abc import ABC, abstractmethod -from typing import Generator, Dict, Any, Tuple -from common.features import IFeature, FeatureGenerator, LabeledFeatureGenerator +from common.features import SampleGenerator, EncodedSampleGenerator class IDataEncoder(ABC): """ - Generic interface for data encoder modules to implement. + Generic interface for data encoder classes to implement. """ def __init__(self, **kwargs): pass @abstractmethod - def encode(self, features: FeatureGenerator, **kwargs) -> LabeledFeatureGenerator: + def encode(self, samples: SampleGenerator, **kwargs) -> EncodedSampleGenerator: """ - For each feature, return both the original feature and its encoded version. + For each feature, yields both the original feature and its encoded version. """ - yield None + pass diff --git a/code/encoders/MultiSampleEncoder.py b/code/encoders/MultiSampleEncoder.py index ad82f90..1ce0a72 100644 --- a/code/encoders/MultiSampleEncoder.py +++ b/code/encoders/MultiSampleEncoder.py @@ -2,13 +2,12 @@ from typing import Any, Dict, Generator, Tuple, Optional, List -import numpy import numpy as np import xarray from common.functions import report_performance from encoders.IDataEncoder import IDataEncoder -from common.features import IFeature, FeatureGenerator +from common.features import IFeature, SampleGenerator, resolve_feature from common.pipeline_logger import PipelineLogger @@ -19,8 +18,8 @@ class MultiSampleEncoder(IDataEncoder): def __init__( self, feature_filter: Optional[List[str]] = None, - max_array_size: int = 1000, - max_time_window_ms: int = 1000, + max_array_size: int = 0, + max_time_window_ms: int = 0, **kwargs, ): """ @@ -30,73 +29,94 @@ def __init__( :param feature_filter: Feature names to include in the order as the features should appear in the DataArray. If empty, all input features - will be included. + of the first sample will be included. :param max_array_size: Maximal number of samples to include in each yielded array, if max_time_window_ms is not reached before. :param max_time_window_ms: Maximal time to wait before yielding an array, if the max_array_size is not reached before. """ super().__init__(**kwargs) - self.feature_filter = feature_filter - log.info(f"Applying feature filter: {self.feature_filter}") + self.feature_filter = None + if feature_filter: + self.feature_filter = [resolve_feature(f) for f in feature_filter] + log.info(f"Applied feature filter: {[f.value for f in self.feature_filter]}") self.max_array_size = max_array_size - self.max_time_window_ms = max_time_window_ms + self.max_time_window_ns = max_time_window_ms * 10**6 + self.created_array_count = 0 def encode( - self, features: FeatureGenerator, **kwargs + self, samples: SampleGenerator, **kwargs ) -> Generator[Tuple[Dict[IFeature, Any], np.ndarray], None, None]: """ Encode input features into xarray.DataArrays. Features in feature_filter become the first dimension and samples the second dimension of the DataArray. - :param features: Generator of feature dictionaries to be encoded. + :param samples: Generator of feature dictionaries to be encoded. :return: Yields tuples with: - (1) list of input feature dictionaries used to generate the encoding + (1) list of input feature dictionaries used to generate the encoding, (2) xarray.DataArray of the encoded samples. """ packet_count = 0 sum_processing_time = 0 - first = True feature_dicts = [] - multi_sample_encoding_array = None + array_to_encode = [] + last_published_time = time.process_time_ns() - for sample in features: - start_time_ref = time.process_time_ns() + for sample in samples: + start_time = time.process_time_ns() # Feature dictionaries will be stored in a single list element. feature_dicts.append(sample) - # xarray is built by appending each feature's encoding one by one. - if first: - first = False - if not self.feature_filter: - # All encoded features follow the first sample's scheme! - self.feature_filter = list(sample.keys()) - - multi_sample_encoding_array = xarray.DataArray( - [ - [sample[f] for f in self.feature_filter], - ], + if not self.feature_filter: + # All encoded samples will follow the first sample's feature scheme! + self.feature_filter = list(sample.keys()) + log.info(f"Applied feature filter: {[f.value for f in self.feature_filter]}") + + array_to_encode.append([sample[f] for f in self.feature_filter]) + packet_count += 1 + current_time = time.process_time_ns() + + if len(array_to_encode) > 0 and (( + self.max_time_window_ns != 0 + and current_time - last_published_time >= self.max_time_window_ns + ) or ( + self.max_array_size != 0 and len(feature_dicts) >= self.max_array_size + )): + encoding_array = xarray.DataArray( + array_to_encode, dims=["samples", "features"], coords={"features": self.feature_filter}, ) - # multi_sample_encoding_array.reshape((1, len(self.feature_filter))) - else: - multi_sample_encoding_array = numpy.concatenate( - ( - multi_sample_encoding_array, - [ - [sample[f] for f in self.feature_filter], - ], - ), - axis=0, - ) - sum_processing_time += time.process_time_ns() - start_time_ref - packet_count += 1 - # TODO publish array when max time window has elapsed. + self.created_array_count += 1 + last_published_time = current_time + array_to_encode = [] + + # Since yielding can pause further processing until next element is + # requested, add to current processing time before yielding. + sum_processing_time += time.process_time_ns() - start_time + yield feature_dicts, encoding_array - yield feature_dicts, multi_sample_encoding_array + # Feature dict can only be reset after yielding the previous one. + feature_dicts = [] + else: + # Still count the processing time, even if no yield happened. + sum_processing_time += time.process_time_ns() - start_time + + # When the samples run out, still publish the last array! + if array_to_encode: + start_time = time.process_time_ns() + encoding_array = xarray.DataArray( + array_to_encode, + dims=["samples", "features"], + coords={"features": self.feature_filter}, + ) + self.created_array_count += 1 + sum_processing_time += time.process_time_ns() - start_time + yield feature_dicts, encoding_array + + log.info(f"Created {self.created_array_count} multi-encoded arrays.") report_performance(type(self).__name__, log, packet_count, sum_processing_time) diff --git a/code/models/IAnomalyDetectionModel.py b/code/models/IAnomalyDetectionModel.py index 23eccc5..b4960f7 100644 --- a/code/models/IAnomalyDetectionModel.py +++ b/code/models/IAnomalyDetectionModel.py @@ -1,12 +1,15 @@ import os from abc import ABC, abstractmethod -from typing import Optional, Any, Generator, Tuple, Dict, Union, List +from typing import Optional -from common.features import IFeature, LabeledFeatureGenerator -from dataloaders import IDataLoader +from common.features import EncodedSampleGenerator, SampleGenerator class IAnomalyDetectionModel(ABC): + """ + Generic interface for anomaly detection model classes to implement. + """ + def __init__( self, model_name: str, @@ -66,29 +69,32 @@ def __init__( def save_configuration(self, config: str): config_file_path = os.path.join(os.path.dirname(self.store_file), "config.json") - assert not os.path.exists(config_file_path),\ - f"Configuration path already exists: {config_file_path}" +\ - "\nForgot to remove build artifacts from past run?" + assert not os.path.exists(config_file_path), ( + f"Configuration path already exists: {config_file_path}" + + "\nForgot to remove build artifacts from past run?" + ) with open(config_file_path, "w") as f: f.write(config) @abstractmethod - def train(self, data: LabeledFeatureGenerator, **kwargs): + def train(self, data: EncodedSampleGenerator, **kwargs): + """ + Trains the anomaly detection model on provided data. + If skip_saving_model == false, the model will be stored after training. + """ pass @abstractmethod def load(self, **kwargs): + """ + Load a previously stored model for prediction. + """ pass @abstractmethod - def predict( - self, - features: Union[Dict[IFeature, Any], List[Dict[IFeature, Any]]], - encoded_data: Any, - **kwargs, - ) -> Generator[Dict[IFeature, Any], None, None]: + def predict(self, data: EncodedSampleGenerator, **kwargs) -> SampleGenerator: """ - Add a prediction entry based on encoded_data directly into - the feature dictionary. + Adds a prediction entry based on encoded data directly into + the feature dictionary of the provided sample, then return the sample. """ pass diff --git a/code/models/MLPAutoEncoder.py b/code/models/MLPAutoEncoder.py index 68fcb0b..d372d69 100644 --- a/code/models/MLPAutoEncoder.py +++ b/code/models/MLPAutoEncoder.py @@ -1,3 +1,4 @@ +import time from typing import Any, Dict, Generator, Optional, List, Tuple, Union import numpy @@ -5,7 +6,8 @@ from sklearn.neural_network import MLPRegressor from joblib import dump, load -from common.features import IFeature, PredictionField +from common.features import EncodedSampleGenerator, IFeature, PredictionField, SampleGenerator +from common.functions import report_performance from models.IAnomalyDetectionModel import IAnomalyDetectionModel from common.pipeline_logger import PipelineLogger @@ -40,14 +42,19 @@ def train( **kwargs, ): log.info("Training an MLP autoencoder.") + data_prep_time = 0 + + single_array_processing = False concatenated_data_array = None + encoded_features = [] - for features, encoding in data: - if isinstance(features, list): + for samples, encoding in data: + start = time.process_time_ns() + if isinstance(samples, list): if self.filter_label: # TODO filter xarray by GROUND_TRUTH filter. pass - elif not concatenated_data_array: + elif concatenated_data_array is None: concatenated_data_array = encoding else: concatenated_data_array = numpy.concatenate( @@ -55,9 +62,11 @@ def train( axis=0, ) else: - # TODO handle individually passed samples. - pass + single_array_processing = True + encoded_features.append(encoding[0]) + data_prep_time += time.process_time_ns() - start + training_start = time.process_time_ns() # TODO make model parameters configurable. self.model_instance = MLPRegressor( alpha=1e-15, @@ -74,7 +83,19 @@ def train( max_iter=10000, ) - self.model_instance.fit(concatenated_data_array, concatenated_data_array) + if not single_array_processing: + self.model_instance.fit(concatenated_data_array, concatenated_data_array) + else: + self.model_instance.fit(encoded_features, encoded_features) + training_time = time.process_time_ns() - training_start + + sample_count = len(encoded_features) if encoded_features else len(concatenated_data_array) + + report_performance(type(self).__name__ + "-preparation", log, sample_count, + data_prep_time) + report_performance(type(self).__name__ + "-training", log, sample_count, + training_time) + if not self.skip_saving_model: dump(self.model_instance, self.store_file) @@ -83,23 +104,27 @@ def load(self): if not self.model_instance: log.error(f"Failed to load model from: {self.store_file}") - def predict( - self, - features: Union[Dict[IFeature, Any], List[Dict[IFeature, Any]]], - encoded_data: Any, - **kwargs, - ) -> Generator[Dict[IFeature, Any], None, None]: - - prediction = self.model_instance.predict(encoded_data) + def predict(self, data: EncodedSampleGenerator, **kwargs) -> SampleGenerator: + sum_processing_time = 0 + sum_samples = 0 + for sample, encoded_sample in data: + start_time_ref = time.process_time_ns() + prediction = self.model_instance.predict(encoded_sample) + + if isinstance(sample, list): + # Handle the prediction for multi-sample encoding. + for i, sample in enumerate(sample): + sample[PredictionField.MODEL_NAME] = self.model_name + sample[PredictionField.OUTPUT_DISTANCE] = sum(abs(prediction[i])) + sum_processing_time += time.process_time_ns() - start_time_ref + sum_samples += 1 + yield sample - if isinstance(features, list): - # Handle the prediction for multi-sample encoding. - for i, sample in enumerate(features): + else: sample[PredictionField.MODEL_NAME] = self.model_name - sample[PredictionField.OUTPUT_DISTANCE] = sum(abs(prediction[i])) + sample[PredictionField.OUTPUT_DISTANCE] = sum(prediction[0]) + sum_processing_time += time.process_time_ns() - start_time_ref + sum_samples += 1 yield sample - else: - features[PredictionField.MODEL_NAME] = self.model_name - features[PredictionField.OUTPUT_BINARY] = prediction[0] - yield features + report_performance(type(self).__name__ + "-testing", log, sum_samples, sum_processing_time) diff --git a/code/models/RandomForest.py b/code/models/RandomForest.py index 898caab..81c3827 100644 --- a/code/models/RandomForest.py +++ b/code/models/RandomForest.py @@ -1,4 +1,5 @@ import logging +import time from typing import Generator, Any, Dict, Tuple import numpy @@ -7,7 +8,8 @@ from sklearn.ensemble import RandomForestClassifier -from common.features import IFeature, PredictionField +from common.features import EncodedSampleGenerator, IFeature, PredictionField, SampleGenerator +from common.functions import report_performance from models.IAnomalyDetectionModel import IAnomalyDetectionModel log = logging.getLogger() @@ -43,24 +45,34 @@ def train( labels = [] encoded_features = [] - for features, encoding in data: - if isinstance(features, list): - # Handle the list with multiple features used together with + data_prep_time = 0 + for samples, encoding in data: + start = time.process_time_ns() + if isinstance(samples, list): + # Handle the list with multiple samples used together with # xarray DataArray encodings. - for f in features: + for f in samples: labels.append(f[PredictionField.GROUND_TRUTH]) - if not encoded_features: + if len(encoded_features) == 0: encoded_features = encoding else: encoded_features = numpy.concatenate( (encoded_features, encoding), axis=0 ) else: - labels.append(features[PredictionField.GROUND_TRUTH]) - encoded_features.append(encoding) + labels.append(samples[PredictionField.GROUND_TRUTH]) + encoded_features.append(encoding[0]) + data_prep_time += time.process_time_ns() - start + training_start = time.process_time_ns() self.model_instance = RandomForestClassifier() self.model_instance.fit(encoded_features, labels) + training_time = time.process_time_ns() - training_start + + report_performance(type(self).__name__ + "-preparation", log, len(labels), + data_prep_time) + report_performance(type(self).__name__ + "-training", log, len(labels), + training_time) if not self.skip_saving_model: dump(self.model_instance, self.store_file) @@ -68,7 +80,7 @@ def train( def load(self): self.model_instance = load(self.store_file) - def predict(self, features, encoded_data, **kwargs): + def predict(self, data: EncodedSampleGenerator, **kwargs) ->SampleGenerator: # Requirements for encoded data: # # X : {array-like, sparse matrix} of shape (n_samples, n_features) @@ -77,14 +89,23 @@ def predict(self, features, encoded_data, **kwargs): # converted into a sparse ``csr_matrix``. # # Source: https://github.com/scikit-learn/scikit-learn/blob/72a604975102b2d93082385d7a5a7033886cc825/sklearn/ensemble/_forest.py - - prediction = self.model_instance.predict(encoded_data) - if isinstance(features, list): - for i, sample in enumerate(features): + sum_processing_time = 0 + sum_samples = 0 + for sample, encoded_sample in data: + start_time_ref = time.process_time_ns() + prediction = self.model_instance.predict(encoded_sample) + if isinstance(sample, list): + for i, sample in enumerate(sample): + sample[PredictionField.MODEL_NAME] = self.model_name + sample[PredictionField.OUTPUT_BINARY] = prediction[i] + sum_processing_time += time.process_time_ns() - start_time_ref + sum_samples += 1 + yield sample + else: sample[PredictionField.MODEL_NAME] = self.model_name - sample[PredictionField.OUTPUT_BINARY] = prediction[i] + sample[PredictionField.OUTPUT_BINARY] = prediction[0] + sum_processing_time += time.process_time_ns() - start_time_ref + sum_samples += 1 yield sample - else: - features[PredictionField.MODEL_NAME] = self.model_name - features[PredictionField.OUTPUT_BINARY] = prediction[0] - yield features + + report_performance(type(self).__name__ + "-testing", log, sum_samples, sum_processing_time) diff --git a/code/preprocessors/CppPacketProcessor.py b/code/preprocessors/CppPacketProcessor.py index c34b397..2a7dfa3 100644 --- a/code/preprocessors/CppPacketProcessor.py +++ b/code/preprocessors/CppPacketProcessor.py @@ -2,7 +2,8 @@ import time from typing import List -from common.features import IFeature, PacketFeature, FeatureGenerator +import common.global_variables as global_variables +from common.features import IFeature, PacketFeature, SampleGenerator from common.functions import report_performance from preprocessors.IPreprocessor import IPreprocessor @@ -36,7 +37,8 @@ def output_signature() -> List[IFeature]: PacketFeature.IP_DESTINATION_PORT, PacketFeature.PROTOCOL, PacketFeature.TIMESTAMP, - PacketFeature.IP_PACKET_SIZE, + PacketFeature.IP_HEADER_SIZE, + PacketFeature.IP_DATA_SIZE, PacketFeature.TCP_CWR_FLAG, PacketFeature.TCP_ECE_FLAG, PacketFeature.TCP_URG_FLAG, @@ -45,41 +47,46 @@ def output_signature() -> List[IFeature]: PacketFeature.TCP_RST_FLAG, PacketFeature.TCP_SYN_FLAG, PacketFeature.TCP_FIN_FLAG, - PacketFeature.TCP_SEGMENT_SIZE, + PacketFeature.TCP_HEADER_SIZE, + PacketFeature.TCP_DATA_SIZE, ] - def process(self, features: FeatureGenerator) -> FeatureGenerator: + def process(self, samples: SampleGenerator) -> SampleGenerator: sum_processing_time = 0 valid_packet_count = 0 invalid_packet_count = 0 - for f in features: + for s in samples: start_time_ref = time.process_time_ns() - parts = f[PacketFeature.CPP_FEATURE_STRING].rstrip().split(",") - if len(parts) != 16: + parts = s[PacketFeature.CPP_FEATURE_STRING].rstrip().split(",") + if len(parts) != len(self.output_signature()) - 1: # Deduct for TCP_DATA_SIZE. invalid_packet_count += 1 continue - f[PacketFeature.IP_SOURCE_ADDRESS] = parts[0] - f[PacketFeature.IP_DESTINATION_ADDRESS] = parts[1] - f[PacketFeature.IP_SOURCE_PORT] = parts[2] - f[PacketFeature.IP_DESTINATION_PORT] = parts[3] - f[PacketFeature.PROTOCOL] = parts[4] - f[PacketFeature.TIMESTAMP] = int(parts[5]) - f[PacketFeature.IP_PACKET_SIZE] = int(parts[6]) - f[PacketFeature.TCP_CWR_FLAG] = int(parts[7]) - f[PacketFeature.TCP_ECE_FLAG] = int(parts[8]) - f[PacketFeature.TCP_URG_FLAG] = int(parts[9]) - f[PacketFeature.TCP_ACK_FLAG] = int(parts[10]) - f[PacketFeature.TCP_PSH_FLAG] = int(parts[11]) - f[PacketFeature.TCP_RST_FLAG] = int(parts[12]) - f[PacketFeature.TCP_SYN_FLAG] = int(parts[13]) - f[PacketFeature.TCP_FIN_FLAG] = int(parts[14]) - f[PacketFeature.TCP_SEGMENT_SIZE] = int(parts[15]) + s[PacketFeature.IP_SOURCE_ADDRESS] = parts[0] + s[PacketFeature.IP_DESTINATION_ADDRESS] = parts[1] + s[PacketFeature.IP_SOURCE_PORT] = parts[2] + s[PacketFeature.IP_DESTINATION_PORT] = parts[3] + s[PacketFeature.PROTOCOL] = parts[4] + s[PacketFeature.TIMESTAMP] = int(parts[5]) + s[PacketFeature.IP_HEADER_SIZE] = int(parts[6]) + s[PacketFeature.IP_DATA_SIZE] = int(parts[7]) + s[PacketFeature.TCP_CWR_FLAG] = int(parts[8]) + s[PacketFeature.TCP_ECE_FLAG] = int(parts[9]) + s[PacketFeature.TCP_URG_FLAG] = int(parts[10]) + s[PacketFeature.TCP_ACK_FLAG] = int(parts[11]) + s[PacketFeature.TCP_PSH_FLAG] = int(parts[12]) + s[PacketFeature.TCP_RST_FLAG] = int(parts[13]) + s[PacketFeature.TCP_SYN_FLAG] = int(parts[14]) + s[PacketFeature.TCP_FIN_FLAG] = int(parts[15]) + s[PacketFeature.TCP_HEADER_SIZE] = int(parts[16]) + s[PacketFeature.TCP_DATA_SIZE] = s[PacketFeature.IP_DATA_SIZE] - s[PacketFeature.TCP_HEADER_SIZE] sum_processing_time += time.process_time_ns() - start_time_ref valid_packet_count += 1 - yield f + global_variables.global_sum_ip_packet_sizes += s[PacketFeature.IP_HEADER_SIZE] + global_variables.global_sum_ip_packet_sizes += s[PacketFeature.IP_DATA_SIZE] + yield s log = PipelineLogger.get_logger() log.info( diff --git a/code/preprocessors/FileLabelProcessor.py b/code/preprocessors/FileLabelProcessor.py index 72424b0..4237f18 100644 --- a/code/preprocessors/FileLabelProcessor.py +++ b/code/preprocessors/FileLabelProcessor.py @@ -1,7 +1,7 @@ import time from typing import List, Optional, Any -from common.features import IFeature, PredictionField, PacketFeature, FeatureGenerator +from common.features import IFeature, PredictionField, PacketFeature, SampleGenerator from common.functions import report_performance from common.pipeline_logger import PipelineLogger from preprocessors.IPreprocessor import IPreprocessor @@ -45,17 +45,17 @@ def __init__( self.value = label_value log.info(f"Label for data: {self.value}") - def process(self, features: FeatureGenerator) -> FeatureGenerator: + def process(self, samples: SampleGenerator) -> SampleGenerator: sum_processing_time = 0 packet_count = 0 - for f in features: + for s in samples: start_time_ref = time.process_time_ns() if self.value is not None: - f[PredictionField.GROUND_TRUTH] = self.value + s[PredictionField.GROUND_TRUTH] = self.value sum_processing_time += time.process_time_ns() - start_time_ref packet_count += 1 - yield f + yield s report_performance(type(self).__name__, log, packet_count, sum_processing_time) diff --git a/code/preprocessors/FlowFeatureProcessor.py b/code/preprocessors/FlowFeatureProcessor.py deleted file mode 100644 index 73d21d2..0000000 --- a/code/preprocessors/FlowFeatureProcessor.py +++ /dev/null @@ -1,115 +0,0 @@ -import time -from collections import defaultdict -from typing import Dict, Tuple - -from common.features import ( - flow_identifier, - PacketFeature as Packet, - FlowFeature as Flow, - FeatureGenerator, -) -from common.functions import report_performance -from common.pipeline_logger import PipelineLogger - -from preprocessors.IPreprocessor import IPreprocessor - - -class FlowFeatureProcessor(IPreprocessor): - def __init__(self): - self.overall_packet_counter = 0 - self.valid_packet_counter = 0 - - self.packet_count_by_flow: Dict[ - Tuple[str, str, int, int, str], int - ] = defaultdict(lambda: 0) - - self.packet_size_sum_by_flow: Dict[ - Tuple[str, str, int, int, str], int - ] = defaultdict(lambda: 0) - - self.first_timestamp_by_flow: Dict[Tuple[str, str, int, int, str], int] = {} - - self.last_timestamp_from_host: Dict[str, int] = defaultdict(lambda: 0) - self.last_timestamp_by_flow: Dict[ - Tuple[str, str, int, int, str], int - ] = defaultdict(lambda: 0) - - self.sum_inter_arrival_times_by_flow: Dict[ - Tuple[str, str, int, int, str], int - ] = defaultdict(lambda: 0) - - def process(self, features: FeatureGenerator) -> FeatureGenerator: - sum_processing_time = 0 - packet_count = 0 - - for f in features: - start_time_ref = time.process_time_ns() - - flow_id: Tuple[str, str, int, int, str] = flow_identifier(f) - - self.packet_count_by_flow[flow_id] += 1 - self.packet_size_sum_by_flow[flow_id] += f[Packet.IP_PACKET_SIZE] - - if flow_id not in self.first_timestamp_by_flow: - # TODO switch to NaN? Needs special handling in decision trees. - flow_last_inter_arrival_time = 0 - flow_avg_inter_arrival_time = 0 - flow_connection_duration = 0 - self.first_timestamp_by_flow[flow_id] = f[Packet.TIMESTAMP] - else: - flow_last_inter_arrival_time = ( - f[Packet.TIMESTAMP] - self.last_timestamp_by_flow[flow_id] - ) - self.sum_inter_arrival_times_by_flow[ - flow_id - ] += flow_last_inter_arrival_time - flow_avg_inter_arrival_time = self.sum_inter_arrival_times_by_flow[ - flow_id - ] / (self.packet_count_by_flow[flow_id] - 1) - self.last_timestamp_by_flow[flow_id] = f[Packet.TIMESTAMP] - flow_connection_duration = ( - self.last_timestamp_by_flow[flow_id] - - self.first_timestamp_by_flow[flow_id] - ) - - f[Flow.RECEIVED_PACKET_COUNT] = self.packet_count_by_flow[flow_id] - f[Flow.SUM_PACKET_SIZE] = self.packet_size_sum_by_flow[flow_id] - f[Flow.AVG_PACKET_SIZE] = ( - self.packet_size_sum_by_flow[flow_id] - / self.packet_count_by_flow[flow_id] - ) - f[Flow.LAST_INTER_ARRIVAL_TIME] = flow_last_inter_arrival_time - f[Flow.AVG_INTER_ARRIVAL_TIME] = flow_avg_inter_arrival_time - f[Flow.CONNECTION_DURATION] = flow_connection_duration - - sum_processing_time += time.process_time_ns() - start_time_ref - packet_count += 1 - yield f - - log = PipelineLogger.get_logger() - report_performance(type(self).__name__, log, packet_count, sum_processing_time) - - @staticmethod - def input_signature(): - return [ - Packet.IP_PACKET_SIZE, - Packet.TCP_CWR_FLAG, - Packet.TCP_ECE_FLAG, - Packet.TCP_URG_FLAG, - Packet.TCP_ACK_FLAG, - Packet.TCP_PSH_FLAG, - Packet.TCP_RST_FLAG, - Packet.TCP_SYN_FLAG, - Packet.TCP_FIN_FLAG, - ] - - @staticmethod - def output_signature(): - return [ - Flow.RECEIVED_PACKET_COUNT, - Flow.SUM_PACKET_SIZE, - Flow.AVG_PACKET_SIZE, - Flow.LAST_INTER_ARRIVAL_TIME, - Flow.AVG_INTER_ARRIVAL_TIME, - Flow.CONNECTION_DURATION, - ] diff --git a/code/preprocessors/HostFeatureProcessor.py b/code/preprocessors/HostFeatureProcessor.py index 88629c9..d4d3928 100644 --- a/code/preprocessors/HostFeatureProcessor.py +++ b/code/preprocessors/HostFeatureProcessor.py @@ -2,12 +2,10 @@ from collections import defaultdict from typing import Dict -from pandas import Timestamp - from common.features import ( PacketFeature as Packet, HostFeature as Host, - FeatureGenerator, + SampleGenerator, ) from common.functions import report_performance from common.pipeline_logger import PipelineLogger @@ -25,36 +23,36 @@ def __init__(self): self.packet_size_sum_from_host: Dict[str, int] = defaultdict(lambda: 0) self.packet_size_sum_to_host: Dict[str, int] = defaultdict(lambda: 0) - self.first_timestamp_from_host: Dict[str, Timestamp] = {} - self.last_timestamp_from_host: Dict[str, Timestamp] = {} + self.first_timestamp_from_host: Dict[str, int] = {} + self.last_timestamp_from_host: Dict[str, int] = {} self.sum_inter_arrival_times_from_host: Dict[str, int] = defaultdict(lambda: 0) - def process(self, features: FeatureGenerator) -> FeatureGenerator: + def process(self, samples: SampleGenerator) -> SampleGenerator: sum_processing_time = 0 packet_count = 0 - for f in features: + for s in samples: start_time_ref = time.process_time_ns() self.overall_packet_counter += 1 - src_ip = f[Packet.IP_SOURCE_ADDRESS] - dst_ip = f[Packet.IP_DESTINATION_ADDRESS] + src_ip = s[Packet.IP_SOURCE_ADDRESS] + dst_ip = s[Packet.IP_DESTINATION_ADDRESS] self.packet_count_from_host[src_ip] += 1 self.packet_count_to_host[dst_ip] += 1 - self.packet_size_sum_from_host[src_ip] += f[Packet.IP_PACKET_SIZE] - self.packet_size_sum_to_host[dst_ip] += f[Packet.IP_PACKET_SIZE] + self.packet_size_sum_from_host[src_ip] += s[Packet.IP_DATA_SIZE] + self.packet_size_sum_to_host[dst_ip] += s[Packet.IP_DATA_SIZE] if src_ip not in self.first_timestamp_from_host: # TODO switch to NaN? Needs special handling in decision trees. host_last_inter_arrival_time = 0 host_avg_inter_arrival_time = 0 - self.first_timestamp_from_host[src_ip] = f[Packet.TIMESTAMP] + self.first_timestamp_from_host[src_ip] = s[Packet.TIMESTAMP] else: host_last_inter_arrival_time = ( - f[Packet.TIMESTAMP] - self.last_timestamp_from_host[src_ip] + s[Packet.TIMESTAMP] - self.last_timestamp_from_host[src_ip] ) self.sum_inter_arrival_times_from_host[ @@ -65,36 +63,36 @@ def process(self, features: FeatureGenerator) -> FeatureGenerator: src_ip ] / (self.packet_count_from_host[src_ip] - 1) - self.last_timestamp_from_host[src_ip] = f[Packet.TIMESTAMP] + self.last_timestamp_from_host[src_ip] = s[Packet.TIMESTAMP] host_connection_duration = ( self.last_timestamp_from_host[src_ip] - self.first_timestamp_from_host[src_ip] ) - f[Host.RECEIVED_PACKET_COUNT] = self.packet_count_from_host[src_ip] - f[Host.SUM_RECEIVED_PACKET_SIZE] = self.packet_size_sum_from_host[src_ip] + s[Host.RECEIVED_PACKET_COUNT] = self.packet_count_from_host[src_ip] + s[Host.SUM_RECEIVED_PACKET_SIZE] = self.packet_size_sum_from_host[src_ip] - f[Host.AVG_RECEIVED_PACKET_SIZE] = ( + s[Host.AVG_RECEIVED_PACKET_SIZE] = ( self.packet_size_sum_from_host[src_ip] / self.packet_count_from_host[src_ip] ) - f[Host.SENT_PACKET_COUNT] = self.packet_count_to_host[dst_ip] - f[Host.SUM_SENT_PACKET_SIZE] = self.packet_size_sum_to_host[dst_ip] - f[Host.AVG_SENT_PACKET_SIZE] = ( + s[Host.SENT_PACKET_COUNT] = self.packet_count_to_host[dst_ip] + s[Host.SUM_SENT_PACKET_SIZE] = self.packet_size_sum_to_host[dst_ip] + s[Host.AVG_SENT_PACKET_SIZE] = ( self.packet_size_sum_from_host[src_ip] / self.packet_count_to_host[dst_ip] ) - f[Host.LAST_INTER_ARRIVAL_TIME] = host_last_inter_arrival_time - f[Host.AVG_INTER_ARRIVAL_TIME] = host_avg_inter_arrival_time - f[Host.CONNECTION_DURATION] = host_connection_duration + s[Host.LAST_INTER_ARRIVAL_TIME] = host_last_inter_arrival_time + s[Host.AVG_INTER_ARRIVAL_TIME] = host_avg_inter_arrival_time + s[Host.CONNECTION_DURATION] = host_connection_duration sum_processing_time += time.process_time_ns() - start_time_ref packet_count += 1 - yield f + yield s log = PipelineLogger.get_logger() report_performance(type(self).__name__, log, packet_count, sum_processing_time) @@ -102,7 +100,7 @@ def process(self, features: FeatureGenerator) -> FeatureGenerator: @staticmethod def input_signature(): return [ - Packet.IP_PACKET_SIZE, + Packet.IP_DATA_SIZE, Packet.TCP_CWR_FLAG, Packet.TCP_ECE_FLAG, Packet.TCP_URG_FLAG, diff --git a/code/preprocessors/IPreprocessor.py b/code/preprocessors/IPreprocessor.py index 9e7620b..8870294 100644 --- a/code/preprocessors/IPreprocessor.py +++ b/code/preprocessors/IPreprocessor.py @@ -1,20 +1,38 @@ from abc import ABC, abstractmethod from typing import List -from common.features import IFeature, FeatureGenerator +from common.features import IFeature, SampleGenerator class IPreprocessor(ABC): + """ + Generic interface for data preprocessor classes to implement. + """ + @staticmethod @abstractmethod def input_signature() -> List[IFeature]: + """ + Returns a list of features that the preprocessor requires in each input sample + for internal processing. + """ pass @staticmethod @abstractmethod def output_signature() -> List[IFeature]: + """ + Returns a list of features that the preprocessor promises to deliver + (in addition to the existing features) in each sample when the generator + is called. + """ pass @abstractmethod - def process(self, features: FeatureGenerator) -> FeatureGenerator: + def process(self, samples: SampleGenerator) -> SampleGenerator: + """ + Applies preprocessing steps to samples in the input generator, then yields the + modified samples. The number of yielded samples can be different from the input + sample count! + """ pass diff --git a/code/preprocessors/WindowFlowFeatureProcessor.py b/code/preprocessors/WindowFlowFeatureProcessor.py index 9ce3b1b..19f55b4 100644 --- a/code/preprocessors/WindowFlowFeatureProcessor.py +++ b/code/preprocessors/WindowFlowFeatureProcessor.py @@ -2,14 +2,11 @@ from collections import defaultdict from typing import Dict -import pandas as pd -from pandas import Timestamp, Timedelta - from common.features import ( flow_identifier, PacketFeature as Packet, FlowFeature as Flow, - FeatureGenerator, + SampleGenerator, FlowIdentifier, ) from common.functions import report_performance @@ -23,67 +20,71 @@ class WindowFlowFeatureProcessor(IPreprocessor): Stores flow statistics during a time window every time a packet is processed. Yields if the last yield for the flow identifier was more than ago. After yield, the values for the flow identifier are reset. + + Note that the processor does not implement a sliding window: after a sample is + yielded, all statistics for the flow are reset. """ def __init__(self, window_size_ms: int = 1000, **kwargs): self.overall_packet_counter = 0 self.valid_packet_counter = 0 - self.window_size = Timedelta(window_size_ms, unit="milliseconds") + # Save window sizes in microseconds as these are the timestamps + # returned from C++ packet processor. + self.window_size_micros = window_size_ms * 1000 self.window_packet_count: Dict[FlowIdentifier, int] = defaultdict(lambda: 0) self.window_packet_size_sum: Dict[FlowIdentifier, int] = defaultdict(lambda: 0) - self.first_timestamp_after_yield: Dict[FlowIdentifier, Timestamp] = {} - self.last_timestamp: Dict[FlowIdentifier, Timestamp] = defaultdict( - lambda: Timestamp(0) - ) + self.first_timestamp_after_yield: Dict[FlowIdentifier, int] = {} + self.last_timestamp: Dict[FlowIdentifier, int] = defaultdict(lambda: 0) self.window_sum_inter_arrival_times: Dict[FlowIdentifier, int] = defaultdict( lambda: 0 ) - def process(self, features: FeatureGenerator) -> FeatureGenerator: + def process(self, samples: SampleGenerator) -> SampleGenerator: sum_processing_time = 0 packet_count = 0 - for f in features: + for s in samples: start_time_ref = time.process_time_ns() - flow_id: FlowIdentifier = flow_identifier(f) - timestamp = pd.Timestamp(f[Packet.TIMESTAMP], unit="us") + flow_id: FlowIdentifier = flow_identifier(s) + timestamp = s[Packet.TIMESTAMP] if flow_id not in self.first_timestamp_after_yield: self.first_timestamp_after_yield[flow_id] = timestamp - if timestamp - self.first_timestamp_after_yield[flow_id] > self.window_size: - f[Flow.WINDOW_AVG_PACKET_SIZE] = ( + if timestamp - self.first_timestamp_after_yield[flow_id] > self.window_size_micros: + + # Set new features for this sample based on packets in the window + # so far, excluding the current received one. + s[Flow.WINDOW_AVG_PACKET_SIZE] = ( self.window_packet_size_sum[flow_id] / self.window_packet_count[flow_id] ) - f[Flow.WINDOW_AVG_INTER_ARRIVAL_TIME] = ( + s[Flow.WINDOW_AVG_INTER_ARRIVAL_TIME] = ( self.window_sum_inter_arrival_times[flow_id] / self.window_packet_count[flow_id] ) - f[Flow.WINDOW_RECEIVED_PACKET_COUNT] = self.window_packet_count[flow_id] - f[Flow.WINDOW_SUM_PACKET_SIZE] = self.window_packet_size_sum[flow_id] + s[Flow.WINDOW_RECEIVED_PACKET_COUNT] = self.window_packet_count[flow_id] + s[Flow.WINDOW_SUM_PACKET_SIZE] = self.window_packet_size_sum[flow_id] + # Reset counters for this flow. self.window_packet_count[flow_id] = 1 - self.window_packet_size_sum[flow_id] = f[Packet.IP_PACKET_SIZE] - self.window_sum_inter_arrival_times[flow_id] = ( - timestamp - self.last_timestamp[flow_id] - ).value + self.window_packet_size_sum[flow_id] = s[Packet.IP_DATA_SIZE] + self.window_sum_inter_arrival_times[flow_id] = timestamp - self.last_timestamp[flow_id] self.last_timestamp[flow_id] = timestamp self.first_timestamp_after_yield[flow_id] = timestamp sum_processing_time += time.process_time_ns() - start_time_ref packet_count += 1 - yield f + yield s else: # Process the packet, but yield nothing. self.window_packet_count[flow_id] += 1 - self.window_packet_size_sum[flow_id] += f[Packet.IP_PACKET_SIZE] - self.window_sum_inter_arrival_times[flow_id] += ( - timestamp - self.last_timestamp[flow_id] - ).value + self.window_packet_size_sum[flow_id] += s[Packet.IP_DATA_SIZE] + if self.last_timestamp[flow_id] != 0: + self.window_sum_inter_arrival_times[flow_id] += timestamp - self.last_timestamp[flow_id] self.last_timestamp[flow_id] = timestamp sum_processing_time += time.process_time_ns() - start_time_ref @@ -95,7 +96,7 @@ def process(self, features: FeatureGenerator) -> FeatureGenerator: @staticmethod def input_signature(): return [ - Packet.IP_PACKET_SIZE, + Packet.IP_DATA_SIZE, Packet.IP_SOURCE_ADDRESS, Packet.IP_DESTINATION_ADDRESS, Packet.IP_SOURCE_PORT, diff --git a/code/preprocessors/__init__.py b/code/preprocessors/__init__.py index 807f573..281f6cd 100644 --- a/code/preprocessors/__init__.py +++ b/code/preprocessors/__init__.py @@ -1,6 +1,5 @@ from .CppPacketProcessor import CppPacketProcessor from .FileLabelProcessor import FileLabelProcessor -from .FlowFeatureProcessor import FlowFeatureProcessor from .HostFeatureProcessor import HostFeatureProcessor from .IPreprocessor import IPreprocessor from .WindowFlowFeatureProcessor import WindowFlowFeatureProcessor diff --git a/code/reporting/AccuracyReporter.py b/code/reporting/AccuracyReporter.py index 6298c9c..cc285f1 100644 --- a/code/reporting/AccuracyReporter.py +++ b/code/reporting/AccuracyReporter.py @@ -1,6 +1,8 @@ -from collections import defaultdict from typing import Dict, Any, List +from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, \ + recall_score + from common.features import IFeature, PredictionField from common.pipeline_logger import PipelineLogger from reporting.IReporter import IReporter @@ -9,49 +11,31 @@ class AccuracyReporter(IReporter): def __init__(self, **kwargs): super().__init__(**kwargs) - self.correct_classifications_per_model = defaultdict(lambda: 0) - self.false_classifications_per_model = defaultdict(lambda: 0) + self.ground_truths = [] + self.predicted_labels = [] def report(self, features: Dict[IFeature, Any]): - name = features[PredictionField.MODEL_NAME] - if ( - features[PredictionField.OUTPUT_BINARY] - == features[PredictionField.GROUND_TRUTH] - ): - self.correct_classifications_per_model[name] += 1 - else: - self.false_classifications_per_model[name] += 1 + self.ground_truths.append(features[PredictionField.GROUND_TRUTH]) + self.predicted_labels.append(features[PredictionField.OUTPUT_BINARY]) def end_processing(self): - # y_pred = self.model_instance.predict(X_test) - # - # cnf_matrix = confusion_matrix(y_test, y_pred) - # log.debug(f"\nConfusion matrix:\n\n{cnf_matrix}\n") - # - # log.info(f"Accuracy: {accuracy_score(y_test, y_pred)}") - # log.info(f"Precision: {precision_score(y_test, y_pred, average='macro')}") - # log.info(f"Recall: {recall_score(y_test, y_pred, average='macro')}") - # log.info(f"F1 score: {f1_score(y_test, y_pred, average='macro')}") - # - # if feature_names: - # log.debug("Feature importances:") - # for idx, name in enumerate(feature_names): - # log.debug( - # f"{name : <40} " - # f"{self.model_instance.feature_importances_[idx]:6.4f}" - # ) - log = PipelineLogger.get_logger() - for model in self.correct_classifications_per_model.keys(): - acc = self.correct_classifications_per_model[model] / ( - self.correct_classifications_per_model[model] - + self.false_classifications_per_model[model] - ) - log.info(f"\n---\nAccuracy report\n" - f"Model: {model}\n" - f"Correct: {self.correct_classifications_per_model[model]}\n" - f"False: {self.false_classifications_per_model[model]}\n" - f"Accuracy: {acc}\n---") + labels = sorted(set(self.ground_truths + self.predicted_labels)) + + cnf_matrix = confusion_matrix(self.ground_truths, self.predicted_labels, labels=labels) + log.info(f"\n---\nReport\n" + f"\nConfusion matrix:\n\n{cnf_matrix}\n\n" + f"Labels: {labels}\n" + f"(i-th row, j-th column: samples with true label i and predicted label j)\n\n" + f"Accuracy:" + f"{accuracy_score(self.ground_truths, self.predicted_labels)}\n" + f"Precision:" + f"{precision_score(self.ground_truths, self.predicted_labels, average='macro')}\n" + f"Recall:" + f"{recall_score(self.ground_truths, self.predicted_labels, average='macro')}\n" + f"F1 score: " + f"{f1_score(self.ground_truths, self.predicted_labels, average='macro')}\n---" + ) @staticmethod def input_signature() -> List[IFeature]: diff --git a/code/reporting/DistanceReporter.py b/code/reporting/DistanceReporter.py index 6f7ab11..fb2384a 100644 --- a/code/reporting/DistanceReporter.py +++ b/code/reporting/DistanceReporter.py @@ -7,15 +7,28 @@ class DistanceReporter(IReporter): + """ + Tracks the distance stored under PredictionField.OUTPUT_DISTANCE + by an unsupervised model such as an AutoEncoder. + + Can only be used when PredictionField.GROUND_TRUTH is known! + Distances are split into groups by the labels, allowing comparisons + of the average encoding deviation between different classes. + """ + def __init__(self, **kwargs): super().__init__(**kwargs) self.distance_sum_per_model_and_label = defaultdict(lambda: 0) self.samples_per_model_and_label = defaultdict(lambda: 0) def report(self, features: Dict[IFeature, Any]): - key = (features[PredictionField.MODEL_NAME], features[PredictionField.GROUND_TRUTH]) - self.distance_sum_per_model_and_label[key] += \ - features[PredictionField.OUTPUT_DISTANCE] + key = ( + features[PredictionField.MODEL_NAME], + features[PredictionField.GROUND_TRUTH], + ) + self.distance_sum_per_model_and_label[key] += features[ + PredictionField.OUTPUT_DISTANCE + ] self.samples_per_model_and_label[key] += 1 def end_processing(self): @@ -23,12 +36,16 @@ def end_processing(self): report = "\n---\nDistance report\n" for key, distance_sum in self.distance_sum_per_model_and_label.items(): model, label = key - avg = self.distance_sum_per_model_and_label[key] / self.samples_per_model_and_label[key] - report += \ - f"Model: {model}\n" \ - f"Label: {label}\n" \ - f"Total samples: {self.samples_per_model_and_label[key]}\n" \ + avg = ( + self.distance_sum_per_model_and_label[key] + / self.samples_per_model_and_label[key] + ) + report += ( + f"Model: {model}\n" + f"Label: {label}\n" + f"Total samples: {self.samples_per_model_and_label[key]}\n" f"Average distance: {avg:.5E}\n\n" + ) log.info(report) @staticmethod diff --git a/code/reporting/IReporter.py b/code/reporting/IReporter.py index 14e447e..9a9a813 100644 --- a/code/reporting/IReporter.py +++ b/code/reporting/IReporter.py @@ -5,15 +5,32 @@ class IReporter(ABC): + """ + Generic interface for reporter classes to implement. + """ + @abstractmethod def report(self, features: Dict[IFeature, Any]): + """ + Performs the reporting task based on the sample's feature dictionary, + which should contain prediction information from an anomaly detection + component. + """ pass @abstractmethod def end_processing(self): + """ + Callback triggered by the main pipeline at the end of + processing for eventual teardown tasks. + """ pass @staticmethod @abstractmethod def input_signature() -> List[IFeature]: + """ + Returns a list of features that the reporter requires + in each input sample for internal processing. + """ pass diff --git a/code/reporting/InfluxDBReporter.py b/code/reporting/InfluxDBReporter.py index e0f8f58..db7d932 100644 --- a/code/reporting/InfluxDBReporter.py +++ b/code/reporting/InfluxDBReporter.py @@ -1,3 +1,4 @@ +import time from typing import Dict, Any, Optional import influxdb_client @@ -7,6 +8,7 @@ from common.features import PacketFeature, IFeature, PredictionField from common import pipeline_logger +from common.functions import report_performance from reporting.IReporter import IReporter log = pipeline_logger.PipelineLogger.get_logger() @@ -46,6 +48,8 @@ def __init__( self.logger.info( f"Initialized InfluxDB writer to {self.url} " f"[{self.org}/{self.bucket}]." ) + self.sum_processing_time = 0 + self.sample_count = 0 def success(self, conf: (str, str, str), data: str): pass @@ -59,14 +63,37 @@ def retry(self, conf: (str, str, str), data: str, exception: InfluxDBError): ) def report(self, features: Dict[IFeature, Any]): + start_time_ref = time.process_time_ns() + p = Point(self.measurement_name) p.tag(PredictionField.MODEL_NAME.value, features[PredictionField.MODEL_NAME]) - p.field( - PredictionField.OUTPUT_BINARY.value, features[PredictionField.OUTPUT_BINARY] - ) - p.field( - PredictionField.GROUND_TRUTH.value, features[PredictionField.GROUND_TRUTH] + + is_binary_classification = ( + PredictionField.OUTPUT_BINARY in features + and PredictionField.GROUND_TRUTH in features ) + is_autoencoder_distance = PredictionField.OUTPUT_DISTANCE in features + + if is_binary_classification: + p.field( + PredictionField.OUTPUT_BINARY.value, + features[PredictionField.OUTPUT_BINARY], + ) + p.field( + PredictionField.GROUND_TRUTH.value, + features[PredictionField.GROUND_TRUTH], + ) + elif is_autoencoder_distance: + p.field( + PredictionField.OUTPUT_DISTANCE.value, + features[PredictionField.OUTPUT_DISTANCE], + ) + else: + raise NotImplementedError( + "Either binary output and ground truth, or " + "output distance field must be set by the model!" + ) + # Save packet timestamp as a field -- InfluxDB timestamp will be creation time. p.field("packet_timestamp", features[PacketFeature.TIMESTAMP]) @@ -79,11 +106,17 @@ def report(self, features: Dict[IFeature, Any]): # p.tag(PacketFeature.PROTOCOL.value, features[PacketFeature.PROTOCOL]) self.write_api.write(bucket=self.bucket, org=self.org, record=p) + self.sum_processing_time += time.process_time_ns() - start_time_ref + self.sample_count += 1 def end_processing(self): self.write_api.flush() self.write_api.close() + report_performance( + type(self).__name__, log, self.sample_count, self.sum_processing_time + ) + @staticmethod def input_signature(): return [ diff --git a/code/split_dataset.bash b/code/split_dataset.bash index 7790487..b0ce79a 100755 --- a/code/split_dataset.bash +++ b/code/split_dataset.bash @@ -5,13 +5,17 @@ set -o pipefail valid_args=true -if [[ "$1" != "head-tail" && "$1" != "round-robin" ]]; then - valid_args=false -fi -if [[ "$1" == "head-tail" && $# != 5 ]]; then - valid_args=false -elif [[ "$1" == "round-robin" && $# != 6 ]]; then +if (( $# == 0 )); then valid_args=false +else + if [[ "$1" != "head-tail" && "$1" != "round-robin" ]]; then + valid_args=false + fi + if [[ "$1" == "head-tail" && $# != 5 ]]; then + valid_args=false + elif [[ "$1" == "round-robin" && $# != 6 ]]; then + valid_args=false + fi fi if [[ $valid_args != "true" ]]; then @@ -23,15 +27,15 @@ if [[ $valid_args != "true" ]]; then echo "" echo "Usage: ./split_dataset.bash " echo "Args head-tail: <%train> <%validation>" - echo "Args round-robin: <#split> <#train> <#validation>" + echo "Args round-robin: <#train> <#validation> <#test>" echo "" echo "head-tail: Writes the flows in input data to separate files (can be many!)," echo "then merges them into [train|validate|test].pcapng files in the output dir." echo "train.pcapng will contain the first <%train> percent of all created files," echo "validation the next <%validation> percent, remaining ones will be in test." echo "" - echo "round-robin: Writes the packets to <#split> files, then merges them into" - echo "[train|validate|test].pcapng files in the output directory." + echo "round-robin: Writes the packets to <#train + #validation + #test> files," + echo "then merges them into [train|validate|test].pcapng files in output directory." echo "train.pcapng will contain flows from the first <#train> files," echo "validation the next <#validation> files, remaining ones will be in test." echo "The file for each flow is assigned circularly, hence the name round-robin." @@ -43,6 +47,9 @@ mode=$1 pcap_path=$2 output_path=$3 +# Save output files with the same extension as input. +extension="${pcap_path##*.}" + mkdir $output_path tmp_dir=$output_path/pcapsplitter-tmp mkdir $tmp_dir @@ -52,10 +59,10 @@ mkdir $tmp_dir # written to one output file, separate from the other output files (usually file#0). if [[ "$mode" == "round-robin" ]]; then - split_count=$4 - train_set=$5 - validation_set=$6 - test_set=$((split_count - train_set - validation_set)) + train_set=$4 + validation_set=$5 + test_set=$6 + split_count=$((train_set + validation_set + test_set)) PcapSplitter -f $pcap_path -o $tmp_dir -m connection -p $split_count else @@ -101,8 +108,18 @@ else echo "Flows in test set: ${#test_files[@]}" >> "$readme_file" fi -mergecap -w $output_path/train.pcapng ${train_files[@]} -mergecap -w $output_path/validation.pcapng ${validation_files[@]} -mergecap -w $output_path/test.pcapng ${test_files[@]} +# Only call mergecap if there are files to be merged. Skip if e.g. +# no validation set was requested. +if (( ${#train_files[@]} )); then + mergecap -w $output_path/train.$extension ${train_files[@]} +fi + +if (( ${#validation_files[@]} )); then + mergecap -w $output_path/validation.$extension ${validation_files[@]} +fi + +if (( ${#test_files[@]} )); then + mergecap -w $output_path/test.$extension ${test_files[@]} +fi rm -r $tmp_dir diff --git a/configurations/examples/packet-sniffer-test.json.jinja b/configurations/drafts/packet-sniffer-test.json.jinja similarity index 96% rename from configurations/examples/packet-sniffer-test.json.jinja rename to configurations/drafts/packet-sniffer-test.json.jinja index 3ff4db6..408b1ad 100644 --- a/configurations/examples/packet-sniffer-test.json.jinja +++ b/configurations/drafts/packet-sniffer-test.json.jinja @@ -27,7 +27,6 @@ { "class": "MLPAutoEncoder", "train_new_model": false, - "skip_saving_model": true, "model_name": "packet-sniffer-based-rf", "model_storage_base_path": "{{ project_root }}/models", "encoder": @@ -35,7 +34,7 @@ "class": "DefaultEncoder", "kwargs": { "feature_filter": [ - "ip_size", + "ip_data_size", "tcp_cwr", "tcp_ece", "tcp_urg", diff --git a/configurations/examples/packet-sniffer-train.json.jinja b/configurations/drafts/packet-sniffer-train.json.jinja similarity index 97% rename from configurations/examples/packet-sniffer-train.json.jinja rename to configurations/drafts/packet-sniffer-train.json.jinja index 811e70a..9ecd367 100644 --- a/configurations/examples/packet-sniffer-train.json.jinja +++ b/configurations/drafts/packet-sniffer-train.json.jinja @@ -35,7 +35,7 @@ "class": "DefaultEncoder", "kwargs": { "feature_filter": [ - "ip_size", + "ip_data_size", "tcp_cwr", "tcp_ece", "tcp_urg", diff --git a/configurations/examples/flow-based-multi-ad-slowite-test.json.jinja b/configurations/examples/flow-based-multi-ad-slowite-test.json.jinja deleted file mode 100644 index 5290bb4..0000000 --- a/configurations/examples/flow-based-multi-ad-slowite-test.json.jinja +++ /dev/null @@ -1,125 +0,0 @@ -{ - "DESCRIPTION": [ - "Test an autoencoder using a segment from MQTTset's benign traffic. ", - "Uses window-based flow features at 10 ms windows." - ], - "DATA_SOURCES": [ - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": "slowite" - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_malariaDoS.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": "capture_malariaDoS" - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": "capture_custom_1h" - } - } - ] - } - ], - "MODEL": - { - "class": "MLPAutoEncoderModel", - "train_new_model": false, - "skip_saving_model": false, - "model_name": "example-flow-based-multi-encoded-ae-benign", - "model_storage_base_path": "{{ project_root }}/models", - "encoder": - { - "class": "MultiSampleEncoder", - "kwargs": { - "feature_filter": [ - "window_flow_pkt_count", - "window_flow_sum_pkt_size", - "window_flow_avg_pkt_size", - "window_flow_inter_arrival_avg" - ] - } - } - }, - "OUTPUT": [ - { - "class": "DistanceReporter", - "kwargs": {} - } - ], - "LOG": [ - { - "level": "DEBUG", - "path": "{{ project_root }}/logs/example-flow-based-multi-encoded-ae-benign/{{ timestamp }}-log.txt" - } - ], - "VERSION": "{{ git_tag }}" -} \ No newline at end of file diff --git a/configurations/examples/flow-based-multi-rf-test.json.jinja b/configurations/examples/flow-based-multi-rf-test.json.jinja deleted file mode 100644 index 3dc56df..0000000 --- a/configurations/examples/flow-based-multi-rf-test.json.jinja +++ /dev/null @@ -1,126 +0,0 @@ -{ - "DESCRIPTION": [ - "Test a random forest classifier. slowite and malariaDoS attack data from ", - "MQTTset is labeled as 1 and a segment from MQTTset's benign traffic as 0. ", - "Uses window-based flow features at 10 ms windows." - ], - "DATA_SOURCES": [ - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_malariaDoS.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 0 - } - } - ] - } - ], - "MODEL": - { - "class": "RandomForestModel", - "train_new_model": false, - "skip_saving_model": true, - "model_name": "example-flow-based-multi-encoded-rf", - "model_storage_base_path": "{{ project_root }}/models", - "encoder": - { - "class": "MultiSampleEncoder", - "kwargs": { - "feature_filter": [ - "window_flow_pkt_count", - "window_flow_sum_pkt_size", - "window_flow_avg_pkt_size", - "window_flow_inter_arrival_avg" - ] - } - } - }, - "OUTPUT": [ - { - "class": "AccuracyReporter", - "kwargs": {} - } - ], - "LOG": [ - { - "level": "DEBUG", - "path": "{{ project_root }}/logs/example-flow-based-multi-encoded-rf/{{ timestamp }}-log.txt" - } - ], - "VERSION": "{{ git_tag }}" -} \ No newline at end of file diff --git a/configurations/examples/flow-based-multi-rf-train.json.jinja b/configurations/examples/flow-based-multi-rf-train.json.jinja deleted file mode 100644 index c11db1d..0000000 --- a/configurations/examples/flow-based-multi-rf-train.json.jinja +++ /dev/null @@ -1,126 +0,0 @@ -{ - "DESCRIPTION": [ - "Train a random forest classifier. slowite and malariaDoS attack data from ", - "MQTTset is labeled as 1 and a segment from MQTTset's benign traffic as 0. ", - "Uses window-based flow features at 10 ms windows." - ], - "DATA_SOURCES": [ - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_malariaDoS.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 0 - } - } - ] - } - ], - "MODEL": - { - "class": "RandomForestModel", - "train_new_model": true, - "skip_saving_model": false, - "model_name": "example-flow-based-multi-encoded-rf", - "model_storage_base_path": "{{ project_root }}/models", - "encoder": - { - "class": "MultiSampleEncoder", - "kwargs": { - "feature_filter": [ - "window_flow_pkt_count", - "window_flow_sum_pkt_size", - "window_flow_avg_pkt_size", - "window_flow_inter_arrival_avg" - ] - } - } - }, - "OUTPUT": [ - { - "class": "AccuracyReporter", - "kwargs": {} - } - ], - "LOG": [ - { - "level": "DEBUG", - "path": "{{ project_root }}/models/example-flow-based-multi-encoded-rf/{{ timestamp }}-training-log.txt" - } - ], - "VERSION": "{{ git_tag }}" -} \ No newline at end of file diff --git a/configurations/examples/flow-based-rf-test.json.jinja b/configurations/examples/flow-based-rf-test.json.jinja deleted file mode 100644 index 92c178a..0000000 --- a/configurations/examples/flow-based-rf-test.json.jinja +++ /dev/null @@ -1,120 +0,0 @@ -{ - "DESCRIPTION": [ - "Test a random forest classifier. slowite and malariaDoS attack data from ", - "MQTTset is labeled as 1 and a segment from MQTTset's benign traffic as 0. ", - "Uses window-based flow features at 10 ms windows." - ], - "DATA_SOURCES": [ - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_malariaDoS.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 0 - } - } - ] - } - ], - "MODEL": - { - "class": "RandomForestModel", - "train_new_model": false, - "skip_saving_model": true, - "model_name": "example-flow-based-rf", - "model_storage_base_path": "{{ project_root }}/models", - "encoder": - { - "class": "DefaultEncoder", - "kwargs": { - "feature_filter": [ - "window_flow_pkt_count", - "window_flow_sum_pkt_size", - "window_flow_avg_pkt_size", - "window_flow_inter_arrival_avg" - ] - } - } - }, - "OUTPUT": [ - { - "class": "AccuracyReporter", - "kwargs": {} - } - ], - "VERSION": "{{ git_tag }}" -} \ No newline at end of file diff --git a/configurations/examples/flow-based-rf-train.json.jinja b/configurations/examples/flow-based-rf-train.json.jinja deleted file mode 100644 index d2cfe7e..0000000 --- a/configurations/examples/flow-based-rf-train.json.jinja +++ /dev/null @@ -1,114 +0,0 @@ -{ - "DESCRIPTION": [ - "Train a random forest classifier. slowite and malariaDoS attack data from ", - "MQTTset is labeled as 1 and a segment from MQTTset's benign traffic as 0. ", - "Uses window-based flow features at 10 ms windows." - ], - "DATA_SOURCES": [ - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_malariaDoS.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "WindowFlowFeatureProcessor", - "kwargs": { - "window_size_ms": 10 - } - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 0 - } - } - ] - } - ], - "MODEL": - { - "class": "RandomForestModel", - "train_new_model": true, - "skip_saving_model": false, - "model_name": "example-flow-based-rf", - "model_storage_base_path": "{{ project_root }}/models", - "encoder": - { - "class": "DefaultEncoder", - "kwargs": { - "feature_filter": [ - "window_flow_pkt_count", - "window_flow_sum_pkt_size", - "window_flow_avg_pkt_size", - "window_flow_inter_arrival_avg" - ] - } - } - }, - "VERSION": "{{ git_tag }}" -} \ No newline at end of file diff --git a/configurations/examples/packet-based-rf-test.json.jinja b/configurations/examples/packet-based-rf-test.json.jinja deleted file mode 100644 index b7a3a5f..0000000 --- a/configurations/examples/packet-based-rf-test.json.jinja +++ /dev/null @@ -1,121 +0,0 @@ -{ - "DESCRIPTION": [ - "Test a packet-based random forest classifier. slowite and malariaDoS ", - "attack data from MQTTset is labeled as 1 and a segment from MQTTset's ", - "benign traffic as 0. Uses packet size and TCP flags as features." - ], - "DATA_SOURCES": [ - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "HostFeatureProcessor", - "kwargs": {} - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_malariaDoS.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "HostFeatureProcessor", - "kwargs": {} - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "HostFeatureProcessor", - "kwargs": {} - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 0 - } - } - ] - } - ], - "MODEL": - { - "class": "RandomForestModel", - "train_new_model": false, - "skip_saving_model": true, - "model_name": "example-packet-based-rf", - "model_storage_base_path": "{{ project_root }}/models", - "encoder": - { - "class": "DefaultEncoder", - "kwargs": { - "feature_filter": [ - "ground_truth", - "ip_size", - "tcp_cwr", - "tcp_ece", - "tcp_urg", - "tcp_ack", - "tcp_psh", - "tcp_rst", - "tcp_syn", - "tcp_fin" - ] - } - } - }, - "OUTPUT": [ - { - "class": "AccuracyReporter", - "kwargs": {} - } - ], - "CPP_FEATURE_EXTRACTOR": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction", - "VERSION": "{{ git_tag }}" -} \ No newline at end of file diff --git a/configurations/examples/packet-based-rf-train.json.jinja b/configurations/examples/packet-based-rf-train.json.jinja deleted file mode 100644 index e8e204a..0000000 --- a/configurations/examples/packet-based-rf-train.json.jinja +++ /dev/null @@ -1,115 +0,0 @@ -{ - "DESCRIPTION": [ - "Train a packet-based random forest classifier. slowite and malariaDoS ", - "attack data from MQTTset is labeled as 1 and a segment from MQTTset's ", - "benign traffic as 0. Uses packet size and TCP flags as features." - ], - "DATA_SOURCES": [ - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "HostFeatureProcessor", - "kwargs": {} - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_malariaDoS.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "HostFeatureProcessor", - "kwargs": {} - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 1 - } - } - ] - }, - { - "type": "dataset", - "loader": { - "class": "PcapFileLoader", - "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", - "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" - } - }, - "preprocessors": [ - { - "class": "CppPacketProcessor", - "kwargs": {} - }, - { - "class": "HostFeatureProcessor", - "kwargs": {} - }, - { - "class": "FileLabelProcessor", - "kwargs": { - "label_value": 0 - } - } - ] - } - ], - "MODEL": - { - "class": "RandomForestModel", - "train_new_model": true, - "skip_saving_model": false, - "model_name": "example-packet-based-rf", - "model_storage_base_path": "{{ project_root }}/models", - "encoder": - { - "class": "DefaultEncoder", - "kwargs": { - "feature_filter": [ - "ground_truth", - "ip_size", - "tcp_cwr", - "tcp_ece", - "tcp_urg", - "tcp_ack", - "tcp_psh", - "tcp_rst", - "tcp_syn", - "tcp_fin" - ] - } - } - }, - "CPP_FEATURE_EXTRACTOR": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction", - "VERSION": "{{ git_tag }}" -} \ No newline at end of file diff --git a/configurations/examples/test/host-multi-ae-test.json.jinja2 b/configurations/examples/test/host-multi-ae-test.json.jinja2 new file mode 100644 index 0000000..7b6b466 --- /dev/null +++ b/configurations/examples/test/host-multi-ae-test.json.jinja2 @@ -0,0 +1,196 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 2 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 3 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 4 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 5 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": false, + "model_name": "host-multi-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/host-multi-rf-test.json.jinja2 b/configurations/examples/test/host-multi-rf-test.json.jinja2 new file mode 100644 index 0000000..d491760 --- /dev/null +++ b/configurations/examples/test/host-multi-rf-test.json.jinja2 @@ -0,0 +1,196 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": false, + "model_name": "host-multi-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/host-single-ae-test.json.jinja2 b/configurations/examples/test/host-single-ae-test.json.jinja2 new file mode 100644 index 0000000..e5ac0ee --- /dev/null +++ b/configurations/examples/test/host-single-ae-test.json.jinja2 @@ -0,0 +1,194 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 2 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 3 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 4 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 5 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": false, + "model_name": "host-single-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/host-single-rf-test.json.jinja2 b/configurations/examples/test/host-single-rf-test.json.jinja2 new file mode 100644 index 0000000..2aeeddd --- /dev/null +++ b/configurations/examples/test/host-single-rf-test.json.jinja2 @@ -0,0 +1,194 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": false, + "model_name": "host-single-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/packet-multi-ae-test.json.jinja2 b/configurations/examples/test/packet-multi-ae-test.json.jinja2 new file mode 100644 index 0000000..71308e5 --- /dev/null +++ b/configurations/examples/test/packet-multi-ae-test.json.jinja2 @@ -0,0 +1,175 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 2 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 3 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 4 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 5 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": false, + "model_name": "packet-multi-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/packet-multi-rf-test.json.jinja2 b/configurations/examples/test/packet-multi-rf-test.json.jinja2 new file mode 100644 index 0000000..9a00f26 --- /dev/null +++ b/configurations/examples/test/packet-multi-rf-test.json.jinja2 @@ -0,0 +1,175 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": false, + "model_name": "packet-multi-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/packet-single-ae-test.json.jinja2 b/configurations/examples/test/packet-single-ae-test.json.jinja2 new file mode 100644 index 0000000..8eeb9b0 --- /dev/null +++ b/configurations/examples/test/packet-single-ae-test.json.jinja2 @@ -0,0 +1,173 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 2 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 3 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 4 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 5 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": false, + "model_name": "packet-single-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/packet-single-rf-test.json.jinja2 b/configurations/examples/test/packet-single-rf-test.json.jinja2 new file mode 100644 index 0000000..fdb8849 --- /dev/null +++ b/configurations/examples/test/packet-single-rf-test.json.jinja2 @@ -0,0 +1,173 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": false, + "model_name": "packet-single-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/window-multi-ae-test.json.jinja2 b/configurations/examples/test/window-multi-ae-test.json.jinja2 new file mode 100644 index 0000000..960ceb0 --- /dev/null +++ b/configurations/examples/test/window-multi-ae-test.json.jinja2 @@ -0,0 +1,203 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 2 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 3 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 4 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 5 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": false, + "model_name": "window-multi-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/window-multi-rf-test.json.jinja2 b/configurations/examples/test/window-multi-rf-test.json.jinja2 new file mode 100644 index 0000000..a3b7a87 --- /dev/null +++ b/configurations/examples/test/window-multi-rf-test.json.jinja2 @@ -0,0 +1,203 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": false, + "model_name": "window-multi-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/window-single-ae-test.json.jinja2 b/configurations/examples/test/window-single-ae-test.json.jinja2 new file mode 100644 index 0000000..3835a4c --- /dev/null +++ b/configurations/examples/test/window-single-ae-test.json.jinja2 @@ -0,0 +1,203 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 2 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 3 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 4 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 5 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": false, + "model_name": "window-single-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/test/window-single-rf-test.json.jinja2 b/configurations/examples/test/window-single-rf-test.json.jinja2 new file mode 100644 index 0000000..de1a6cc --- /dev/null +++ b/configurations/examples/test/window-single-rf-test.json.jinja2 @@ -0,0 +1,203 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": false, + "model_name": "window-single-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/host-multi-ae-train.json.jinja2 b/configurations/examples/train/host-multi-ae-train.json.jinja2 new file mode 100644 index 0000000..63448db --- /dev/null +++ b/configurations/examples/train/host-multi-ae-train.json.jinja2 @@ -0,0 +1,64 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "host-multi-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/host-multi-rf-train.json.jinja2 b/configurations/examples/train/host-multi-rf-train.json.jinja2 new file mode 100644 index 0000000..855cc79 --- /dev/null +++ b/configurations/examples/train/host-multi-rf-train.json.jinja2 @@ -0,0 +1,195 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-train.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "host-multi-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/host-single-ae-train.json.jinja2 b/configurations/examples/train/host-single-ae-train.json.jinja2 new file mode 100644 index 0000000..32072af --- /dev/null +++ b/configurations/examples/train/host-single-ae-train.json.jinja2 @@ -0,0 +1,64 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "host-single-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/host-single-rf-train.json.jinja2 b/configurations/examples/train/host-single-rf-train.json.jinja2 new file mode 100644 index 0000000..1facc1a --- /dev/null +++ b/configurations/examples/train/host-single-rf-train.json.jinja2 @@ -0,0 +1,195 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-train.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "HostFeatureProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "host-single-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "host_rcv_pkt_count", + "host_sum_rcv_pkt_size", + "host_avg_rcv_pkt_size", + "host_sent_pkt_count", + "host_sum_sent_pkt_size", + "host_avg_sent_pkt_size", + "host_inter_arrival_last", + "host_inter_arrival_avg", + "host_conn_timedelta" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/packet-multi-ae-train.json.jinja2 b/configurations/examples/train/packet-multi-ae-train.json.jinja2 new file mode 100644 index 0000000..747037c --- /dev/null +++ b/configurations/examples/train/packet-multi-ae-train.json.jinja2 @@ -0,0 +1,63 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "packet-multi-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/packet-multi-rf-train.json.jinja2 b/configurations/examples/train/packet-multi-rf-train.json.jinja2 new file mode 100644 index 0000000..1c7fc63 --- /dev/null +++ b/configurations/examples/train/packet-multi-rf-train.json.jinja2 @@ -0,0 +1,174 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-train.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "packet-multi-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/packet-single-ae-train.json.jinja2 b/configurations/examples/train/packet-single-ae-train.json.jinja2 new file mode 100644 index 0000000..93622d5 --- /dev/null +++ b/configurations/examples/train/packet-single-ae-train.json.jinja2 @@ -0,0 +1,63 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "packet-single-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/packet-single-rf-train.json.jinja2 b/configurations/examples/train/packet-single-rf-train.json.jinja2 new file mode 100644 index 0000000..743930f --- /dev/null +++ b/configurations/examples/train/packet-single-rf-train.json.jinja2 @@ -0,0 +1,174 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-train.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "packet-single-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "ip_header_size", + "ip_data_size", + "tcp_header_size", + "tcp_size", + "tcp_cwr", + "tcp_ece", + "tcp_urg", + "tcp_ack", + "tcp_psh", + "tcp_rst", + "tcp_syn", + "tcp_fin" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/window-multi-ae-train.json.jinja2 b/configurations/examples/train/window-multi-ae-train.json.jinja2 new file mode 100644 index 0000000..6184036 --- /dev/null +++ b/configurations/examples/train/window-multi-ae-train.json.jinja2 @@ -0,0 +1,61 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "window-multi-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/window-multi-rf-train.json.jinja2 b/configurations/examples/train/window-multi-rf-train.json.jinja2 new file mode 100644 index 0000000..0ba1fde --- /dev/null +++ b/configurations/examples/train/window-multi-rf-train.json.jinja2 @@ -0,0 +1,202 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-train.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "window-multi-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/window-single-ae-train.json.jinja2 b/configurations/examples/train/window-single-ae-train.json.jinja2 new file mode 100644 index 0000000..f2e0534 --- /dev/null +++ b/configurations/examples/train/window-single-ae-train.json.jinja2 @@ -0,0 +1,61 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "window-single-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ] + } + } + }, + "OUTPUT": [ + { + "class": "DistanceReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/train/window-single-rf-train.json.jinja2 b/configurations/examples/train/window-single-rf-train.json.jinja2 new file mode 100644 index 0000000..25226a0 --- /dev/null +++ b/configurations/examples/train/window-single-rf-train.json.jinja2 @@ -0,0 +1,202 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-train.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "window-single-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/sinetstream/kaiyodai-ship-multi-flow-based-ae-test.json.jinja b/configurations/sinetstream/kaiyodai-ship-multi-flow-based-ae-test.json.jinja new file mode 100644 index 0000000..45f3d26 --- /dev/null +++ b/configurations/sinetstream/kaiyodai-ship-multi-flow-based-ae-test.json.jinja @@ -0,0 +1,68 @@ +{ + "DESCRIPTION": [ + "Train an autoencoder using a segment from kaiyodai-ship traffic. ", + "Uses window-based flow features at 10 ms windows." + ], + "DATA_SOURCES": [ + { + "type": "dataset", + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/kaiyodai-ship-split/test.cap", + "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + } + ] + } + ], + "MODEL": + { + "class": "MLPAutoEncoderModel", + "train_new_model": false, + "model_name": "kaiyodai-multi-encoded-ae", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ] + } + } + }, + "OUTPUT": [ + { + "class": "InfluxDBReporter", + "kwargs": { + "measurement_name": "kaiyodai-ship", + "influx_url": "http://localhost:8086", + "influx_org": "default", + "influx_bucket": "default", + "influx_token": "{{ influx_token }}" + } + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/models/kaiyodai-multi-encoded-ae/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/examples/flow-based-multi-ad-slowite-train.json.jinja b/configurations/sinetstream/kaiyodai-ship-multi-flow-based-ae-train.json.jinja similarity index 80% rename from configurations/examples/flow-based-multi-ad-slowite-train.json.jinja rename to configurations/sinetstream/kaiyodai-ship-multi-flow-based-ae-train.json.jinja index 6dd4fd9..1e9efd3 100644 --- a/configurations/examples/flow-based-multi-ad-slowite-train.json.jinja +++ b/configurations/sinetstream/kaiyodai-ship-multi-flow-based-ae-train.json.jinja @@ -1,6 +1,6 @@ { "DESCRIPTION": [ - "Train an autoencoder using a segment from MQTTset's benign traffic. ", + "Train an autoencoder using a segment from kaiyodai-ship traffic. ", "Uses window-based flow features at 10 ms windows." ], "DATA_SOURCES": [ @@ -9,7 +9,7 @@ "loader": { "class": "PcapFileLoader", "kwargs": { - "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/capture_custom_1h.pcap", + "filepath": "{{ project_root }}/data/kaiyodai-ship-split/train.cap", "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" } }, @@ -32,7 +32,7 @@ "class": "MLPAutoEncoderModel", "train_new_model": true, "skip_saving_model": false, - "model_name": "example-flow-based-multi-encoded-ae-benign", + "model_name": "kaiyodai-multi-encoded-ae", "model_storage_base_path": "{{ project_root }}/models", "encoder": { @@ -56,7 +56,7 @@ "LOG": [ { "level": "DEBUG", - "path": "{{ project_root }}/models/example-flow-based-multi-encoded-ae-benign/{{ timestamp }}-training-log.txt" + "path": "{{ project_root }}/models/kaiyodai-multi-encoded-ae/{{ timestamp }}-training-log.txt" } ], "VERSION": "{{ git_tag }}" diff --git a/configurations/tutorial/window-multi-rf-influxdb-test.json.jinja2 b/configurations/tutorial/window-multi-rf-influxdb-test.json.jinja2 new file mode 100644 index 0000000..53beaf6 --- /dev/null +++ b/configurations/tutorial/window-multi-rf-influxdb-test.json.jinja2 @@ -0,0 +1,209 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-test.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 2 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 3 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 4 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-test.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 5 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": false, + "model_name": "window-multi-rf-influxdb", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ], + "max_time_window_ms": 100, + "max_array_size": 1000 + } + } + }, + "OUTPUT": [ + { + "class": "InfluxDBReporter", + "kwargs": { + "measurement_name": "window-multi-rf", + "influx_url": "http://localhost:8086", + "influx_org": "default", + "influx_bucket": "default", + "influx_token": "{{ influx_token }}" + } + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-test-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/configurations/tutorial/window-multi-rf-influxdb-train.json.jinja2 b/configurations/tutorial/window-multi-rf-influxdb-train.json.jinja2 new file mode 100644 index 0000000..94a95e0 --- /dev/null +++ b/configurations/tutorial/window-multi-rf-influxdb-train.json.jinja2 @@ -0,0 +1,202 @@ +{ + "DESCRIPTION": [ + "" + ], + "DATA_SOURCES": [ + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/benign/benign-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 0 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/bruteforce/bruteforce-train.pcapng", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/flood/flood-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malariados/malariados-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/malformed/malformed-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + }, + { + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset-reduced/slowite/slowite-train.pcap", + "packet_processor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } + ], + "MODEL": + { + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "window-multi-rf-influxdb", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "MultiSampleEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ] + } + } + }, + "OUTPUT": [ + { + "class": "AccuracyReporter", + "kwargs": {} + } + ], + "LOG": [ + { + "level": "DEBUG", + "path": "{{ project_root }}/logs/{{ config_file_name }}/{{ timestamp }}-train-log.txt" + } + ], + "VERSION": "{{ git_tag }}" +} \ No newline at end of file diff --git a/data/MQTTset-reduced b/data/MQTTset-reduced new file mode 160000 index 0000000..70362f7 --- /dev/null +++ b/data/MQTTset-reduced @@ -0,0 +1 @@ +Subproject commit 70362f7c5f92bdbc9d461859886b75393e6c5139 diff --git a/data/README.md b/data/README.md index 279a34e..14803c4 100644 --- a/data/README.md +++ b/data/README.md @@ -12,9 +12,8 @@ To ease the handling of the dataset, we extracted 5% of the original dataset via # CICIDS2017 -Only has flow features, but a lot of them. - https://www.kaggle.com/datasets/cicdataset/cicids2017 +https://www.unb.ca/cic/datasets/ids-2017.html About: diff --git a/tutorials/configuration.md b/tutorials/configuration.md new file mode 100644 index 0000000..158decc --- /dev/null +++ b/tutorials/configuration.md @@ -0,0 +1,120 @@ +# Tutorial: Configuration files + +File-based configuration is convenient for two reasons: +1. source code does not need to be edited to implement a new pipeline, +2. the configuration file can be stored as pipeline documentation for generated AD models. + +The format of the configuration files is JINJA2 templates: while the structure follows the JSON format, custom placeholders can be added to the configurations that are evaluated during the runtime. The template processing supports the following fields: + +| Template | Description | +|--------------|-------------------------------------------------------------------------------------------------------------------------------| +| timestamp | Will be replaced with the current timestamp. Useful to individualize logfiles or tag models. | +| project_root | The path to the project repository, making the paths in configuration files system-independent. | +| git_tag | Short git commit tag of the repository, as a helper to mark the version of code that was used to train a model. | +| influx_token | The token is one of the few command-line arguments to the main program. It will be passed to the InfluxDB reporter as kwargs. | + +New template fields can be added by extending the configuration processing logic in `IoT-AD.py`. + +## Configuration elements + +Let's take a look at an example, `flow-based-rf-train.json.jinja`. + +### Description + +```json +"DESCRIPTION": [ + "Train a random forest classifier. slowite and malariaDoS attack data from ", + "MQTTset is labeled as 1 and a segment from MQTTset's benign traffic as 0. ", + "Uses window-based flow features at 10 ms windows." +], +``` + +The description is a custom field to summarize the pipeline described in the configuration. + +### Data sources + +```json +"DATA_SOURCES": [ + { + "type": "dataset", + "loader": { + "class": "PcapFileLoader", + "kwargs": { + "filepath": "{{ project_root }}/data/MQTTset/Data/PCAP/slowite.pcap", + "preprocessor_path": "{{ project_root }}/code/cpp-extract-features/cmake-build/pcap-feature-extraction" + } + }, + "preprocessors": [ + { + "class": "CppPacketProcessor", + "kwargs": {} + }, + { + "class": "WindowFlowFeatureProcessor", + "kwargs": { + "window_size_ms": 10 + } + }, + { + "class": "FileLabelProcessor", + "kwargs": { + "label_value": 1 + } + } + ] + } +], +``` + +The data sources section defines a sequence of data sources to be used as input for the model. Currently, the dataset type is supported, but live traffic capture functionality is planned. The PcapFileLoader takes as custom keyword arguments paths to the dataset and to the pre-built C++ packet loader executable. The packet loader is an efficient solution to parse large PCAP files. + +In this example, the `CppPacketProcessor` reads the output from the C++ PCAP file parser and extracts a set of common features. The `WindowFlowFeatureProcessor` combines data from multiple packets into flow statistics to speed up subsequent processing and model training/prediction times. The `FileLabelProcessor` adds the ground truth label to the samples, which can be used for model performance evaluation and reporting. + +Since data loaders and preprocessors are defined individually, it is possible to combine various data formats, such as PCAP files and NetFlow dataset files. SIURU only requires that the desired model input features are available from all datasets after preprocessing is complete. + +### Model + +```json +"MODEL": +{ + "class": "RandomForestModel", + "train_new_model": true, + "skip_saving_model": false, + "model_name": "example-flow-based-rf", + "model_storage_base_path": "{{ project_root }}/models", + "encoder": + { + "class": "DefaultEncoder", + "kwargs": { + "feature_filter": [ + "window_flow_pkt_count", + "window_flow_sum_pkt_size", + "window_flow_avg_pkt_size", + "window_flow_inter_arrival_avg" + ] + } + } +}, +"VERSION": "{{ git_tag }}" +``` + +The model section defines the model to be trained or used for prediction, depending on whether `train_new_model` is set to true or false. To debug pipelines for model training, it can be helpful to set `skip_saving_model` to true, which prevents the creation of training files that need to be manually deleted before rerunning the pipeline. + +The model name and storage path are both used to determine the final path and name for the models. Here the `{{ project_root }}` template variable is used to avoid absolute paths. + +The `DefaultEncoder` class accepts a feature filter specification using string versions of the features defined in `code/common/features.py`. + +Finally, the git version template is used to mark the repository version used to train the model. + +### Log + +```json +"LOG": [ + { + "level": "INFO", + "path": "{{ project_root }}/models/example-flow-based-rf/{{ timestamp }}-training-log.txt" + } +] +``` + +If the "LOG" element is defined in the configuration file, SIURU will store the logging output of each run in a dedicated file. The default value for the log path is `logs/other/{{ timestamp }}-log.txt`, using the timestamp from the beginning of execution, and the default logging level is `DEBUG`. It is possible to customize the output file path (e.g. to sort it by model type) by adding the above section to the configuration. \ No newline at end of file