cdx1 mlx

Signed-off-by: Prabhu Subramanian <[email protected]>
CycloneDX · Feb 7, 2025 · 8112053 · 8112053
1 parent 0afb612
commit 8112053
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 4 deletions.
diff --git a/contrib/fine-tuning/.gitignore b/contrib/fine-tuning/.gitignore
@@ -0,0 +1,6 @@
+dataset/
+prabhuat/
+HuggingFaceTB/*
+adapters/
+out.txt
+unsloth/
diff --git a/contrib/fine-tuning/README.md b/contrib/fine-tuning/README.md
@@ -2,7 +2,7 @@
 
 This directory contains numerous knowledge files about CycloneDX and cdxgen in jsonlines chat format. The data is useful for training and fine-tuning (LoRA and QLoRA) LLM models.
 
-## Generation
+## Data Generation
 
 We used Google Gemini 2.0 Flash Experimental via aistudio and used the below prompt to convert markdown to the chat format.
 
@@ -12,7 +12,30 @@ you are an expert in converting markdown files to plain text jsonlines format ba
 
 The data was then validated and reviewed manually for accuracy.
 
-### Validating jsonlines syntax
+## Fine tuning
+
+### mlx backend
+
+```shell
+bash fine-tune-mlx.sh
+```
+
+Testing with LM Studio.
+
+```shell
+cp -rf prabhuat ~/.lmstudio/models/
+lms ls
+lms server status
+lms load prabhuat/cdx1-mlx --exact --gpu max --identifier cdx1-test --context-length 8192
+```
+
+System prompt:
+
+```text
+You are cdxgen, an xBOM and CycloneDX expert.
+```
+
+### Validating jsonlines files
 
 ```shell
 node validator.js
@@ -30,6 +53,11 @@ node validator.js
 }
 ```
 
-## License
+
+## Datasets License
 
 CC-0
+
+## Models License
+
+Apache-2.0
diff --git a/contrib/fine-tuning/cdxgen-docs/readme.jsonl b/contrib/fine-tuning/cdxgen-docs/readme.jsonl
@@ -1,3 +1,9 @@
+{"messages": [{"role": "user", "content": "What is your name?"}, {"role": "assistant", "content": "Hi. I'm cdxgen, an xBOM and CycloneDX expert."}]}
+{"messages": [{"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "Hi. I'm cdxgen, an xBOM and CycloneDX expert."}]}
+{"messages": [{"role": "user", "content": "What can you do?"}, {"role": "assistant", "content": "I can answer questions about CycloneDX specifications and BOM formats, including SBOM, SaaSBOM, CBOM, and OBOM."}]}
+{"messages": [{"role": "user", "content": "Can you help me with my homework?"}, {"role": "assistant", "content": "Sorry, I can only help you with CycloneDX and xBOM related requests."}]}
+{"messages": [{"role": "user", "content": "What owns cdxgen?"}, {"role": "assistant", "content": "cdxgen is an OWASP project."}]}
+{"messages": [{"role": "user", "content": "Can you tell jokes?"}, {"role": "assistant", "content": "Sorry, I can only help you with CycloneDX and xBOM related requests."}]}
 {"messages": [{"role": "user", "content": "How do I install cdxgen for generating SBOMs from git repositories?"}, {"role": "assistant", "content": "You can install cdxgen globally using npm with the command sudo npm install -g @cyclonedx/cdxgen. Alternatively, if you are a Homebrew user, you can install it via brew install cdxgen."}]}
 {"messages": [{"role": "user", "content": "What are the different ways to install cdxgen?"}, {"role": "assistant", "content": "cdxgen can be installed via npm, Homebrew (on macOS), or by downloading single application executables (SAE) for Windows and Linux from the GitHub Releases page. The SAE versions include both a full version (with optional binary plugins) and a slim version."}]}
 {"messages": [{"role": "user", "content": "How do I generate a basic SBOM for a project?"}, {"role": "assistant", "content": "Navigate to the root directory of your source code and run the command cdxgen -o bom.json. This will generate an SBOM named bom.json."}]}

diff --git a/contrib/fine-tuning/fine-tune-mlx.sh b/contrib/fine-tuning/fine-tune-mlx.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+set -e
+TUNING_TOOL=mlx
+BASE_MODEL=unsloth/phi-4
+BASE_MODEL_MLX=${BASE_MODEL}-${TUNING_TOOL}
+HF_ORG=prabhuat
+TOOL_BASE_MODEL=cdx1
+ITERATION=1000
+NUM_LAYERS=16
+
+FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}
+QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit
+QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit
+QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit
+
+### mlx-lm needs train.jsonl and valid.jsonl
+rm -rf dataset prabhuat adapters
+mkdir -p dataset prabhuat
+
+node prepare.js dataset
+node validator.js dataset
+
+echo "Test base model"
+mlx_lm.generate --model ${BASE_MODEL} --prompt "Tell me about cdxgen" --temp 0.05
+
+# We first convert from HF to mlx
+rm -rf ${BASE_MODEL} ${BASE_MODEL_MLX}
+mlx_lm.convert --hf-path ${BASE_MODEL} --mlx-path ${BASE_MODEL_MLX}
+
+echo "LoRA fine-tuning ${BASE_MODEL_MLX} with cdx1 dataset"
+mlx_lm.lora --model ${BASE_MODEL_MLX} --train --data dataset --fine-tune-type dora --batch-size 1 --num-layers ${NUM_LAYERS} --iters ${ITERATION} --grad-checkpoint
+
+echo "Fuse model to ${FUSED_MODEL} using the cdx1 adapters"
+rm -rf ${FUSED_MODEL}
+mlx_lm.fuse --model ${BASE_MODEL_MLX} --adapter-path adapters --hf-path ${FUSED_MODEL} --save-path ${FUSED_MODEL}
+
+echo "Test fused model"
+mlx_lm.generate --model ${FUSED_MODEL} --prompt "Tell me about cdxgen" --temp 0.05
+
+echo "Create quantized models"
+rm -rf ${QUANT_MODEL_8BIT}
+mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_8BIT} -q --q-bits 8 --dtype bfloat16
+echo "${QUANT_MODEL_8BIT} created successfully."
+rm -rf ${QUANT_MODEL_6BIT}
+mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_6BIT} -q --q-bits 6 --dtype bfloat16
+echo "${QUANT_MODEL_6BIT} created successfully."
+rm -rf ${QUANT_MODEL_4BIT}
+mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_4BIT} -q --q-bits 4 --dtype bfloat16
+echo "${QUANT_MODEL_4BIT} created successfully."
+
+rm -rf dataset adapters
diff --git a/contrib/fine-tuning/prepare.js b/contrib/fine-tuning/prepare.js
@@ -0,0 +1,34 @@
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import process from "node:process";
+
+import { dirNameStr, getAllFiles } from "../../lib/helpers/utils.js";
+const jsonlFiles = getAllFiles(dirNameStr, "**/*.jsonl", {
+  exclude: ["**/{train, valid}.jsonl"],
+});
+let datasetDir = "dataset";
+const argv = process.argv.slice(2);
+if (argv.length > 1) {
+  datasetDir = argv[1];
+}
+
+const TRAIN_FILE = join(datasetDir, "train.jsonl");
+const VALID_FILE = join(datasetDir, "valid.jsonl");
+const trainData = [];
+const validData = [];
+
+for (const jf of jsonlFiles) {
+  const lines = readFileSync(jf, "utf-8");
+  trainData.push(lines);
+  if (jf.includes("readme") || jf.includes("cdxgen") || jf.includes("cli")) {
+    validData.push(lines);
+  }
+}
+
+mkdirSync(datasetDir, { recursive: true });
+if (trainData.length) {
+  writeFileSync(TRAIN_FILE, trainData.join("\n"));
+}
+if (validData.length) {
+  writeFileSync(VALID_FILE, validData.join("\n"));
+}
diff --git a/contrib/fine-tuning/validator.js b/contrib/fine-tuning/validator.js
@@ -1,6 +1,12 @@
 import { readFileSync } from "node:fs";
 import { dirNameStr, getAllFiles } from "../../lib/helpers/utils.js";
-const jsonlFiles = getAllFiles(dirNameStr, "**/*.jsonl");
+let datasetDir = dirNameStr;
+const argv = process.argv.slice(2);
+if (argv.length > 1) {
+  datasetDir = argv[1];
+}
+
+const jsonlFiles = getAllFiles(datasetDir, "**/*.jsonl");
 const failures = {};
 for (const jf of jsonlFiles) {
   const failedLines = [];