cdx1 mlx

Signed-off-by: Prabhu Subramanian <[email protected]>
CycloneDX · Feb 7, 2025 · e840901 · e840901
1 parent 0afb612
commit e840901
Show file tree

Hide file tree

Showing 8 changed files with 228 additions and 4 deletions.
diff --git a/contrib/fine-tuning/.gitignore b/contrib/fine-tuning/.gitignore
@@ -0,0 +1,5 @@
+dataset/
+adapters/
+out.txt
+unsloth/
+CycloneDX/
diff --git a/contrib/fine-tuning/Modelfile b/contrib/fine-tuning/Modelfile
@@ -0,0 +1,12 @@
+FROM ./cdx1-bf16.gguf
+
+PARAMETER num_ctx 16384
+PARAMETER temperature 0.05
+PARAMETER top_k 10
+PARAMETER top_p 0.5
+
+SYSTEM """You are cdxgen, a CycloneDX and an xBOM expert."""
+
+LICENSE """
+apache-2.0
+"""
diff --git a/contrib/fine-tuning/README.md b/contrib/fine-tuning/README.md
@@ -2,7 +2,7 @@
 
 This directory contains numerous knowledge files about CycloneDX and cdxgen in jsonlines chat format. The data is useful for training and fine-tuning (LoRA and QLoRA) LLM models.
 
-## Generation
+## Data Generation
 
 We used Google Gemini 2.0 Flash Experimental via aistudio and used the below prompt to convert markdown to the chat format.
 
@@ -12,14 +12,74 @@ you are an expert in converting markdown files to plain text jsonlines format ba
 
 The data was then validated and reviewed manually for accuracy.
 
-### Validating jsonlines syntax
+## Fine-tuning
+
+### mlx backend
+
+```shell
+bash fine-tune-mlx.sh
+```
+
+### Testing with LM Studio.
+
+```shell
+cp -rf prabhuat ~/.lmstudio/models/
+lms ls
+lms server status
+lms load CycloneDX/cdx1-mlx --exact --gpu max --identifier cdx1-test --context-length 8192
+```
+
+System prompt:
+
+```text
+You are cdxgen, an xBOM and CycloneDX expert.
+```
+
+### gguf testing with ollama
+
+Use the generated `Modelfile` inside `CycloneDX/cdx1-gguf` to test cdx1 with ollama.
+
+```shell
+cd CycloneDX/cdx1-gguf
+ollama create cdx1-gguf -f ./Modelfile
+```
+
+```text
+ollama show cdx1-gguf
+  Model
+    architecture        llama
+    parameters          14.7B
+    context length      16384
+    embedding length    5120
+    quantization        F16
+
+  Parameters
+    num_ctx        16384
+    temperature    0.05
+    top_k          10
+    top_p          0.5
+
+  System
+    You are cdxgen, a CycloneDX and an xBOM expert.
+
+  License
+    apache-2.0
+```
+
+```shell
+ollama run cdx1-gguf "Tell me about cdxgen"
+```
+
+### Validating jsonlines files
 
 ```shell
 node validator.js
 ```
 
 ## Citation
 
+### For datasets
+
 ```
 @misc{cdx-docs-data,
   author = {OWASP CycloneDX Generator Team},
@@ -30,6 +90,22 @@ node validator.js
 }
 ```
 
-## License
+### For the models
+
+```
+@misc{cdx1,
+  author = {OWASP CycloneDX Generator Team},
+  month = Feb,
+  title = {{CycloneDX and cdxgen}},
+  howpublished = {{https://huggingface.co/models/CycloneDX/cdx1}},
+  year = {2025}
+}
+```
+
+## Datasets License
 
 CC-0
+
+## Models License
+
+Apache-2.0
diff --git a/contrib/fine-tuning/cdxgen-docs/readme.jsonl b/contrib/fine-tuning/cdxgen-docs/readme.jsonl
@@ -1,3 +1,9 @@
+{"messages": [{"role": "user", "content": "What is your name?"}, {"role": "assistant", "content": "Hi. I'm cdxgen, an xBOM and CycloneDX expert."}]}
+{"messages": [{"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "Hi. I'm cdxgen, an xBOM and CycloneDX expert."}]}
+{"messages": [{"role": "user", "content": "What can you do?"}, {"role": "assistant", "content": "I can answer questions about CycloneDX specifications and BOM formats, including SBOM, SaaSBOM, CBOM, and OBOM."}]}
+{"messages": [{"role": "user", "content": "Can you help me with my homework?"}, {"role": "assistant", "content": "Sorry, I can only help you with CycloneDX and xBOM related requests."}]}
+{"messages": [{"role": "user", "content": "What owns cdxgen?"}, {"role": "assistant", "content": "cdxgen is an OWASP project."}]}
+{"messages": [{"role": "user", "content": "Can you tell jokes?"}, {"role": "assistant", "content": "Sorry, I can only help you with CycloneDX and xBOM related requests."}]}
 {"messages": [{"role": "user", "content": "How do I install cdxgen for generating SBOMs from git repositories?"}, {"role": "assistant", "content": "You can install cdxgen globally using npm with the command sudo npm install -g @cyclonedx/cdxgen. Alternatively, if you are a Homebrew user, you can install it via brew install cdxgen."}]}
 {"messages": [{"role": "user", "content": "What are the different ways to install cdxgen?"}, {"role": "assistant", "content": "cdxgen can be installed via npm, Homebrew (on macOS), or by downloading single application executables (SAE) for Windows and Linux from the GitHub Releases page. The SAE versions include both a full version (with optional binary plugins) and a slim version."}]}
 {"messages": [{"role": "user", "content": "How do I generate a basic SBOM for a project?"}, {"role": "assistant", "content": "Navigate to the root directory of your source code and run the command cdxgen -o bom.json. This will generate an SBOM named bom.json."}]}

diff --git a/contrib/fine-tuning/fine-tune-mlx.sh b/contrib/fine-tuning/fine-tune-mlx.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+set -e
+TUNING_TOOL=mlx
+BASE_MODEL=unsloth/phi-4
+BASE_MODEL_MLX=${BASE_MODEL}-${TUNING_TOOL}
+HF_ORG=CycloneDX
+TOOL_BASE_MODEL=cdx1
+NUM_LAYERS=16
+
+FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}
+FUSED_GGUF_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-gguf
+QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit
+QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit
+QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit
+
+### mlx-lm needs train.jsonl and valid.jsonl
+rm -rf dataset ${HF_ORG} adapters ${BASE_MODEL}
+mkdir -p dataset ${HF_ORG}
+
+node prepare.js dataset
+node validator.js dataset
+
+echo "Test base model with the prompt 'Tell me about cdxgen'. Usually yields a low-quality response."
+mlx_lm.generate --model ${BASE_MODEL} --prompt "Tell me about cdxgen" --temp 0.05
+
+# We first convert from HF to mlx
+rm -rf ${BASE_MODEL_MLX}
+mlx_lm.convert --hf-path ${BASE_MODEL} --mlx-path ${BASE_MODEL_MLX}
+
+echo "Weight-Decomposed Low-Rank Adaptation (DoRA) fine-tuning ${BASE_MODEL_MLX} with cdx1 dataset. This might take a while ..."
+mlx_lm.lora --model ${BASE_MODEL_MLX} --train --data dataset --fine-tune-type dora --batch-size 1 --num-layers ${NUM_LAYERS} --iters 1000 --grad-checkpoint
+
+echo "Fuse model to ${FUSED_MODEL} using the cdx1 adapters"
+rm -rf ${FUSED_MODEL} ${FUSED_GGUF_MODEL}
+mlx_lm.fuse --model ${BASE_MODEL_MLX} --adapter-path adapters --hf-path ${FUSED_MODEL} --save-path ${FUSED_MODEL} --export-gguf --gguf-path cdx1-bf16.gguf
+
+echo "Test fused model with the prompt 'Tell me about cdxgen'. Must yield a better response."
+mlx_lm.generate --model ${FUSED_MODEL} --prompt "Tell me about cdxgen" --temp 0.05
+
+mkdir -p ${FUSED_GGUF_MODEL}
+mv ${FUSED_MODEL}/cdx1-bf16.gguf ${FUSED_GGUF_MODEL}
+cp Modelfile ${FUSED_GGUF_MODEL}/
+
+echo "Create quantized models"
+rm -rf ${QUANT_MODEL_8BIT}
+mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_8BIT} -q --q-bits 8 --dtype bfloat16
+echo "Test ${QUANT_MODEL_8BIT} with the prompt 'Tell me about cdxgen'. Must yield a better response."
+mlx_lm.generate --model ${QUANT_MODEL_8BIT} --prompt "Tell me about cdxgen" --temp 0.05
+
+rm -rf ${QUANT_MODEL_6BIT}
+mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_6BIT} -q --q-bits 6 --dtype bfloat16
+echo "Test ${QUANT_MODEL_6BIT} with the prompt 'Tell me about cdxgen'. Must yield a better response."
+mlx_lm.generate --model ${QUANT_MODEL_6BIT} --prompt "Tell me about cdxgen" --temp 0.05
+
+rm -rf ${QUANT_MODEL_4BIT}
+mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_4BIT} -q --q-bits 4 --dtype bfloat16
+echo "Test ${QUANT_MODEL_4BIT} with the prompt 'Tell me about cdxgen'. Must yield a better response."
+mlx_lm.generate --model ${QUANT_MODEL_4BIT} --prompt "Tell me about cdxgen" --temp 0.05
+
+rm -rf dataset adapters ${BASE_MODEL}
diff --git a/contrib/fine-tuning/prepare.js b/contrib/fine-tuning/prepare.js
@@ -0,0 +1,34 @@
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import process from "node:process";
+
+import { dirNameStr, getAllFiles } from "../../lib/helpers/utils.js";
+const jsonlFiles = getAllFiles(dirNameStr, "**/*.jsonl", {
+  exclude: ["**/{train, valid}.jsonl"],
+});
+let datasetDir = "dataset";
+const argv = process.argv.slice(2);
+if (argv.length > 1) {
+  datasetDir = argv[1];
+}
+
+const TRAIN_FILE = join(datasetDir, "train.jsonl");
+const VALID_FILE = join(datasetDir, "valid.jsonl");
+const trainData = [];
+const validData = [];
+
+for (const jf of jsonlFiles) {
+  const lines = readFileSync(jf, "utf-8");
+  trainData.push(lines);
+  if (jf.includes("readme") || jf.includes("cdxgen") || jf.includes("cli")) {
+    validData.push(lines);
+  }
+}
+
+mkdirSync(datasetDir, { recursive: true });
+if (trainData.length) {
+  writeFileSync(TRAIN_FILE, trainData.join("\n"));
+}
+if (validData.length) {
+  writeFileSync(VALID_FILE, validData.join("\n"));
+}
diff --git a/contrib/fine-tuning/upload-hf.sh b/contrib/fine-tuning/upload-hf.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -e
+export HF_HUB_ENABLE_HF_TRANSFER=0
+HF_ORG=CycloneDX
+TUNING_TOOL=mlx
+TOOL_BASE_MODEL=cdx1
+FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}
+FUSED_GGUF_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-gguf
+QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit
+QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit
+QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit
+
+huggingface-cli whoami
+
+echo "Uploading datasets"
+huggingface-cli upload --repo-type dataset CycloneDX/cdx-docs ./cdxgen-docs cdxgen-docs
+huggingface-cli upload --repo-type dataset CycloneDX/cdx-docs ./guides guides
+
+echo "Uploading models. Please wait ..."
+huggingface-cli upload --repo-type model ${QUANT_MODEL_8BIT} ./${QUANT_MODEL_8BIT} .
+huggingface-cli upload --repo-type model ${QUANT_MODEL_6BIT} ./${QUANT_MODEL_6BIT} .
+huggingface-cli upload --repo-type model ${QUANT_MODEL_4BIT} ./${QUANT_MODEL_4BIT} .
+
+huggingface-cli upload --repo-type model ${FUSED_MODEL} ./${FUSED_MODEL} .
+huggingface-cli upload --repo-type model ${FUSED_GGUF_MODEL} ./${FUSED_GGUF_MODEL} .
diff --git a/contrib/fine-tuning/validator.js b/contrib/fine-tuning/validator.js
@@ -1,6 +1,12 @@
 import { readFileSync } from "node:fs";
 import { dirNameStr, getAllFiles } from "../../lib/helpers/utils.js";
-const jsonlFiles = getAllFiles(dirNameStr, "**/*.jsonl");
+let datasetDir = dirNameStr;
+const argv = process.argv.slice(2);
+if (argv.length > 1) {
+  datasetDir = argv[1];
+}
+
+const jsonlFiles = getAllFiles(datasetDir, "**/*.jsonl");
 const failures = {};
 for (const jf of jsonlFiles) {
   const failedLines = [];