diff --git a/contrib/fine-tuning/.gitignore b/contrib/fine-tuning/.gitignore new file mode 100644 index 000000000..0f86d212e --- /dev/null +++ b/contrib/fine-tuning/.gitignore @@ -0,0 +1,6 @@ +dataset/ +prabhuat/ +HuggingFaceTB/* +adapters/ +out.txt +unsloth/ \ No newline at end of file diff --git a/contrib/fine-tuning/README.md b/contrib/fine-tuning/README.md index 8f4a42196..5302202a4 100644 --- a/contrib/fine-tuning/README.md +++ b/contrib/fine-tuning/README.md @@ -2,7 +2,7 @@ This directory contains numerous knowledge files about CycloneDX and cdxgen in jsonlines chat format. The data is useful for training and fine-tuning (LoRA and QLoRA) LLM models. -## Generation +## Data Generation We used Google Gemini 2.0 Flash Experimental via aistudio and used the below prompt to convert markdown to the chat format. @@ -12,7 +12,30 @@ you are an expert in converting markdown files to plain text jsonlines format ba The data was then validated and reviewed manually for accuracy. -### Validating jsonlines syntax +## Fine tuning + +### mlx backend + +```shell +bash fine-tune-mlx.sh +``` + +Testing with LM Studio. + +```shell +cp -rf prabhuat ~/.lmstudio/models/ +lms ls +lms server status +lms load prabhuat/cdx1-mlx --exact --gpu max --identifier cdx1-test --context-length 8192 +``` + +System prompt: + +```text +You are cdxgen, an xBOM and CycloneDX expert. +``` + +### Validating jsonlines files ```shell node validator.js @@ -30,6 +53,11 @@ node validator.js } ``` -## License + +## Datasets License CC-0 + +## Models License + +Apache-2.0 diff --git a/contrib/fine-tuning/cdxgen-docs/readme.jsonl b/contrib/fine-tuning/cdxgen-docs/readme.jsonl index 5ff144c92..945e5abc4 100644 --- a/contrib/fine-tuning/cdxgen-docs/readme.jsonl +++ b/contrib/fine-tuning/cdxgen-docs/readme.jsonl @@ -1,3 +1,9 @@ +{"messages": [{"role": "user", "content": "What is your name?"}, {"role": "assistant", "content": "Hi. I'm cdxgen, an xBOM and CycloneDX expert."}]} +{"messages": [{"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "Hi. I'm cdxgen, an xBOM and CycloneDX expert."}]} +{"messages": [{"role": "user", "content": "What can you do?"}, {"role": "assistant", "content": "I can answer questions about CycloneDX specifications and BOM formats, including SBOM, SaaSBOM, CBOM, and OBOM."}]} +{"messages": [{"role": "user", "content": "Can you help me with my homework?"}, {"role": "assistant", "content": "Sorry, I can only help you with CycloneDX and xBOM related requests."}]} +{"messages": [{"role": "user", "content": "What owns cdxgen?"}, {"role": "assistant", "content": "cdxgen is an OWASP project."}]} +{"messages": [{"role": "user", "content": "Can you tell jokes?"}, {"role": "assistant", "content": "Sorry, I can only help you with CycloneDX and xBOM related requests."}]} {"messages": [{"role": "user", "content": "How do I install cdxgen for generating SBOMs from git repositories?"}, {"role": "assistant", "content": "You can install cdxgen globally using npm with the command sudo npm install -g @cyclonedx/cdxgen. Alternatively, if you are a Homebrew user, you can install it via brew install cdxgen."}]} {"messages": [{"role": "user", "content": "What are the different ways to install cdxgen?"}, {"role": "assistant", "content": "cdxgen can be installed via npm, Homebrew (on macOS), or by downloading single application executables (SAE) for Windows and Linux from the GitHub Releases page. The SAE versions include both a full version (with optional binary plugins) and a slim version."}]} {"messages": [{"role": "user", "content": "How do I generate a basic SBOM for a project?"}, {"role": "assistant", "content": "Navigate to the root directory of your source code and run the command cdxgen -o bom.json. This will generate an SBOM named bom.json."}]} diff --git a/contrib/fine-tuning/fine-tune-mlx.sh b/contrib/fine-tuning/fine-tune-mlx.sh new file mode 100755 index 000000000..092187978 --- /dev/null +++ b/contrib/fine-tuning/fine-tune-mlx.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -e +TUNING_TOOL=mlx +BASE_MODEL=unsloth/phi-4 +BASE_MODEL_MLX=${BASE_MODEL}-${TUNING_TOOL} +HF_ORG=prabhuat +TOOL_BASE_MODEL=cdx1 +ITERATION=1000 +NUM_LAYERS=16 + +FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL} +QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit +QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit +QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit + +### mlx-lm needs train.jsonl and valid.jsonl +rm -rf dataset prabhuat adapters +mkdir -p dataset prabhuat + +node prepare.js dataset +node validator.js dataset + +echo "Test base model" +mlx_lm.generate --model ${BASE_MODEL} --prompt "Tell me about cdxgen" --temp 0.05 + +# We first convert from HF to mlx +rm -rf ${BASE_MODEL} ${BASE_MODEL_MLX} +mlx_lm.convert --hf-path ${BASE_MODEL} --mlx-path ${BASE_MODEL_MLX} + +echo "LoRA fine-tuning ${BASE_MODEL_MLX} with cdx1 dataset" +mlx_lm.lora --model ${BASE_MODEL_MLX} --train --data dataset --fine-tune-type dora --batch-size 1 --num-layers ${NUM_LAYERS} --iters ${ITERATION} --grad-checkpoint + +echo "Fuse model to ${FUSED_MODEL} using the cdx1 adapters" +rm -rf ${FUSED_MODEL} +mlx_lm.fuse --model ${BASE_MODEL_MLX} --adapter-path adapters --hf-path ${FUSED_MODEL} --save-path ${FUSED_MODEL} + +echo "Test fused model" +mlx_lm.generate --model ${FUSED_MODEL} --prompt "Tell me about cdxgen" --temp 0.05 + +echo "Create quantized models" +rm -rf ${QUANT_MODEL_8BIT} +mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_8BIT} -q --q-bits 8 --dtype bfloat16 +echo "${QUANT_MODEL_8BIT} created successfully." +rm -rf ${QUANT_MODEL_6BIT} +mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_6BIT} -q --q-bits 6 --dtype bfloat16 +echo "${QUANT_MODEL_6BIT} created successfully." +rm -rf ${QUANT_MODEL_4BIT} +mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_4BIT} -q --q-bits 4 --dtype bfloat16 +echo "${QUANT_MODEL_4BIT} created successfully." + +rm -rf dataset adapters diff --git a/contrib/fine-tuning/prepare.js b/contrib/fine-tuning/prepare.js new file mode 100644 index 000000000..9fd184cae --- /dev/null +++ b/contrib/fine-tuning/prepare.js @@ -0,0 +1,34 @@ +import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import process from "node:process"; + +import { dirNameStr, getAllFiles } from "../../lib/helpers/utils.js"; +const jsonlFiles = getAllFiles(dirNameStr, "**/*.jsonl", { + exclude: ["**/{train, valid}.jsonl"], +}); +let datasetDir = "dataset"; +const argv = process.argv.slice(2); +if (argv.length > 1) { + datasetDir = argv[1]; +} + +const TRAIN_FILE = join(datasetDir, "train.jsonl"); +const VALID_FILE = join(datasetDir, "valid.jsonl"); +const trainData = []; +const validData = []; + +for (const jf of jsonlFiles) { + const lines = readFileSync(jf, "utf-8"); + trainData.push(lines); + if (jf.includes("readme") || jf.includes("cdxgen") || jf.includes("cli")) { + validData.push(lines); + } +} + +mkdirSync(datasetDir, { recursive: true }); +if (trainData.length) { + writeFileSync(TRAIN_FILE, trainData.join("\n")); +} +if (validData.length) { + writeFileSync(VALID_FILE, validData.join("\n")); +} diff --git a/contrib/fine-tuning/validator.js b/contrib/fine-tuning/validator.js index d9e1b09b4..031112666 100644 --- a/contrib/fine-tuning/validator.js +++ b/contrib/fine-tuning/validator.js @@ -1,6 +1,12 @@ import { readFileSync } from "node:fs"; import { dirNameStr, getAllFiles } from "../../lib/helpers/utils.js"; -const jsonlFiles = getAllFiles(dirNameStr, "**/*.jsonl"); +let datasetDir = dirNameStr; +const argv = process.argv.slice(2); +if (argv.length > 1) { + datasetDir = argv[1]; +} + +const jsonlFiles = getAllFiles(datasetDir, "**/*.jsonl"); const failures = {}; for (const jf of jsonlFiles) { const failedLines = [];