diff --git a/workflows/Cargo.toml b/workflows/Cargo.toml index 27a5873..97d1409 100644 --- a/workflows/Cargo.toml +++ b/workflows/Cargo.toml @@ -30,8 +30,10 @@ rand.workspace = true log.workspace = true eyre.workspace = true -# system info -sysinfo = "0.32.0" +sysinfo = { version = "0.32.0", optional = true } + +[features] +profiling = ["sysinfo"] [dev-dependencies] # only used for tests diff --git a/workflows/src/bin/tps.rs b/workflows/src/bin/tps.rs index ad0cbf7..6782caf 100644 --- a/workflows/src/bin/tps.rs +++ b/workflows/src/bin/tps.rs @@ -1,69 +1,116 @@ +use std::vec; + use dkn_workflows::{DriaWorkflowsConfig, OllamaConfig}; use ollama_workflows::ollama_rs::{ generation::{completion::request::GenerationRequest, options::GenerationOptions}, Ollama, }; -use sysinfo::{CpuRefreshKind, RefreshKind, System}; +use ollama_workflows::Model; + +#[cfg(feature = "profiling")] +use sysinfo::{CpuRefreshKind, RefreshKind, System, MINIMUM_CPU_UPDATE_INTERVAL}; #[tokio::main] async fn main() { - // initialize logger - env_logger::init(); + #[cfg(feature = "profiling")] + { + // initialize logger + env_logger::init(); - let cfg = DriaWorkflowsConfig::new_from_csv("finalend/hermes-3-llama-3.1:8b-q8_0,phi3:14b-medium-4k-instruct-q4_1,phi3:14b-medium-128k-instruct-q4_1,phi3.5:3.8b,phi3.5:3.8b-mini-instruct-fp16,gemma2:9b-instruct-q8_0,gemma2:9b-instruct-fp16,llama3.1:latest,llama3.1:8b-instruct-q8_0,llama3.1:8b-instruct-fp16,llama3.1:70b-instruct-q4_0,llama3.1:70b-instruct-q8_0,llama3.2:1b,llama3.2:3b,qwen2.5:7b-instruct-q5_0,qwen2.5:7b-instruct-fp16,qwen2.5:32b-instruct-fp16,qwen2.5-coder:1.5b,qwen2.5-coder:7b-instruct,llama3.2:3b,qwen2.5-coder:7b-instruct-q8_0,qwen2.5-coder:7b-instruct-fp16,deepseek-coder:6.7b,mixtral:8x7b"); - let config = OllamaConfig::default(); - let ollama = Ollama::new(config.host, config.port); + let models = vec![ + Model::NousTheta, + Model::Phi3Medium, + Model::Phi3Medium128k, + Model::Phi3_5Mini, + Model::Phi3_5MiniFp16, + Model::Gemma2_9B, + Model::Gemma2_9BFp16, + Model::Llama3_1_8B, + Model::Llama3_1_8Bq8, + Model::Llama3_1_8Bf16, + Model::Llama3_1_8BTextQ4KM, + Model::Llama3_1_8BTextQ8, + Model::Llama3_1_70B, + Model::Llama3_1_70Bq8, + Model::Llama3_1_70BTextQ4KM, + Model::Llama3_2_1B, + Model::Llama3_2_3B, + Model::Llama3_2_1BTextQ4KM, + Model::Qwen2_5_7B, + Model::Qwen2_5_7Bf16, + Model::Qwen2_5_32Bf16, + Model::Qwen2_5Coder1_5B, + Model::Qwen2_5coder7B, + Model::Qwen2_5oder7Bq8, + Model::Qwen2_5coder7Bf16, + Model::DeepSeekCoder6_7B, + Model::Mixtral8_7b, + Model::GPT4Turbo, + Model::GPT4o, + Model::GPT4oMini, + Model::O1Preview, + Model::O1Mini, + Model::Gemini15ProExp0827, + Model::Gemini15Pro, + Model::Gemini15Flash, + Model::Gemini10Pro, + Model::Gemma2_2bIt, + Model::Gemma2_27bIt, + ]; - log::info!("Starting..."); - // ensure that all lists of CPUs and processes are filled - let mut system = System::new_all(); - // update all information of the system - system.refresh_all(); + let cfg = DriaWorkflowsConfig::new(models); + let config = OllamaConfig::default(); + let ollama = Ollama::new(config.host, config.port); + log::info!("Starting..."); + // ensure that all lists of CPUs and processes are filled + let mut system = System::new_all(); + // update all information of the system + system.refresh_all(); - log::debug!("Getting system information..."); - let brand = system.cpus()[0].brand().to_string(); - let os_name = System::name().unwrap_or_else(|| "Unknown".to_string()); - let os_version = System::long_os_version().unwrap_or_else(|| "Unknown".to_string()); - let cpu_usage = system.global_cpu_usage(); - let total_memory = system.total_memory(); - let used_memory = system.used_memory(); + log::debug!("Getting system information..."); + let brand = system.cpus()[0].brand().to_string(); + let os_name = System::name().unwrap_or_else(|| "Unknown".to_string()); + let os_version = System::long_os_version().unwrap_or_else(|| "Unknown".to_string()); + let cpu_usage = system.global_cpu_usage(); + let total_memory = system.total_memory(); + let used_memory = system.used_memory(); - for (_, model) in cfg.models { - log::info!("Pulling model: {}", model); + for (_, model) in cfg.models { + log::info!("Pulling model: {}", model); - // pull model - match ollama.pull_model(model.to_string(), false).await { - Ok(status) => log::info!("Status: {}", status.message), - Err(err) => { - log::error!("Failed to pull model {}: {:?}", model, err); + // pull model + match ollama.pull_model(model.to_string(), false).await { + Ok(status) => log::info!("Status: {}", status.message), + Err(err) => { + log::error!("Failed to pull model {}: {:?}", model, err); + } } - } - log::debug!("Creating request..."); - // create dummy request - let mut generation_request = - GenerationRequest::new(model.to_string(), "compute 6780 * 1200".to_string()); + log::debug!("Creating request..."); + // create dummy request + let mut generation_request = + GenerationRequest::new(model.to_string(), "compute 6780 * 1200".to_string()); - if let Ok(num_thread) = std::env::var("OLLAMA_NUM_THREAD") { - generation_request = generation_request.options( - GenerationOptions::default().num_thread( - num_thread - .parse() - .expect("num threads should be a positive integer"), - ), - ); - } + if let Ok(num_thread) = std::env::var("OLLAMA_NUM_THREAD") { + generation_request = generation_request.options( + GenerationOptions::default().num_thread( + num_thread + .parse() + .expect("num threads should be a positive integer"), + ), + ); + } - // generate response - match ollama.generate(generation_request).await { - Ok(response) => { - log::debug!("Got response for model {}", model); - // compute TPS - let tps = (response.eval_count.unwrap_or_default() as f64) - / (response.eval_duration.unwrap_or(1) as f64) - * 1_000_000_000f64; - // report machine info - log::info!( + // generate response + match ollama.generate(generation_request).await { + Ok(response) => { + log::debug!("Got response for model {}", model); + // compute TPS + let tps = (response.eval_count.unwrap_or_default() as f64) + / (response.eval_duration.unwrap_or(1) as f64) + * 1_000_000_000f64; + // report machine info + log::info!( "\n Model: {} \n TPS: {} \n OS: {} {} \n Version: {} \n CPU Usage: % {} \n Total Memory: {} KB \n Used Memory: {} KB ", model, tps, @@ -74,18 +121,20 @@ async fn main() { total_memory, used_memory, ); + } + Err(e) => { + log::warn!("Ignoring model {}: Workflow failed with error {}", model, e); + } } - Err(e) => { - log::warn!("Ignoring model {}: Workflow failed with error {}", model, e); - } + // refresh CPU usage (https://docs.rs/sysinfo/latest/sysinfo/struct.Cpu.html#method.cpu_usage) + system = System::new_with_specifics( + RefreshKind::new().with_cpu(CpuRefreshKind::everything()), + ); + // wait a bit because CPU usage is based on diff + std::thread::sleep(MINIMUM_CPU_UPDATE_INTERVAL); + // refresh CPUs again to get actual value + system.refresh_cpu_usage(); } - // refresh CPU usage (https://docs.rs/sysinfo/latest/sysinfo/struct.Cpu.html#method.cpu_usage) - system = - System::new_with_specifics(RefreshKind::new().with_cpu(CpuRefreshKind::everything())); - // wait a bit because CPU usage is based on diff - std::thread::sleep(sysinfo::MINIMUM_CPU_UPDATE_INTERVAL); - // refresh CPUs again to get actual value - system.refresh_cpu_usage(); } log::info!("Finished"); }