diff --git a/Cargo.lock b/Cargo.lock index 134036f6..7d449f0d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2071,9 +2071,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.21" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "macro_rules_attribute" diff --git a/docs/openapi.json b/docs/openapi.json index 7368145e..d2f36301 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "1.2.3" + "version": "1.3.0" }, "paths": { "/decode": { @@ -19,7 +19,6 @@ "Text Embeddings Inference" ], "summary": "Decode input ids", - "description": "Decode input ids", "operationId": "decode", "requestBody": { "content": { @@ -65,7 +64,6 @@ "Text Embeddings Inference" ], "summary": "Get Embeddings. Returns a 424 status code if the model is not an embedding model.", - "description": "Get Embeddings. Returns a 424 status code if the model is not an embedding model.", "operationId": "embed", "requestBody": { "content": { @@ -153,7 +151,7 @@ "Text Embeddings Inference" ], "summary": "Get all Embeddings without Pooling.", - "description": "Get all Embeddings without Pooling.\nReturns a 424 status code if the model is not an embedding model.", + "description": "Returns a 424 status code if the model is not an embedding model.", "operationId": "embed_all", "requestBody": { "content": { @@ -241,7 +239,6 @@ "Text Embeddings Inference" ], "summary": "Get Sparse Embeddings. Returns a 424 status code if the model is not an embedding model with SPLADE pooling.", - "description": "Get Sparse Embeddings. Returns a 424 status code if the model is not an embedding model with SPLADE pooling.", "operationId": "embed_sparse", "requestBody": { "content": { @@ -323,101 +320,12 @@ } } }, - "/embeddings": { - "post": { - "tags": [ - "Text Embeddings Inference" - ], - "summary": "OpenAI compatible route. Returns a 424 status code if the model is not an embedding model.", - "description": "OpenAI compatible route. Returns a 424 status code if the model is not an embedding model.", - "operationId": "openai_embed", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/OpenAICompatRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Embeddings", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/OpenAICompatResponse" - } - } - } - }, - "413": { - "description": "Batch size error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/OpenAICompatErrorResponse" - }, - "example": { - "message": "Batch size error", - "type": "validation" - } - } - } - }, - "422": { - "description": "Tokenization error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/OpenAICompatErrorResponse" - }, - "example": { - "message": "Tokenization error", - "type": "tokenizer" - } - } - } - }, - "424": { - "description": "Embedding Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/OpenAICompatErrorResponse" - }, - "example": { - "message": "Inference failed", - "type": "backend" - } - } - } - }, - "429": { - "description": "Model is overloaded", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/OpenAICompatErrorResponse" - }, - "example": { - "message": "Model is overloaded", - "type": "overloaded" - } - } - } - } - } - } - }, "/health": { "get": { "tags": [ "Text Embeddings Inference" ], "summary": "Health check method", - "description": "Health check method", "operationId": "health", "responses": { "200": { @@ -446,7 +354,6 @@ "Text Embeddings Inference" ], "summary": "Text Embeddings Inference endpoint info", - "description": "Text Embeddings Inference endpoint info", "operationId": "get_model_info", "responses": { "200": { @@ -468,7 +375,6 @@ "Text Embeddings Inference" ], "summary": "Prometheus metrics scrape endpoint", - "description": "Prometheus metrics scrape endpoint", "operationId": "metrics", "responses": { "200": { @@ -490,7 +396,6 @@ "Text Embeddings Inference" ], "summary": "Get Predictions. Returns a 424 status code if the model is not a Sequence Classification model", - "description": "Get Predictions. Returns a 424 status code if the model is not a Sequence Classification model", "operationId": "predict", "requestBody": { "content": { @@ -578,7 +483,7 @@ "Text Embeddings Inference" ], "summary": "Get Ranks. Returns a 424 status code if the model is not a Sequence Classification model with", - "description": "Get Ranks. Returns a 424 status code if the model is not a Sequence Classification model with\na single class.", + "description": "a single class.", "operationId": "rerank", "requestBody": { "content": { @@ -666,7 +571,6 @@ "Text Embeddings Inference" ], "summary": "Tokenize inputs", - "description": "Tokenize inputs", "operationId": "tokenize", "requestBody": { "content": { @@ -706,19 +610,18 @@ } } }, - "/vertex": { + "/v1/embeddings": { "post": { "tags": [ "Text Embeddings Inference" ], - "summary": "Generate embeddings from a Vertex request", - "description": "Generate embeddings from a Vertex request", - "operationId": "vertex_compatibility", + "summary": "OpenAI compatible route. Returns a 424 status code if the model is not an embedding model.", + "operationId": "openai_embed", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/VertexRequest" + "$ref": "#/components/schemas/OpenAICompatRequest" } } }, @@ -726,18 +629,25 @@ }, "responses": { "200": { - "description": "Results" + "description": "Embeddings", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAICompatResponse" + } + } + } }, "413": { "description": "Batch size error", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ErrorResponse" + "$ref": "#/components/schemas/OpenAICompatErrorResponse" }, "example": { - "error": "Batch size error", - "error_type": "validation" + "message": "Batch size error", + "type": "validation" } } } @@ -747,25 +657,25 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ErrorResponse" + "$ref": "#/components/schemas/OpenAICompatErrorResponse" }, "example": { - "error": "Tokenization error", - "error_type": "tokenizer" + "message": "Tokenization error", + "type": "tokenizer" } } } }, "424": { - "description": "Error", + "description": "Embedding Error", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ErrorResponse" + "$ref": "#/components/schemas/OpenAICompatErrorResponse" }, "example": { - "error": "Inference failed", - "error_type": "backend" + "message": "Inference failed", + "type": "backend" } } } @@ -775,11 +685,11 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ErrorResponse" + "$ref": "#/components/schemas/OpenAICompatErrorResponse" }, "example": { - "error": "Model is overloaded", - "error_type": "overloaded" + "message": "Model is overloaded", + "type": "overloaded" } } } @@ -852,10 +762,26 @@ "inputs": { "$ref": "#/components/schemas/Input" }, + "prompt_name": { + "type": "string", + "description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.", + "default": "null", + "example": "null", + "nullable": true + }, "truncate": { "type": "boolean", "default": "false", - "example": "false" + "example": "false", + "nullable": true + }, + "truncation_direction": { + "allOf": [ + { + "$ref": "#/components/schemas/TruncationDirection" + } + ], + "default": "right" } } }, @@ -895,10 +821,26 @@ "default": "true", "example": "true" }, + "prompt_name": { + "type": "string", + "description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.", + "default": "null", + "example": "null", + "nullable": true + }, "truncate": { "type": "boolean", "default": "false", - "example": "false" + "example": "false", + "nullable": true + }, + "truncation_direction": { + "allOf": [ + { + "$ref": "#/components/schemas/TruncationDirection" + } + ], + "default": "right" } } }, @@ -928,10 +870,26 @@ "inputs": { "$ref": "#/components/schemas/Input" }, + "prompt_name": { + "type": "string", + "description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.", + "default": "null", + "example": "null", + "nullable": true + }, "truncate": { "type": "boolean", "default": "false", - "example": "false" + "example": "false", + "nullable": true + }, + "truncation_direction": { + "allOf": [ + { + "$ref": "#/components/schemas/TruncationDirection" + } + ], + "default": "right" } } }, @@ -944,6 +902,20 @@ } } }, + "Embedding": { + "oneOf": [ + { + "type": "array", + "items": { + "type": "number", + "format": "float" + } + }, + { + "type": "string" + } + ] + }, "EmbeddingModel": { "type": "object", "required": [ @@ -956,6 +928,13 @@ } } }, + "EncodingFormat": { + "type": "string", + "enum": [ + "float", + "base64" + ] + }, "ErrorResponse": { "type": "object", "required": [ @@ -991,10 +970,14 @@ "max_input_length", "max_batch_tokens", "max_client_batch_size", + "auto_truncate", "tokenization_workers", "version" ], "properties": { + "auto_truncate": { + "type": "boolean" + }, "docker_label": { "type": "string", "example": "null", @@ -1065,12 +1048,12 @@ "Input": { "oneOf": [ { - "type": "string" + "$ref": "#/components/schemas/InputType" }, { "type": "array", "items": { - "type": "string" + "$ref": "#/components/schemas/InputType" } } ] @@ -1098,6 +1081,21 @@ } ] }, + "InputType": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "integer", + "format": "int32", + "minimum": 0 + } + } + ] + }, "ModelType": { "oneOf": [ { @@ -1144,16 +1142,7 @@ ], "properties": { "embedding": { - "type": "array", - "items": { - "type": "number", - "format": "float" - }, - "example": [ - 0.0, - 1.0, - 2.0 - ] + "$ref": "#/components/schemas/Embedding" }, "index": { "type": "integer", @@ -1193,6 +1182,14 @@ "input" ], "properties": { + "encoding_format": { + "allOf": [ + { + "$ref": "#/components/schemas/EncodingFormat" + } + ], + "default": "float" + }, "input": { "$ref": "#/components/schemas/Input" }, @@ -1317,7 +1314,16 @@ "truncate": { "type": "boolean", "default": "false", - "example": "false" + "example": "false", + "nullable": true + }, + "truncation_direction": { + "allOf": [ + { + "$ref": "#/components/schemas/TruncationDirection" + } + ], + "default": "right" } } }, @@ -1416,7 +1422,16 @@ "truncate": { "type": "boolean", "default": "false", - "example": "false" + "example": "false", + "nullable": true + }, + "truncation_direction": { + "allOf": [ + { + "$ref": "#/components/schemas/TruncationDirection" + } + ], + "default": "right" } } }, @@ -1479,6 +1494,19 @@ } } }, + "TokenizeInput": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, "TokenizeRequest": { "type": "object", "required": [ @@ -1491,7 +1519,14 @@ "example": "true" }, "inputs": { - "$ref": "#/components/schemas/Input" + "$ref": "#/components/schemas/TokenizeInput" + }, + "prompt_name": { + "type": "string", + "description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.", + "default": "null", + "example": "null", + "nullable": true } } }, @@ -1515,273 +1550,12 @@ ] ] }, - "VertexInstance": { - "oneOf": [ - { - "allOf": [ - { - "$ref": "#/components/schemas/EmbedRequest" - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "embed" - ] - } - } - } - ] - }, - { - "allOf": [ - { - "$ref": "#/components/schemas/EmbedAllRequest" - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "embed_all" - ] - } - } - } - ] - }, - { - "allOf": [ - { - "$ref": "#/components/schemas/EmbedSparseRequest" - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "embed_sparse" - ] - } - } - } - ] - }, - { - "allOf": [ - { - "$ref": "#/components/schemas/PredictRequest" - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "predict" - ] - } - } - } - ] - }, - { - "allOf": [ - { - "$ref": "#/components/schemas/RerankRequest" - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "rerank" - ] - } - } - } - ] - }, - { - "allOf": [ - { - "$ref": "#/components/schemas/TokenizeRequest" - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "tokenize" - ] - } - } - } - ] - } - ], - "discriminator": { - "propertyName": "type" - } - }, - "VertexRequest": { - "type": "object", - "required": [ - "instances" - ], - "properties": { - "instances": { - "type": "array", - "items": { - "$ref": "#/components/schemas/VertexInstance" - } - } - } - }, - "VertexResponse": { - "type": "array", - "items": { - "$ref": "#/components/schemas/VertexResponseInstance" - } - }, - "VertexResponseInstance": { - "oneOf": [ - { - "type": "object", - "required": [ - "type", - "result" - ], - "properties": { - "result": { - "$ref": "#/components/schemas/EmbedResponse" - }, - "type": { - "type": "string", - "enum": [ - "embed" - ] - } - } - }, - { - "type": "object", - "required": [ - "type", - "result" - ], - "properties": { - "result": { - "$ref": "#/components/schemas/EmbedAllResponse" - }, - "type": { - "type": "string", - "enum": [ - "embed_all" - ] - } - } - }, - { - "type": "object", - "required": [ - "type", - "result" - ], - "properties": { - "result": { - "$ref": "#/components/schemas/EmbedSparseResponse" - }, - "type": { - "type": "string", - "enum": [ - "embed_sparse" - ] - } - } - }, - { - "type": "object", - "required": [ - "type", - "result" - ], - "properties": { - "result": { - "$ref": "#/components/schemas/PredictResponse" - }, - "type": { - "type": "string", - "enum": [ - "predict" - ] - } - } - }, - { - "type": "object", - "required": [ - "type", - "result" - ], - "properties": { - "result": { - "$ref": "#/components/schemas/RerankResponse" - }, - "type": { - "type": "string", - "enum": [ - "rerank" - ] - } - } - }, - { - "type": "object", - "required": [ - "type", - "result" - ], - "properties": { - "result": { - "$ref": "#/components/schemas/TokenizeResponse" - }, - "type": { - "type": "string", - "enum": [ - "tokenize" - ] - } - } - } - ], - "discriminator": { - "propertyName": "type" - } + "TruncationDirection": { + "type": "string", + "enum": [ + "Left", + "Right" + ] } } }, diff --git a/router/src/http/server.rs b/router/src/http/server.rs index 49e6029a..17baada6 100644 --- a/router/src/http/server.rs +++ b/router/src/http/server.rs @@ -5,7 +5,8 @@ use crate::http::types::{ OpenAICompatEmbedding, OpenAICompatErrorResponse, OpenAICompatRequest, OpenAICompatResponse, OpenAICompatUsage, PredictInput, PredictRequest, PredictResponse, Prediction, Rank, RerankRequest, RerankResponse, Sequence, SimpleToken, SparseValue, TokenizeInput, - TokenizeRequest, TokenizeResponse, VertexPrediction, VertexRequest, VertexResponse, + TokenizeRequest, TokenizeResponse, TruncationDirection, VertexPrediction, VertexRequest, + VertexResponse, }; use crate::{ shutdown, ClassifierModel, EmbeddingModel, ErrorResponse, ErrorType, Info, ModelType, @@ -32,7 +33,6 @@ use text_embeddings_core::infer::{ AllEmbeddingsInferResponse, Infer, InferMetadata, PooledEmbeddingsInferResponse, }; use text_embeddings_core::TextEmbeddingsError; -use tokenizers::TruncationDirection; use tokio::sync::OwnedSemaphorePermit; use tower_http::cors::{AllowOrigin, CorsLayer}; use tracing::instrument; @@ -118,7 +118,7 @@ async fn predict( .predict( inputs, truncate, - req.truncation_direction, + req.truncation_direction.into(), req.raw_scores, permit, ) @@ -335,7 +335,7 @@ async fn rerank( .predict( (query, text), truncate, - req.truncation_direction, + req.truncation_direction.into(), req.raw_scores, permit, ) @@ -499,7 +499,7 @@ async fn embed( .embed_pooled( input, truncate, - req.truncation_direction, + req.truncation_direction.into(), req.prompt_name, req.normalize, permit, @@ -568,7 +568,7 @@ async fn embed( .embed_pooled( input, truncate, - req.truncation_direction, + req.truncation_direction.into(), prompt_name, req.normalize, permit, @@ -677,7 +677,7 @@ async fn embed_sparse( .embed_sparse( input, truncate, - req.truncation_direction, + req.truncation_direction.into(), req.prompt_name, permit, ) @@ -745,7 +745,7 @@ async fn embed_sparse( .embed_sparse( input, truncate, - req.truncation_direction, + req.truncation_direction.into(), prompt_name, permit, ) @@ -846,7 +846,7 @@ async fn embed_all( .embed_all( input, truncate, - req.truncation_direction, + req.truncation_direction.into(), req.prompt_name, permit, ) @@ -914,7 +914,7 @@ async fn embed_all( .embed_all( input, truncate, - req.truncation_direction, + req.truncation_direction.into(), prompt_name, permit, ) @@ -1029,7 +1029,7 @@ async fn openai_embed( .embed_pooled( input, truncate, - TruncationDirection::Right, + tokenizers::TruncationDirection::Right, None, true, permit, @@ -1102,7 +1102,7 @@ async fn openai_embed( .embed_pooled( input, truncate, - TruncationDirection::Right, + tokenizers::TruncationDirection::Right, None, true, permit, @@ -1483,6 +1483,8 @@ pub async fn run( Info, ModelType, ClassifierModel, + Embedding, + EncodingFormat, EmbeddingModel, PredictRequest, Prediction, @@ -1506,6 +1508,7 @@ pub async fn run( TokenizeInput, TokenizeRequest, TokenizeResponse, + TruncationDirection, SimpleToken, InputType, InputIds, diff --git a/router/src/http/types.rs b/router/src/http/types.rs index a47a995b..4414ecb4 100644 --- a/router/src/http/types.rs +++ b/router/src/http/types.rs @@ -4,7 +4,6 @@ use serde::{de, Deserialize, Deserializer, Serialize}; use serde_json::json; use std::fmt::Formatter; use text_embeddings_core::tokenization::EncodingInput; -use tokenizers::TruncationDirection; use utoipa::openapi::{RefOr, Schema}; use utoipa::ToSchema; @@ -194,6 +193,22 @@ impl<'__s> ToSchema<'__s> for PredictInput { } } +#[derive(Debug, Clone, Copy, PartialEq, Deserialize, ToSchema, Eq, Default)] +pub(crate) enum TruncationDirection { + Left, + #[default] + Right, +} + +impl From for tokenizers::TruncationDirection { + fn from(value: TruncationDirection) -> Self { + match value { + TruncationDirection::Left => Self::Left, + TruncationDirection::Right => Self::Right, + } + } +} + #[derive(Deserialize, ToSchema)] pub(crate) struct PredictRequest { pub inputs: PredictInput, @@ -262,6 +277,7 @@ pub(crate) enum InputType { String(String), Ids(Vec), } + impl InputType { pub(crate) fn count_chars(&self) -> usize { match self { @@ -270,6 +286,7 @@ impl InputType { } } } + impl From for EncodingInput { fn from(value: InputType) -> Self { match value { @@ -278,6 +295,7 @@ impl From for EncodingInput { } } } + #[derive(Deserialize, ToSchema)] #[serde(untagged)] pub(crate) enum Input { @@ -351,6 +369,15 @@ pub(crate) struct EmbedRequest { #[serde(default)] #[schema(default = "right", example = "right")] pub truncation_direction: TruncationDirection, + /// The name of the prompt that should be used by for encoding. If not set, no prompt + /// will be applied. + /// + /// Must be a key in the `Sentence Transformers` configuration `prompts` dictionary. + /// + /// For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, + /// then the sentence "What is the capital of France?" will be encoded as + /// "query: What is the capital of France?" because the prompt text will be prepended before + /// any text to encode. #[schema(default = "null", example = "null", nullable = true)] pub prompt_name: Option, #[serde(default = "default_normalize")] @@ -375,6 +402,15 @@ pub(crate) struct EmbedSparseRequest { #[serde(default)] #[schema(default = "right", example = "right")] pub truncation_direction: TruncationDirection, + /// The name of the prompt that should be used by for encoding. If not set, no prompt + /// will be applied. + /// + /// Must be a key in the `Sentence Transformers` configuration `prompts` dictionary. + /// + /// For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, + /// then the sentence "What is the capital of France?" will be encoded as + /// "query: What is the capital of France?" because the prompt text will be prepended before + /// any text to encode. #[schema(default = "null", example = "null", nullable = true)] pub prompt_name: Option, } @@ -397,6 +433,15 @@ pub(crate) struct EmbedAllRequest { #[serde(default)] #[schema(default = "right", example = "right")] pub truncation_direction: TruncationDirection, + /// The name of the prompt that should be used by for encoding. If not set, no prompt + /// will be applied. + /// + /// Must be a key in the `Sentence Transformers` configuration `prompts` dictionary. + /// + /// For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, + /// then the sentence "What is the capital of France?" will be encoded as + /// "query: What is the capital of France?" because the prompt text will be prepended before + /// any text to encode. #[schema(default = "null", example = "null", nullable = true)] pub prompt_name: Option, } @@ -426,6 +471,15 @@ pub(crate) struct TokenizeRequest { #[serde(default = "default_add_special_tokens")] #[schema(default = "true", example = "true")] pub add_special_tokens: bool, + /// The name of the prompt that should be used by for encoding. If not set, no prompt + /// will be applied. + /// + /// Must be a key in the `Sentence Transformers` configuration `prompts` dictionary. + /// + /// For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, + /// then the sentence "What is the capital of France?" will be encoded as + /// "query: What is the capital of France?" because the prompt text will be prepended before + /// any text to encode. #[schema(default = "null", example = "null", nullable = true)] pub prompt_name: Option, } diff --git a/router/src/lib.rs b/router/src/lib.rs index 3be03190..f5fd102c 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -238,11 +238,13 @@ pub async fn run( .await .context("Model backend is not healthy")?; - tracing::info!("Warming up model"); - backend - .warmup(max_input_length, max_batch_tokens, max_batch_requests) - .await - .context("Model backend is not healthy")?; + if !backend.padded_model { + tracing::info!("Warming up model"); + backend + .warmup(max_input_length, max_batch_tokens, max_batch_requests) + .await + .context("Model backend is not healthy")?; + } let max_batch_requests = backend .max_batch_size