From 0fc4c01a220bf7081d0115c76ea3dd9eeef4f2f5 Mon Sep 17 00:00:00 2001 From: Xinhe Wang Date: Sat, 6 Dec 2025 00:31:42 +0000 Subject: [PATCH 1/2] feat(embed_text): add `expected_output_dimension` parameter Separate API parameter (output_dimension) from validation dimension (expected_output_dimension) for clearer semantics and better flexibility. Maintains backward compatibility with fallback to output_dimension when expected_output_dimension is not specified. --- docs/docs/ops/functions.md | 7 ++++--- .../functions/_engine_builtin_specs.py | 1 + rust/cocoindex/src/ops/functions/embed_text.rs | 17 ++++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/docs/ops/functions.md b/docs/docs/ops/functions.md index d9314fa6..631b6e58 100644 --- a/docs/docs/ops/functions.md +++ b/docs/docs/ops/functions.md @@ -189,10 +189,11 @@ The spec takes the following fields: * `api_type` ([`cocoindex.LlmApiType`](/docs/ai/llm#llm-api-types)): The type of LLM API to use for embedding. * `model` (`str`): The name of the embedding model to use. * `address` (`str`, optional): The address of the LLM API. If not specified, uses the default address for the API type. -* `output_dimension` (`int`, optional): The expected dimension of the output embedding vector. If not specified, use the default dimension of the model. +* `output_dimension` (`int`, optional): The dimension to request from the embedding API. Some APIs support specifying the output dimension (e.g., OpenAI's models support dimension reduction). If not specified, the API will use its default dimension. +* `expected_output_dimension` (`int`, optional): The expected dimension of the output embedding vector for validation and type schema. If not specified, falls back to `output_dimension`, then to the default dimension of the model. - For most API types, the function internally keeps a registry for the default output dimension of known model. - You need to explicitly specify the `output_dimension` if you want to use a new model that is not in the registry yet. + For most API types, the function internally keeps a registry for the default output dimension of known models. + You need to explicitly specify `expected_output_dimension` (or `output_dimension`) if you want to use a new model that is not in the registry yet. * `task_type` (`str`, optional): The task type for embedding, used by some embedding models to optimize the embedding for specific use cases. diff --git a/python/cocoindex/functions/_engine_builtin_specs.py b/python/cocoindex/functions/_engine_builtin_specs.py index ee52c0d1..34673855 100644 --- a/python/cocoindex/functions/_engine_builtin_specs.py +++ b/python/cocoindex/functions/_engine_builtin_specs.py @@ -55,6 +55,7 @@ class EmbedText(op.FunctionSpec): model: str address: str | None = None output_dimension: int | None = None + expected_output_dimension: int | None = None task_type: str | None = None api_config: llm.VertexAiConfig | None = None api_key: TransientAuthEntryReference[str] | None = None diff --git a/rust/cocoindex/src/ops/functions/embed_text.rs b/rust/cocoindex/src/ops/functions/embed_text.rs index ac119ee4..23c48ac3 100644 --- a/rust/cocoindex/src/ops/functions/embed_text.rs +++ b/rust/cocoindex/src/ops/functions/embed_text.rs @@ -12,6 +12,7 @@ struct Spec { address: Option, api_config: Option, output_dimension: Option, + expected_output_dimension: Option, task_type: Option, api_key: Option>, } @@ -129,15 +130,12 @@ impl SimpleFunctionFactoryBase for Factory { spec.api_config.clone(), ) .await?; - let output_dimension = match spec.output_dimension { - Some(output_dimension) => output_dimension, - None => { - client.get_default_embedding_dimension(spec.model.as_str()) - .ok_or_else(|| api_error!("model \"{}\" is unknown for {:?}, needs to specify `output_dimension` explicitly", spec.model, spec.api_type))? - } - }; + let expected_output_dimension = spec.expected_output_dimension + .or(spec.output_dimension) + .or_else(|| client.get_default_embedding_dimension(spec.model.as_str())) + .ok_or_else(|| api_error!("model \"{}\" is unknown for {:?}, needs to specify `expected_output_dimension` (or `output_dimension`) explicitly", spec.model, spec.api_type))? as usize; let output_schema = make_output_type(BasicValueType::Vector(VectorTypeSchema { - dimension: Some(output_dimension as usize), + dimension: Some(expected_output_dimension), element_type: Box::new(BasicValueType::Float32), })); Ok(SimpleFunctionAnalysisOutput { @@ -145,7 +143,7 @@ impl SimpleFunctionFactoryBase for Factory { resolved_args: Args { client, text, - expected_output_dimension: output_dimension as usize, + expected_output_dimension, }, output_schema, }) @@ -179,6 +177,7 @@ mod tests { address: None, api_config: None, output_dimension: None, + expected_output_dimension: None, task_type: None, api_key: None, }; From 5ff9e54f0dbeafb06c1f72300508abba4e7a49bc Mon Sep 17 00:00:00 2001 From: Xinhe Wang Date: Sat, 6 Dec 2025 01:06:00 +0000 Subject: [PATCH 2/2] feat(embed_text): warn on dimension parameter mismatch Add warning when output_dimension and expected_output_dimension are both specified with different values, explaining their respective uses in schema definition vs API request. --- rust/cocoindex/src/ops/functions/embed_text.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/rust/cocoindex/src/ops/functions/embed_text.rs b/rust/cocoindex/src/ops/functions/embed_text.rs index 23c48ac3..2d83783a 100644 --- a/rust/cocoindex/src/ops/functions/embed_text.rs +++ b/rust/cocoindex/src/ops/functions/embed_text.rs @@ -130,6 +130,19 @@ impl SimpleFunctionFactoryBase for Factory { spec.api_config.clone(), ) .await?; + + // Warn if both parameters are specified but have different values + if let (Some(expected), Some(output)) = + (spec.expected_output_dimension, spec.output_dimension) + { + if expected != output { + warn!( + "Both `expected_output_dimension` ({expected}) and `output_dimension` ({output}) are specified but have different values. \ + `expected_output_dimension` will be used for output schema and validation, while `output_dimension` will be sent to the embedding API." + ); + } + } + let expected_output_dimension = spec.expected_output_dimension .or(spec.output_dimension) .or_else(|| client.get_default_embedding_dimension(spec.model.as_str()))