Skip to content

Commit 03df311

Browse files
committed
feat: Add Jina as a selectable embeddings provider with reranking support
This commit adds Jina AI as a new embeddings provider option with the following features: - Added JinaEmbeddingProvider implementing the EmbeddingProvider trait - Support for Jina embeddings API (jina-embeddings-v4 model) - Configurable task parameter (default: code.query for code embeddings) - Late chunking support enabled by default for improved accuracy - Integrated reranking functionality using Jina reranker-v3 model - Reranking is enabled by default when Jina provider is selected - Added Jina variant to EmbeddingProvider enum - Created JinaEmbeddingConfig struct with all necessary configuration options - Added 'jina' feature flag to Cargo.toml - Registered jina_provider module in lib.rs - Updated example_embedding.toml with Jina configuration example The implementation follows the same pattern as existing providers (OpenAI, Ollama) with retry logic, batch processing, and proper error handling.
1 parent 4e92c73 commit 03df311

File tree

5 files changed

+645
-0
lines changed

5 files changed

+645
-0
lines changed

config/example_embedding.toml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,26 @@ api_base = "https://api.openai.com/v1"
1414
max_retries = 3
1515
timeout_secs = 30
1616

17+
# Example Jina configuration (uncomment to use)
18+
# [embedding]
19+
# provider = "jina"
20+
# dimension = 1024
21+
# cache_enabled = true
22+
# cache_ttl_secs = 3600
23+
# normalize_embeddings = true
24+
#
25+
# [embedding.jina]
26+
# model = "jina-embeddings-v4"
27+
# api_key_env = "JINA_API_KEY"
28+
# api_base = "https://api.jina.ai/v1"
29+
# max_retries = 3
30+
# timeout_secs = 30
31+
# task = "code.query"
32+
# late_chunking = true
33+
# enable_reranking = true
34+
# reranking_model = "jina-reranker-v3"
35+
# reranking_top_n = 10
36+
1737
[performance]
1838
mode = "balanced"
1939

crates/codegraph-core/src/embedding_config.rs

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ pub enum EmbeddingProvider {
1010
Local,
1111
Cohere,
1212
HuggingFace,
13+
Jina,
1314
Custom(String),
1415
}
1516

@@ -95,6 +96,79 @@ impl Default for LocalEmbeddingConfig {
9596
}
9697
}
9798

99+
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
100+
pub struct JinaEmbeddingConfig {
101+
pub model: String,
102+
pub api_key_env: String,
103+
#[serde(default = "JinaEmbeddingConfig::default_api_base")]
104+
pub api_base: String,
105+
#[serde(default = "JinaEmbeddingConfig::default_max_retries")]
106+
pub max_retries: u32,
107+
#[serde(default = "JinaEmbeddingConfig::default_timeout_secs")]
108+
pub timeout_secs: u64,
109+
#[serde(default = "JinaEmbeddingConfig::default_task")]
110+
pub task: String,
111+
#[serde(default = "JinaEmbeddingConfig::default_late_chunking")]
112+
pub late_chunking: bool,
113+
#[serde(default = "JinaEmbeddingConfig::default_enable_reranking")]
114+
pub enable_reranking: bool,
115+
#[serde(default = "JinaEmbeddingConfig::default_reranking_model")]
116+
pub reranking_model: String,
117+
#[serde(default = "JinaEmbeddingConfig::default_reranking_top_n")]
118+
pub reranking_top_n: usize,
119+
}
120+
121+
impl JinaEmbeddingConfig {
122+
fn default_api_base() -> String {
123+
"https://api.jina.ai/v1".to_string()
124+
}
125+
126+
fn default_max_retries() -> u32 {
127+
3
128+
}
129+
130+
fn default_timeout_secs() -> u64 {
131+
30
132+
}
133+
134+
fn default_task() -> String {
135+
"code.query".to_string()
136+
}
137+
138+
fn default_late_chunking() -> bool {
139+
true
140+
}
141+
142+
fn default_enable_reranking() -> bool {
143+
true
144+
}
145+
146+
fn default_reranking_model() -> String {
147+
"jina-reranker-v3".to_string()
148+
}
149+
150+
fn default_reranking_top_n() -> usize {
151+
10
152+
}
153+
}
154+
155+
impl Default for JinaEmbeddingConfig {
156+
fn default() -> Self {
157+
Self {
158+
model: "jina-embeddings-v4".to_string(),
159+
api_key_env: "JINA_API_KEY".to_string(),
160+
api_base: Self::default_api_base(),
161+
max_retries: Self::default_max_retries(),
162+
timeout_secs: Self::default_timeout_secs(),
163+
task: Self::default_task(),
164+
late_chunking: Self::default_late_chunking(),
165+
enable_reranking: Self::default_enable_reranking(),
166+
reranking_model: Self::default_reranking_model(),
167+
reranking_top_n: Self::default_reranking_top_n(),
168+
}
169+
}
170+
}
171+
98172
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
99173
pub struct EmbeddingModelConfig {
100174
#[serde(default)]
@@ -109,6 +183,9 @@ pub struct EmbeddingModelConfig {
109183
#[serde(default)]
110184
pub local: Option<LocalEmbeddingConfig>,
111185

186+
#[serde(default)]
187+
pub jina: Option<JinaEmbeddingConfig>,
188+
112189
#[serde(default)]
113190
pub custom_config: HashMap<String, serde_json::Value>,
114191

@@ -175,6 +252,22 @@ impl EmbeddingModelConfig {
175252
anyhow::ensure!(config.batch_size > 0, "Batch size must be greater than 0");
176253
}
177254
}
255+
EmbeddingProvider::Jina => {
256+
anyhow::ensure!(
257+
self.jina.is_some(),
258+
"Jina configuration required when using Jina provider"
259+
);
260+
if let Some(config) = &self.jina {
261+
anyhow::ensure!(
262+
!config.model.is_empty(),
263+
"Jina model name cannot be empty"
264+
);
265+
anyhow::ensure!(
266+
!config.api_key_env.is_empty(),
267+
"Jina API key environment variable name cannot be empty"
268+
);
269+
}
270+
}
178271
_ => {}
179272
}
180273

@@ -190,6 +283,7 @@ impl EmbeddingModelConfig {
190283
..Default::default()
191284
}),
192285
local: None,
286+
jina: None,
193287
custom_config: HashMap::new(),
194288
cache_enabled: Self::default_cache_enabled(),
195289
cache_ttl_secs: Self::default_cache_ttl_secs(),
@@ -206,6 +300,24 @@ impl EmbeddingModelConfig {
206300
model_path: model_path.to_string(),
207301
..Default::default()
208302
}),
303+
jina: None,
304+
custom_config: HashMap::new(),
305+
cache_enabled: Self::default_cache_enabled(),
306+
cache_ttl_secs: Self::default_cache_ttl_secs(),
307+
normalize_embeddings: Self::default_normalize_embeddings(),
308+
}
309+
}
310+
311+
pub fn for_jina(model: &str, dimension: usize) -> Self {
312+
Self {
313+
provider: EmbeddingProvider::Jina,
314+
dimension,
315+
openai: None,
316+
local: None,
317+
jina: Some(JinaEmbeddingConfig {
318+
model: model.to_string(),
319+
..Default::default()
320+
}),
209321
custom_config: HashMap::new(),
210322
cache_enabled: Self::default_cache_enabled(),
211323
cache_ttl_secs: Self::default_cache_ttl_secs(),
@@ -221,6 +333,7 @@ impl Default for EmbeddingModelConfig {
221333
dimension: Self::default_dimension(),
222334
openai: None,
223335
local: Some(LocalEmbeddingConfig::default()),
336+
jina: None,
224337
custom_config: HashMap::new(),
225338
cache_enabled: Self::default_cache_enabled(),
226339
cache_ttl_secs: Self::default_cache_ttl_secs(),
@@ -277,13 +390,22 @@ impl EmbeddingPreset {
277390
}
278391
}
279392

393+
pub fn jina_v4() -> Self {
394+
Self {
395+
name: "jina-v4".to_string(),
396+
description: "Jina embeddings-v4 model (multimodal, code-optimized with reranking)".to_string(),
397+
config: EmbeddingModelConfig::for_jina("jina-embeddings-v4", 1024),
398+
}
399+
}
400+
280401
pub fn all_presets() -> Vec<Self> {
281402
vec![
282403
Self::openai_small(),
283404
Self::openai_large(),
284405
Self::openai_ada(),
285406
Self::local_minilm(),
286407
Self::local_mpnet(),
408+
Self::jina_v4(),
287409
]
288410
}
289411

crates/codegraph-vector/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ local-embeddings = [
7575
onnx = ["dep:ort", "dep:tokenizers", "dep:hf-hub", "dep:ndarray"]
7676
onnx-coreml = ["onnx"]
7777
ollama = ["dep:reqwest"]
78+
jina = ["dep:reqwest"]
7879

7980
[[example]]
8081
name = "rag_demo"

0 commit comments

Comments
 (0)