huggingface
diff --git a/‎backends/candle/src/models/qwen3.rs‎
Lines changed: 26 additions & 12 deletions b/‎backends/candle/src/models/qwen3.rs‎
Lines changed: 26 additions & 12 deletions
@@ -487,21 +487,23 @@ impl Qwen3Model {
                 let seq_length = end - start;
                 input_lengths.push(seq_length);
 
-                for j in start..end {
-                    input_ids.push(batch.input_ids[j]);
-                    position_ids.push(batch.position_ids[j]);
-                    attention_bias.push(0.0);
-                }
-
+                // Left padding for Qwen3-Embedding (pad at the beginning)
                 let padding = max_length - seq_length;
                 if padding > 0 {
                     masking = true;
                     for _ in 0..padding {
-                        input_ids.insert(start, self.pad_token_id);
-                        position_ids.insert(start, 0);
-                        attention_bias.insert(start, f32::MIN);
+                        input_ids.push(self.pad_token_id);
+                        position_ids.push(0);
+                        attention_bias.push(f32::MIN);
                     }
                 }
+
+                // Then add the actual sequence
+                for j in start..end {
+                    input_ids.push(batch.input_ids[j]);
+                    position_ids.push(batch.position_ids[j]);
+                    attention_bias.push(0.0);
+                }
             }
 
             let input_ids = Tensor::from_vec(input_ids, shape, &self.device)?;
@@ -533,7 +535,15 @@ impl Qwen3Model {
             )?;
             let input_lengths = vec![batch.input_ids.len()];
 
-            (input_ids, position_ids, input_lengths, None)
+            let seq_len = batch.input_ids.len();
+            // Create attention bias for causal masking even for single sequences
+            let attention_bias = Tensor::zeros(
+                (1, self.num_attention_heads, seq_len, seq_len),
+                candle::DType::F32,
+                &self.device,
+            )?;
+
+            (input_ids, position_ids, input_lengths, Some(attention_bias))
         };
 
         let attention_bias = if let Some(attn_bias) = attention_bias {
@@ -597,14 +607,16 @@ impl Qwen3Model {
                             .iter()
                             .map(|&i| {
                                 let i = i as usize;
+                                // With left padding, the last token is always at max_length - 1
                                 let last_token_idx = max_length - 1;
                                 outputs.i((i, last_token_idx))?.unsqueeze(0)
                             })
                             .collect();
 
                         Some(Tensor::cat(&results?, 0)?)
                     } else {
-                        let last_idx = input_lengths[0] - 1;
+                        // For single inference, use the actual last token position from cumulative_seq_lengths
+                        let last_idx = batch.cumulative_seq_lengths[1] as usize - 1;
                         Some(outputs.i((0, last_idx))?.unsqueeze(0)?)
                     }
                 }
@@ -617,7 +629,9 @@ impl Qwen3Model {
                                 let i = i as usize;
                                 let length = input_lengths[i];
 
-                                let embeddings = outputs.i((i, ..length))?;
+                                // With left padding, actual tokens are at the end
+                                let padding = max_length - length;
+                                let embeddings = outputs.i((i, padding..))?;
                                 let sum = embeddings.sum_keepdim(0)?;
                                 sum / (length as f64)
                             })