Update FillMaskPipeline types and default

xenova · xenova · commit e60e9da17e53 · 2025-12-07T15:27:51.000-05:00
diff --git a/src/pipelines.js b/src/pipelines.js
@@ -109,9 +109,8 @@ const SUPPORTED_TASKS = Object.freeze({
         pipeline: FillMaskPipeline,
         model: AutoModelForMaskedLM,
         default: {
-            // TODO: replace with original
-            // "model": "bert-base-uncased",
-            model: 'Xenova/bert-base-uncased',
+            model: 'onnx-community/ettin-encoder-32m-ONNX',
+            dtype: 'fp32',
         },
         type: 'text',
     },
@@ -445,9 +444,9 @@ export async function pipeline(
     if (!model) {
         model = pipelineInfo.default.model;
         console.log(`No model specified. Using default model: "${model}".`);
-    }
-    if (!dtype && pipelineInfo.default.dtype) {
-        dtype = pipelineInfo.default.dtype;
+        if (!dtype && pipelineInfo.default.dtype) {
+            dtype = pipelineInfo.default.dtype;
+        }
     }
 
     const pretrainedOptions = {
diff --git a/src/pipelines/fill-mask.js b/src/pipelines/fill-mask.js
@@ -19,53 +19,70 @@ import { softmax } from '../utils/maths.js';
  * @typedef {Object} FillMaskPipelineOptions Parameters specific to fill mask pipelines.
  * @property {number} [top_k=5] When passed, overrides the number of predictions to return.
  *
- * @callback FillMaskPipelineCallback Fill the masked token in the text(s) given as inputs.
- * @param {string|string[]} texts One or several texts (or one list of prompts) with masked tokens.
+ * @callback FillMaskPipelineCallbackSingle Fill the masked token in the text given as input.
+ * @param {string} texts The text with masked tokens.
  * @param {FillMaskPipelineOptions} [options] The options to use for masked language modelling.
- * @returns {Promise<FillMaskOutput|FillMaskOutput[]>} An array of objects containing the score, predicted token, predicted token string,
- * and the sequence with the predicted token filled in, or an array of such arrays (one for each input text).
- * If only one input text is given, the output will be an array of objects.
- * @throws {Error} When the mask token is not found in the input text.
+ * @returns {Promise<FillMaskOutput>} An array of objects containing the score, predicted token, predicted token string,
+ * and the sequence with the predicted token filled in.
+ *
+ * @callback FillMaskPipelineCallbackBatch Fill the masked token in the texts given as inputs.
+ * @param {string[]} texts A list of texts with masked tokens.
+ * @param {FillMaskPipelineOptions} [options] The options to use for masked language modelling.
+ * @returns {Promise<FillMaskOutput[]>} An array where each entry corresponds to the predictions for an input text.
+ *
+ * @typedef {FillMaskPipelineCallbackSingle & FillMaskPipelineCallbackBatch} FillMaskPipelineCallback
  *
  * @typedef {TextPipelineConstructorArgs & FillMaskPipelineCallback & Disposable} FillMaskPipelineType
  */
 
 /**
  * Masked language modeling prediction pipeline using any `ModelWithLMHead`.
  *
+ * **Example:** Perform masked language modelling (a.k.a. "fill-mask") with `onnx-community/ettin-encoder-32m-ONNX`.
+ * ```javascript
+ * import { pipeline } from '@huggingface/transformers';
+ *
+ * const unmasker = await pipeline('fill-mask', 'onnx-community/ettin-encoder-32m-ONNX');
+ * const output = await unmasker('The capital of France is [MASK].');
+ * // [
+ * //   { score: 0.5151872038841248, token: 7785, token_str: ' Paris', sequence: 'The capital of France is Paris.' },
+ * //   { score: 0.033725105226039886, token: 42268, token_str: ' Lyon', sequence: 'The capital of France is Lyon.' },
+ * //   { score: 0.031234024092555046, token: 23397, token_str: ' Nancy', sequence: 'The capital of France is Nancy.' },
+ * //   { score: 0.02075139433145523, token: 30167, token_str: ' Brussels', sequence: 'The capital of France is Brussels.' },
+ * //   { score: 0.018962178379297256, token: 31955, token_str: ' Geneva', sequence: 'The capital of France is Geneva.' }
+ * // ]
+ * ```
+ *
  * **Example:** Perform masked language modelling (a.k.a. "fill-mask") with `Xenova/bert-base-uncased`.
  * ```javascript
+ * import { pipeline } from '@huggingface/transformers';
+ *
  * const unmasker = await pipeline('fill-mask', 'Xenova/bert-base-cased');
  * const output = await unmasker('The goal of life is [MASK].');
  * // [
- * //   { token_str: 'survival', score: 0.06137419492006302, token: 8115, sequence: 'The goal of life is survival.' },
- * //   { token_str: 'love', score: 0.03902450203895569, token: 1567, sequence: 'The goal of life is love.' },
- * //   { token_str: 'happiness', score: 0.03253183513879776, token: 9266, sequence: 'The goal of life is happiness.' },
- * //   { token_str: 'freedom', score: 0.018736306577920914, token: 4438, sequence: 'The goal of life is freedom.' },
- * //   { token_str: 'life', score: 0.01859794743359089, token: 1297, sequence: 'The goal of life is life.' }
+ * //   { score: 0.11368396878242493, sequence: "The goal of life is survival.", token: 8115, token_str: "survival" },
+ * //   { score: 0.053510840982198715, sequence: "The goal of life is love.", token: 1567, token_str: "love" },
+ * //   { score: 0.05041185021400452, sequence: "The goal of life is happiness.", token: 9266, token_str: "happiness" },
+ * //   { score: 0.033218126744031906, sequence: "The goal of life is freedom.", token: 4438, token_str: "freedom" },
+ * //   { score: 0.03301157429814339, sequence: "The goal of life is success.", token: 2244, token_str: "success" },
  * // ]
  * ```
  *
  * **Example:** Perform masked language modelling (a.k.a. "fill-mask") with `Xenova/bert-base-cased` (and return top result).
  * ```javascript
+ * import { pipeline } from '@huggingface/transformers';
+ *
  * const unmasker = await pipeline('fill-mask', 'Xenova/bert-base-cased');
  * const output = await unmasker('The Milky Way is a [MASK] galaxy.', { top_k: 1 });
- * // [{ token_str: 'spiral', score: 0.6299987435340881, token: 14061, sequence: 'The Milky Way is a spiral galaxy.' }]
+ * // [{ score: 0.5982972383499146, sequence: "The Milky Way is a spiral galaxy.", token: 14061, token_str: "spiral" }]
  * ```
  */
 export class FillMaskPipeline
     extends /** @type {new (options: TextPipelineConstructorArgs) => FillMaskPipelineType} */ (Pipeline)
 {
-    /**
-     * Create a new FillMaskPipeline.
-     * @param {TextPipelineConstructorArgs} options An object used to instantiate the pipeline.
-     */
-    constructor(options) {
-        super(options);
-    }
-
-    /** @type {FillMaskPipelineCallback} */
     async _call(texts, { top_k = 5 } = {}) {
+        const { mask_token_id, mask_token } = this.tokenizer;
+
         // Run tokenization
         const model_inputs = this.tokenizer(texts, {
             padding: true,
@@ -84,11 +101,11 @@ export class FillMaskPipeline
             const mask_token_index = ids.findIndex(
                 (x) =>
                     // We use == to match bigint with number
-                    // @ts-ignore
-                    x == this.tokenizer.mask_token_id,
+                    // @ts-expect-error TS2367
+                    x == mask_token_id,
             );
             if (mask_token_index === -1) {
-                throw Error(`Mask token (${this.tokenizer.mask_token}) not found in text.`);
+                throw Error(`Mask token (${mask_token}) not found in text.`);
             }
             const itemLogits = logits[i][mask_token_index];