Optimize llama3.2 webgpu demo

xenova · xenova · commit df88fc5d6275 · 2025-11-22T22:01:04.000-05:00
diff --git a/llama-3.2-webgpu/package-lock.json b/llama-3.2-webgpu/package-lock.json
diff --git a/llama-3.2-webgpu/package.json b/llama-3.2-webgpu/package.json
@@ -10,7 +10,7 @@
     "preview": "vite preview"
   },
   "dependencies": {
-    "@huggingface/transformers": "3.7.1",
+    "@huggingface/transformers": "3.8.0",
     "dompurify": "^3.1.2",
     "marked": "^12.0.2",
     "react": "^18.3.1",
diff --git a/llama-3.2-webgpu/src/App.jsx b/llama-3.2-webgpu/src/App.jsx
@@ -209,15 +209,15 @@ function App() {
               <br />
               You are about to load{" "}
               <a
-                href="https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-q4f16"
+                href="https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-ONNX"
                 target="_blank"
                 rel="noreferrer"
                 className="font-medium underline"
               >
                 Llama-3.2-1B-Instruct
               </a>
               , a 1.24 billion parameter LLM that is optimized for inference on
-              the web. Once downloaded, the model (1.15&nbsp;GB) will be cached
+              the web. Once downloaded, the model (1.01&nbsp;GB) will be cached
               and reused when you revisit the page.
               <br />
               <br />
diff --git a/llama-3.2-webgpu/src/worker.js b/llama-3.2-webgpu/src/worker.js
@@ -9,7 +9,7 @@ import {
  * This class uses the Singleton pattern to enable lazy-loading of the pipeline
  */
 class TextGenerationPipeline {
-  static model_id = "onnx-community/Llama-3.2-1B-Instruct-q4f16";
+  static model_id = "onnx-community/Llama-3.2-1B-Instruct-ONNX";
 
   static async getInstance(progress_callback = null) {
     this.tokenizer ??= AutoTokenizer.from_pretrained(this.model_id, {
@@ -69,8 +69,7 @@ async function generate(messages) {
 
   const { past_key_values, sequences } = await model.generate({
     ...inputs,
-    // TODO: Add when model is fixed
-    // past_key_values: past_key_values_cache,
+    past_key_values: past_key_values_cache,
 
     // Sampling
     do_sample: false,
@@ -80,7 +79,7 @@ async function generate(messages) {
     stopping_criteria,
     return_dict_in_generate: true,
   });
-  // past_key_values_cache = past_key_values;
+  past_key_values_cache = past_key_values;
 
   const decoded = tokenizer.batch_decode(sequences, {
     skip_special_tokens: true,