Skip to content

Commit df88fc5

Browse files
committed
Optimize llama3.2 webgpu demo
1 parent 8188d75 commit df88fc5

File tree

4 files changed

+21
-20
lines changed

4 files changed

+21
-20
lines changed

llama-3.2-webgpu/package-lock.json

Lines changed: 15 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

llama-3.2-webgpu/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"preview": "vite preview"
1111
},
1212
"dependencies": {
13-
"@huggingface/transformers": "3.7.1",
13+
"@huggingface/transformers": "3.8.0",
1414
"dompurify": "^3.1.2",
1515
"marked": "^12.0.2",
1616
"react": "^18.3.1",

llama-3.2-webgpu/src/App.jsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,15 +209,15 @@ function App() {
209209
<br />
210210
You are about to load{" "}
211211
<a
212-
href="https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-q4f16"
212+
href="https://huggingface.co/onnx-community/Llama-3.2-1B-Instruct-ONNX"
213213
target="_blank"
214214
rel="noreferrer"
215215
className="font-medium underline"
216216
>
217217
Llama-3.2-1B-Instruct
218218
</a>
219219
, a 1.24 billion parameter LLM that is optimized for inference on
220-
the web. Once downloaded, the model (1.15&nbsp;GB) will be cached
220+
the web. Once downloaded, the model (1.01&nbsp;GB) will be cached
221221
and reused when you revisit the page.
222222
<br />
223223
<br />

llama-3.2-webgpu/src/worker.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import {
99
* This class uses the Singleton pattern to enable lazy-loading of the pipeline
1010
*/
1111
class TextGenerationPipeline {
12-
static model_id = "onnx-community/Llama-3.2-1B-Instruct-q4f16";
12+
static model_id = "onnx-community/Llama-3.2-1B-Instruct-ONNX";
1313

1414
static async getInstance(progress_callback = null) {
1515
this.tokenizer ??= AutoTokenizer.from_pretrained(this.model_id, {
@@ -69,8 +69,7 @@ async function generate(messages) {
6969

7070
const { past_key_values, sequences } = await model.generate({
7171
...inputs,
72-
// TODO: Add when model is fixed
73-
// past_key_values: past_key_values_cache,
72+
past_key_values: past_key_values_cache,
7473

7574
// Sampling
7675
do_sample: false,
@@ -80,7 +79,7 @@ async function generate(messages) {
8079
stopping_criteria,
8180
return_dict_in_generate: true,
8281
});
83-
// past_key_values_cache = past_key_values;
82+
past_key_values_cache = past_key_values;
8483

8584
const decoded = tokenizer.batch_decode(sequences, {
8685
skip_special_tokens: true,

0 commit comments

Comments
 (0)