From d66bdcfcf5266418aa17bccfba002208650d84a0 Mon Sep 17 00:00:00 2001
From: Felix Fu <841808303@qq.com>
Date: Fri, 5 Sep 2025 06:38:49 -0700
Subject: [PATCH] "cancel" package in mathjax is buggy, avoid using it.
---
_posts/2025-08-18-diff-distill.md | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/_posts/2025-08-18-diff-distill.md b/_posts/2025-08-18-diff-distill.md
index f1ead0a..61a4418 100644
--- a/_posts/2025-08-18-diff-distill.md
+++ b/_posts/2025-08-18-diff-distill.md
@@ -161,15 +161,22 @@ Based on the MeanFlow identity, we can compute the target as follows:
$$
\require{physics}
-\require{cancel}
\begin{align*}
F_{t\to s}^{\text{tgt}}(\mathbf{x}_t, t, s\vert\mathbf{x}_0) &= \dv{\mathbf{x}_t}{t} - (t-s)\dv{F_{t\to s}(\mathbf{x}_t, t, s)}{t} \\
-& = \dv{\mathbf{x}_t}{t} - (t-s)\left(\nabla_{\mathbf{x}_t} F_{t\to s}(\mathbf{x}_t, t, s) \dv{\mathbf{x}_t}{t} + \partial_t F_{t\to s}(\mathbf{x}_t, t, s) + \cancel{\partial_s F_{t\to s}(\mathbf{x}_t, t, s) \dv{s}{t}}\right) \\
+& = \dv{\mathbf{x}_t}{t} - (t-s)\left(\nabla_{\mathbf{x}_t} F_{t\to s}(\mathbf{x}_t, t, s) \dv{\mathbf{x}_t}{t} + \partial_t F_{t\to s}(\mathbf{x}_t, t, s) + \underbrace{\partial_s F_{t\to s}(\mathbf{x}_t, t, s) \dv{s}{t}}_{=0}\right) \\
& = v - (t-s)\left(v \nabla_{\mathbf{x}_t} F_{t\to s}(\mathbf{x}_t, t, s) + \partial_t F_{t\to s}(\mathbf{x}_t, t, s)\right). \\
\end{align*}
$$
-Note that in MeanFlow $$\dv{\mathbf{x}_t}{t} = v(\mathbf{x}_t, t\vert \mathbf{x}_0)$$ and $$\dv{s}{t}=0$$ since $s$ is independent of $t$.
+Note that in MeanFlow
+
+$$\require{physics}\dv{\mathbf{x}_t}{t} = v(\mathbf{x}_t, t\vert \mathbf{x}_0)$$
+
+and
+
+$$\require{physics}\dv{s}{t}=0$$
+
+since $s$ is independent of $t$.
In practice, the total derivative of $$F_{t\to s}(\mathbf{x}_t, t, s)$$ and the evaluation can be done in a single function call: `f, dfdt=jvp(f_theta, (xt, s, t), (v, 0, 1))`. Despite `jvp` operation only introduces one extra backward pass, it still incurs instability and slows down training. Moreover, the `jvp` operation is currently incompatible with the latest attention architecture. SplitMeanFlow circumvents this issue by enforcing another consistency identity $$(t-s)F_{t\to s} = (t-r)F_{t\to r}+(r-s)F_{r\to s}$$ where $$s