From 356206451aacb9a41783094a35010f2930072f9e Mon Sep 17 00:00:00 2001
From: jammychiou1 <jammy.chiou1@gmail.com>
Date: Fri, 21 Nov 2025 18:31:39 +0800
Subject: [PATCH 1/2] Fix some minor details in comments for AVX2 decompose

- The floor() in floor((f + 127) >> 7) was somewhat unecessary as
  the usual semantic for the right-shift operator (>>) has integer
  output anyway. Seeing as the right-shift operator is not used in other
  explanation comments, we decided to rewrite it as division by 2^7 for
  better consistency.
- The bound of f1'' is correct but the proof was misleading. The new
  proof should be clearer.

Signed-off-by: jammychiou1 <jammy.chiou1@gmail.com>
---
 dev/x86_64/src/poly_decompose_32_avx2.c              | 6 ++++--
 dev/x86_64/src/poly_decompose_88_avx2.c              | 6 ++++--
 mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c | 6 ++++--
 mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c | 6 ++++--
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c
index b4266ce95..eeb93cb1f 100644
--- a/dev/x86_64/src/poly_decompose_32_avx2.c
+++ b/dev/x86_64/src/poly_decompose_32_avx2.c
@@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -87,7 +87,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025
+     * range: 0 <= f1''  = floor(f1' * 1025 / 2^16)
+     *                  <= f1' * 1025 / 2^16
+     *                   < 2^16 * 1025 / 2^16 = 1025
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.
diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c
index 4461b06af..d34595c20 100644
--- a/dev/x86_64/src/poly_decompose_88_avx2.c
+++ b/dev/x86_64/src/poly_decompose_88_avx2.c
@@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -88,7 +88,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275
+     * range: 0 <= f1''  = floor(f1' * 11275 / 2^16)
+     *                  <= f1' * 11275 / 2^16
+     *                   < 2^16 * 11275 / 2^16 = 11275
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
index b4266ce95..eeb93cb1f 100644
--- a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
+++ b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
@@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -87,7 +87,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025
+     * range: 0 <= f1''  = floor(f1' * 1025 / 2^16)
+     *                  <= f1' * 1025 / 2^16
+     *                   < 2^16 * 1025 / 2^16 = 1025
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
index 4461b06af..d34595c20 100644
--- a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
+++ b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
@@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -88,7 +88,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275
+     * range: 0 <= f1''  = floor(f1' * 11275 / 2^16)
+     *                  <= f1' * 11275 / 2^16
+     *                   < 2^16 * 11275 / 2^16 = 11275
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.

From 326d7312784dbb00300da4285ead0b2d7f1843f1 Mon Sep 17 00:00:00 2001
From: jammychiou1 <jammy.chiou1@gmail.com>
Date: Mon, 1 Dec 2025 12:01:24 +0800
Subject: [PATCH 2/2] [DRAFT] Better explanation for round() vs round-() in
 decompose

Only the AVX2 one is updated currently. The NEON one will also be
updated if this looks good.

Signed-off-by: jammychiou1 <jammy.chiou1@gmail.com>
---
 dev/x86_64/src/poly_decompose_32_avx2.c              | 9 ++++++++-
 dev/x86_64/src/poly_decompose_88_avx2.c              | 9 ++++++++-
 mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c | 9 ++++++++-
 mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c | 9 ++++++++-
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c
index eeb93cb1f..a28a9157e 100644
--- a/dev/x86_64/src/poly_decompose_32_avx2.c
+++ b/dev/x86_64/src/poly_decompose_32_avx2.c
@@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 2046 == intdiv(4092, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
-     * 1 / 4092.
+     * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to
+     *
+     *   round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as
      * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' =
diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c
index d34595c20..60a54829c 100644
--- a/dev/x86_64/src/poly_decompose_88_avx2.c
+++ b/dev/x86_64/src/poly_decompose_88_avx2.c
@@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 744 == intdiv(1488, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
-     * 1 / 1488.
+     * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to
+     *
+     *   round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as
      * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' =
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
index eeb93cb1f..a28a9157e 100644
--- a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
+++ b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
@@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 2046 == intdiv(4092, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
-     * 1 / 4092.
+     * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to
+     *
+     *   round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as
      * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' =
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
index d34595c20..60a54829c 100644
--- a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
+++ b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
@@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 744 == intdiv(1488, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
-     * 1 / 1488.
+     * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to
+     *
+     *   round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as
      * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' =