Fix some minor details in comments for AVX2 decompose

jammychiou1 · jammychiou1 · commit 993120374f0d · 2025-12-01T11:32:57.000+08:00
- The floor() in floor((f + 127) &gt;&gt; 7) was somewhat unecessary as
  the usual semantic for the right-shift operator (&gt;&gt;) has integer
  output anyway. Seeing as the right-shift operator is not used in other
  explanation comments, we decided to rewrite it as division by 2^7 for
  better consistency.
- The bound of f1'' is correct but the proof was misleading. The new
  proof should be clearer.

Signed-off-by: jammychiou1 &lt;jammy.chiou1@gmail.com&gt;
diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c
@@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -87,7 +87,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025
+     * range: 0 <= f1''  = floor(f1' * 1025 / 2^16)
+     *                  <= f1' * 1025 / 2^16
+     *                   < 2^16 * 1025 / 2^16 = 1025
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.
diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c
@@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -88,7 +88,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275
+     * range: 0 <= f1''  = floor(f1' * 11275 / 2^16)
+     *                  <= f1' * 11275 / 2^16
+     *                   < 2^16 * 11275 / 2^16 = 11275
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
@@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -87,7 +87,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025
+     * range: 0 <= f1''  = floor(f1' * 1025 / 2^16)
+     *                  <= f1' * 1025 / 2^16
+     *                   < 2^16 * 1025 / 2^16 = 1025
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
@@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -88,7 +88,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275
+     * range: 0 <= f1''  = floor(f1' * 11275 / 2^16)
+     *                  <= f1' * 11275 / 2^16
+     *                   < 2^16 * 11275 / 2^16 = 11275
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.