pq-code-package · jammychiou1 · Nov 21, 2025 · Dec 1, 2025 · jammychiou1 · Dec 1, 2025
@@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 2046 == intdiv(4092, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
-     * 1 / 4092.
+     * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to
+     *
+     *   round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as
      * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' =
@@ -87,7 +94,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025
+     * range: 0 <= f1''  = floor(f1' * 1025 / 2^16)
+     *                  <= f1' * 1025 / 2^16
+     *                   < 2^16 * 1025 / 2^16 = 1025
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.

@@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 744 == intdiv(1488, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
-     * 1 / 1488.
+     * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to
+     *
+     *   round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as
      * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' =
@@ -88,7 +95,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275
+     * range: 0 <= f1''  = floor(f1' * 11275 / 2^16)
+     *                  <= f1' * 11275 / 2^16
+     *                   < 2^16 * 11275 / 2^16 = 11275
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.

@@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 2046 == intdiv(4092, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
-     * 1 / 4092.
+     * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to
+     *
+     *   round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as
      * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' =
@@ -87,7 +94,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025
+     * range: 0 <= f1''  = floor(f1' * 1025 / 2^16)
+     *                  <= f1' * 1025 / 2^16
+     *                   < 2^16 * 1025 / 2^16 = 1025
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.

@@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B
      */
 
-    /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */
+    /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
     f1 = _mm256_add_epi32(f, off);
     f1 = _mm256_srli_epi32(f1, 7);
     /*
@@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
+    /* check-magic: 744 == intdiv(1488, 2) */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
-     * 1 / 1488.
+     * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to
+     *
+     *   round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon))
+     *                             = round(1 / 2 - epsilon') = 0,
+     *
+     * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both
+     * tiny but positive numbers.
      *
      * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as
      * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' =
@@ -88,7 +95,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      */
     f1 = _mm256_mulhi_epu16(f1, v);
     /*
-     * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275
+     * range: 0 <= f1''  = floor(f1' * 11275 / 2^16)
+     *                  <= f1' * 11275 / 2^16
+     *                   < 2^16 * 11275 / 2^16 = 11275
      *
      * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
      * is, no erroneous sign-extension occurs.