From 356206451aacb9a41783094a35010f2930072f9e Mon Sep 17 00:00:00 2001 From: jammychiou1 Date: Fri, 21 Nov 2025 18:31:39 +0800 Subject: [PATCH 1/2] Fix some minor details in comments for AVX2 decompose - The floor() in floor((f + 127) >> 7) was somewhat unecessary as the usual semantic for the right-shift operator (>>) has integer output anyway. Seeing as the right-shift operator is not used in other explanation comments, we decided to rewrite it as division by 2^7 for better consistency. - The bound of f1'' is correct but the proof was misleading. The new proof should be clearer. Signed-off-by: jammychiou1 --- dev/x86_64/src/poly_decompose_32_avx2.c | 6 ++++-- dev/x86_64/src/poly_decompose_88_avx2.c | 6 ++++-- mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c | 6 ++++-- mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c | 6 ++++-- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c index b4266ce95..eeb93cb1f 100644 --- a/dev/x86_64/src/poly_decompose_32_avx2.c +++ b/dev/x86_64/src/poly_decompose_32_avx2.c @@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -87,7 +87,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025 + * range: 0 <= f1'' = floor(f1' * 1025 / 2^16) + * <= f1' * 1025 / 2^16 + * < 2^16 * 1025 / 2^16 = 1025 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs. diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c index 4461b06af..d34595c20 100644 --- a/dev/x86_64/src/poly_decompose_88_avx2.c +++ b/dev/x86_64/src/poly_decompose_88_avx2.c @@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -88,7 +88,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275 + * range: 0 <= f1'' = floor(f1' * 11275 / 2^16) + * <= f1' * 11275 / 2^16 + * < 2^16 * 11275 / 2^16 = 11275 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs. diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c index b4266ce95..eeb93cb1f 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c @@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -87,7 +87,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025 + * range: 0 <= f1'' = floor(f1' * 1025 / 2^16) + * <= f1' * 1025 / 2^16 + * < 2^16 * 1025 / 2^16 = 1025 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs. diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c index 4461b06af..d34595c20 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c @@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -88,7 +88,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275 + * range: 0 <= f1'' = floor(f1' * 11275 / 2^16) + * <= f1' * 11275 / 2^16 + * < 2^16 * 11275 / 2^16 = 11275 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs. From 326d7312784dbb00300da4285ead0b2d7f1843f1 Mon Sep 17 00:00:00 2001 From: jammychiou1 Date: Mon, 1 Dec 2025 12:01:24 +0800 Subject: [PATCH 2/2] [DRAFT] Better explanation for round() vs round-() in decompose Only the AVX2 one is updated currently. The NEON one will also be updated if this looks good. Signed-off-by: jammychiou1 --- dev/x86_64/src/poly_decompose_32_avx2.c | 9 ++++++++- dev/x86_64/src/poly_decompose_88_avx2.c | 9 ++++++++- mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c | 9 ++++++++- mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c | 9 ++++++++- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c index eeb93cb1f..a28a9157e 100644 --- a/dev/x86_64/src/poly_decompose_32_avx2.c +++ b/dev/x86_64/src/poly_decompose_32_avx2.c @@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 2046 == intdiv(4092, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲ - * 1 / 4092. + * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to + * + * round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both + * tiny but positive numbers. * * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' = diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c index d34595c20..60a54829c 100644 --- a/dev/x86_64/src/poly_decompose_88_avx2.c +++ b/dev/x86_64/src/poly_decompose_88_avx2.c @@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 744 == intdiv(1488, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲ - * 1 / 1488. + * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to + * + * round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both + * tiny but positive numbers. * * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' = diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c index eeb93cb1f..a28a9157e 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c @@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 2046 == intdiv(4092, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲ - * 1 / 4092. + * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to + * + * round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both + * tiny but positive numbers. * * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' = diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c index d34595c20..60a54829c 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c @@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 744 == intdiv(1488, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲ - * 1 / 1488. + * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to + * + * round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both + * tiny but positive numbers. * * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' =