diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c index b4266ce95..a28a9157e 100644 --- a/dev/x86_64/src/poly_decompose_32_avx2.c +++ b/dev/x86_64/src/poly_decompose_32_avx2.c @@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 2046 == intdiv(4092, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲ - * 1 / 4092. + * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to + * + * round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both + * tiny but positive numbers. * * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' = @@ -87,7 +94,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025 + * range: 0 <= f1'' = floor(f1' * 1025 / 2^16) + * <= f1' * 1025 / 2^16 + * < 2^16 * 1025 / 2^16 = 1025 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs. diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c index 4461b06af..60a54829c 100644 --- a/dev/x86_64/src/poly_decompose_88_avx2.c +++ b/dev/x86_64/src/poly_decompose_88_avx2.c @@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 744 == intdiv(1488, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲ - * 1 / 1488. + * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to + * + * round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both + * tiny but positive numbers. * * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' = @@ -88,7 +95,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275 + * range: 0 <= f1'' = floor(f1' * 11275 / 2^16) + * <= f1' * 11275 / 2^16 + * < 2^16 * 11275 / 2^16 = 11275 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs. diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c index b4266ce95..a28a9157e 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c @@ -61,7 +61,7 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -72,10 +72,17 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 2046 == intdiv(4092, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲ - * 1 / 4092. + * 1 / 4092, so (for example) f1' = B / 2 = 2046 is mapped to + * + * round(2046 * 1025 / 2^22) = round(2046 * (1 / 4092 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 4092 - 1025 / 2^22 and epsilon' = 2046 * eps are both + * tiny but positive numbers. * * round(f1' * 1025 / 2^22) is in turn computed in 2 steps as * round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' = @@ -87,7 +94,9 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 1025 / 2^16) = 1025 + * range: 0 <= f1'' = floor(f1' * 1025 / 2^16) + * <= f1' * 1025 / 2^16 + * < 2^16 * 1025 / 2^16 = 1025 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs. diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c index 4461b06af..60a54829c 100644 --- a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +++ b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c @@ -62,7 +62,7 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B */ - /* Compute f1' = ceil(f / 128) as floor((f + 127) >> 7) */ + /* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */ f1 = _mm256_add_epi32(f, off); f1 = _mm256_srli_epi32(f1, 7); /* @@ -73,10 +73,17 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) * _mm256_mulhi_epu16() below. */ + /* check-magic: 744 == intdiv(1488, 2) */ /* * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲ - * 1 / 1488. + * 1 / 1488, so (for example) f1' = B / 2 = 744 is mapped to + * + * round(744 * 11275 / 2^24) = round(744 * (1 / 1488 - epsilon)) + * = round(1 / 2 - epsilon') = 0, + * + * where epsilon = 1 / 1488 - 11275 / 2^24 and epsilon' = 744 * eps are both + * tiny but positive numbers. * * round(f1' * 11275 / 2^24) is in turn computed in 2 steps as * round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' = @@ -88,7 +95,9 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) */ f1 = _mm256_mulhi_epu16(f1, v); /* - * range: 0 <= f1'' < floor(2^16 * 11275 / 2^16) = 11275 + * range: 0 <= f1'' = floor(f1' * 11275 / 2^16) + * <= f1' * 11275 / 2^16 + * < 2^16 * 11275 / 2^16 = 11275 * * Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that * is, no erroneous sign-extension occurs.