cp-algorithms
diff --git a/‎cp-algo/linalg/matrix.hpp‎
Lines changed: 3 additions & 0 deletions b/‎cp-algo/linalg/matrix.hpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cp-algo/linalg/vector.hpp‎
Lines changed: 3 additions & 0 deletions b/‎cp-algo/linalg/vector.hpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cp-algo/math/cvector.hpp‎
Lines changed: 19 additions & 16 deletions b/‎cp-algo/math/cvector.hpp‎
Lines changed: 19 additions & 16 deletions
diff --git a/‎cp-algo/math/factorials.hpp‎
Lines changed: 4 additions & 1 deletion b/‎cp-algo/math/factorials.hpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cp-algo/math/fft.hpp‎
Lines changed: 22 additions & 19 deletions b/‎cp-algo/math/fft.hpp‎
Lines changed: 22 additions & 19 deletions
diff --git a/‎cp-algo/math/fft64.hpp‎
Lines changed: 5 additions & 2 deletions b/‎cp-algo/math/fft64.hpp‎
Lines changed: 5 additions & 2 deletions
@@ -1,5 +1,7 @@
 #ifndef CP_ALGO_LINALG_MATRIX_HPP
 #define CP_ALGO_LINALG_MATRIX_HPP
+#pragma GCC push_options
+#pragma GCC target("avx2")
 #include "../random/rng.hpp"
 #include "../math/common.hpp"
 #include "vector.hpp"
@@ -304,4 +306,5 @@ namespace cp_algo::linalg {
     template<typename base_t>
     auto operator *(base_t t, matrix<base_t> const& A) {return A * t;}
 }
+#pragma GCC pop_options
 #endif // CP_ALGO_LINALG_MATRIX_HPP
@@ -1,5 +1,7 @@
 #ifndef CP_ALGO_LINALG_VECTOR_HPP
 #define CP_ALGO_LINALG_VECTOR_HPP
+#pragma GCC push_options
+#pragma GCC target("avx2")
 #include "../random/rng.hpp"
 #include "../number_theory/modint.hpp"
 #include "../util/big_alloc.hpp"
@@ -152,4 +154,5 @@ namespace cp_algo::linalg {
         size_t counter = 0;
     };
 }
+#pragma GCC pop_options
 #endif // CP_ALGO_LINALG_VECTOR_HPP
@@ -1,5 +1,7 @@
 #ifndef CP_ALGO_MATH_CVECTOR_HPP
 #define CP_ALGO_MATH_CVECTOR_HPP
+#pragma GCC push_options
+#pragma GCC target("avx2")
 #include "../util/simd.hpp"
 #include "../util/complex.hpp"
 #include "../util/checkpoint.hpp"
@@ -15,7 +17,7 @@ namespace cp_algo::math::fft {
     using point = complex<ftype>;
     using vpoint = complex<vftype>;
     static constexpr vftype vz = {};
-    simd_target vpoint vi(vpoint const& r) {
+    vpoint vi(vpoint const& r) {
         return {-imag(r), real(r)};
     }
 
@@ -30,7 +32,7 @@ namespace cp_algo::math::fft {
         vpoint& at(size_t k) {return r[k / flen];}
         vpoint at(size_t k) const {return r[k / flen];}
         template<class pt = point>
-        simd_inline void set(size_t k, pt const& t) {
+        inline void set(size_t k, pt const& t) {
             if constexpr(std::is_same_v<pt, point>) {
                 real(r[k / flen])[k % flen] = real(t);
                 imag(r[k / flen])[k % flen] = imag(t);
@@ -39,7 +41,7 @@ namespace cp_algo::math::fft {
             }
         }
         template<class pt = point>
-        simd_inline pt get(size_t k) const {
+        inline pt get(size_t k) const {
             if constexpr(std::is_same_v<pt, point>) {
                 return {real(r[k / flen])[k % flen], imag(r[k / flen])[k % flen]};
             } else {
@@ -79,18 +81,18 @@ namespace cp_algo::math::fft {
             return roots[std::bit_width(n)];
         }
         template<int step>
-        simd_target static void exec_on_eval(size_t n, size_t k, auto &&callback) {
+        static void exec_on_eval(size_t n, size_t k, auto &&callback) {
             callback(k, root(4 * step * n) * eval_point(step * k));
         }
         template<int step>
-        simd_target static void exec_on_evals(size_t n, auto &&callback) {
+        static void exec_on_evals(size_t n, auto &&callback) {
             point factor = root(4 * step * n);
             for(size_t i = 0; i < n; i++) {
                 callback(i, factor * eval_point(step * i));
             }
         }
 
-        simd_target static void do_dot_iter(point rt, vpoint& Bv, vpoint const& Av, vpoint& res) {
+        static void do_dot_iter(point rt, vpoint& Bv, vpoint const& Av, vpoint& res) {
             res += Av * Bv;
             real(Bv) = rotate_right(real(Bv));
             imag(Bv) = rotate_right(imag(Bv));
@@ -99,9 +101,9 @@ namespace cp_algo::math::fft {
             imag(Bv)[0] = x * imag(rt) + y * real(rt);
         }
 
-        simd_target void dot(cvector const& t) {
+        void dot(cvector const& t) {
             size_t n = this->size();
-            exec_on_evals<1>(n / flen, [&](size_t k, point rt) {
+            exec_on_evals<1>(n / flen, [&](size_t k, point rt) __attribute__((always_inline)) {
                 k *= flen;
                 auto [Ax, Ay] = at(k);
                 auto Bv = t.at(k);
@@ -115,11 +117,11 @@ namespace cp_algo::math::fft {
             checkpoint("dot");
         }
         template<bool partial = true>
-        simd_target void ifft() {
+        void ifft() {
             size_t n = size();
             if constexpr (!partial) {
                 point pi(0, 1);
-                exec_on_evals<4>(n / 4, [&](size_t k, point rt) {
+                exec_on_evals<4>(n / 4, [&](size_t k, point rt) __attribute__((always_inline)) {
                     k *= 4;
                     point v1 = conj(rt);
                     point v2 = v1 * v1;
@@ -136,7 +138,7 @@ namespace cp_algo::math::fft {
             }
             bool parity = std::countr_zero(n) % 2;
             if(parity) {
-                exec_on_evals<2>(n / (2 * flen), [&](size_t k, point rt) {
+                exec_on_evals<2>(n / (2 * flen), [&](size_t k, point rt) __attribute__((always_inline)) {
                     k *= 2 * flen;
                     vpoint cvrt = {vz + real(rt), vz - imag(rt)};
                     auto B = at(k) - at(k + flen);
@@ -149,7 +151,7 @@ namespace cp_algo::math::fft {
                 size_t level = std::countr_one(leaf + 3);
                 for(size_t lvl = 4 + parity; lvl <= level; lvl += 2) {
                     size_t i = (1 << lvl) / 4;
-                    exec_on_eval<4>(n >> lvl, leaf >> lvl, [&](size_t k, point rt) {
+                    exec_on_eval<4>(n >> lvl, leaf >> lvl, [&](size_t k, point rt) __attribute__((always_inline)) {
                         k <<= lvl;
                         vpoint v1 = {vz + real(rt), vz - imag(rt)};
                         vpoint v2 = v1 * v1;
@@ -177,15 +179,15 @@ namespace cp_algo::math::fft {
             }
         }
         template<bool partial = true>
-        simd_target void fft() {
+        void fft() {
             size_t n = size();
             bool parity = std::countr_zero(n) % 2;
             for(size_t leaf = 0; leaf < n; leaf += 4 * flen) {
                 size_t level = std::countr_zero(n + leaf);
                 level -= level % 2 != parity;
                 for(size_t lvl = level; lvl >= 4; lvl -= 2) {
                     size_t i = (1 << lvl) / 4;
-                    exec_on_eval<4>(n >> lvl, leaf >> lvl, [&](size_t k, point rt) {
+                    exec_on_eval<4>(n >> lvl, leaf >> lvl, [&](size_t k, point rt) __attribute__((always_inline)) {
                         k <<= lvl;
                         vpoint v1 = {vz + real(rt), vz + imag(rt)};
                         vpoint v2 = v1 * v1;
@@ -204,7 +206,7 @@ namespace cp_algo::math::fft {
                 }
             }
             if(parity) {
-                exec_on_evals<2>(n / (2 * flen), [&](size_t k, point rt) {
+                exec_on_evals<2>(n / (2 * flen), [&](size_t k, point rt) __attribute__((always_inline)) {
                     k *= 2 * flen;
                     vpoint vrt = {vz + real(rt), vz + imag(rt)};
                     auto t = at(k + flen) * vrt;
@@ -214,7 +216,7 @@ namespace cp_algo::math::fft {
             }
             if constexpr (!partial) {
                 point pi(0, 1);
-                exec_on_evals<4>(n / 4, [&](size_t k, point rt) {
+                exec_on_evals<4>(n / 4, [&](size_t k, point rt) __attribute__((always_inline)) {
                     k *= 4;
                     point v1 = rt;
                     point v2 = v1 * v1;
@@ -252,4 +254,5 @@ namespace cp_algo::math::fft {
         return res;
     }();
 }
+#pragma GCC pop_options
 #endif // CP_ALGO_MATH_CVECTOR_HPP
@@ -1,5 +1,7 @@
 #ifndef CP_ALGO_MATH_FACTORIALS_HPP
 #define CP_ALGO_MATH_FACTORIALS_HPP
+#pragma GCC push_options
+#pragma GCC target("avx2")
 #include "../util/checkpoint.hpp"
 #include "../util/bump_alloc.hpp"
 #include "../util/simd.hpp"
@@ -9,7 +11,7 @@
 
 namespace cp_algo::math {
     template<bool use_bump_alloc = false, int maxn = -1>
-    simd_target auto facts(auto const& args) {
+    auto facts(auto const& args) {
         static_assert(!use_bump_alloc || maxn > 0, "maxn must be set if use_bump_alloc is true");
         constexpr int max_mod = 1'000'000'000;
         constexpr int accum = 4;
@@ -93,4 +95,5 @@ namespace cp_algo::math {
         return res;
     }
 }
+#pragma GCC pop_options
 #endif // CP_ALGO_MATH_FACTORIALS_HPP
@@ -1,5 +1,7 @@
 #ifndef CP_ALGO_MATH_FFT_HPP
 #define CP_ALGO_MATH_FFT_HPP
+#pragma GCC push_options
+#pragma GCC target("avx2")
 #include "../number_theory/modint.hpp"
 #include "../util/checkpoint.hpp"
 #include "../random/rng.hpp"
@@ -29,7 +31,7 @@ namespace cp_algo::math::fft {
             }
         }
 
-        simd_target static std::pair<vftype, vftype> 
+        static std::pair<vftype, vftype> 
         do_split(auto const& a, size_t idx, u64x4 mul) {
             if(idx >= std::size(a)) {
                 return std::pair{vftype(), vftype()};
@@ -48,7 +50,7 @@ namespace cp_algo::math::fft {
         }
 
         dft(size_t n): A(n), B(n) {init();}
-        simd_target dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {
+        dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {
             init();
             base b2x32 = bpow(base(2), 32);
             u64x4 cur = {
@@ -77,7 +79,7 @@ namespace cp_algo::math::fft {
                 }
             }
         }
-        simd_target static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {
+        static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {
             AC += Av * Cv; AD += Av * Dv;
             BC += Bv * Cv; BD += Bv * Dv;
             real(Cv) = rotate_right(real(Cv));
@@ -93,8 +95,8 @@ namespace cp_algo::math::fft {
         }
 
         template<bool overwrite = true, bool partial = true>
-        simd_target void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {
-            cvector::exec_on_evals<1>(A.size() / flen, [&](size_t k, point rt) {
+        void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {
+            cvector::exec_on_evals<1>(A.size() / flen, [&](size_t k, point rt) __attribute__((always_inline)) {
                 k *= flen;
                 vpoint AC, AD, BC, BD;
                 AC = AD = BC = BD = vz;
@@ -125,11 +127,11 @@ namespace cp_algo::math::fft {
             checkpoint("dot");
         }
 
-        [[gnu::target("avx2")]] void dot(auto &&C, auto const& D) {
+        void dot(auto &&C, auto const& D) {
             dot(C, D, A, B, C);
         }
 
-        simd_target static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {
+        static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {
             auto A0 = lround(A), A1 = lround(C), A2 = lround(B);
             auto Ai = A0 + A1 * split() + A2 * splitsplit + uint64_t(base::modmod());
             auto Au = montgomery_reduce(u64x4(Ai), mod, imod);
@@ -140,7 +142,7 @@ namespace cp_algo::math::fft {
             }
         }
 
-        simd_target void recover_mod(auto &&C, auto &res, size_t k) {
+        void recover_mod(auto &&C, auto &res, size_t k) {
             size_t check = (k + flen - 1) / flen * flen;
             assert(res.size() >= check);
             size_t n = A.size();
@@ -168,7 +170,7 @@ namespace cp_algo::math::fft {
             checkpoint("recover mod");
         }
 
-        simd_target void mul(auto &&C, auto const& D, auto &res, size_t k) {
+        void mul(auto &&C, auto const& D, auto &res, size_t k) {
             assert(A.size() == C.size());
             size_t n = A.size();
             if(!n) {
@@ -181,10 +183,10 @@ namespace cp_algo::math::fft {
             C.ifft();
             recover_mod(C, res, k);
         }
-        simd_target void mul_inplace(auto &&B, auto& res, size_t k) {
+        void mul_inplace(auto &&B, auto& res, size_t k) {
             mul(B.A, B.B, res, k);
         }
-        simd_target void mul(auto const& B, auto& res, size_t k) {
+        void mul(auto const& B, auto& res, size_t k) {
             mul(cvector(B.A), B.B, res, k);
         }
         big_vector<base> operator *= (dft &B) {
@@ -209,7 +211,7 @@ namespace cp_algo::math::fft {
     template<modint_type base> uint32_t dft<base>::mod = {};
     template<modint_type base> uint32_t dft<base>::imod = {};
 
-    [[gnu::target("avx2")]] void mul_slow(auto &a, auto const& b, size_t k) {
+    void mul_slow(auto &a, auto const& b, size_t k) {
         if(std::empty(a) || std::empty(b)) {
             a.clear();
         } else {
@@ -230,7 +232,7 @@ namespace cp_algo::math::fft {
         }
         return std::max(flen, std::bit_ceil(as + bs - 1) / 2);
     }
-    [[gnu::target("avx2")]] void mul_truncate(auto &a, auto const& b, size_t k) {
+    void mul_truncate(auto &a, auto const& b, size_t k) {
         using base = std::decay_t<decltype(a[0])>;
         if(std::min({k, std::size(a), std::size(b)}) < magic) {
             mul_slow(a, b, k);
@@ -247,7 +249,7 @@ namespace cp_algo::math::fft {
     }
 
     // store mod x^n-k in first half, x^n+k in second half
-    simd_target void mod_split(auto &&x, size_t n, auto k) {
+    void mod_split(auto &&x, size_t n, auto k) {
         using base = std::decay_t<decltype(k)>;
         dft<base>::init();
         assert(std::size(x) == 2 * n);
@@ -279,7 +281,7 @@ namespace cp_algo::math::fft {
         }
         cp_algo::checkpoint("mod split");
     }
-    [[gnu::target("avx2")]] void cyclic_mul(auto &a, auto &&b, size_t k) {
+    void cyclic_mul(auto &a, auto &&b, size_t k) {
         assert(std::popcount(k) == 1);
         assert(std::size(a) == std::size(b) && std::size(a) == k);
         using base = std::decay_t<decltype(a[0])>;
@@ -312,13 +314,13 @@ namespace cp_algo::math::fft {
         }
         cp_algo::checkpoint("mod join");
     }
-    [[gnu::target("avx2")]] auto make_copy(auto &&x) {
+    auto make_copy(auto &&x) {
         return x;
     }
-    [[gnu::target("avx2")]] void cyclic_mul(auto &a, auto const& b, size_t k) {
+    void cyclic_mul(auto &a, auto const& b, size_t k) {
         return cyclic_mul(a, make_copy(b), k);
     }
-    [[gnu::target("avx2")]] void mul(auto &a, auto &&b) {
+    void mul(auto &a, auto &&b) {
         size_t N = size(a) + size(b);
         if(N > (1 << 20)) {
             N--;
@@ -331,7 +333,7 @@ namespace cp_algo::math::fft {
             mul_truncate(a, b, N - 1);
         }
     }
-    [[gnu::target("avx2")]] void mul(auto &a, auto const& b) {
+    void mul(auto &a, auto const& b) {
         size_t N = size(a) + size(b);
         if(N > (1 << 20)) {
             mul(a, make_copy(b));
@@ -340,4 +342,5 @@ namespace cp_algo::math::fft {
         }
     }
 }
+#pragma GCC pop_options
 #endif // CP_ALGO_MATH_FFT_HPP
@@ -1,5 +1,7 @@
 #ifndef CP_ALGO_MATH_FFT64_HPP
 #define CP_ALGO_MATH_FFT64_HPP
+#pragma GCC push_options
+#pragma GCC target("avx2")
 #include "../random/rng.hpp"
 #include "../math/common.hpp"
 #include "../math/cvector.hpp"
@@ -46,7 +48,7 @@ namespace cp_algo::math::fft {
             }
         }
 
-        simd_target static void do_dot_iter(point rt, std::array<vpoint, 4>& B, std::array<vpoint, 4> const& A, std::array<vpoint, 4>& C) {
+        static void do_dot_iter(point rt, std::array<vpoint, 4>& B, std::array<vpoint, 4> const& A, std::array<vpoint, 4>& C) {
             for(size_t k = 0; k < 4; k++) {
                 for(size_t i = 0; i <= k; i++) {
                     C[k] += A[i] * B[k - i];
@@ -63,7 +65,7 @@ namespace cp_algo::math::fft {
 
         void dot(dft64 const& t) {
             size_t N = cv[0].size();
-            cvector::exec_on_evals<1>(N / flen, [&](size_t k, point rt) {
+            cvector::exec_on_evals<1>(N / flen, [&](size_t k, point rt) __attribute__((always_inline)) {
                 k *= flen;
                 auto [A0x, A0y] = cv[0].at(k);
                 auto [A1x, A1y] = cv[1].at(k);
@@ -127,4 +129,5 @@ namespace cp_algo::math::fft {
         A.recover_mod(a, n + m - 1);
     }
 }
+#pragma GCC pop_options
 #endif // CP_ALGO_MATH_FFT64_HPP
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
`1`	`1`	`#ifndef CP_ALGO_MATH_FFT_HPP`
`2`	`2`	`#define CP_ALGO_MATH_FFT_HPP`
	`3`	`+#pragma GCC push_options`
	`4`	`+#pragma GCC target("avx2")`
`3`	`5`	`#include "../number_theory/modint.hpp"`
`4`	`6`	`#include "../util/checkpoint.hpp"`
`5`	`7`	`#include "../random/rng.hpp"`
`@@ -29,7 +31,7 @@ namespace cp_algo::math::fft {`
`29`	`31`	`}`
`30`	`32`	`}`
`31`	`33`
`32`		`- simd_target static std::pair<vftype, vftype>`
	`34`	`+ static std::pair<vftype, vftype>`
`33`	`35`	`do_split(auto const& a, size_t idx, u64x4 mul) {`
`34`	`36`	`if(idx >= std::size(a)) {`
`35`	`37`	`return std::pair{vftype(), vftype()};`
`@@ -48,7 +50,7 @@ namespace cp_algo::math::fft {`
`48`	`50`	`}`
`49`	`51`
`50`	`52`	`dft(size_t n): A(n), B(n) {init();}`
`51`		`- simd_target dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {`
	`53`	`+ dft(auto const& a, size_t n, bool partial = true): A(n), B(n) {`
`52`	`54`	`init();`
`53`	`55`	`base b2x32 = bpow(base(2), 32);`
`54`	`56`	`u64x4 cur = {`
`@@ -77,7 +79,7 @@ namespace cp_algo::math::fft {`
`77`	`79`	`}`
`78`	`80`	`}`
`79`	`81`	`}`
`80`		`- simd_target static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {`
	`82`	`+ static void do_dot_iter(point rt, vpoint& Cv, vpoint& Dv, vpoint const& Av, vpoint const& Bv, vpoint& AC, vpoint& AD, vpoint& BC, vpoint& BD) {`
`81`	`83`	`AC += Av * Cv; AD += Av * Dv;`
`82`	`84`	`BC += Bv * Cv; BD += Bv * Dv;`
`83`	`85`	`real(Cv) = rotate_right(real(Cv));`
`@@ -93,8 +95,8 @@ namespace cp_algo::math::fft {`
`93`	`95`	`}`
`94`	`96`
`95`	`97`	`template<bool overwrite = true, bool partial = true>`
`96`		`- simd_target void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {`
`97`		`- cvector::exec_on_evals<1>(A.size() / flen, [&](size_t k, point rt) {`
	`98`	`+ void dot(auto const& C, auto const& D, auto &Aout, auto &Bout, auto &Cout) const {`
	`99`	`+ cvector::exec_on_evals<1>(A.size() / flen, [&](size_t k, point rt) __attribute__((always_inline)) {`
`98`	`100`	`k *= flen;`
`99`	`101`	`vpoint AC, AD, BC, BD;`
`100`	`102`	`AC = AD = BC = BD = vz;`
`@@ -125,11 +127,11 @@ namespace cp_algo::math::fft {`
`125`	`127`	`checkpoint("dot");`
`126`	`128`	`}`
`127`	`129`
`128`		`- [[gnu::target("avx2")]] void dot(auto &&C, auto const& D) {`
	`130`	`+ void dot(auto &&C, auto const& D) {`
`129`	`131`	`dot(C, D, A, B, C);`
`130`	`132`	`}`
`131`	`133`
`132`		`- simd_target static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {`
	`134`	`+ static void do_recover_iter(size_t idx, auto A, auto B, auto C, auto mul, uint64_t splitsplit, auto &res) {`
`133`	`135`	`auto A0 = lround(A), A1 = lround(C), A2 = lround(B);`
`134`	`136`	`auto Ai = A0 + A1 * split() + A2 * splitsplit + uint64_t(base::modmod());`
`135`	`137`	`auto Au = montgomery_reduce(u64x4(Ai), mod, imod);`
`@@ -140,7 +142,7 @@ namespace cp_algo::math::fft {`
`140`	`142`	`}`
`141`	`143`	`}`
`142`	`144`
`143`		`- simd_target void recover_mod(auto &&C, auto &res, size_t k) {`
	`145`	`+ void recover_mod(auto &&C, auto &res, size_t k) {`
`144`	`146`	`size_t check = (k + flen - 1) / flen * flen;`
`145`	`147`	`assert(res.size() >= check);`
`146`	`148`	`size_t n = A.size();`
`@@ -168,7 +170,7 @@ namespace cp_algo::math::fft {`
`168`	`170`	`checkpoint("recover mod");`
`169`	`171`	`}`
`170`	`172`
`171`		`- simd_target void mul(auto &&C, auto const& D, auto &res, size_t k) {`
	`173`	`+ void mul(auto &&C, auto const& D, auto &res, size_t k) {`
`172`	`174`	`assert(A.size() == C.size());`
`173`	`175`	`size_t n = A.size();`
`174`	`176`	`if(!n) {`
`@@ -181,10 +183,10 @@ namespace cp_algo::math::fft {`
`181`	`183`	`C.ifft();`
`182`	`184`	`recover_mod(C, res, k);`
`183`	`185`	`}`
`184`		`- simd_target void mul_inplace(auto &&B, auto& res, size_t k) {`
	`186`	`+ void mul_inplace(auto &&B, auto& res, size_t k) {`
`185`	`187`	`mul(B.A, B.B, res, k);`
`186`	`188`	`}`
`187`		`- simd_target void mul(auto const& B, auto& res, size_t k) {`
	`189`	`+ void mul(auto const& B, auto& res, size_t k) {`
`188`	`190`	`mul(cvector(B.A), B.B, res, k);`
`189`	`191`	`}`
`190`	`192`	`big_vector<base> operator *= (dft &B) {`
`@@ -209,7 +211,7 @@ namespace cp_algo::math::fft {`
`209`	`211`	`template<modint_type base> uint32_t dft<base>::mod = {};`
`210`	`212`	`template<modint_type base> uint32_t dft<base>::imod = {};`
`211`	`213`
`212`		`- [[gnu::target("avx2")]] void mul_slow(auto &a, auto const& b, size_t k) {`
	`214`	`+ void mul_slow(auto &a, auto const& b, size_t k) {`
`213`	`215`	`if(std::empty(a) \|\| std::empty(b)) {`
`214`	`216`	`a.clear();`
`215`	`217`	`} else {`
`@@ -230,7 +232,7 @@ namespace cp_algo::math::fft {`
`230`	`232`	`}`
`231`	`233`	`return std::max(flen, std::bit_ceil(as + bs - 1) / 2);`
`232`	`234`	`}`
`233`		`- [[gnu::target("avx2")]] void mul_truncate(auto &a, auto const& b, size_t k) {`
	`235`	`+ void mul_truncate(auto &a, auto const& b, size_t k) {`
`234`	`236`	`using base = std::decay_t<decltype(a[0])>;`
`235`	`237`	`if(std::min({k, std::size(a), std::size(b)}) < magic) {`
`236`	`238`	`mul_slow(a, b, k);`
`@@ -247,7 +249,7 @@ namespace cp_algo::math::fft {`
`247`	`249`	`}`
`248`	`250`
`249`	`251`	`// store mod x^n-k in first half, x^n+k in second half`
`250`		`- simd_target void mod_split(auto &&x, size_t n, auto k) {`
	`252`	`+ void mod_split(auto &&x, size_t n, auto k) {`
`251`	`253`	`using base = std::decay_t<decltype(k)>;`
`252`	`254`	`dft<base>::init();`
`253`	`255`	`assert(std::size(x) == 2 * n);`
`@@ -279,7 +281,7 @@ namespace cp_algo::math::fft {`
`279`	`281`	`}`
`280`	`282`	`cp_algo::checkpoint("mod split");`
`281`	`283`	`}`
`282`		`- [[gnu::target("avx2")]] void cyclic_mul(auto &a, auto &&b, size_t k) {`
	`284`	`+ void cyclic_mul(auto &a, auto &&b, size_t k) {`
`283`	`285`	`assert(std::popcount(k) == 1);`
`284`	`286`	`assert(std::size(a) == std::size(b) && std::size(a) == k);`
`285`	`287`	`using base = std::decay_t<decltype(a[0])>;`
`@@ -312,13 +314,13 @@ namespace cp_algo::math::fft {`
`312`	`314`	`}`
`313`	`315`	`cp_algo::checkpoint("mod join");`
`314`	`316`	`}`
`315`		`- [[gnu::target("avx2")]] auto make_copy(auto &&x) {`
	`317`	`+ auto make_copy(auto &&x) {`
`316`	`318`	`return x;`
`317`	`319`	`}`
`318`		`- [[gnu::target("avx2")]] void cyclic_mul(auto &a, auto const& b, size_t k) {`
	`320`	`+ void cyclic_mul(auto &a, auto const& b, size_t k) {`
`319`	`321`	`return cyclic_mul(a, make_copy(b), k);`
`320`	`322`	`}`
`321`		`- [[gnu::target("avx2")]] void mul(auto &a, auto &&b) {`
	`323`	`+ void mul(auto &a, auto &&b) {`
`322`	`324`	`size_t N = size(a) + size(b);`
`323`	`325`	`if(N > (1 << 20)) {`
`324`	`326`	`N--;`
`@@ -331,7 +333,7 @@ namespace cp_algo::math::fft {`
`331`	`333`	`mul_truncate(a, b, N - 1);`
`332`	`334`	`}`
`333`	`335`	`}`
`334`		`- [[gnu::target("avx2")]] void mul(auto &a, auto const& b) {`
	`336`	`+ void mul(auto &a, auto const& b) {`
`335`	`337`	`size_t N = size(a) + size(b);`
`336`	`338`	`if(N > (1 << 20)) {`
`337`	`339`	`mul(a, make_copy(b));`
`@@ -340,4 +342,5 @@ namespace cp_algo::math::fft {`
`340`	`342`	`}`
`341`	`343`	`}`
`342`	`344`	`}`
	`345`	`+#pragma GCC pop_options`
`343`	`346`	`#endif // CP_ALGO_MATH_FFT_HPP`