Skip to content

Commit 5c4d741

Browse files
chore: update minified library versions
1 parent 671b2cb commit 5c4d741

File tree

8 files changed

+28
-10
lines changed

8 files changed

+28
-10
lines changed

cp-algo/min/math/cvector.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@
66
#include "../util/big_alloc.hpp"
77
#include <ranges>
88
#include <bit>
9-
namespace stdx=std::experimental;namespace cp_algo::math::fft{static constexpr size_t flen=4;using ftype=double;using vftype=dx4;using point=complex<ftype>;using vpoint=complex<vftype>;static constexpr vftype vz={};[[gnu::target("avx2")]]vpoint vi(vpoint const&r){return{-imag(r),real(r)};}struct cvector{big_vector<vpoint>r;cvector(size_t n){n=std::max(flen,std::bit_ceil(n));r.resize(n/flen);checkpoint("cvector create");}vpoint&at(size_t k){return r[k/flen];}vpoint at(size_t k)const{return r[k/flen];}template<class pt=point>void set(size_t k,pt const&t){if constexpr(std::is_same_v<pt,point>){real(r[k/flen])[k%flen]=real(t);imag(r[k/flen])[k%flen]=imag(t);}else{at(k)=t;}}template<class pt=point>[[gnu::target("avx2")]]pt get(size_t k)const{if constexpr(std::is_same_v<pt,point>){return{real(r[k/flen])[k%flen],imag(r[k/flen])[k%flen]};}else{return at(k);}}size_t size()const{return flen*r.size();}static constexpr size_t eval_arg(size_t n){if(n<pre_evals){return eval_args[n];}else{return eval_arg(n/2)|(n&1)<<(std::bit_width(n)-1);}}static constexpr point eval_point(size_t n){if(n%2){return-eval_point(n-1);}else if(n%4){return eval_point(n-2)*point(0,1);}else if(n/4<pre_evals){return evalp[n/4];}else{return polar<ftype>(1.,std::numbers::pi/(ftype)std::bit_floor(n)*(ftype)eval_arg(n));}}static constexpr std::array<point,32>roots=[](){std::array<point,32>res;for(size_t i=2;i<32;i++){res[i]=polar<ftype>(1.,std::numbers::pi/(1ull<<(i-2)));}return res;}();static constexpr point root(size_t n){return roots[std::bit_width(n)];}template<int step>[[gnu::target("avx2")]]static void exec_on_eval(size_t n,size_t k,auto&&callback){callback(k,root(4*step*n)*eval_point(step*k));}template<int step>[[gnu::target("avx2")]]static void exec_on_evals(size_t n,auto&&callback){point factor=root(4*step*n);for(size_t i=0;i<n;i++){callback(i,factor*eval_point(step*i));}}[[gnu::target("avx2")]]static void do_dot_iter(point rt,vpoint&Bv,vpoint const&Av,vpoint&res){res+=Av*Bv;real(Bv)=rotate_right(real(Bv));imag(Bv)=rotate_right(imag(Bv));auto x=real(Bv)[0],y=imag(Bv)[0];real(Bv)[0]=x*real(rt)-y*imag(rt);imag(Bv)[0]=x*imag(rt)+y*real(rt);}[[gnu::target("avx2")]]void dot(cvector const&t){size_t n=this->size();exec_on_evals<1>(n/flen,[&](size_t k,point rt){k*=flen;auto[Ax,Ay]=at(k);auto Bv=t.at(k);vpoint res=vz;for(size_t i=0;i<flen;i++){vpoint Av=vpoint(vz+Ax[i],vz+Ay[i]);do_dot_iter(rt,Bv,Av,res);}set(k,res);});checkpoint("dot");}template<bool partial=true>[[gnu::target("avx2")]]void ifft(){size_t n=size();if constexpr(!partial){point pi(0,1);exec_on_evals<4>(n/4,[&](size_t k,point rt){k*=4;point v1=conj(rt);point v2=v1*v1;point v3=v1*v2;auto A=get(k);auto B=get(k+1);auto C=get(k+2);auto D=get(k+3);set(k,(A+B)+(C+D));set(k+2,((A+B)-(C+D))*v2);set(k+1,((A-B)-pi*(C-D))*v1);set(k+3,((A-B)+pi*(C-D))*v3);});}bool parity=std::countr_zero(n)%2;if(parity){exec_on_evals<2>(n/(2*flen),[&](size_t k,point rt){k*=2*flen;vpoint cvrt={vz+real(rt),vz-imag(rt)};auto B=at(k)-at(k+flen);at(k)+=at(k+flen);at(k+flen)=B*cvrt;});}for(size_t leaf=3*flen;leaf<n;leaf+=4*flen){size_t level=std::countr_one(leaf+3);for(size_t lvl=4+parity;lvl<=level;lvl+=2){size_t i=(1<<lvl)/4;exec_on_eval<4>(n>>lvl,leaf>>lvl,[&](size_t k,point rt){k<<=lvl;vpoint v1={vz+real(rt),vz-imag(rt)};vpoint v2=v1*v1;vpoint v3=v1*v2;for(size_t j=k;j<k+i;j+=flen){auto A=at(j);auto B=at(j+i);auto C=at(j+2*i);auto D=at(j+3*i);at(j)=((A+B)+(C+D));at(j+2*i)=((A+B)-(C+D))*v2;at(j+i)=((A-B)-vi(C-D))*v1;at(j+3*i)=((A-B)+vi(C-D))*v3;}});}}checkpoint("ifft");for(size_t k=0;k<n;k+=flen){if constexpr(partial){set(k,get<vpoint>(k)/=vz+ftype(n/flen));}else{set(k,get<vpoint>(k)/=vz+ftype(n));}}}template<bool partial=true>[[gnu::target("avx2")]]void fft(){size_t n=size();bool parity=std::countr_zero(n)%2;for(size_t leaf=0;leaf<n;leaf+=4*flen){size_t level=std::countr_zero(n+leaf);level-=level%2!=parity;for(size_t lvl=level;lvl>=4;lvl-=2){size_t i=(1<<lvl)/4;exec_on_eval<4>(n>>lvl,leaf>>lvl,[&](size_t k,point rt){k<<=lvl;vpoint v1={vz+real(rt),vz+imag(rt)};vpoint v2=v1*v1;vpoint v3=v1*v2;for(size_t j=k;j<k+i;j+=flen){auto A=at(j);auto B=at(j+i)*v1;auto C=at(j+2*i)*v2;auto D=at(j+3*i)*v3;at(j)=(A+C)+(B+D);at(j+i)=(A+C)-(B+D);at(j+2*i)=(A-C)+vi(B-D);at(j+3*i)=(A-C)-vi(B-D);}});}}if(parity){exec_on_evals<2>(n/(2*flen),[&](size_t k,point rt){k*=2*flen;vpoint vrt={vz+real(rt),vz+imag(rt)};auto t=at(k+flen)*vrt;at(k+flen)=at(k)-t;at(k)+=t;});}if constexpr(!partial){point pi(0,1);exec_on_evals<4>(n/4,[&](size_t k,point rt){k*=4;point v1=rt;point v2=v1*v1;point v3=v1*v2;auto A=get(k);auto B=get(k+1)*v1;auto C=get(k+2)*v2;auto D=get(k+3)*v3;set(k,(A+C)+(B+D));set(k+1,(A+C)-(B+D));set(k+2,(A-C)+pi*(B-D));set(k+3,(A-C)-pi*(B-D));});}checkpoint("fft");}static constexpr size_t pre_evals=1<<16;static const std::array<size_t,pre_evals>eval_args;static const std::array<point,pre_evals>evalp;};const std::array<size_t,cvector::pre_evals>cvector::eval_args=[](){std::array<size_t,pre_evals>res={};for(size_t i=1;i<pre_evals;i++){res[i]=res[i>>1]|(i&1)<<(std::bit_width(i)-1);}return res;}();const std::array<point,cvector::pre_evals>cvector::evalp=[](){std::array<point,pre_evals>res={};res[0]=1;for(size_t n=1;n<pre_evals;n++){res[n]=polar<ftype>(1.,std::numbers::pi*ftype(eval_args[n])/ftype(4*std::bit_floor(n)));}return res;}();}
9+
namespace stdx=std::experimental;namespace cp_algo::math::fft{static constexpr size_t flen=4;using ftype=double;using vftype=dx4;using point=complex<ftype>;using vpoint=complex<vftype>;static constexpr vftype vz={};simd_target vpoint vi(vpoint const&r){return{-imag(r),real(r)};}struct cvector{big_vector<vpoint>r;cvector(size_t n){n=std::max(flen,std::bit_ceil(n));r.resize(n/flen);checkpoint("cvector create");}vpoint&at(size_t k){return r[k/flen];}vpoint at(size_t k)const{return r[k/flen];}template<class pt=point>simd_inline void set(size_t k,pt const&t){if constexpr(std::is_same_v<pt,point>){real(r[k/flen])[k%flen]=real(t);imag(r[k/flen])[k%flen]=imag(t);}else{at(k)=t;}}template<class pt=point>simd_inline pt get(size_t k)const{if constexpr(std::is_same_v<pt,point>){return{real(r[k/flen])[k%flen],imag(r[k/flen])[k%flen]};}else{return at(k);}}size_t size()const{return flen*r.size();}static constexpr size_t eval_arg(size_t n){if(n<pre_evals){return eval_args[n];}else{return eval_arg(n/2)|(n&1)<<(std::bit_width(n)-1);}}static constexpr point eval_point(size_t n){if(n%2){return-eval_point(n-1);}else if(n%4){return eval_point(n-2)*point(0,1);}else if(n/4<pre_evals){return evalp[n/4];}else{return polar<ftype>(1.,std::numbers::pi/(ftype)std::bit_floor(n)*(ftype)eval_arg(n));}}static constexpr std::array<point,32>roots=[](){std::array<point,32>res;for(size_t i=2;i<32;i++){res[i]=polar<ftype>(1.,std::numbers::pi/(1ull<<(i-2)));}return res;}();static constexpr point root(size_t n){return roots[std::bit_width(n)];}template<int step>simd_target static void exec_on_eval(size_t n,size_t k,auto&&callback){callback(k,root(4*step*n)*eval_point(step*k));}template<int step>simd_target static void exec_on_evals(size_t n,auto&&callback){point factor=root(4*step*n);for(size_t i=0;i<n;i++){callback(i,factor*eval_point(step*i));}}simd_target static void do_dot_iter(point rt,vpoint&Bv,vpoint const&Av,vpoint&res){res+=Av*Bv;real(Bv)=rotate_right(real(Bv));imag(Bv)=rotate_right(imag(Bv));auto x=real(Bv)[0],y=imag(Bv)[0];real(Bv)[0]=x*real(rt)-y*imag(rt);imag(Bv)[0]=x*imag(rt)+y*real(rt);}simd_target void dot(cvector const&t){size_t n=this->size();exec_on_evals<1>(n/flen,[&](size_t k,point rt){k*=flen;auto[Ax,Ay]=at(k);auto Bv=t.at(k);vpoint res=vz;for(size_t i=0;i<flen;i++){vpoint Av=vpoint(vz+Ax[i],vz+Ay[i]);do_dot_iter(rt,Bv,Av,res);}set(k,res);});checkpoint("dot");}template<bool partial=true>simd_target void ifft(){size_t n=size();if constexpr(!partial){point pi(0,1);exec_on_evals<4>(n/4,[&](size_t k,point rt){k*=4;point v1=conj(rt);point v2=v1*v1;point v3=v1*v2;auto A=get(k);auto B=get(k+1);auto C=get(k+2);auto D=get(k+3);set(k,(A+B)+(C+D));set(k+2,((A+B)-(C+D))*v2);set(k+1,((A-B)-pi*(C-D))*v1);set(k+3,((A-B)+pi*(C-D))*v3);});}bool parity=std::countr_zero(n)%2;if(parity){exec_on_evals<2>(n/(2*flen),[&](size_t k,point rt){k*=2*flen;vpoint cvrt={vz+real(rt),vz-imag(rt)};auto B=at(k)-at(k+flen);at(k)+=at(k+flen);at(k+flen)=B*cvrt;});}for(size_t leaf=3*flen;leaf<n;leaf+=4*flen){size_t level=std::countr_one(leaf+3);for(size_t lvl=4+parity;lvl<=level;lvl+=2){size_t i=(1<<lvl)/4;exec_on_eval<4>(n>>lvl,leaf>>lvl,[&](size_t k,point rt){k<<=lvl;vpoint v1={vz+real(rt),vz-imag(rt)};vpoint v2=v1*v1;vpoint v3=v1*v2;for(size_t j=k;j<k+i;j+=flen){auto A=at(j);auto B=at(j+i);auto C=at(j+2*i);auto D=at(j+3*i);at(j)=((A+B)+(C+D));at(j+2*i)=((A+B)-(C+D))*v2;at(j+i)=((A-B)-vi(C-D))*v1;at(j+3*i)=((A-B)+vi(C-D))*v3;}});}}checkpoint("ifft");for(size_t k=0;k<n;k+=flen){if constexpr(partial){set(k,get<vpoint>(k)/=vz+ftype(n/flen));}else{set(k,get<vpoint>(k)/=vz+ftype(n));}}}template<bool partial=true>simd_target void fft(){size_t n=size();bool parity=std::countr_zero(n)%2;for(size_t leaf=0;leaf<n;leaf+=4*flen){size_t level=std::countr_zero(n+leaf);level-=level%2!=parity;for(size_t lvl=level;lvl>=4;lvl-=2){size_t i=(1<<lvl)/4;exec_on_eval<4>(n>>lvl,leaf>>lvl,[&](size_t k,point rt){k<<=lvl;vpoint v1={vz+real(rt),vz+imag(rt)};vpoint v2=v1*v1;vpoint v3=v1*v2;for(size_t j=k;j<k+i;j+=flen){auto A=at(j);auto B=at(j+i)*v1;auto C=at(j+2*i)*v2;auto D=at(j+3*i)*v3;at(j)=(A+C)+(B+D);at(j+i)=(A+C)-(B+D);at(j+2*i)=(A-C)+vi(B-D);at(j+3*i)=(A-C)-vi(B-D);}});}}if(parity){exec_on_evals<2>(n/(2*flen),[&](size_t k,point rt){k*=2*flen;vpoint vrt={vz+real(rt),vz+imag(rt)};auto t=at(k+flen)*vrt;at(k+flen)=at(k)-t;at(k)+=t;});}if constexpr(!partial){point pi(0,1);exec_on_evals<4>(n/4,[&](size_t k,point rt){k*=4;point v1=rt;point v2=v1*v1;point v3=v1*v2;auto A=get(k);auto B=get(k+1)*v1;auto C=get(k+2)*v2;auto D=get(k+3)*v3;set(k,(A+C)+(B+D));set(k+1,(A+C)-(B+D));set(k+2,(A-C)+pi*(B-D));set(k+3,(A-C)-pi*(B-D));});}checkpoint("fft");}static constexpr size_t pre_evals=1<<16;static const std::array<size_t,pre_evals>eval_args;static const std::array<point,pre_evals>evalp;};const std::array<size_t,cvector::pre_evals>cvector::eval_args=[](){std::array<size_t,pre_evals>res={};for(size_t i=1;i<pre_evals;i++){res[i]=res[i>>1]|(i&1)<<(std::bit_width(i)-1);}return res;}();const std::array<point,cvector::pre_evals>cvector::evalp=[](){std::array<point,pre_evals>res={};res[0]=1;for(size_t n=1;n<pre_evals;n++){res[n]=polar<ftype>(1.,std::numbers::pi*ftype(eval_args[n])/ftype(4*std::bit_floor(n)));}return res;}();}
1010
#endif

cp-algo/min/math/factorials.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@
66
#include "../math/combinatorics.hpp"
77
#include "../number_theory/modint.hpp"
88
#include <ranges>
9-
namespace cp_algo::math{template<bool use_bump_alloc=false,int maxn=-1>[[gnu::target("avx2")]]auto facts(auto const&args){static_assert(!use_bump_alloc||maxn>0,"maxn must be set if use_bump_alloc is true");constexpr int max_mod=1'000'000'000;constexpr int accum=4;constexpr int simd_size=8;constexpr int block=1<<18;constexpr int subblock=block/simd_size;using base=std::decay_t<decltype(args[0])>;static_assert(modint_type<base>,"Base type must be a modint type");using T=std::array<int,2>;using alloc=std::conditional_t<use_bump_alloc,bump_alloc<T,30*maxn>,big_alloc<T>>;std::basic_string<T,std::char_traits<T>,alloc>odd_args_per_block[max_mod/subblock];std::basic_string<T,std::char_traits<T>,alloc>reg_args_per_block[max_mod/subblock];constexpr int limit_reg=max_mod/64;int limit_odd=0;big_vector<base>res(size(args),1);const int mod=base::mod();const int imod=-math::inv2(mod);for(auto[i,xy]:std::views::zip(args,res)|std::views::enumerate){auto[x,y]=xy;int t=x.getr();if(t>=mod/2){t=mod-t-1;y=t%2?1:mod-1;}auto pw=32ull*(t+1);while(t>limit_reg){limit_odd=std::max(limit_odd,(t-1)/2);odd_args_per_block[(t-1)/2/subblock].push_back({int(i),(t-1)/2});t/=2;pw+=t;}reg_args_per_block[t/subblock].push_back({int(i),t});y*=pow_fixed<base,2>(int(pw%(mod-1)));}checkpoint("init");base bi2x32=pow_fixed<base,2>(32).inv();auto process=[&](int limit,auto&args_per_block,auto step,auto&&proj){base fact=1;for(int b=0;b<=limit;b+=accum*block){u32x8 cur[accum];static std::array<u32x8,subblock>prods[accum];for(int z=0;z<accum;z++){for(int j=0;j<simd_size;j++){cur[z][j]=uint32_t(b+z*block+j*subblock);cur[z][j]=proj(cur[z][j]);prods[z][0][j]=cur[z][j]+!cur[z][j];prods[z][0][j]=uint32_t(uint64_t(prods[z][0][j])*bi2x32.getr()%mod);}}for(int i=1;i<block/simd_size;i++){for(int z=0;z<accum;z++){cur[z]+=step;prods[z][i]=montgomery_mul(prods[z][i-1],cur[z],mod,imod);}}checkpoint("inner loop");for(int z=0;z<accum;z++){for(int j=0;j<simd_size;j++){int bl=b+z*block+j*subblock;for(auto[i,x]:args_per_block[bl/subblock]){res[i]*=fact*prods[z][x-bl][j];}fact*=base(prods[z].back()[j]);}}checkpoint("mul ans");}};process(limit_reg,reg_args_per_block,1,std::identity{});process(limit_odd,odd_args_per_block,2,[](uint32_t x){return 2*x+1;});auto invs=bulk_invs<base>(res);for(auto[i,x]:res|std::views::enumerate){if(args[i]>=mod/2){x=invs[i];}}checkpoint("inv ans");return res;}}
9+
namespace cp_algo::math{template<bool use_bump_alloc=false,int maxn=-1>simd_target auto facts(auto const&args){static_assert(!use_bump_alloc||maxn>0,"maxn must be set if use_bump_alloc is true");constexpr int max_mod=1'000'000'000;constexpr int accum=4;constexpr int simd_size=8;constexpr int block=1<<18;constexpr int subblock=block/simd_size;using base=std::decay_t<decltype(args[0])>;static_assert(modint_type<base>,"Base type must be a modint type");using T=std::array<int,2>;using alloc=std::conditional_t<use_bump_alloc,bump_alloc<T,30*maxn>,big_alloc<T>>;std::basic_string<T,std::char_traits<T>,alloc>odd_args_per_block[max_mod/subblock];std::basic_string<T,std::char_traits<T>,alloc>reg_args_per_block[max_mod/subblock];constexpr int limit_reg=max_mod/64;int limit_odd=0;big_vector<base>res(size(args),1);const int mod=base::mod();const int imod=-math::inv2(mod);for(auto[i,xy]:std::views::zip(args,res)|std::views::enumerate){auto[x,y]=xy;int t=x.getr();if(t>=mod/2){t=mod-t-1;y=t%2?1:mod-1;}auto pw=32ull*(t+1);while(t>limit_reg){limit_odd=std::max(limit_odd,(t-1)/2);odd_args_per_block[(t-1)/2/subblock].push_back({int(i),(t-1)/2});t/=2;pw+=t;}reg_args_per_block[t/subblock].push_back({int(i),t});y*=pow_fixed<base,2>(int(pw%(mod-1)));}checkpoint("init");base bi2x32=pow_fixed<base,2>(32).inv();auto process=[&](int limit,auto&args_per_block,auto step,auto&&proj){base fact=1;for(int b=0;b<=limit;b+=accum*block){u32x8 cur[accum];static std::array<u32x8,subblock>prods[accum];for(int z=0;z<accum;z++){for(int j=0;j<simd_size;j++){cur[z][j]=uint32_t(b+z*block+j*subblock);cur[z][j]=proj(cur[z][j]);prods[z][0][j]=cur[z][j]+!cur[z][j];prods[z][0][j]=uint32_t(uint64_t(prods[z][0][j])*bi2x32.getr()%mod);}}for(int i=1;i<block/simd_size;i++){for(int z=0;z<accum;z++){cur[z]+=step;prods[z][i]=montgomery_mul(prods[z][i-1],cur[z],mod,imod);}}checkpoint("inner loop");for(int z=0;z<accum;z++){for(int j=0;j<simd_size;j++){int bl=b+z*block+j*subblock;for(auto[i,x]:args_per_block[bl/subblock]){res[i]*=fact*prods[z][x-bl][j];}fact*=base(prods[z].back()[j]);}}checkpoint("mul ans");}};process(limit_reg,reg_args_per_block,1,std::identity{});process(limit_odd,odd_args_per_block,2,[](uint32_t x){return 2*x+1;});auto invs=bulk_invs<base>(res);for(auto[i,x]:res|std::views::enumerate){if(args[i]>=mod/2){x=invs[i];}}checkpoint("inv ans");return res;}}
1010
#endif

0 commit comments

Comments
 (0)