从洛谷P5309 Ynoi2011 初始化看卡常

一般情况下，程序运行消耗时间主要与时间复杂度有关，超时与否取决于算法是否正确。

但对于某些题目，时间复杂度正确的程序也无法通过，这时我们就需要卡常数，即通过优化一些操作的常数因子减少时间消耗。

比如这道题 P5309 [Ynoi2011] 初始化 。

这道题目的做法我写在另一篇博客里，这里主要研究卡常方式。

#include<bits/stdc++.h>

using namespace std;

const int h=200010;

inline int read() {

    int s = 0, w = 1;

    char ch = getchar();

    while(ch < '0' || ch > '9') {

        if(ch == '-') w= -1;

        ch = getchar();

    }

    while(ch >= '0' && ch <= '9') {

        s = s * 10 + ch - '0';

        ch = getchar();

    }

    return s * w;

}

int mod=1e9+7;

inline void update(int &x, int y) {

    x += y;

    if(x >= mod) x -= mod;

}

int n,m;

int a[h];

int b_sum[h];

int len;

int pre[2010][2010];

int suf[2010][2010];

int get_pos(int x){

    return (x-1)/len+1;

}

int main(){

    n=read(),m=read();

    len=135;

    for(int i=1;i<=n;i++)

        a[i]=read(),update(b_sum[get_pos(i)],a[i]);

    int op,x,y,z;

    for(int i=1;i<=m;i++){

        op=read(),x=read(),y=read();

        if(op==1){

            z=read();

            if(x>=len)

                for(int j=y;j<=n;j+=x)

                    update(a[j],z),update(b_sum[get_pos(j)],z);

            else{

                for(int j=y;j<=x;j++)

                    update(pre[x][j],z);

                for(int j=1;j<=y;j++)

                    update(suf[x][j],z);

            }

        }

        else{

            int l=x,r=y,lb=get_pos(x),rb=get_pos(y);

            int ans=0;

            if(lb==rb)

                for(int j=l;j<=r;j++)

                    update(ans,a[j]);

            else{

                for(int j=l;j<=lb*len;j++)

                    update(ans,a[j]);

                for(int j=lb+1;j<=rb-1;j++)

                    update(ans,b_sum[j]);

                for(int j=(rb-1)*len+1;j<=r;j++)

                    update(ans,a[j]);

            }

            for(int j=1;j<len;j++){

                lb=(l-1)/j+1,rb=(r-1)/j+1;

                if(lb==rb)

                    ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;

                else

                    ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;

            }

            ans = ans % mod + mod;

            ans = (ans >= mod) ? ans - mod : ans;

            printf("%d\n",ans);

        }

    }

    return 0;

}

20分超时代码

技巧一：手写常数因子大的运算

c++中取模运算常数因子很大，于是我们改用更快的运算组合表示取模。

inline void update(int &x, int y) {

    x += y;

    if(x >= mod) x -= mod;

}

经此修改，可以拿到95分。

#include<bits/stdc++.h>

using namespace std;

const int h=200010;

inline int read() {

    int s = 0, w = 1;

    char ch = getchar();

    while(ch < '0' || ch > '9') {

        if(ch == '-') w= -1;

        ch = getchar();

    }

    while(ch >= '0' && ch <= '9') {

        s = s * 10 + ch - '0';

        ch = getchar();

    }

    return s * w;

}

int mod=1e9+7;

inline void update(int &x, int y) {

    x += y;

    if(x >= mod) x -= mod;

}

int n,m;

int a[h];

int b_sum[h];

int len;

int pre[2010][2010];

int suf[2010][2010];

int get_pos(int x){

    return (x-1)/len+1;

}

int main(){

    n=read(),m=read();

    len=120;

    for(int i=1;i<=n;i++)

        a[i]=read(),update(b_sum[get_pos(i)],a[i]);

    int op,x,y,z;

    for(int i=1;i<=m;i++){

        op=read(),x=read(),y=read();

        if(op==1){

            z=read();

            if(x>=len)

                for(int j=y;j<=n;j+=x)

                    update(a[j],z),update(b_sum[get_pos(j)],z);

            else{

                for(int j=y;j<=x;j++)

                    update(pre[x][j],z);

                for(int j=1;j<=y;j++)

                    update(suf[x][j],z);

            }

        }

        else{

            int l=x,r=y,lb=get_pos(x),rb=get_pos(y);

            int ans=0;

            if(lb==rb)

                for(int j=l;j<=r;j++)

                    update(ans,a[j]);

            else{

                for(int j=l;j<=lb*len;j++)

                    update(ans,a[j]);

                for(int j=lb+1;j<=rb-1;j++)

                    update(ans,b_sum[j]);

                for(int j=(rb-1)*len+1;j<=r;j++)

                    update(ans,a[j]);

            }

            for(int j=1;j<len;j++){

                lb=(l-1)/j+1,rb=(r-1)/j+1;

                if(lb==rb)

                    ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;

                else

                    ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;

            }

            ans = ans % mod + mod;

            ans = (ans >= mod) ? ans - mod : ans;

            printf("%d\n",ans);

        }

    }

    return 0;

}

95分优化取模

技巧二：块长乱搞

这道题目需要使用分块。

而分块的时间复杂度往往随着块长而剧烈波动。

于是我们不断尝试新的块长。（为了节约评测机资源可以二分寻找）

得出最优块长在135左右，通过了这道题，耗时为5.93s。

#include<bits/stdc++.h>

using namespace std;

const int h=200010;

inline int read() {

    int s = 0, w = 1;

    char ch = getchar();

    while(ch < '0' || ch > '9') {

        if(ch == '-') w= -1;

        ch = getchar();

    }

    while(ch >= '0' && ch <= '9') {

        s = s * 10 + ch - '0';

        ch = getchar();

    }

    return s * w;

}

int mod=1e9+7;

inline void update(int &x, int y) {

    x += y;

    if(x >= mod) x -= mod;

}

int n,m;

int a[h];

int b_sum[h];

int len;

int pre[2010][2010];

int suf[2010][2010];

int get_pos(int x){

    return (x-1)/len+1;

}

int main(){

    n=read(),m=read();

    len=135;

    for(int i=1;i<=n;i++)

        a[i]=read(),update(b_sum[get_pos(i)],a[i]);

    int op,x,y,z;

    for(int i=1;i<=m;i++){

        op=read(),x=read(),y=read();

        if(op==1){

            z=read();

            if(x>=len)

                for(int j=y;j<=n;j+=x)

                    update(a[j],z),update(b_sum[get_pos(j)],z);

            else{

                for(int j=y;j<=x;j++)

                    update(pre[x][j],z);

                for(int j=1;j<=y;j++)

                    update(suf[x][j],z);

            }

        }

        else{

            int l=x,r=y,lb=get_pos(x),rb=get_pos(y);

            int ans=0;

            if(lb==rb)

                for(int j=l;j<=r;j++)

                    update(ans,a[j]);

            else{

                for(int j=l;j<=lb*len;j++)

                    update(ans,a[j]);

                for(int j=lb+1;j<=rb-1;j++)

                    update(ans,b_sum[j]);

                for(int j=(rb-1)*len+1;j<=r;j++)

                    update(ans,a[j]);

            }

            for(int j=1;j<len;j++){

                lb=(l-1)/j+1,rb=(r-1)/j+1;

                if(lb==rb)

                    ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;

                else

                    ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;

            }

            ans = ans % mod + mod;

            ans = (ans >= mod) ? ans - mod : ans;

            printf("%d\n",ans);

        }

    }

    return 0;

}

100分块长优化

技巧三：按照题目内容实行特定优化

以下内容参考自原题目讨论版。

不同的题目有不同的步骤，这道题里这样一个步骤耗时巨大。

for(int j=1;j<len;j++){
    lb=(l-1)/j+1,rb=(r-1)/j+1;
    if(lb==rb)
        ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;
    else
        ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;
}

这个地方运算很慢（因为一些原因套不了取模优化），而且当这个因数没有修改的时候，这些运算完全没有必要。

于是，如果这个因数没有进行修改，我们跳过后面的运算步骤。

for(int j=1;j<len;j++){

    if(!pre[j][j])

        continue;

    lb=(l-1)/j+1,rb=(r-1)/j+1;

    if(lb==rb)

        ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;

    else

        ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;

}

以及在没有修改的情况下，弄一个前缀和，查询区间和直接调用。

耗时3.06s。

#include<bits/stdc++.h>

using namespace std;

const int h=200010;

inline int read() {

    int s = 0, w = 1;

    char ch = getchar();

    while(ch < '0' || ch > '9') {

        if(ch == '-') w= -1;

        ch = getchar();

    }

    while(ch >= '0' && ch <= '9') {

        s = s * 10 + ch - '0';

        ch = getchar();

    }

    return s * w;

}

int mod=1e9+7;

inline void update(int &x, int y) {

    x += y;

    if(x >= mod) x -= mod;

}

int n,m;

int a[h];

int b_sum[h];

int len;

int pre[2010][2010];

int suf[2010][2010];

int sum[h];

int get_pos(int x){

    return (x-1)/len+1;

}

int main(){

    n=read(),m=read();

    len=150;

    for(int i=1;i<=n;i++)

        a[i]=read(),update(b_sum[get_pos(i)],a[i]),update(sum[i],a[i]+sum[i-1]);

    int op,x,y,z;

    bool fl=0;

    for(int i=1;i<=m;i++){

        op=read(),x=read(),y=read();

        if(op==1){

            z=read();

            if(x>=len)

                for(int j=y;j<=n;j+=x)

                    fl|=1,update(a[j],z),update(b_sum[get_pos(j)],z);

            else{

                for(int j=y;j<=x;j++)

                    update(pre[x][j],z);

                for(int j=1;j<=y;j++)

                    update(suf[x][j],z);

            }

        }

        else{

            int l=x,r=y,lb=get_pos(x),rb=get_pos(y);

            int ans=0;

            if(!fl)

                ans+=sum[r]-sum[l-1];

            else

                if(lb==rb)

                    for(int j=l;j<=r;j++)

                        update(ans,a[j]);

                else{

                    for(int j=l;j<=lb*len;j++)

                        update(ans,a[j]);

                    for(int j=lb+1;j<=rb-1;j++)

                        update(ans,b_sum[j]);

                    for(int j=(rb-1)*len+1;j<=r;j++)

                        update(ans,a[j]);

                }

            for(int j=1;j<len;j++){

                if(!pre[j][j])

                    continue;

                lb=(l-1)/j+1,rb=(r-1)/j+1;

                if(lb==rb)

                    ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;

                else

                    ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;

            }

            ans = ans % mod + mod;

            ans = (ans >= mod) ? ans - mod : ans;

            printf("%d\n",ans);

        }

    }

    return 0;

}

技巧四：节约空间

开数组也是要消耗很多时间的，我们进行优化。

int b_sum[1510];

int len;

int pre[161][161];

int suf[161][161];

时间优化至3.00s。

技巧五：奇技淫巧

到这一步，接下来的卡常技巧就没有学习的必要了。

比如把mod定义成成const可以快到2.52s。

将提交语言改为c++98可以快到2.4s。

然后是...然后自己看吧，没什么意义。

(主要由@Foofish大佬贡献）

//#pragma G++ target("avx")

//#pragma G++ optimize("Ofast")

#include<bits/stdc++.h>

using namespace std;

const int h=200010;

inline int read() {

    register int s = 0, w = 1;

    char ch = getchar();

    while(ch < '0' || ch > '9') {

        if(ch == '-') w= -1;

        ch = getchar();

    }

    while(ch >= '0' && ch <= '9') {

        s = s * 10 + ch - '0';

        ch = getchar();

    }

    return s * w;

}

bool flag=0;

const int mod=1e9+7;

int n,m;

int a[h];

int b_sum[2010];

int len;

int pre[132][132];//pre[x][y]即modify(x,1)+modify(x,2)+...+modify(x,y)

int suf[132][132];

int sum[h];

int b_len;

int POS[201000];

#define get_pos(x) POS[x]

inline void write(int x) {

    if(x >= 10) write(x / 10);

    putchar(x % 10 + '0');

}

inline void update(int &x, int y) {

    x += y;

    if(x >= mod) x -= mod;

}

int main(){

    //ios::sync_with_stdio(0);

    n=read(),m=read();

    len=130;

    int Bnum = n / len;

    for(int i = 1; i <= Bnum + 1; ++i)

        for(int j = (i - 1) * len + 1; j <= i * len; ++j)

            POS[j] = i;

    for(int i=1;i<=n;i++)

        a[i]=read(),update(b_sum[get_pos(i)], a[i]),update(sum[i], sum[i - 1] + a[i]);

    int op,x,y,z;

    for(int i=1;i<=m;i++){

        op=read(),x=read(),y=read();

        if(op==1){

            z=read();

            if(x>=len)

                for(int j=y;j<=n;j+=x)

                    flag|=1,update(a[j], z),update(b_sum[get_pos(j)], z);

            else{

                for(int j=y;j<=x;j++)

                    update(pre[x][j], z);

                for(int j=1;j<=y;j++)

                    update(suf[x][j], z);

            }

        }

        else{

            int l=x,r=y,lb=get_pos(x),rb=get_pos(y);

            int ans=0;

            if(!flag)

                ans+=sum[r]-sum[l-1];

            else

                if(lb==rb)

                    for(int j=l;j<=r;j++)

                        update(ans, a[j]);

                else{

                    for(int j=l;j<=lb*len;j++)

                        update(ans, a[j]);

                    for(int j=lb+1;j<=rb-1;j++)

                        update(ans, b_sum[j]);

                    for(int j=(rb-1)*len+1;j<=r;j++)

                        update(ans, a[j]);

                }

            for(int j=1;j<len;j++){

                if(!pre[j][j])

                    continue;

                lb=(l-1)/j+1,rb=(r-1)/j+1;

                if(lb==rb) {

                    ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;

//                    ans = (ans - pre[j][(l - 1) % j] + pre[j][(r - 1) % j + 1] + mod) % mod;

                }

                else {

                    ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;

                }

            }

            ans = ans % mod + mod;

            ans = (ans >= mod) ? ans - mod : ans;

            printf("%d",ans); puts("");

        }

    }

    return 0;

}

耗时2.38s

#define getchar() (p1==p2&&(p2=(p1=buf)+fread(buf,1,1<<21,stdin),p1==p2)?EOF:*p1++)

char buf[1<<21],*p1=buf,*p2=buf;

template<typename T>inline void read(T& t){

    int c=getchar(),f=1;t=0;

    while(!isdigit(c)){if(c=='-')f=-1;c=getchar();}

    while(isdigit(c))t=t*10+c-48,c=getchar();t*=f;

}

template<typename T,typename ...Args>inline void read(T& t,Args&... args){

    read(t),read(args...);

}

*某位大佬提供的快读，安装之后时间达到了2.23s。

目前能达到的最短时间：c++ 14 无O2优化 2.21s

#pragma G++ target("avx")

#pragma G++ optimize("Ofast")

#include<bits/stdc++.h>

using namespace std;

const int h=200010;

#define getchar() (p1==p2&&(p2=(p1=buf)+fread(buf,1,1<<21,stdin),p1==p2)?EOF:*p1++)

char buf[1<<21],*p1=buf,*p2=buf;

template<typename T>inline void read(T& t){

    int c=getchar(),f=1;t=0;

    while(!isdigit(c)){if(c=='-')f=-1;c=getchar();}

    while(isdigit(c))t=t*10+c-48,c=getchar();t*=f;

}

template<typename T,typename ...Args>inline void read(T& t,Args&... args){

    read(t),read(args...);

}

bool flag=0;

const int mod=1e9+7;

int n,m;

int a[h];

int b_sum[2010];

int len;

int pre[132][132];//pre[x][y]即modify(x,1)+modify(x,2)+...+modify(x,y)

int suf[132][132];

int sum[h];

int b_len;

int POS[201000];

#define get_pos(x) POS[x]

inline void write(int x) {

    if(x >= 10) write(x / 10);

    putchar(x % 10 + '0');

}

inline void update(int &x, int y) {

    x += y;

    if(x >= mod) x -= mod;

}

int main(){

    //ios::sync_with_stdio(0);

    read(n,m);

    len=130;

    int Bnum = n / len;

    for(int i = 1; i <= Bnum + 1; ++i)

        for(int j = (i - 1)* len + 1,ran= i * len; j <=ran ; ++j)

            POS[j] = i;

    for(int i=1;i<=n;i++)

        read(a[i]),update(b_sum[get_pos(i)], a[i]),update(sum[i], sum[i - 1] + a[i]);

    int op,x,y,z;

    for(int i=1;i<=m;i++){

        read(op,x,y);

        if(op==1){

            read(z);

            if(x>=len)

                for(int j=y;j<=n;j+=x)

                    flag|=1,update(a[j], z),update(b_sum[get_pos(j)], z);

            else{

                for(int j=y;j<=x;j++)

                    update(pre[x][j], z);

                for(int j=1;j<=y;j++)

                    update(suf[x][j], z);

            }

        }

        else{

            int l=x,r=y,lb=get_pos(x),rb=get_pos(y);

            int ans=0;

            if(!flag)

                ans+=sum[r]-sum[l-1];

            else

                if(lb==rb)

                    for(int j=l;j<=r;j++)

                        update(ans, a[j]);

                else{

                    for(int j=l,ran=lb*len;j<=ran;j++)

                        update(ans, a[j]);

                    for(int j=lb+1;j<=rb-1;j++)

                        update(ans, b_sum[j]);

                    for(int j=(rb-1)*len+1;j<=r;j++)

                        update(ans, a[j]);

                }

            for(int j=1;j<len;j++){

                if(!pre[j][j])

                    continue;

                lb=(l-1)/j+1,rb=(r-1)/j+1;

                if(lb==rb) {

                    ans-=pre[j][(l-1)%j],ans%=mod,ans+=pre[j][(r-1)%j+1],ans%=mod;

//                    ans = (ans - pre[j][(l - 1) % j] + pre[j][(r - 1) % j + 1] + mod) % mod;

                }

                else {

                    ans=(ans+1ll*(rb-lb-1)*pre[j][j]+pre[j][(r-1)%j+1]+suf[j][(l-1)%j+1])%mod;

                }

            }

            ans = ans % mod + mod;

            ans = (ans >= mod) ? ans - mod : ans;

            printf("%d",ans); puts("");

        }

    }

    return 0;

}

Code