后缀自动机专题(hihocoder)

#1445 : 后缀自动机二·重复旋律5

题意：

给出字符串$s$，询问字符串$s$中有多少不同的子串。

思路：

考虑对$s$建后缀自动机，那么$\sum (len[i]-len[fa[i]])$即为答案。

还可以考虑$dp$，设$dp[i]$为从$i$出发不同子串的个数，那么$dp[i]=\sum_{(i,j)\in Edge}dp[j]+1$。$dp[1]$即为答案。

#include <bits/stdc++.h>

using namespace std;

typedef long long ll;

const int N = 1000006;

char s[N];

struct node{

    int ch[26];

    int len, fa;

    node(){memset(ch, 0, sizeof(ch)), len = 0;}

}dian[N << 1];

int last, tot;

void add(int c) {

    int p = last;

    int np = last = ++tot;

    dian[np].len = dian[p].len + 1;

    for(; p && !dian[p].ch[c]; p = dian[p].fa) dian[p].ch[c] = np;

    if(!p) dian[np].fa = 1;

    else {

        int q = dian[p].ch[c];

        if(dian[q].len == dian[p].len + 1) dian[np].fa = q;

        else {

            int nq = ++tot; dian[nq] = dian[q];

            dian[nq].len = dian[p].len + 1;

            dian[q].fa = dian[np].fa = nq;

            for(; p && dian[p].ch[c] == q; p = dian[p].fa) dian[p].ch[c] = nq;

        }

    }

}

int n;

int main() {

    ios::sync_with_stdio(false); cin.tie(0);

    last = tot = 1;

    scanf("%s", s + 1);

    n = strlen(s + 1);

    for(int i = 1; i <= n; i++) add(s[i] - 'a');

    ll ans = 0;

    for(int i = 1; i <= tot; i++) ans += dian[i].len - dian[dian[i].fa].len;

    cout << ans;

    return 0;

}

#1449 : 后缀自动机三·重复旋律6

题意：

对于给定字符串$s$，求出所有所有长度为$k$的子串中，出现最多的次数。$k$取遍$1,2,\cdots,len(s)$。

思路：

考虑最朴素的解法：对于后缀自动机上的每个结点，在当前类中长度区间为$[min[i],max[i]]$，此时对答案的贡献为$endpos(i)$。那么线段树区间更新，维护最大值即可。

但这样可能会比较慢，考虑问题的性质。

注意到$ans[1,2,\cdots,len(s)]$是单调不增的。因为若对于一个长度$L$来说，其出现次数为$k$，那么其儿子类中出现的次数肯定不会小于$k$。
那么考虑在$max[i]$处打上一个标记，表示长度小于等于它时答案应该更大，否则更小。之后倒着扫一遍维护后缀最大值即可。

为什么这样答案是正确的？

因为答案只有可能在这些$endpos(i)$处产生，并且取最大值符合限制条件。

#include <bits/stdc++.h>

using namespace std;

typedef long long ll;

const int N = 1000006;

char s[N];

struct node{

    int ch[26];

    int len, fa;

    node(){memset(ch, 0, sizeof(ch)), len = 0;}

}dian[N << 1];

int last = 1, tot = 1;

ll ans[N], f[N << 1];

void add(int c) {

    int p = last;

    int np = last = ++tot;

    dian[np].len = dian[p].len + 1;

    f[np] = 1;

    for(; p && !dian[p].ch[c]; p = dian[p].fa) dian[p].ch[c] = np;

    if(!p) dian[np].fa = 1;

    else {

        int q = dian[p].ch[c];

        if(dian[q].len == dian[p].len + 1) dian[np].fa = q;

        else {

            int nq = ++tot; dian[nq] = dian[q];

            dian[nq].len = dian[p].len + 1;

            dian[q].fa = dian[np].fa = nq;

            for(; p && dian[p].ch[c] == q; p = dian[p].fa) dian[p].ch[c] = nq;

        }

    }

}

int q[N << 1], in[N << 1];

void topsort() {

    int l = 1, r = 0;

    for(int i = 1; i <= tot; i++) in[dian[i].fa]++;

    for(int i = 1; i <= tot; i++) if(!in[i]) q[++r] = i;

    while(l <= r) {

        int x = q[l++];

        f[dian[x].fa] += f[x];

        if(--in[dian[x].fa] == 0) q[++r] = dian[x].fa;

    }

}

int n;

int main() {

    ios::sync_with_stdio(false); cin.tie(0);

    scanf("%s", s + 1);

    n = strlen(s + 1);

    for(int i = 1; i <= n; i++) add(s[i] - 'a');

    topsort();

    for(int i = 1; i <= tot; i++) ans[dian[i].len] = max(ans[dian[i].len], f[i]);

    for(int i = n; i; i--) ans[i] = max(ans[i], ans[i + 1]);

    for(int i = 1; i <= n; i++) cout << ans[i] << '\n';

    return 0;

}

#1457 : 后缀自动机四·重复旋律7

题意：

现在给出$n$个十进制串，要求所有不同的串的和。

思路：

因为后缀自动机的$next$指针代表在当前类中的串后面加上一个字符，并且每条路径对应唯一子串，每个结点代表一个等价类。

所以此问题可以考虑递推来求解：

首先将多个串用分隔符拼接起来，然后统一处理。
记当前结点的答案为$dp_i$，那么根据拓扑序来$dp$，转移方程有：$dp_i=\sum_{(j,i)\in Edge}dp[j]*10+cnt[i]*w(j,i)$，其中$cnt[i]$为到达$i$的合法路径个数，即不会经过分隔符的路径个数，这个我们可以一边$dp$一边处理。

为什么要不经过分隔符？只要没有分隔，我们就是兄弟~

之后将所有值累加即为答案。

#include <bits/stdc++.h>

using namespace std;

typedef long long ll;

const int N = 2000006, MOD = 1e9 + 7;

ll ans;

struct SAM{

    struct node{

        int ch[26];

        int len, fa;

        node(){memset(ch, 0, sizeof(ch)), len = 0;}

    }dian[N];

    int q[N], in[N];

    ll f[N], cnt[N];

    bool vis[N];

    int last = 1, tot = 1;

    void add(int c) {

        int p = last;

        int np = last = ++tot;

        dian[np].len = dian[p].len + 1;

        for(; p && !dian[p].ch[c]; p = dian[p].fa) dian[p].ch[c] = np;

        if(!p) dian[np].fa = 1;

        else {

            int q = dian[p].ch[c];

            if(dian[q].len == dian[p].len + 1) dian[np].fa = q;

            else {

                int nq = ++tot; dian[nq] = dian[q];

                dian[nq].len = dian[p].len + 1;

                dian[q].fa = dian[np].fa = nq;

                for(; p && dian[p].ch[c] == q; p = dian[p].fa) dian[p].ch[c] = nq;

            }

        }

    }

    void gogogo() {

        int l = 1, r = 0;

        q[++r] = 1;

        while(l <= r) {

            int u = q[l++];

            for(int i = 0; i < 10; i++) {

                int v = dian[u].ch[i];

                if(!v) continue;

                ++in[v];

                if(!vis[v]) q[++r] = v;

                vis[v] = 1;

            }

        }

    }

    void topsort() {

        int l = 1, r = 0;

        q[++r] = 1; cnt[1] = 1;

        while(l <= r) {

            int u = q[l++];

            for(int i = 0; i < 10; i++) {

                int v = dian[u].ch[i];

                if(!v) continue;

                cnt[v] += cnt[u];

                f[v] = (f[v] + (f[u] * 10 + i * cnt[u]) % MOD) % MOD;

                if(--in[v] == 0) q[++r] = v;

            }

        }

    }

    ll getans() {

        ll res = 0;

        for(int i = 1; i <= tot; i++) res = (res + f[i]) % MOD;

        return res;

    }

}A;

int n;

string s;

int main() {

    ios::sync_with_stdio(false); cin.tie(0);

    cin >> n;

    string res = "";

    for(int i = 1; i <= n; i++) cin >> s, res += s, res += ":";

    int len = res.length();

    for(int i = 0; i < len - 1; i++) A.add(res[i] - '0');

    A.gogogo();

    A.topsort();

    cout << A.getans();

    return 0;

}

#1465 : 后缀自动机五·重复旋律8

题意：

给出一个字符串$S$，然后有多个询问，每个询问给出一个字符串$T$，回答$T$的循环同构在$S$中出现了多少次。

思路：

还是考察后缀自动机的性质。

考虑枚举每一个位置，依次统计答案。

我们预处理出$f[i]$，表示$T$串中，以$T[i]$结尾的子串与$S$串的最长公共前缀。
假设$f(i)\geq len(T)$，那么则代表以$i$结尾的子串肯定出现在$S$中。
如果知道当前对应的状态为$u$，那么此时对答案的贡献就是$|endpos(u)|$。
那么我们定义一个变量$u$记录当前状态，随着$i$的增加，$u$也不断地在后缀自动机上面跑。
但最终找到的状态$u$不一定为最优，所以可以沿着后缀链接往回跳，找到一个位置$u$满足$max[u]\geq len(T)\geq min[u]$，此时的$u$即为最优情况。
也可能存在找不到的情况，那么$u$肯定变为$0$，判断一下即可。

注意考虑特殊情况，就是我们找到的状态$u$已经计入答案的时候，这时我们不会重复计入答案。

那干嘛往回跳？不往回跳就行了嘛。就算这里不往回跳，那当前类中的所有串也是包含最优状态中的子串，它不能匹配，你也不能。

#include <bits/stdc++.h>

using namespace std;

typedef long long ll;

const int N = 200005;

struct node{

    int ch[26];

    int len, fa;

    node(){memset(ch, 0, sizeof(ch)), len = 0;}

}dian[N];

int last = 1, tot = 1;

ll f[N], g[N], h[N];

void add(int c) {

    int p = last;

    int np = last = ++tot;

    dian[np].len = dian[p].len + 1;

    h[np] = 1;

    for(; p && !dian[p].ch[c]; p = dian[p].fa) dian[p].ch[c] = np;

    if(!p) dian[np].fa = 1;

    else {

        int q = dian[p].ch[c];

        if(dian[q].len == dian[p].len + 1) dian[np].fa = q;

        else {

            int nq = ++tot; dian[nq] = dian[q];

            dian[nq].len = dian[p].len + 1;

            dian[q].fa = dian[np].fa = nq;

            for(; p && dian[p].ch[c] == q; p = dian[p].fa) dian[p].ch[c] = nq;

        }

    }

}

int q[N], in[N];

void topsort() {

    int l = 1, r = 0;

    for(int i = 1; i <= tot; i++) in[dian[i].fa]++;

    for(int i = 1; i <= tot; i++) if(!in[i]) q[++r] = i;

    while(l <= r) {

        int x = q[l++];

        h[dian[x].fa] += h[x];

        if(--in[dian[x].fa] == 0) q[++r] = dian[x].fa;

    }

}

int n;

string s;

int chk[N];

int main() {

//    freopen("input.in", "r", stdin);

    ios::sync_with_stdio(false); cin.tie(0);

    cin >> s;

    int len = s.length();

    for(int i = 0; i < len; i++) add(s[i] - 'a');

    topsort();

    cin >> n;

    for(int i = 1; i <= n; i++) {

        ll ans = 0;

        cin >> s; s += s;

        len = s.length();

        for(int j = 0; j < len; j++) f[j] = g[j] = 0;

        int u = 1, l = 0;

        for(int j = 0; j < len; j++) {

            int tmp = s[j] - 'a';

            while(u && !dian[u].ch[tmp]) u = dian[u].fa, l = dian[u].len;

            if(u == 0) {

                f[j] = l = 0; u = 1;

            } else {

                f[j] = ++l; //注意这里不能直接赋值为len，因为len代表着最长！！！而这里我们是只添加了一个字符。

                u = dian[u].ch[tmp];

                g[j] = u;

            }

        }

        for(int j = 0; j < len; j++) {

            int tmp = g[j];

            if(f[j] < len / 2) continue;

            while(dian[dian[tmp].fa].len >= len / 2) tmp = dian[tmp].fa;

            if(chk[tmp] != i) {

                chk[tmp] = i;

                ans += h[tmp];

            }

        }

        cout << ans << '\n';

    }

}

#1466 : 后缀自动机六·重复旋律9

题意：

给出两个字符串$A$和$B$，现在两个人玩游戏，先手会给出两个子串，然后先手开始，每个人依次往其中一个串后面添加任意一个字符。当一个人添加字符后，所得到的串还应为原串的子串。当某个人不能满足条件时，即判为失败。

现问所有开始局面的所有情况中（包含空串），字典序第$k$大局面是什么。

此处比较字典序先比较第一个字符串，再比较第二个字符串。

思路：

易知这是两个有向图游戏，那么最终的局面与两个状态的$sg$值相关。如果$sg[x]\ xor\ sg[y]=0$，则为先手必败局面，否则先手必胜。

知道这一点那么我们肯定会把每个结点的$sg$值求出来，因为每个结点的出边不会超过$26$条，所以$sg$值最大为$27$，那么可以直接暴力来求。

接下来考虑如何求字典序第$k$大：

首先肯定先求出第一个字符串。那么对于每个位置，判断此时第二个有向图中所有的状态数是否大于$k$即可知道第一个字符串能不能中止[1]；如果不能，则枚举后面的字符，贪心确定下一位[2]；
第一个字符串确定后，来确定第二个字符串。同上面的思路，我们首先看目前状态能否产生贡献，如果$k>0$，那么则继续枚举下一位，贪心确定[3]。

大体思路就是这样，那么具体条件是什么？根据之前的分析，肯定和$sg$函数有关。

我们对每个状态$u$，求出$cnt[u][0,1,\cdots,26]$，表示从$u$出发，能经过结点的$sg$值为$0,1,\cdots,26$的个数；并求出$sum[u]=\sum cnt[u][i]$

那么上面的三个条件判断如下(我们只需要统计必胜局面)：

[1] $sum[1]-cnt[1][sg[u]]$与$k$；
[2] $\sum_i cnt[v][i]*(sum[1]-cnt[1][sg[v]])$与$k$；
[3] $sum[v]-cnt[v][sg[u]]$与$k$。

至于为什么这样，脑补一下就行了~

详细见代码：

#include <bits/stdc++.h>

using namespace std;

typedef long long ll;

const int N = 2e5 + 5, MAX = 26;

struct SAM{

    struct node{

        int ch[MAX + 5];

        int len, fa;

        node(){memset(ch, 0, sizeof(ch)), len = 0;}

    }dian[N];

    int last = 1, tot = 1;

    ll cnt[N][MAX + 5], sum[N];;

    int sg[N], flag[N][MAX + 5];

    void add(int c) {

        int p = last;

        int np = last = ++tot;

        dian[np].len = dian[p].len + 1;

        for(; p && !dian[p].ch[c]; p = dian[p].fa) dian[p].ch[c] = np;

        if(!p) dian[np].fa = 1;

        else {

            int q = dian[p].ch[c];

            if(dian[q].len == dian[p].len + 1) dian[np].fa = q;

            else {

                int nq = ++tot; dian[nq] = dian[q];

                dian[nq].len = dian[p].len + 1;

                dian[q].fa = dian[np].fa = nq;

                for(; p && dian[p].ch[c] == q; p = dian[p].fa) dian[p].ch[c] = nq;

            }

        }

    }

    int get_sg(int u) {

        if(~sg[u]) return sg[u];

        for(int i = 0; i < MAX; i++) {

            int v = dian[u].ch[i];

            if(!v) continue;

            flag[u][get_sg(v)] = 1;

            for(int j = 0; j <= MAX; j++) cnt[u][j] += cnt[v][j];

        }

        int i = 0;

        while(flag[u][i]) ++i;

        sg[u] = i; ++cnt[u][i];

        for(int i = 0; i <= MAX; i++) sum[u] += cnt[u][i];

        return sg[u];

    }

    void build(char *s) {

        int n = strlen(s + 1);

        for(int i = 1; i <= n; i++) add(s[i] - 'a');

        memset(sg, -1, sizeof(sg));

        get_sg(1);

    }

}A, B;

vector <char> res1, res2;

ll k;

char s[N];

int now;

ll getnow(int u) {

    ll res = 0;

    for(int i = 0; i <= MAX; i++) {

        res += A.cnt[u][i] * (B.sum[1] - B.cnt[1][i]);

    }

    return res;

}

int dfsa(int u) {

    ll sum = B.sum[1] - B.cnt[1][A.sg[u]];

    if(sum >= k) return u ;

    else k -= sum;

    for(int i = 0; i < MAX; i++) {

        int v = A.dian[u].ch[i];

        if(!v) continue;

        if(getnow(v) < k) {

            k -= getnow(v);

        } else {

            res1.push_back(i + 'a');

            return dfsa(v);

        }

    }

    return -1;

}

void dfsb(int u) {

    k -= A.sg[now] != B.sg[u];

    if(k <= 0) return;

    for(int i = 0; i < MAX; i++) {

        int v = B.dian[u].ch[i];

        if(!v) continue;

        ll sum = B.sum[v] - B.cnt[v][A.sg[now]];

        if(sum < k) k -= sum;

        else {

            res2.push_back(i + 'a');

            dfsb(v);

            return;

        }

    }

}

int main() {

    ios::sync_with_stdio(false); cin.tie(0);

    cin >> k;

    cin >> s + 1;

    A.build(s);

    cin >> s + 1;

    B.build(s);

    now = dfsa(1);

    if(now == -1) {

        cout << "NO";

        return 0;

    }

    dfsb(1);

    for(auto it : res1) cout << it;

    cout << '\n';

    for(auto it : res2) cout << it;

    return 0;

}

总结

最近学了后缀自动机，练了几道题，感觉后缀自动机十分的强大，能做许多的事情（毕竟天生DAG），但同时也十分灵活，需要一定的技巧性。

比如$next$指针，我们一般就需要一位一位来分析；跳后缀链接，那就是不断跳到长度更小的且具有相同后缀的子串集合中。

重复旋律6，感觉就是对$parent$树的理解，同时还具有一定的思维难度。

重复旋律7，将多个串拼在一起的技巧，其实很多字符串的题也会遇到，但放在后缀自动机中，总感觉有一些陌生...此时的每个$endpos$等价类，不带分隔符的子串，就是所有字符串的那些子串了，是不是感觉很巧妙，很神奇？

重复旋律8，里面跳后缀链接的操作，有点类似于字符串的匹配问题，因为每跳一次，就有了更多可选集合；同时最后还有个操作，也是为了保证答案的正确性。

重复旋律9，next指针上面跑sg函数，则主要用了后缀自动机天生DAG的性质，并且有向图游戏在博弈领域有很多的定理。同时对两个字典序求第$k$大，也蕴含了贪心的思想。

总之，还需要多加练习才行~