【模版 Luogu P3808/P3796/P5357】AC自动机（简论）

　　浙江集训Day9，没有出任何实质性成果，只好把昨天打完的板子记一下。

　　该博客基于luogu的三道模版题。只有一个大致的讲解，主要提供代码给自己参考。

-----------------------------------------------------------------------

(7.14)

一、AC自动机

　　AC自动机，一个有着令人容易误会的名字的有限状态自动机结构，主要被应用在多模式串的文本匹配问题中。理解AC自动机，首先要熟悉KMP算法和字典树。使用KMP可以分开对每个模式串进行计数，但是对目标串的扫描次数会爆炸。实际上，KMP算法本身也可以从有限状态自动机的角度来理解（简单理解大概就是跳转能够达到的状态是有限个）。AC自动机与KMP都含有类似的fail指针结构。通俗理解fail的意义，我们先建立原串的trie树（KMP则就是原串），然后预处理出每个节点“在当前点a匹配不下去了，我要找一个字典树的前缀串是该串的后缀串”代表的b（这样的意义是，匹配到a就一定匹配到了b）。这就是构建自动机的过程。平凡的ACAM在匹配时，我们沿着字典树往下走，下一个字符失配就沿着fail边去跳转它的后缀代表的匹配状态，直到找到一个可以匹配文本串的下个字符的后缀状态为止。同时，每找到一个串，我们就要沿着fail边翻出它的后缀串，因为这些后缀也都被匹配到了。这就导致AC自动机的匹配复杂度有了可优化的空间。

　　所谓的trie图优化，就是在建立AC自动机时直接把失配的那个字符对应的边连到目标后缀上，这样可以省去每次失配跳fail边的麻烦。同时，一个串有很多后缀，但是并没有都出现在模式串中，中间空状态的跳转没有意义；那么我们就新开一个数组记录下它的第一个是结束节点的fail目标状态（即这个后缀存在于模式串中）的位置，然后每次沿着这个边跳转即可。

　　代码明天放，顺便安利让我学会AC自动机的dalao的两篇博客，受益匪浅。

　　https://www.cnblogs.com/sclbgw7/p/9260756.html　　（AC自动机的构建）

　　https://www.cnblogs.com/sclbgw7/p/9875671.html　　（AC自动机的两种优化）

---------------------------------------------------

(7.15)

二、代码

　　模版一：统计出现模式串的个数

#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#define BUG puts("findone")
#define maxn 1000000 + 10
template <typename T>
void read(T &x) {
x = 0;
int f = 1;
char ch = getchar();
while (!isdigit(ch)) {
if (ch == '-')
f = -1;
ch = getchar();
}
while (isdigit(ch)) {
x = x * 10 + (ch ^ 48);
ch = getchar();
}
x *= f;
return;
}
using namespace std;
char s[maxn];
namespace ACAM {
int trie[26][maxn], pi[maxn], cnt[maxn], last[maxn];
const int root(1);
int tot = 1;
void Insert(char *s) {
int nd = root, len = strlen(s);
for (int i = 0; i < len; ++i) {
int c = s[i] - 'a';
if (!trie[c][nd])
trie[c][nd] = ++tot;
nd = trie[c][nd];
}
++cnt[nd];
}
void Build_ACAM() {
for (int i = 0; i < 26; ++i)
trie[i][0] = root;
pi[root] = 0;
queue<int> que;
que.push(root);
while (!que.empty()) {
int nd = que.front();
que.pop();
for (int c = 0; c < 26; ++c) {
if (!trie[c][nd]) {
// trie[c][nd] = trie[c][pi[nd]]; //这句话就是trie图优化，这题不用它反而跑得更快……
continue;
}
int son = trie[c][nd], nxt = pi[nd];
while (nxt && !trie[c][nxt])
nxt = pi[nxt];
pi[son] = trie[c][nxt];
last[son] = cnt[pi[son]] ? pi[son] : last[pi[son]]; //last优化，它在三道题中都很优秀
que.push(son);
}
}
}
int Match(char *s) {
int len = strlen(s), nd = root, ans = 0;
for (int i = 0; i < len; ++i) {
int c = s[i] - 'a';
while (nd && !trie[c][nd]) { //如果加了trie图优化就不用每次跳fail边来找后缀，因为trie图优化直接记录可匹配的下一个后缀
nd = pi[nd];
}
nd = trie[c][nd];
for (int t = nd; t && ~cnt[t]; t = last[t])
ans += cnt[t], cnt[t] = -1;
}
return ans;
}
} using namespace ACAM;
int main() {
// freopen("testdata.txt", "r", stdin);
// freopen("ans.txt", "w", stdout);
int n;
scanf("%d", &n);
for (int i = 1; i <= n; ++i) {
scanf("%s", s);
Insert(s);
}
Build_ACAM();
scanf("%s", s);
cout << Match(s);
return 0;
}

模版二：AC自动机（加强版）：多组数据，输出出现最多的串的出现次数，按输入顺序输出这些串。

#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#include <vector>
#define BUG puts("findone")
#define maxn 70 * 150 + 10
template <typename T>
void read(T &x) {
x = 0;
int f = 1;
char ch = getchar();
while (!isdigit(ch)) {
if (ch == '-')
f = -1;
ch = getchar();
}
while (isdigit(ch)) {
x = x * 10 + (ch ^ 48);
ch = getchar();
}
x *= f;
return;
}
using namespace std;
char s[1000010], T[151][80];
namespace ACAM {
int trie[26][maxn], pi[maxn], cnt[maxn], last[maxn], sum[maxn], id[maxn];
const int root(1);
int tot = 1;
void Insert(char *s, int pos) {
int nd = root, len = strlen(s);
for (int i = 0; i < len; ++i) {
int c = s[i] - 'a';
if (!trie[c][nd])
trie[c][nd] = ++tot;
nd = trie[c][nd];
}
++cnt[nd], id[nd] = pos; //id数组的意义是记录每个节点（状态）在原输入顺序中所对应的串
}
void Build_ACAM() {
for (int i = 0; i < 26; ++i)
trie[i][0] = root;
pi[root] = 0;
queue<int> que;
que.push(root);
while (!que.empty()) {
int nd = que.front();
que.pop();
for (int c = 0; c < 26; ++c) {
if (!trie[c][nd]) {
trie[c][nd] = trie[c][pi[nd]];
continue;
}//优化位置
int son = trie[c][nd], nxt = pi[nd];
while (nxt && !trie[c][nxt])
nxt = pi[nxt];
pi[son] = trie[c][nxt];
last[son] = cnt[pi[son]] ? pi[son] : last[pi[son]];
que.push(son);
}
}
}
void Match(char *s) {
int len = strlen(s), nd = root;
for (int i = 0; i < len; ++i) {
int c = s[i] - 'a';
// while (nd && !trie[c][nd])
// nd = pi[nd];
nd = trie[c][nd];
for (int t = nd; t; t = last[t])
if (cnt[t])
++sum[t];
}
vector<int> ans;
for (int i = 1; i <= tot; ++i)
if (ans.empty() || sum[i] == sum[ans.front()])
ans.push_back(i);
else if (sum[i] > sum[ans.front()]) {
ans.clear();
ans.push_back(i);
}
printf("%d\n", sum[ans.front()]);
for (int i = 0; i < ans.size(); ++i)
puts(T[id[ans[i]]]);
}
} using namespace ACAM;
int main() {
// freopen("testdata.txt", "r", stdin);
// freopen("ans.txt", "w", stdout);
ios::sync_with_stdio(0); //某种加快iostream的黑科技据称读入字符串飞快
cin.tie(0);
while (19260817) {
int n;
cin >> n;
if(n == 0) break;
tot = 1;
memset(trie, 0, sizeof(trie));
memset(sum, 0, sizeof(sum));
memset(cnt, 0, sizeof(cnt));
memset(pi, 0, sizeof(pi));
memset(last, 0, sizeof(last));
memset(id, 0, sizeof(id));
for (int i = 1; i <= n; ++i) {
cin >> T[i];
Insert(T[i], i);
}
Build_ACAM();
cin >> s;
Match(s);
}
return 0;
}

模版三、AC自动机（二次加强版）：统计每个模式串出现的次数。一开始的策略是每到一个位置就暴力跳last边来找后缀，但是时间只有1000ms，T掉了几个点。参看题解给出的解法是：统计每个状态的出现次数，然后从fail[u]向u连边，构成一棵树。这棵树被称作fail树，满足每个节点的祖先都是它的后缀。这样，每个模式串的出现次数就是它自己的出现次数+以它为后缀的串的出现次数，也就是以它为根的子树的大小。trie树上某状态的祖先则是它的前缀。fail树的性质很好，也具有广泛的应用。

#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#define maxs 2000010
#define maxn 200010
using namespace std;
char T[maxn], s[maxs];
int head[maxn], top;
struct E {
int to, nxt;
} edge[maxn];
void Insert_edge(int u, int v) {
edge[++top] = (E) {v, head[u]};
head[u] = top;
}
namespace ACAM {
int trie[26][maxn], tot = 1, cnt[maxn], pi[maxn], last[maxn], end[maxn];
int id[maxn];
const int root(1);
void Insert(char *s, int k) {
int nd = root, len = strlen(s);
for (int i = 0; i < len; ++i) {
int c = s[i] - 'a';
if (!trie[c][nd])
trie[c][nd] = ++tot;
nd = trie[c][nd];
}
++end[nd];
id[k] = nd;
}
void Build() {
for (int c = 0; c < 26; ++c)
trie[c][0] = root;
queue<int> que;
que.push(root);
while (!que.empty()) {
int nd = que.front(); que.pop();
for (int c = 0; c < 26; ++c) {
int son = trie[c][nd];
if (!son) {
trie[c][nd] = trie[c][pi[nd]];
continue;
}
int nxt = pi[nd];
while (nxt && !trie[c][nxt])
nxt = pi[nxt];
pi[son] = trie[c][nxt];
last[son] = end[pi[son]] ? pi[son] : last[pi[son]];
que.push(son);
}
}
}
void dfs(int u) {
for (int i = head[u]; i; i = edge[i].nxt) {
int v = edge[i].to;
dfs(v);
cnt[u] += cnt[v];
}
}
void Match(char *s) {
register int nd = root; int len = strlen(s);
for (int i = 0; i < len; ++i) {
int c = s[i] - 'a';
nd = trie[c][nd];
++cnt[nd];
}
for (int i = 2; i <= tot; ++i)
Insert_edge(pi[i], i); //建fail树
dfs(root); //统计子树大小
}
} using namespace ACAM;
int main() {
ios::sync_with_stdio(0);
cin.tie(0);
int n;
cin >> n;
for (int i = 1; i <= n; ++i) {
cin >> T;
Insert(T, i);
}
Build();
cin >> s;
Match(s);
for (int i = 1; i <= n; ++i)
printf("%d\n", cnt[id[i]]);
return 0;
}

【模版 Luogu P3808/P3796/P5357】AC自动机（简论）的更多相关文章

AC自动机例题
P3808 [模板]AC自动机(简单版) [题目描述] 给定n个模式串和1个文本串,求有多少个模式串在文本串里出现过. #include<bits/stdc++.h> using name ...
洛谷P3808 & P3796 AC自动机模板
题目:P3808:https://www.luogu.org/problemnew/show/P3808 P3796:https://www.luogu.org/problemnew/show/P37 ...
P3808 【模版】AC自动机（简单版）
题目背景这是一道简单的AC自动机模版题. 用于检测正确性以及算法常数. 为了防止卡OJ,在保证正确的基础上只有两组数据,请不要恶意提交. 题目描述给定n个模式串和1个文本串,求有多少个模式串在文本 ...
luogu P3796【模板】AC自动机（加强版）
嘟嘟嘟这个和某谷的AC自动机模板简单版差不多. 但还是要注意几点的: 1.这个是统计出现次数,而不是是否出现,所以在查询的时候加上这个节点的val后,不能把val标记为-1.那么也就可以说查询的时间 ...
luogu P3808 【模板】AC自动机（简单版）
题目背景这是一道简单的AC自动机模板题. 用于检测正确性以及算法常数. 为了防止卡OJ,在保证正确的基础上只有两组数据,请不要恶意提交. 管理员提示:本题数据内有重复的单词,且重复单词应该计算多次, ...
AC自动机（模板） LUOGU P3808
传送门解题思路 AC自动机,是解决多模匹配问题的算法,是字典树与kmp结合的算法,可以解决许多子串在文本串中出现的次数等信息.关键是实现一个fail指针,是指向更靠上的前缀相同字母,从而可以实现在文 ...
P3796 【模板】AC自动机（加强版）
P3796 [模板]AC自动机(加强版) https://www.luogu.org/problemnew/show/P3796 题目描述有NN个由小写字母组成的模式串以及一个文本串TT.每个模式串 ...
[算法模版]AC自动机
[算法模版]AC自动机基础内容板子不再赘述,OI-WIKI有详细讲解. $query$函数则是遍历文本串的所有位置,在文本串的每个位置都沿着$fail$跳到根,将沿途所有元素答案++.意义 ...
洛谷 P3796 【模板】AC自动机（加强版）（AC自动机）
题目链接:https://www.luogu.com.cn/problem/P3796 AC自动机:复杂度$O( (N+M)\times L )$,N为模式串个数,L为平均长度,M为文章长度. ins ...

随机推荐

【6】TensorFlow光速入门-python模型转换为tfjs模型并使用
本文地址:https://www.cnblogs.com/tujia/p/13862365.html 系列文章: [0]TensorFlow光速入门-序 [1]TensorFlow光速入门-tenso ...
微服务通信之ribbon实现原理
前言上一篇我们知道了feign调用实现负载均衡是通过集成ribbon实现的.也较为详细的了解到了集成的过程.现在我们看一下ribbo是如何实现负载均衡的.写到这里我尚未去阅读源代码,我在这里盲猜一下 ...
联发科Mediatek工业路由芯片上网稳定低功耗的Router模块WiFi中继——无线AP定制方案
Router模块又名路由器模块,是指将路由器的接口类型及部分扩展功能是可以根据实际需求来进行无线接入服务,允许其他无线设备接入,通过局域无线端或联网远程端,进行数据访问,对无线设备进行远程控制.常见的 ...
Magic Method
Python 的 Magic Method 在 Python 中,所有以 "__" 双下划线包起来的方法,都统称为"魔术方法".比如我们接触最多的 __init ...
记录一些API（持续更新）
//对response进行编解码URLEncoder.encode(string,"UTF-8");//ts检查checkbox是否为选中状态$event.target.check ...
使用git处理github中提交有冲突的pull request
前言: 为什么要写这篇文章,因为前段时间有一个开源的github中的项目有一个朋友提交了一个pr看了下是帮忙优化了下代码(十分感谢这位网友).但是他提交的pr刚好和我的项目有许多的冲突导致无法自动合并 ...
理解js参数
<!DOCTYPE html><html><head> <meta charset="utf-8" /> <title> ...
layui系列学习第一天
新开一个博客系列记录下layui学习过程及感受今天受到很大的打击 ...希望自己能坚持做好到1.12号可以完成这个博客一.基础说明 layui css 命名规范:1.layui-模块名- ...
Visual Studio空格变成点的快捷键切换
[Ctrl + R + W] 效果如下图
git clone 出现"error: RPC failed; curl 56 GnuTLS recv error (-9): A TLS packet with unexpected length was received."
1. 最近用git pull几个大项目,总是报如下错误: error: RPC failed; curl 56 GnuTLS recv error (-9): A TLS packet with un ...

【模版 Luogu P3808/P3796/P5357】AC自动机（简论）

【模版 Luogu P3808/P3796/P5357】AC自动机（简论）的更多相关文章

随机推荐

热门专题