Suffix Tree（后缀树）

　　这篇简单的谈谈后缀树原理及实现。

　　如前缀树原理一般，后缀trie树是将字符串的每个后缀使用trie树的算法来构造。例如banana的所有后缀：

0: banana

1:  anana

2:   nana

3:    ana

4:     na

5:      a

　　按字典序排列后：

5: a

3:  ana

1:     anana

0: banana

4: na

2:   nana

　　形成一个树形结构。

　　代码：

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

// banana中不重复的字符有：a b n

/*

 *   a   b   n

 *  n $  a   a

 *  a    n  n $

 * n $   a  a

 * a     n  $

 * $     a

         $*/

#define SIZE 27

#define Index(c) ((c) - 'a')

#define rep(i, a, b) for(i = a; i < b; i++)

typedef struct BaseNode {

	struct BaseNode*next[SIZE];

	char c;

	int num;

} suffix_tree, *strie;

void initialize(strie* root)

{

	int i;

	*root = (strie)malloc(sizeof(suffix_tree));

	(*root)->c = 0;

	(*root)->num = -1;

	rep(i, 0, SIZE) (*root)->next[i] = NULL;

}

void insert(strie*root, const char*str, int k)

{

	suffix_tree*node = *root, *tail;

	int i, j;

	for (i = 0; str[i] != '\0'; i++)

	{

		if (node->next[Index(str[i])] == NULL)

		{

			tail = (strie)malloc(sizeof(suffix_tree));

			tail->c = str[i];

			tail->num = -1;

			rep(j, 0, SIZE) tail->next[j] = NULL;

			node->next[Index(str[i])] = tail;

		}

		node = node->next[Index(str[i])];

	}

	tail = (strie)malloc(sizeof(suffix_tree));

	tail->c = '$';

	tail->num = k;

	rep(i, 0, SIZE) tail->next[i] = NULL;

	node->next[SIZE - 1] = tail;

}

void show(suffix_tree*root)

{

	if (root)

	{

		int i;

		rep(i, 0, SIZE) show(root->next[i]);

		printf("%c\n", root->c);

		if (root->num > -1)

		{

			printf("%d\n", root->num);

		}

	}

}

void destory(strie*root)

{

	if (*root)

	{

		int i;

		rep(i, 0, SIZE) destory(&(*root)->next[i]);

		free(*root);

		*root = NULL;

	}

}

int main()

{

	suffix_tree*root;

	initialize(&root);

	char str[] = "banana", *p = str;

	int i = 0;

	while(*p)

	{

		insert(&root, p, i);

		p++;

		i++;

	}

	show(root);

	destory(&root);

	return 0;

}

　　时间复杂度分析：算法中对于建立一串长m的字符串，需要一个外层的m次循环 + 一个内层m次循环 + 一些常数，于是建立一颗后缀字典树所需的时间为O(m²)，27的循环在这里可看作常数；

　　空间复杂度分析：一个字符的字符串长度为1，需要消耗的1个该字符 + 1个根节点 + 1个\$字符的空间，两个字符的字符串长度为2，需要消耗3个字符空间+ 1个根节点 + 2个\$空间...以此类推，发现总是含有1个根节点和m个\$字符，\$的个数等于字符串长度m，而存储的源字符串后缀所需的空间有如下规律：

$$ \begin{aligned} O(s_1) &= 1 \\ O(s_2) &= 1+2 \\ O(s_3) &= 1+2+3 \\ \cdot \cdot \cdot \\ O(s_m) &= 1+2+ \cdot \cdot \cdot + m \end{aligned} $$

　　设以长为m的字符串s建立后缀树T，于是有：

$$ O(T) = O(\frac{(1 + m)m}{2} + 1 + m) = O(m^2) $$

　　由于上面算法对于无重复的字符串来说空间复杂度比较大，所以使用路径压缩以节省空间，这样的树就称为后缀树，也可以通过下标来存储，如图：

　　p.s.写压缩路径的后缀树时，脑子犯傻了...错了，改天再把正确的补上。。。

　　路径压缩版后缀树：

#include <iostream>

using namespace std;

#define rep(i, a, b) for(int i = a; i < b; i++)

#define trans(c) (c - 'a')

#define SIZE 26

#define MAX (100010 << 2)

struct BaseNode {

	int len;

	const char*s;

	int pos[MAX];

	BaseNode*next[SIZE];

	BaseNode()

	{

		len = 0;

		rep(i, 0, MAX) pos[i] = 0;

		rep(i, 0, SIZE) next[i] = nullptr;

	}

	BaseNode(const char*s, int p)

	{

		this->s = s, this->len = p;

		rep(i, 0, MAX) pos[i] = 0;

		rep(i, 0, SIZE) next[i] = nullptr;

	}

};

class SuffixTree {

private:

	BaseNode*root;

	/**/

	void add(const char*s, int p);

	void print(BaseNode*r);

	void destory(BaseNode*&r);

public:

	SuffixTree()

	{

		root = nullptr;

	}

	void insert(const char*s);

	void insert(string s)

	{

		insert(s.c_str());

	}

	void remove(const char*s)

	{

	}

	void visual()

	{

		print(root);

	}

	bool match(const char*s);

	bool match(string s)

	{

		match(s.c_str());

	}

	~SuffixTree()

	{

		destory(root);

	}

};

void SuffixTree::add(const char*s, int p)

{

	int i = 0; while (s[i]) i++;

	if (!root->next[p]) root->next[p] = new BaseNode(s, i);

	root->next[p]->pos[i] = i;

}

void SuffixTree::insert(const char*s)

{

	root = new BaseNode();

	while (*s)

	{

		add(s, trans(*s));

		s++;

	}

}

bool SuffixTree::match(const char*s)

{

	const char* ps = root->next[trans(*s)]->s;

	while (*s) if (*ps++ != *s++) return false;

	return true;

}

void SuffixTree::print(BaseNode*r)

{

	if (r)

	{

		rep(i, 0, SIZE)

			if (r->next[i])

			{

				cout << i << ':' << endl;

				rep(j, 0, r->next[i]->len + 1)

					if (r->next[i]->pos[j])

					{

						rep(k, 0, r->next[i]->pos[j])

							cout << r->next[i]->s[k];

						cout << '$' << endl;

					}

			}

	}

}

void SuffixTree::destory(BaseNode*&r)

{

	if (r)

	{

		rep(i, 0, SIZE) destory(r->next[i]);

		delete r;

	}

}

int main()

{

	SuffixTree st;

	st.insert("banana");

	st.visual();

	if (st.match("na")) cout << "Yes" << endl;

	else cout << "No" << endl;

	return 0;

}

　　上面的后缀树都是对于一个字符串的处理方法，而广义后缀树将算法推广到了不同的字符串上，但我还没写过，改天补上。。。

　　参考：https://en.wikipedia.org/wiki/Suffix_tree

Suffix Tree（后缀树）的更多相关文章

后缀树(suffix tree)
参考: 从前缀树谈到后缀树后缀树 Suffix Tree-后缀树字典树(trie树).后缀树一.前缀树简述:又名单词查找树,tries树,一种多路树形结构,常用来操作字符串(但不限于字符串), ...
Trie / Radix Tree / Suffix Tree
Trie (字典树) "A", "to", "tea", "ted", "ten", "i ...
Trie树(代码)，后缀树（代码）
Trie树系列 Trie字典树压缩的Trie 后缀树Suffix tree 后缀树--ukkonen算法 Trie是通过对字符串进行预先处理,达到加快搜索速度的算法.即把文本中的字符串转换为树结构, ...
后缀树（Suffix Tree）
问题描述: 后缀树(Suffix Tree) 参考资料: http://www.cppblog.com/yuyang7/archive/2009/03/29 ...
后缀树的建立-Ukkonen算法
参考: Ukkonen算法讲解 Ukkonen算法动画 Ukkonen算法,以字符串abcabxabcd为例,先介绍一下运算过程,最后讨论一些我自己的理解. 需要维护以下三个变量: 当前扫描位置# 三 ...
笔试算法题（40）：后缀数组 & 后缀树（Suffix Array & Suffix Tree）
议题:后缀数组(Suffix Array) 分析: 后缀树和后缀数组都是处理字符串的有效工具,前者较为常见,但后者更容易编程实现,空间耗用更少:后缀数组可用于解决最长公共子串问题,多模式匹配问题,最长 ...
Suffix树，后缀树
body, table{font-family: 微软雅黑; font-size: 13.5pt} table{border-collapse: collapse; border: solid gra ...
后缀树(Suffix Trie)子串匹配结构
Suffix Trie 又称后缀Trie或后缀树.它与Trie树的最大不同在于,后缀Trie的字符串集合是由指定字符串的后缀子串构成的.比如.完整字符串"minimize"的后缀子 ...
CF504E Misha and LCP on Tree 后缀自动机+树链剖分+倍增
求树上两条路径的 LCP (树上每个节点代表一个字符) 总共写+调了6个多小时,终于过了~ 绝对是我写过的最复杂的数据结构了我们对这棵树进行轻重链剖分,然后把所有的重链分正串,反串插入到广义后缀自动 ...

随机推荐

java is 和 == ，以及equal
package string; public class MemAddrChange { public static void main(String[] args) { // const 常量区,
Python - 标准库部分函数、类的大致实现(持续更新)
all() def all(iterable): for element in iterbale: if not element: return False return True any() def ...
安装ipython[win/linux]
首先以win7 64位系统, python2.7.9为例,linux见底部 1.下载材料http://files.cnblogs.com/files/smileyes/ipython-win64.z ...
CDH安装时，无法纳管全部的节点的一个bug
问题描述: 使用CDH 5.2版本安装时,agent节点有12个.按照安装说明,在各个节点启动cm-agent之后,发现只有6个节点能被纳管.其它的节点总是无法加入纳管中. 在确认防火墙已经关闭后 ...
10day 系统安全优化
系统安全相关优化(将一些安全服务进行关闭) 1. 防火墙服务程序 centos6 查看防护墙服务状态 /etc/init.d/iptables status 临时关闭防火墙服务 /etc/init.d ...
Nuxt配置动态路由以及参数校验
动态路由就是带参数的路由.比如我们商品列表里很多商品详细页,这时候就需要动态路由的帮助了. 比如我们新建一个commodity文件夹,新建一个index.vue 文件,然后新建一个_id.vue (以 ...
Servlet转发
可以使用ServletContext中的getRequestDispatcher(url).forward(request, response)方法进行转发 myservlet2.java publi ...
Go并发介绍
1. 进程.线程.协程进程(Process),线程(Thread),协程(Coroutine,也叫轻量级线程) 进程进程是一个程序在一个数据集中的一次动态执行过程,可以简单理解为“正在执行的程序” ...
samba文件共享及账户映射
samba文件共享及账户映射实验介绍:在虚拟机Linux系统上安装sanmba服务,并在另外一台虚拟机的win7系统上访问共享文件夹,主要分为:匿名访问.身份验证访问.以及添加白名单和为了保护服务器 ...
每日扫盲(二)：xxx.dll文件的作用
DLL,dynamic-link library 动态链接库.我们看他的说明,是应用程序扩展.DLL内是一些程序的功能.由于使用静态链接库(static LIBrary,LIB)会使主程序变得臃肿,并 ...

Suffix Tree（后缀树）

Suffix Tree（后缀树）的更多相关文章

随机推荐

热门专题