本文使用压缩trie树实现字符串检索的功能。首先将字符串通过编码转化为二进制串,随后将二进制串插入到trie树中,在插入过程中同时实现压缩的功能。

字符编码采用Huffman,但最终测试发现不采用Huffman的方法不仅省下了编码时间,同时trie树的插入时间也有所减少。

 /**
程序主函数与编码
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "huffman.h"
#include "compress_trie.h"
//#include <time.h> #define NUM_OF_HUFFMAN 81
#define LENGTH_OF_LINE 10000
#define RESULT_OF_HUFFMAN "result_of_HUFFMAN.dat"
//#define EMAIL "strpool.dat"
//#define CHECKED_EMAIL "checkedemail.dat"
#define RESULT "result.dat" void str_to_bin(char buf[],char binary[],huffman_node hufm[]); int main(int argc, char *argv[])
{
//time_t time_start,time_end;
//time_start = time(NULL); char* EMAIL = argv[];
char* CHECKED_EMAIL = argv[]; huffman_node hufm[NUM_OF_HUFFMAN];
hufm_init(hufm,NUM_OF_HUFFMAN);
char buf[LENGTH_OF_LINE];
char binary[LENGTH_OF_LINE]; FILE* fin_of_huffman;
fin_of_huffman = fopen(RESULT_OF_HUFFMAN,"r");
if(fin_of_huffman == NULL)
{
hufm_init(hufm,NUM_OF_HUFFMAN);
int i;
for(i=;i<(NUM_OF_HUFFMAN+)/;i++)
{
hufm[i].num_of_ch = NUM_OF_HUFFMAN - i;
}
huffman_coding(hufm,NUM_OF_HUFFMAN);
}
else
{
char temp_char;
int i;
for(i=;i<(NUM_OF_HUFFMAN+)/;i++)
{
fgets(buf,sizeof(buf),fin_of_huffman);
sscanf(buf,"%c %d %s",&temp_char,&hufm[i].num_of_ch,hufm[i].code);
}
}
fclose(fin_of_huffman); printf("building trie...");
FILE* fin_of_email;
fin_of_email = fopen(EMAIL,"r");
trie_node *root;
root = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&root); while(fgets(buf,sizeof(buf),fin_of_email)!=NULL)
{
str_to_bin(buf,binary,hufm);
trie_insert(&root,binary);
}
fclose(fin_of_email);
printf("\r");
printf("build trie success.\n"); FILE *fin_of_checked,*fout_of_result;
fin_of_checked = fopen(CHECKED_EMAIL,"r");
fout_of_result = fopen(RESULT,"w");
int num_yes = ;
int num_no = ;
while(fgets(buf,sizeof(buf),fin_of_checked)!=NULL)
{
str_to_bin(buf,binary,hufm);
if(trie_search(root,binary))
{
fprintf(fout_of_result,"YES\n");
num_yes++;
}
else
{
fprintf(fout_of_result,"NO\n");
num_no++;
}
}
fprintf(fout_of_result,"num of YES is:%d\n",num_yes);
fprintf(fout_of_result,"num of NO is:%d\n",num_no);
printf("search success!\n");
fclose(fin_of_checked);
fclose(fout_of_result);
//time_end = time(NULL);
//printf("用时:%.0lfs\n", difftime(time_end, time_start));
return ;
} void str_to_bin(char buf[],char binary[],huffman_node hufm[])
{
int i;
binary[] = '\0';
for(i=strlen(buf)-;i>=;i--)
{
if(buf[i]>='a' && buf[i]<='z')
{
strcat(binary,hufm[buf[i]-'a'].code);
}
else if(buf[i]>='A' && buf[i]<='Z')
{
strcat(binary,hufm[buf[i]-'A'].code);
}
else if(buf[i]>='' && buf[i]<='')
{
strcat(binary,hufm[+buf[i]-''].code);
}
else if(buf[i]=='_')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='-')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='.')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='@')
{
strcat(binary,hufm[].code);
}
else
{
strcat(binary,hufm[].code);
}
}
}
 /**
完成trie树的插入,查找。
*/ typedef struct TRIE_NODE
{
char is_str;
unsigned short num_of_bit;
unsigned char* compress_of_bit;
struct TRIE_NODE *point_of_zero,*point_of_one;
}trie_node; //long int temp_of_new = 0; void trie_node_init(trie_node **root);
int trie_insert(trie_node **root,char* bit_of_insert);
int trie_search(trie_node *root,char* bit_of_insert);
void trie_delete(trie_node *root);
void compress(trie_node *root,char* bit_of_insert);
int compare_of_bit(trie_node *root,char* bit_of_insert);
void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop); void trie_node_init(trie_node **root)
{
(*root)->is_str = (char);
(*root)->num_of_bit = ;
(*root)->compress_of_bit = NULL;
(*root)->point_of_zero = NULL;
(*root)->point_of_one = NULL;
} void compress(trie_node *root,char* bit_of_insert)
{
int i,j,len_of_insert;
len_of_insert = strlen(bit_of_insert);
root->num_of_bit = len_of_insert;
if(root->num_of_bit<=)
{
int temp;
for(i=len_of_insert-,j=;i>=;i--,j++)
{
if(bit_of_insert[i] == '')
{
clearbit(temp,j);
}
else
{
setbit(temp,j);
}
}
root->compress_of_bit = (unsigned char*)temp;
}
else
{
root->compress_of_bit = (unsigned char*)malloc((len_of_insert%)?(len_of_insert/+):(len_of_insert/));
for(i=len_of_insert-,j=;i>=;i--,j++)
{
if(bit_of_insert[i] == '')
{
clearbit(root->compress_of_bit[j/],j%);
}
else
{
setbit(root->compress_of_bit[j/],j%);
}
}
}
} int trie_insert(trie_node **root,char* bit_of_insert)
{
int ret;
char bit_of_pop[];
if(root == NULL)
{
ret = ;
}
else
{
if((*root)->num_of_bit == )
{
if(!(*bit_of_insert))
{
(*root)->is_str = (char);
ret = ;
}
else
{
if((*root)->is_str ==
&& (*root)->point_of_zero == NULL
&& (*root)->point_of_one == NULL)
{
compress((*root),bit_of_insert);
(*root)->is_str = (char);
ret = ;
}
else
{
if(*bit_of_insert == '')
{
if((*root)->point_of_zero == NULL)
{
(*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_zero);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+);
}
else
{
if((*root)->point_of_one == NULL)
{
(*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_one);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_one,bit_of_insert+);
}
}
}
}
else
{
int ans_of_compare = compare_of_bit((*root),bit_of_insert);
if(ans_of_compare == )
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,);
if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
}
else
{
father->point_of_one = (*root);
}
if(!(*bit_of_insert))
{
father->is_str = (char);
ret = ;
}
else
{
if(*bit_of_insert == '')
{
father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_zero);
//temp_of_new++;
ret = trie_insert(&father->point_of_zero,bit_of_insert+);
}
else
{
father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_one);
//temp_of_new++;
ret = trie_insert(&father->point_of_one,bit_of_insert+);
}
}
(*root) = father;
}
else
{
if(ans_of_compare == (int)(*root)->num_of_bit
&& ans_of_compare == strlen(bit_of_insert))
{
(*root)->is_str = (char);
ret = ;
}
else if(ans_of_compare == (int)(*root)->num_of_bit)
{
bit_of_insert += ans_of_compare;
if(*bit_of_insert == '')
{
if((*root)->point_of_zero == NULL)
{
(*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_zero);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+);
}
else
{
if((*root)->point_of_one == NULL)
{
(*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_one);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_one,bit_of_insert+);
}
}
else if(ans_of_compare == strlen(bit_of_insert))
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,ans_of_compare);
compress(father,bit_of_pop);
father->is_str = (char);
pop_bit((*root),bit_of_pop,);
if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
}
else
{
father->point_of_one = (*root);
}
(*root) = father;
}
else
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,ans_of_compare);
compress(father,bit_of_pop);
pop_bit((*root),bit_of_pop,);
bit_of_insert += ans_of_compare+; if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_one);
//temp_of_new++;
ret = trie_insert(&father->point_of_one,bit_of_insert);
}
else
{
father->point_of_one = (*root);
father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_zero);
//temp_of_new++;
ret = trie_insert(&father->point_of_zero,bit_of_insert);
}
(*root) = father;
}
}
}
}
return ret;
} int trie_search(trie_node *root,char *bit_of_search)
{
trie_node *p = root;
while(p!=NULL && *bit_of_search)
{
if(p->num_of_bit!=)
{
if((int)p->num_of_bit == compare_of_bit(p,bit_of_search))
{
bit_of_search += (int)p->num_of_bit;
}
else
{
p=NULL;
break;
}
}
if(!(*bit_of_search))
{
break;
}
if(bit_of_search[]=='')
{
p = p->point_of_zero;
bit_of_search++;
}
else if(bit_of_search[]=='')
{
p = p->point_of_one;
bit_of_search++;
}
if(!(*bit_of_search) && p!=NULL && p->num_of_bit!=)
{
p=NULL;
break;
}
}
if(p!=NULL)
{
return p->is_str;
}
else
{
return ;
}
} void trie_delete(trie_node *root)
{
if(root == NULL)
return;
trie_delete(root->point_of_zero);
trie_delete(root->point_of_one);
free(root);
} int compare_of_bit(trie_node *root,char* bit_of_insert)
{
int len_of_insert = strlen(bit_of_insert);
int i,j,tempbit;
if(root->num_of_bit<=)
{
for(i=,j=root->num_of_bit-;i<len_of_insert && i<root->num_of_bit;i++,j--)
{
tempbit = getbit((int)root->compress_of_bit,j);
if(bit_of_insert[i]-'' != tempbit)
{
break;
}
}
}
else
{
for(i=,j=root->num_of_bit-;i<len_of_insert && i<root->num_of_bit;i++,j--)
{
tempbit = getbit(root->compress_of_bit[j/],j%);
if(bit_of_insert[i]-'' != tempbit)
{
break;
}
}
}
return i;
} void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop)
{
int i,j;
short num_of_bit = root->num_of_bit - (short)len_of_pop;
if(root->num_of_bit<=)
{
for(i=,j=root->num_of_bit-;i<len_of_pop;i++,j--)
{
bit_of_pop[i] = getbit((int)root->compress_of_bit,j) +'';
}
bit_of_pop[i] = '\0';
}
else
{
for(i=,j=root->num_of_bit-;i<len_of_pop;i++,j--)
{
bit_of_pop[i] = getbit(root->compress_of_bit[j/],j%) +'';
}
bit_of_pop[i] = '\0'; if(num_of_bit == )
{
free(root->compress_of_bit);
}
else if(num_of_bit<=)
{
int temp;
for(j=num_of_bit-;j>=;j--)
{
if(getbit(root->compress_of_bit[j/],j%) == )
{
clearbit(temp,j);
}
else
{
setbit(temp,j);
}
}
free(root->compress_of_bit);
root->compress_of_bit = (unsigned char*)temp;
}
else
{
unsigned char *p;
short num_of_byte = (num_of_bit%)?(num_of_bit/+):(num_of_bit/);
if(((root->num_of_bit%)?(root->num_of_bit/+):(root->num_of_bit/)) != num_of_byte)
{
p = (unsigned char*)malloc(num_of_byte);
short i;
for(i=;i<num_of_byte;i++)
{
p[i] = root->compress_of_bit[i];
}
free(root->compress_of_bit);
root->compress_of_bit = p;
}
}
}
root->num_of_bit = num_of_bit;
}

大规模字符串检索-压缩trie树的更多相关文章

  1. 835. 字符串统计(Trie树模板题)

    维护一个字符串集合,支持两种操作: “I x”向集合中插入一个字符串x: “Q x”询问一个字符串在集合中出现了多少次. 共有N个操作,输入的字符串总长度不超过 105105,字符串仅包含小写英文字母 ...

  2. Trie树|字典树(字符串排序)

    有时,我们会碰到对字符串的排序,若采用一些经典的排序算法,则时间复杂度一般为O(n*lgn),但若采用Trie树,则时间复杂度仅为O(n). Trie树又名字典树,从字面意思即可理解,这种树的结构像英 ...

  3. Trie树及其应用

    Trie树及其应用 Trie树 Trie树,又称单词查找树.字典树,是一种树形结构,是一种哈希树的变种,是一种用于快速检索的多叉树结构.典型应用是用于统计和排序大量的字符串(但不仅限于字符串),所以经 ...

  4. Trie树(代码),后缀树(代码)

    Trie树系列 Trie字典树 压缩的Trie 后缀树Suffix tree 后缀树--ukkonen算法 Trie是通过对字符串进行预先处理,达到加快搜索速度的算法.即把文本中的字符串转换为树结构, ...

  5. [转]双数组TRIE树原理

    原文名称: An Efficient Digital Search Algorithm by Using a Double-Array Structure 作者: JUN-ICHI AOE 译文: 使 ...

  6. 【动画】看动画轻松理解「Trie树」

    Trie树 Trie这个名字取自“retrieval”,检索,因为Trie可以只用一个前缀便可以在一部字典中找到想要的单词. 虽然发音与「Tree」一致,但为了将这种 字典树 与 普通二叉树 以示区别 ...

  7. Trie树(Prefix Tree)介绍

    本文用尽量简洁的语言介绍一种树形数据结构 -- Trie树. 一.什么是Trie树 Trie树,又叫字典树.前缀树(Prefix Tree).单词查找树 或 键树,是一种多叉树结构.如下图: 上图是一 ...

  8. 数据结构与算法—Trie树

    Trie,又经常叫前缀树,字典树等等.它有很多变种,如后缀树,Radix Tree/Trie,PATRICIA tree,以及bitwise版本的crit-bit tree.当然很多名字的意义其实有交 ...

  9. trie树(前缀树)详解——PHP代码实现

    trie树常用于搜索提示.如当输入一个网址,可以自动搜索出可能的选择.当没有完全匹配的搜索结果,可以返回前缀最相似的可能. 一.Tire树的基本性质 根节点不包含字符,除根节点外每一个节点都只包含一个 ...

随机推荐

  1. bzoj2064

    这道题初看真的毫无思路,又是合并又是分裂的 但实际上我们知道,当两组和相等的时候才能由一组变成另一组 我们将初始状态和最终状态划分成若干对,每对中的两组元素和相等的 不难发现,最少步骤=n+m-2*对 ...

  2. bzoj1853 bzoj2393

    两题是类似的,这里说一下bzoj1853 首先我们求出所有的幸运号码,注意如果存在x是y的倍数则x不算在内,避免之后重复计算 下面我们就要统计幸运号码的倍数了,这显然是要用到容斥原理的 但是幸运号码很 ...

  3. Node.js权威指南 (1) - Node.js介绍

    1.1 Node.js概述 / 2 1.1.1 使用Node.js能够解决什么问题 / 2 1.1.2 实现高性能服务器 / 2 1.1.3 非阻塞型I/O及事件环机制 / 2 1.1.4 Node. ...

  4. 两次fopen不同的文件返回相同的FILE* 地址

    最近接触一个垃圾程序,出现一个奇怪的bug,现象是两次fopen不同的文件返回相同的FILE*地址,第二次返回的FILE*有时候无端端的就被关闭了.以下代码是对这个bug的概括: auto fp1 = ...

  5. 【转】XCode、Cocoa、Objective-C 的关系区别

    原文网址:http://blog.sina.com.cn/s/blog_5e89e1ff0100z4k1.html Object-Ciphone开发用的编程语言不是c,c++,java 而是objec ...

  6. (转载)JavaScript中定义变量

    (转载)http://blog.163.com/xuxiaoqianhz@126/blog/static/165190577201061594421870/ JavaScript中定义变量有两种方式: ...

  7. Apache 整合 Tomcat (首先Apache 发布的是PHP项目,占用端口80,tomcat 发布的是Java 项目,占用端口8080)

    情况简介: Apache 整合 Tomcat (首先Apache 发布的是PHP项目,占用端口80,tomcat 发布的是Java 项目,占用端口8080),而现在是虚拟出来两个域名(希望这两个域名都 ...

  8. Eclipse中svn图标不显示

    在菜单栏中:windows ->preferences->General->Appearance->Lable Decorations 勾选其中的 SVN 项,最后应用确认之后 ...

  9. KiCad中层定义

    5.2.1. Paired layers The Adhesives layers (Copper and Component):    These are used in the applicati ...

  10. The equation - SGU 106(扩展欧几里得)

    题目大意:有一个二元一次方程,给出系数值和x与y的取值范围,求出来总共有多少对整数解. 分析:有以下几点情况. 1,系数a=0, b=0, 当c != 0的时候结果很明显是无解,当c=0的时候x,y可 ...