SQL分词器1.10版
处理SQL及分词效果:
select * from ( select rownum as rn,tb1.stuid,tb1.summary from ( select stuid,sum(score) as summary from gk_score group by stuid order by summary desc ) tb1 order by tb1.summary desc ) tb2 where rn<11
Index Type No Text Type Desc
------------------------------------------------------------------------------------
0 1 select KW:select
1 2 * Text
2 4 from KW:from
3 18 ( (
4 1 select KW:select
5 2 rownum Text
6 13 as KW:as
7 2 rn Text
8 3 , Comma
9 2 tb1.stuid Text
10 3 , Comma
11 2 tb1.summary Text
12 4 from KW:from
13 18 ( (
14 1 select KW:select
15 2 stuid Text
16 3 , Comma
17 2 sum Text
18 18 ( (
19 2 score Text
20 19 ) )
21 13 as KW:as
22 2 summary Text
23 4 from KW:from
24 2 gk_score Text
25 14 group KW:group
26 10 by KW:by
27 2 stuid Text
28 9 order KW:order
29 10 by KW:by
30 2 summary Text
31 11 desc KW:asc
32 19 ) )
33 2 tb1 Text
34 9 order KW:order
35 10 by KW:by
36 2 tb1.summary Text
37 11 desc KW:asc
38 19 ) )
39 2 tb2 Text
40 5 where KW:where
41 2 rn Text
42 16 < <
43 2 11 Text
程序:
package com.heyang.easysql.lex10; import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List; class Token{
static final int TYPE_SELECT=1;
static final int TYPE_TEXT=2;
static final int TYPE_COMMA=3;
static final int TYPE_FROM=4;
static final int TYPE_WHERE=5;
static final int TYPE_AND=6;
static final int TYPE_EQUAL=7;
static final int TYPE_OR=8;
static final int TYPE_ORDER=9;
static final int TYPE_BY=10;
static final int TYPE_ASC=11;
static final int TYPE_DESC=12;
static final int TYPE_AS=13;
static final int TYPE_GROUP=14;
static final int TYPE_HAVING=15;
static final int TYPE_LESSTHAN=16;
static final int TYPE_GREATERTHAN=17;
static final int TYPE_OPEN_PARENTHESIS=18;
static final int TYPE_CLOSE_PARENTHESIS=19;
static final int TYPE_CONNECT=20;
static final int TYPE_LESSTHAN_OR_EQUAL=21;
static final int TYPE_GREATERTHAN_OR_EQUAL=22;
static final int TYPE_LESSTHAN_OR_GREATERTHAN=23;
static final int TYPE_CASE=24;
static final int TYPE_WHEN=25;
static final int TYPE_THEN=26;
static final int TYPE_ELSE=27;
static final int TYPE_END=28;
static final int TYPE_IS=29;
static final int TYPE_NULL=30;
static final int TYPE_TRUE=31;
static final int TYPE_FALSE=32;
static final int TYPE_PLUS=33;
static final int TYPE_MINUS=34;
//static final int TYPE_MULTI=35;
static final int TYPE_DEVIDE=36;
static final int TYPE_DISTINCT=37;
static final int TYPE_OVER=38;
static final int TYPE_STRING_CONCAT=39;
static final int TYPE_ON=40;
static final int TYPE_JOIN=41;
static final int TYPE_INNER=42;
static final int TYPE_LEFT=43;
static final int TYPE_RIGHT=44;
static final int TYPE_OUTER=45;
static final int TYPE_FULL=46;
static final int TYPE_WITHIN=47;
static final int TYPE_PARTITION=48; int type;
String text; public Token(char c,int type) {
this.text=String.valueOf(c);
this.type=type;
} public Token(String word,int type) {
this.text=word;
this.type=type;
} public String getTypeStr() {
if(type==TYPE_SELECT) {
return "KW:select";
}else if(type==TYPE_FROM) {
return "KW:from";
}else if(type==TYPE_COMMA) {
return "Comma";
}else if(type==TYPE_TEXT) {
return "Text";
}else if(type==TYPE_WHERE) {
return "KW:where";
}else if(type==TYPE_AND) {
return "KW:and";
}else if(type==TYPE_EQUAL) {
return "=";
}else if(type==TYPE_OR) {
return "KW:or";
}else if(type==TYPE_ORDER) {
return "KW:order";
}else if(type==TYPE_BY) {
return "KW:by";
}else if(type==TYPE_ASC) {
return "KW:asc";
}else if(type==TYPE_DESC) {
return "KW:desc";
}else if(type==TYPE_AS) {
return "KW:as";
}else if(type==TYPE_GROUP) {
return "KW:group";
}else if(type==TYPE_HAVING) {
return "KW:having";
}else if(type==TYPE_LESSTHAN) {
return "<";
}else if(type==TYPE_GREATERTHAN) {
return ">";
}else if(type==TYPE_OPEN_PARENTHESIS) {
return "(";
}else if(type==TYPE_CLOSE_PARENTHESIS) {
return ")";
}else if(type==TYPE_CONNECT) {
return "KW:connect";
}else if(type==TYPE_LESSTHAN_OR_EQUAL) {
return "<=";
}else if(type==TYPE_GREATERTHAN_OR_EQUAL) {
return ">=";
}else if(type==TYPE_LESSTHAN_OR_GREATERTHAN) {
return "<>";
}else if(type==TYPE_CASE) {
return "KW:case";
}else if(type==TYPE_WHEN) {
return "KW:when";
}else if(type==TYPE_THEN) {
return "KW:then";
}else if(type==TYPE_ELSE) {
return "KW:else";
}else if(type==TYPE_END) {
return "KW:end";
}else if(type==TYPE_IS) {
return "KW:is";
}else if(type==TYPE_NULL) {
return "KW:null";
}else if(type==TYPE_TRUE) {
return "KW:true";
}else if(type==TYPE_FALSE) {
return "KW:false";
}else if(type==TYPE_PLUS) {
return "+";
}else if(type==TYPE_MINUS) {
return "-";
}else if(type==TYPE_DEVIDE) {
return "/";
}else if(type==TYPE_DISTINCT) {
return "KW:distinct";
}else if(type==TYPE_OVER) {
return "KW:over";
}else if(type==TYPE_STRING_CONCAT) {
return "||";
}else if(type==TYPE_ON) {
return "KW:on";
}else if(type==TYPE_JOIN) {
return "KW:join";
}else if(type==TYPE_INNER) {
return "KW:inner";
}else if(type==TYPE_LEFT) {
return "KW:left";
}else if(type==TYPE_RIGHT) {
return "KW:right";
}else if(type==TYPE_OUTER) {
return "KW:outer";
}else if(type==TYPE_FULL) {
return "KW:full";
}else if(type==TYPE_WITHIN) {
return "KW:within";
}else if(type==TYPE_PARTITION) {
return "KW:partition";
} return null;
}
} public class Lexer {
private List<Token> tokenList; public Lexer(String inputSql) {
String sql=pretreat(inputSql);
String swallowed=""; tokenList=new ArrayList<Token>();
for(int i=0;i<sql.length();i++){
char c=sql.charAt(i); if(Character.isWhitespace(c)){
addTextToList(swallowed);
swallowed="";
}else if(c==','){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_COMMA));
}else if(c=='='){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_EQUAL));
}else if(c=='<'){
int next=i+1;
if(next<sql.length() && sql.charAt(next)=='=') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token("<=",Token.TYPE_LESSTHAN_OR_EQUAL));
i++;
}else if(next<sql.length() && sql.charAt(next)=='>') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token("<>",Token.TYPE_LESSTHAN_OR_GREATERTHAN));
i++;
}else {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_LESSTHAN));
}
}else if(c=='>'){
int next=i+1;
if(next<sql.length() && sql.charAt(next)=='=') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(">=",Token.TYPE_GREATERTHAN_OR_EQUAL));
i++;
}else {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_GREATERTHAN));
}
}else if(c=='|'){
int next=i+1;
if(next<sql.length() && sql.charAt(next)=='|') {
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token("||",Token.TYPE_STRING_CONCAT));
i++;
}
}else if(c=='('){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_OPEN_PARENTHESIS));
}else if(c==')'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_CLOSE_PARENTHESIS));
}else if(c=='+'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_PLUS));
}else if(c=='-'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_MINUS));
}else if(c=='/'){
addTextToList(swallowed);
swallowed="";
tokenList.add(new Token(c,Token.TYPE_DEVIDE));
}else {
swallowed+=c;
}
}
} private int findTypeByText(String text) {
Object[][] arr= {
{"select", Token.TYPE_SELECT},
{"from", Token.TYPE_FROM},
{"where", Token.TYPE_WHERE},
{"and", Token.TYPE_AND},
{"or", Token.TYPE_OR},
{"order", Token.TYPE_ORDER},
{"by", Token.TYPE_BY},
{"asc", Token.TYPE_ASC},
{"desc", Token.TYPE_ASC},
{"asc", Token.TYPE_DESC},
{"as", Token.TYPE_AS},
{"group", Token.TYPE_GROUP},
{"having", Token.TYPE_HAVING},
{"connect", Token.TYPE_CONNECT},
{"case", Token.TYPE_CASE},
{"when", Token.TYPE_WHEN},
{"then", Token.TYPE_THEN},
{"else", Token.TYPE_ELSE},
{"end", Token.TYPE_END},
{"is", Token.TYPE_IS},
{"null", Token.TYPE_NULL},
{"true", Token.TYPE_TRUE},
{"false", Token.TYPE_FALSE},
{"distinct", Token.TYPE_DISTINCT},
{"over", Token.TYPE_OVER},
{"on", Token.TYPE_ON},
{"join", Token.TYPE_JOIN},
{"inner", Token.TYPE_INNER},
{"left", Token.TYPE_LEFT},
{"right", Token.TYPE_RIGHT},
{"outer", Token.TYPE_OUTER},
{"full", Token.TYPE_FULL},
{"within", Token.TYPE_WITHIN},
{"partition", Token.TYPE_PARTITION},
}; for(Object[] arrInner:arr) {
String keyword=String.valueOf(arrInner[0]);
if(keyword.equalsIgnoreCase(text)) {
return Integer.parseInt(arrInner[1].toString());
}
} return Token.TYPE_TEXT;
} private void addTextToList(String text) {
int type=findTypeByText(text);
addToken2List(text,type);
} private void addToken2List(String text,int type) {
if(text.trim().length()>0) {
tokenList.add(new Token(text,type));
}
} public void printTokenList() {
final String continuousStar = createRepeatedStr("-", 84);
final String layout = "%-20s %-20s %-20s %-20s %s";
StringBuilder sb = new StringBuilder(); sb.append(String.format(layout, "Index", "Type No","Text","Type Desc","\n"));
sb.append(continuousStar + "\n");
int index=0;
for(Token token:tokenList) {
sb.append(String.format(layout, String.valueOf(index),String.valueOf(token.type), token.text,token.getTypeStr(),"\n"));
index++;
} System.out.println(sb.toString());
} private static String createRepeatedStr(String seed, int n) {
return String.join("", Collections.nCopies(n, seed));
} private String pretreat(String raw) {
return raw.trim()+" ";
} public static void main(String[] args) throws Exception{
String sql=removeExtraSpace(readSqlFromFile("c:\\temp\\12.sql"));
System.out.println(sql);
new Lexer(sql).printTokenList();;
} private static String readSqlFromFile(String filePath) throws Exception{
StringBuilder sb=new StringBuilder();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF-8"));
String line = null;
while( ( line = br.readLine() ) != null ) {
sb.append(line);
}
br.close();
return sb.toString();
} private static String removeExtraSpace(String raw) {
return raw.replaceAll("\\s{2,}", " ");
}
}
--2020年5月13日 16点07分--
SQL分词器1.10版的更多相关文章
- elasticsearch安装ik分词器(极速版)
简介:下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 1.下载zip包.elasticsearch-analysis-ik-1.8.0.jar下面有附件链接[ik-安装包.zip],下 ...
- elasticsearch 6.2.4 安装 elasticsearch-analysis-ik 分词器 (windows 10下)
访问 https://github.com/medcl/elasticsearch-analysis-ik 找 releases 找到对应的 es 版本 下载 elasticsearch-analy ...
- 用lucene4.10.2分词器进行分词
import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.luc ...
- Solr4.10与tomcat整合并安装中文分词器
1.solr Solr 是Apache下的一个顶级开源项目,采用Java开发,它是基于Lucene的全文搜索服务器.Solr提供了比Lucene更为丰富的查询语言,同时实现了可配置.可扩展,并对索引. ...
- Elasticsearch教程(三),IK分词器安装 (极速版)
如果只想快速安装IK,本教程管用.下面看经过. 简介: 下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 当前讲解的IK分词器 包的 version 为1.8. 一.下载zip包. 下面 ...
- 5.Solr4.10.3中配置中文分词器
转载请出自出处:http://www.cnblogs.com/hd3013779515/ 1.下载IK Analyzer 2012FF_hf1.zip并上传到/home/test 2.按照如下命令安装 ...
- Elasticsearch(10) --- 内置分词器、中文分词器
Elasticsearch(10) --- 内置分词器.中文分词器 这篇博客主要讲:分词器概念.ES内置分词器.ES中文分词器. 一.分词器概念 1.Analysis 和 Analyzer Analy ...
- 11大Java开源中文分词器的使用方法和分词效果对比,当前几个主要的Lucene中文分词器的比较
本文的目标有两个: 1.学会使用11大Java开源中文分词器 2.对比分析11大Java开源中文分词器的分词效果 本文给出了11大Java开源中文分词的使用方法以及分词结果对比代码,至于效果哪个好,那 ...
- Lucene.Net+盘古分词器(详细介绍)(转)
出处:http://www.cnblogs.com/magicchaiy/archive/2013/06/07/LuceneNet%E7%9B%98%E5%8F%A4%E5%88%86%E8%AF%8 ...
随机推荐
- NIO(一):Buffer缓冲区
一.NIO与IO: IO: 一般泛指进行input/output操作(读写操作),Java IO其核心是字符流(inputstream/outputstream)和字节流(reader/writer ...
- ES6 面向对象笔记
JS面向对象两大编程思想 面向过程 面向对象 面向过程编程POP 面向过程就是分析出问题的需要步骤,然后用函数一步一步的实现,使用的时候一个一个调用就可以了 面向对象编程OOP ...
- VMWare虚拟机问题总结
windows7提示:在该系统上全局禁用了虚拟打印功能,不会为该虚拟机启用此功能,虚拟设备:'seria10'将断开连接. 解决:打开虚拟机前选择 编辑-->首选项-->设备--&g ...
- 【LeetCode/LintCode】 题解丨微软面试题:大楼轮廓
水平面上有 N 座大楼,每座大楼都是矩阵的形状,可以用一个三元组表示 (start, end, height),分别代表其在x轴上的起点,终点和高度.大楼之间从远处看可能会重叠,求出 N 座大楼的外轮 ...
- C#LeetCode刷题之#521-最长特殊序列 Ⅰ(Longest Uncommon Subsequence I)
问题 该文章的最新版本已迁移至个人博客[比特飞],单击链接 https://www.byteflying.com/archives/3949 访问. 给定两个字符串,你需要从这两个字符串中找出最长的特 ...
- 滴滴推理引擎IFX:千万规模设备下AI部署实践
桔妹导读:「滴滴技术」将于本月开始,联合各技术团队为大家带来精彩分享.你想了解的技术干货,深度专访,团队及招聘将于每周三与你准时见面.本月为「滴滴云平台事业群分享月」,在今天的内容中,云平台事业群-机 ...
- day2 变量
变量是在程序中表现为不重复的名字,只需定义一个名字,给这个名字变量赋值即可 作用 在内存中开辟一块空间.起了一个别名,用了访问和存储空间中的数据 在编写 Python 程序过程中, 经常需要给标识 ...
- Spring Security报异常 Encoded password does not look like BCrypt
控制台报错: Encoded password does not look like BCrypt 意思是前端传回去的密码格式与数据库里的密码格式不匹配,这样即使密码正确也无法校验.自然也就无法登录. ...
- linux手动安装python
前提:你的linux服务器必须有gcc编译器,gcc查看方法:linux命令行>gcc -v 如果返回版本信息证明已经安装, 如果找不到命令,跳到这篇手动安装gcc >>> l ...
- linux root用户下没有.ssh目录
.ssh 是记录密码信息的文件夹,如果没有登录过root的话,就没有 .ssh 文件夹,因此登录 localhost ,并输入密码就会生成了 ssh localhost