SQL分词器1.10版
处理SQL及分词效果:
- select * from ( select rownum as rn,tb1.stuid,tb1.summary from ( select stuid,sum(score) as summary from gk_score group by stuid order by summary desc ) tb1 order by tb1.summary desc ) tb2 where rn<11
- Index Type No Text Type Desc
- ------------------------------------------------------------------------------------
- 0 1 select KW:select
- 1 2 * Text
- 2 4 from KW:from
- 3 18 ( (
- 4 1 select KW:select
- 5 2 rownum Text
- 6 13 as KW:as
- 7 2 rn Text
- 8 3 , Comma
- 9 2 tb1.stuid Text
- 10 3 , Comma
- 11 2 tb1.summary Text
- 12 4 from KW:from
- 13 18 ( (
- 14 1 select KW:select
- 15 2 stuid Text
- 16 3 , Comma
- 17 2 sum Text
- 18 18 ( (
- 19 2 score Text
- 20 19 ) )
- 21 13 as KW:as
- 22 2 summary Text
- 23 4 from KW:from
- 24 2 gk_score Text
- 25 14 group KW:group
- 26 10 by KW:by
- 27 2 stuid Text
- 28 9 order KW:order
- 29 10 by KW:by
- 30 2 summary Text
- 31 11 desc KW:asc
- 32 19 ) )
- 33 2 tb1 Text
- 34 9 order KW:order
- 35 10 by KW:by
- 36 2 tb1.summary Text
- 37 11 desc KW:asc
- 38 19 ) )
- 39 2 tb2 Text
- 40 5 where KW:where
- 41 2 rn Text
- 42 16 < <
- 43 2 11 Text
程序:
- package com.heyang.easysql.lex10;
- import java.io.BufferedReader;
- import java.io.FileInputStream;
- import java.io.InputStreamReader;
- import java.util.ArrayList;
- import java.util.Collections;
- import java.util.List;
- class Token{
- static final int TYPE_SELECT=1;
- static final int TYPE_TEXT=2;
- static final int TYPE_COMMA=3;
- static final int TYPE_FROM=4;
- static final int TYPE_WHERE=5;
- static final int TYPE_AND=6;
- static final int TYPE_EQUAL=7;
- static final int TYPE_OR=8;
- static final int TYPE_ORDER=9;
- static final int TYPE_BY=10;
- static final int TYPE_ASC=11;
- static final int TYPE_DESC=12;
- static final int TYPE_AS=13;
- static final int TYPE_GROUP=14;
- static final int TYPE_HAVING=15;
- static final int TYPE_LESSTHAN=16;
- static final int TYPE_GREATERTHAN=17;
- static final int TYPE_OPEN_PARENTHESIS=18;
- static final int TYPE_CLOSE_PARENTHESIS=19;
- static final int TYPE_CONNECT=20;
- static final int TYPE_LESSTHAN_OR_EQUAL=21;
- static final int TYPE_GREATERTHAN_OR_EQUAL=22;
- static final int TYPE_LESSTHAN_OR_GREATERTHAN=23;
- static final int TYPE_CASE=24;
- static final int TYPE_WHEN=25;
- static final int TYPE_THEN=26;
- static final int TYPE_ELSE=27;
- static final int TYPE_END=28;
- static final int TYPE_IS=29;
- static final int TYPE_NULL=30;
- static final int TYPE_TRUE=31;
- static final int TYPE_FALSE=32;
- static final int TYPE_PLUS=33;
- static final int TYPE_MINUS=34;
- //static final int TYPE_MULTI=35;
- static final int TYPE_DEVIDE=36;
- static final int TYPE_DISTINCT=37;
- static final int TYPE_OVER=38;
- static final int TYPE_STRING_CONCAT=39;
- static final int TYPE_ON=40;
- static final int TYPE_JOIN=41;
- static final int TYPE_INNER=42;
- static final int TYPE_LEFT=43;
- static final int TYPE_RIGHT=44;
- static final int TYPE_OUTER=45;
- static final int TYPE_FULL=46;
- static final int TYPE_WITHIN=47;
- static final int TYPE_PARTITION=48;
- int type;
- String text;
- public Token(char c,int type) {
- this.text=String.valueOf(c);
- this.type=type;
- }
- public Token(String word,int type) {
- this.text=word;
- this.type=type;
- }
- public String getTypeStr() {
- if(type==TYPE_SELECT) {
- return "KW:select";
- }else if(type==TYPE_FROM) {
- return "KW:from";
- }else if(type==TYPE_COMMA) {
- return "Comma";
- }else if(type==TYPE_TEXT) {
- return "Text";
- }else if(type==TYPE_WHERE) {
- return "KW:where";
- }else if(type==TYPE_AND) {
- return "KW:and";
- }else if(type==TYPE_EQUAL) {
- return "=";
- }else if(type==TYPE_OR) {
- return "KW:or";
- }else if(type==TYPE_ORDER) {
- return "KW:order";
- }else if(type==TYPE_BY) {
- return "KW:by";
- }else if(type==TYPE_ASC) {
- return "KW:asc";
- }else if(type==TYPE_DESC) {
- return "KW:desc";
- }else if(type==TYPE_AS) {
- return "KW:as";
- }else if(type==TYPE_GROUP) {
- return "KW:group";
- }else if(type==TYPE_HAVING) {
- return "KW:having";
- }else if(type==TYPE_LESSTHAN) {
- return "<";
- }else if(type==TYPE_GREATERTHAN) {
- return ">";
- }else if(type==TYPE_OPEN_PARENTHESIS) {
- return "(";
- }else if(type==TYPE_CLOSE_PARENTHESIS) {
- return ")";
- }else if(type==TYPE_CONNECT) {
- return "KW:connect";
- }else if(type==TYPE_LESSTHAN_OR_EQUAL) {
- return "<=";
- }else if(type==TYPE_GREATERTHAN_OR_EQUAL) {
- return ">=";
- }else if(type==TYPE_LESSTHAN_OR_GREATERTHAN) {
- return "<>";
- }else if(type==TYPE_CASE) {
- return "KW:case";
- }else if(type==TYPE_WHEN) {
- return "KW:when";
- }else if(type==TYPE_THEN) {
- return "KW:then";
- }else if(type==TYPE_ELSE) {
- return "KW:else";
- }else if(type==TYPE_END) {
- return "KW:end";
- }else if(type==TYPE_IS) {
- return "KW:is";
- }else if(type==TYPE_NULL) {
- return "KW:null";
- }else if(type==TYPE_TRUE) {
- return "KW:true";
- }else if(type==TYPE_FALSE) {
- return "KW:false";
- }else if(type==TYPE_PLUS) {
- return "+";
- }else if(type==TYPE_MINUS) {
- return "-";
- }else if(type==TYPE_DEVIDE) {
- return "/";
- }else if(type==TYPE_DISTINCT) {
- return "KW:distinct";
- }else if(type==TYPE_OVER) {
- return "KW:over";
- }else if(type==TYPE_STRING_CONCAT) {
- return "||";
- }else if(type==TYPE_ON) {
- return "KW:on";
- }else if(type==TYPE_JOIN) {
- return "KW:join";
- }else if(type==TYPE_INNER) {
- return "KW:inner";
- }else if(type==TYPE_LEFT) {
- return "KW:left";
- }else if(type==TYPE_RIGHT) {
- return "KW:right";
- }else if(type==TYPE_OUTER) {
- return "KW:outer";
- }else if(type==TYPE_FULL) {
- return "KW:full";
- }else if(type==TYPE_WITHIN) {
- return "KW:within";
- }else if(type==TYPE_PARTITION) {
- return "KW:partition";
- }
- return null;
- }
- }
- public class Lexer {
- private List<Token> tokenList;
- public Lexer(String inputSql) {
- String sql=pretreat(inputSql);
- String swallowed="";
- tokenList=new ArrayList<Token>();
- for(int i=0;i<sql.length();i++){
- char c=sql.charAt(i);
- if(Character.isWhitespace(c)){
- addTextToList(swallowed);
- swallowed="";
- }else if(c==','){
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_COMMA));
- }else if(c=='='){
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_EQUAL));
- }else if(c=='<'){
- int next=i+1;
- if(next<sql.length() && sql.charAt(next)=='=') {
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token("<=",Token.TYPE_LESSTHAN_OR_EQUAL));
- i++;
- }else if(next<sql.length() && sql.charAt(next)=='>') {
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token("<>",Token.TYPE_LESSTHAN_OR_GREATERTHAN));
- i++;
- }else {
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_LESSTHAN));
- }
- }else if(c=='>'){
- int next=i+1;
- if(next<sql.length() && sql.charAt(next)=='=') {
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(">=",Token.TYPE_GREATERTHAN_OR_EQUAL));
- i++;
- }else {
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_GREATERTHAN));
- }
- }else if(c=='|'){
- int next=i+1;
- if(next<sql.length() && sql.charAt(next)=='|') {
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token("||",Token.TYPE_STRING_CONCAT));
- i++;
- }
- }else if(c=='('){
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_OPEN_PARENTHESIS));
- }else if(c==')'){
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_CLOSE_PARENTHESIS));
- }else if(c=='+'){
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_PLUS));
- }else if(c=='-'){
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_MINUS));
- }else if(c=='/'){
- addTextToList(swallowed);
- swallowed="";
- tokenList.add(new Token(c,Token.TYPE_DEVIDE));
- }else {
- swallowed+=c;
- }
- }
- }
- private int findTypeByText(String text) {
- Object[][] arr= {
- {"select", Token.TYPE_SELECT},
- {"from", Token.TYPE_FROM},
- {"where", Token.TYPE_WHERE},
- {"and", Token.TYPE_AND},
- {"or", Token.TYPE_OR},
- {"order", Token.TYPE_ORDER},
- {"by", Token.TYPE_BY},
- {"asc", Token.TYPE_ASC},
- {"desc", Token.TYPE_ASC},
- {"asc", Token.TYPE_DESC},
- {"as", Token.TYPE_AS},
- {"group", Token.TYPE_GROUP},
- {"having", Token.TYPE_HAVING},
- {"connect", Token.TYPE_CONNECT},
- {"case", Token.TYPE_CASE},
- {"when", Token.TYPE_WHEN},
- {"then", Token.TYPE_THEN},
- {"else", Token.TYPE_ELSE},
- {"end", Token.TYPE_END},
- {"is", Token.TYPE_IS},
- {"null", Token.TYPE_NULL},
- {"true", Token.TYPE_TRUE},
- {"false", Token.TYPE_FALSE},
- {"distinct", Token.TYPE_DISTINCT},
- {"over", Token.TYPE_OVER},
- {"on", Token.TYPE_ON},
- {"join", Token.TYPE_JOIN},
- {"inner", Token.TYPE_INNER},
- {"left", Token.TYPE_LEFT},
- {"right", Token.TYPE_RIGHT},
- {"outer", Token.TYPE_OUTER},
- {"full", Token.TYPE_FULL},
- {"within", Token.TYPE_WITHIN},
- {"partition", Token.TYPE_PARTITION},
- };
- for(Object[] arrInner:arr) {
- String keyword=String.valueOf(arrInner[0]);
- if(keyword.equalsIgnoreCase(text)) {
- return Integer.parseInt(arrInner[1].toString());
- }
- }
- return Token.TYPE_TEXT;
- }
- private void addTextToList(String text) {
- int type=findTypeByText(text);
- addToken2List(text,type);
- }
- private void addToken2List(String text,int type) {
- if(text.trim().length()>0) {
- tokenList.add(new Token(text,type));
- }
- }
- public void printTokenList() {
- final String continuousStar = createRepeatedStr("-", 84);
- final String layout = "%-20s %-20s %-20s %-20s %s";
- StringBuilder sb = new StringBuilder();
- sb.append(String.format(layout, "Index", "Type No","Text","Type Desc","\n"));
- sb.append(continuousStar + "\n");
- int index=0;
- for(Token token:tokenList) {
- sb.append(String.format(layout, String.valueOf(index),String.valueOf(token.type), token.text,token.getTypeStr(),"\n"));
- index++;
- }
- System.out.println(sb.toString());
- }
- private static String createRepeatedStr(String seed, int n) {
- return String.join("", Collections.nCopies(n, seed));
- }
- private String pretreat(String raw) {
- return raw.trim()+" ";
- }
- public static void main(String[] args) throws Exception{
- String sql=removeExtraSpace(readSqlFromFile("c:\\temp\\12.sql"));
- System.out.println(sql);
- new Lexer(sql).printTokenList();;
- }
- private static String readSqlFromFile(String filePath) throws Exception{
- StringBuilder sb=new StringBuilder();
- BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF-8"));
- String line = null;
- while( ( line = br.readLine() ) != null ) {
- sb.append(line);
- }
- br.close();
- return sb.toString();
- }
- private static String removeExtraSpace(String raw) {
- return raw.replaceAll("\\s{2,}", " ");
- }
- }
--2020年5月13日 16点07分--
SQL分词器1.10版的更多相关文章
- elasticsearch安装ik分词器(极速版)
简介:下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 1.下载zip包.elasticsearch-analysis-ik-1.8.0.jar下面有附件链接[ik-安装包.zip],下 ...
- elasticsearch 6.2.4 安装 elasticsearch-analysis-ik 分词器 (windows 10下)
访问 https://github.com/medcl/elasticsearch-analysis-ik 找 releases 找到对应的 es 版本 下载 elasticsearch-analy ...
- 用lucene4.10.2分词器进行分词
import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.luc ...
- Solr4.10与tomcat整合并安装中文分词器
1.solr Solr 是Apache下的一个顶级开源项目,采用Java开发,它是基于Lucene的全文搜索服务器.Solr提供了比Lucene更为丰富的查询语言,同时实现了可配置.可扩展,并对索引. ...
- Elasticsearch教程(三),IK分词器安装 (极速版)
如果只想快速安装IK,本教程管用.下面看经过. 简介: 下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 当前讲解的IK分词器 包的 version 为1.8. 一.下载zip包. 下面 ...
- 5.Solr4.10.3中配置中文分词器
转载请出自出处:http://www.cnblogs.com/hd3013779515/ 1.下载IK Analyzer 2012FF_hf1.zip并上传到/home/test 2.按照如下命令安装 ...
- Elasticsearch(10) --- 内置分词器、中文分词器
Elasticsearch(10) --- 内置分词器.中文分词器 这篇博客主要讲:分词器概念.ES内置分词器.ES中文分词器. 一.分词器概念 1.Analysis 和 Analyzer Analy ...
- 11大Java开源中文分词器的使用方法和分词效果对比,当前几个主要的Lucene中文分词器的比较
本文的目标有两个: 1.学会使用11大Java开源中文分词器 2.对比分析11大Java开源中文分词器的分词效果 本文给出了11大Java开源中文分词的使用方法以及分词结果对比代码,至于效果哪个好,那 ...
- Lucene.Net+盘古分词器(详细介绍)(转)
出处:http://www.cnblogs.com/magicchaiy/archive/2013/06/07/LuceneNet%E7%9B%98%E5%8F%A4%E5%88%86%E8%AF%8 ...
随机推荐
- CSS变化、过渡与动画
CSS变换用于在空间中移动物体,而CSS过渡和CSS关键帧动画用于控制元素随时间推移的变化. 变换.过渡和关键帧动画的规范仍然在制定中.尽管如此,其中大多数特性已经在常用浏览器中实现了. 1.二维变换 ...
- Redis教程——检视阅读
Redis教程--检视阅读 参考 Redis教程--菜鸟--蓝本--3.2.100 Redis教程--w3c--3.2.100 Redis教程--w3c--Redis开发运维实践指南 Redis教程- ...
- ~/.ssh/目录找不到解决方法
执行 cd ~/.ssh发现.ssh目录找不到 原因是因为没有用root用户ssh登录过,执行一下ssh操作就会自动生成了
- [leetcode/lintcode 题解] Google面试题:合法组合
给一个单词s,和一个字符串集合str.这个单词每次去掉一个字母,直到剩下最后一个字母.求验证是否存在一种删除的顺序,这个顺序下所有的单词都在str中.例如单词是’abc’,字符串集合是{‘a’,’ab ...
- html定时跳转页面
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...
- Solon Ioc 的注解对比Spring及JSR330
注解对比 Solon 1.0.10 Spring JSR 330 @XInject * @Autowired @Inject 字段或参数注入 @XBean * @Component @Named Be ...
- Android SDK 环境的搭建 --图形界面模式和命令行模式
Android 开发首先就是要搭建开发环境,没有用过Eclipse(ADT)开发过,直接用的Android Studio,其中最主要的就是 Android SDK的安装和搭建,所以这里只是总结下And ...
- xpath和css选择器对比
基本语法对比 都可以在html中提取内容,但xpath可以提取xml的内容.
- JavaScript对象、函数、变量、字符串的处理、运算符
一.对象 使用一种抽象的概念去描述,人{属性,方法} var car={type:"BYD",model:500,color:white,do:function(){"可 ...
- Ubuntu LNMP环境的搭建
一.安装nginx Step1:安装: sudo apt-get install nginx Step2:查看ngnix 运行状态 : service nginx status 查看80端口是否开启: ...