处理SQL及分词效果:

  1. select * from ( select rownum as rn,tb1.stuid,tb1.summary from ( select stuid,sum(score) as summary from gk_score group by stuid order by summary desc ) tb1 order by tb1.summary desc ) tb2 where rn<11
  2. Index Type No Text Type Desc
  3. ------------------------------------------------------------------------------------
  4. 0 1 select KW:select
  5. 1 2 * Text
  6. 2 4 from KW:from
  7. 3 18 ( (
  8. 4 1 select KW:select
  9. 5 2 rownum Text
  10. 6 13 as KW:as
  11. 7 2 rn Text
  12. 8 3 , Comma
  13. 9 2 tb1.stuid Text
  14. 10 3 , Comma
  15. 11 2 tb1.summary Text
  16. 12 4 from KW:from
  17. 13 18 ( (
  18. 14 1 select KW:select
  19. 15 2 stuid Text
  20. 16 3 , Comma
  21. 17 2 sum Text
  22. 18 18 ( (
  23. 19 2 score Text
  24. 20 19 ) )
  25. 21 13 as KW:as
  26. 22 2 summary Text
  27. 23 4 from KW:from
  28. 24 2 gk_score Text
  29. 25 14 group KW:group
  30. 26 10 by KW:by
  31. 27 2 stuid Text
  32. 28 9 order KW:order
  33. 29 10 by KW:by
  34. 30 2 summary Text
  35. 31 11 desc KW:asc
  36. 32 19 ) )
  37. 33 2 tb1 Text
  38. 34 9 order KW:order
  39. 35 10 by KW:by
  40. 36 2 tb1.summary Text
  41. 37 11 desc KW:asc
  42. 38 19 ) )
  43. 39 2 tb2 Text
  44. 40 5 where KW:where
  45. 41 2 rn Text
  46. 42 16 < <
  47. 43 2 11 Text

程序:

  1. package com.heyang.easysql.lex10;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.FileInputStream;
  5. import java.io.InputStreamReader;
  6. import java.util.ArrayList;
  7. import java.util.Collections;
  8. import java.util.List;
  9.  
  10. class Token{
  11. static final int TYPE_SELECT=1;
  12. static final int TYPE_TEXT=2;
  13. static final int TYPE_COMMA=3;
  14. static final int TYPE_FROM=4;
  15. static final int TYPE_WHERE=5;
  16. static final int TYPE_AND=6;
  17. static final int TYPE_EQUAL=7;
  18. static final int TYPE_OR=8;
  19. static final int TYPE_ORDER=9;
  20. static final int TYPE_BY=10;
  21. static final int TYPE_ASC=11;
  22. static final int TYPE_DESC=12;
  23. static final int TYPE_AS=13;
  24. static final int TYPE_GROUP=14;
  25. static final int TYPE_HAVING=15;
  26. static final int TYPE_LESSTHAN=16;
  27. static final int TYPE_GREATERTHAN=17;
  28. static final int TYPE_OPEN_PARENTHESIS=18;
  29. static final int TYPE_CLOSE_PARENTHESIS=19;
  30. static final int TYPE_CONNECT=20;
  31. static final int TYPE_LESSTHAN_OR_EQUAL=21;
  32. static final int TYPE_GREATERTHAN_OR_EQUAL=22;
  33. static final int TYPE_LESSTHAN_OR_GREATERTHAN=23;
  34. static final int TYPE_CASE=24;
  35. static final int TYPE_WHEN=25;
  36. static final int TYPE_THEN=26;
  37. static final int TYPE_ELSE=27;
  38. static final int TYPE_END=28;
  39. static final int TYPE_IS=29;
  40. static final int TYPE_NULL=30;
  41. static final int TYPE_TRUE=31;
  42. static final int TYPE_FALSE=32;
  43. static final int TYPE_PLUS=33;
  44. static final int TYPE_MINUS=34;
  45. //static final int TYPE_MULTI=35;
  46. static final int TYPE_DEVIDE=36;
  47. static final int TYPE_DISTINCT=37;
  48. static final int TYPE_OVER=38;
  49. static final int TYPE_STRING_CONCAT=39;
  50. static final int TYPE_ON=40;
  51. static final int TYPE_JOIN=41;
  52. static final int TYPE_INNER=42;
  53. static final int TYPE_LEFT=43;
  54. static final int TYPE_RIGHT=44;
  55. static final int TYPE_OUTER=45;
  56. static final int TYPE_FULL=46;
  57. static final int TYPE_WITHIN=47;
  58. static final int TYPE_PARTITION=48;
  59.  
  60. int type;
  61. String text;
  62.  
  63. public Token(char c,int type) {
  64. this.text=String.valueOf(c);
  65. this.type=type;
  66. }
  67.  
  68. public Token(String word,int type) {
  69. this.text=word;
  70. this.type=type;
  71. }
  72.  
  73. public String getTypeStr() {
  74. if(type==TYPE_SELECT) {
  75. return "KW:select";
  76. }else if(type==TYPE_FROM) {
  77. return "KW:from";
  78. }else if(type==TYPE_COMMA) {
  79. return "Comma";
  80. }else if(type==TYPE_TEXT) {
  81. return "Text";
  82. }else if(type==TYPE_WHERE) {
  83. return "KW:where";
  84. }else if(type==TYPE_AND) {
  85. return "KW:and";
  86. }else if(type==TYPE_EQUAL) {
  87. return "=";
  88. }else if(type==TYPE_OR) {
  89. return "KW:or";
  90. }else if(type==TYPE_ORDER) {
  91. return "KW:order";
  92. }else if(type==TYPE_BY) {
  93. return "KW:by";
  94. }else if(type==TYPE_ASC) {
  95. return "KW:asc";
  96. }else if(type==TYPE_DESC) {
  97. return "KW:desc";
  98. }else if(type==TYPE_AS) {
  99. return "KW:as";
  100. }else if(type==TYPE_GROUP) {
  101. return "KW:group";
  102. }else if(type==TYPE_HAVING) {
  103. return "KW:having";
  104. }else if(type==TYPE_LESSTHAN) {
  105. return "<";
  106. }else if(type==TYPE_GREATERTHAN) {
  107. return ">";
  108. }else if(type==TYPE_OPEN_PARENTHESIS) {
  109. return "(";
  110. }else if(type==TYPE_CLOSE_PARENTHESIS) {
  111. return ")";
  112. }else if(type==TYPE_CONNECT) {
  113. return "KW:connect";
  114. }else if(type==TYPE_LESSTHAN_OR_EQUAL) {
  115. return "<=";
  116. }else if(type==TYPE_GREATERTHAN_OR_EQUAL) {
  117. return ">=";
  118. }else if(type==TYPE_LESSTHAN_OR_GREATERTHAN) {
  119. return "<>";
  120. }else if(type==TYPE_CASE) {
  121. return "KW:case";
  122. }else if(type==TYPE_WHEN) {
  123. return "KW:when";
  124. }else if(type==TYPE_THEN) {
  125. return "KW:then";
  126. }else if(type==TYPE_ELSE) {
  127. return "KW:else";
  128. }else if(type==TYPE_END) {
  129. return "KW:end";
  130. }else if(type==TYPE_IS) {
  131. return "KW:is";
  132. }else if(type==TYPE_NULL) {
  133. return "KW:null";
  134. }else if(type==TYPE_TRUE) {
  135. return "KW:true";
  136. }else if(type==TYPE_FALSE) {
  137. return "KW:false";
  138. }else if(type==TYPE_PLUS) {
  139. return "+";
  140. }else if(type==TYPE_MINUS) {
  141. return "-";
  142. }else if(type==TYPE_DEVIDE) {
  143. return "/";
  144. }else if(type==TYPE_DISTINCT) {
  145. return "KW:distinct";
  146. }else if(type==TYPE_OVER) {
  147. return "KW:over";
  148. }else if(type==TYPE_STRING_CONCAT) {
  149. return "||";
  150. }else if(type==TYPE_ON) {
  151. return "KW:on";
  152. }else if(type==TYPE_JOIN) {
  153. return "KW:join";
  154. }else if(type==TYPE_INNER) {
  155. return "KW:inner";
  156. }else if(type==TYPE_LEFT) {
  157. return "KW:left";
  158. }else if(type==TYPE_RIGHT) {
  159. return "KW:right";
  160. }else if(type==TYPE_OUTER) {
  161. return "KW:outer";
  162. }else if(type==TYPE_FULL) {
  163. return "KW:full";
  164. }else if(type==TYPE_WITHIN) {
  165. return "KW:within";
  166. }else if(type==TYPE_PARTITION) {
  167. return "KW:partition";
  168. }
  169.  
  170. return null;
  171. }
  172. }
  173.  
  174. public class Lexer {
  175. private List<Token> tokenList;
  176.  
  177. public Lexer(String inputSql) {
  178. String sql=pretreat(inputSql);
  179. String swallowed="";
  180.  
  181. tokenList=new ArrayList<Token>();
  182. for(int i=0;i<sql.length();i++){
  183. char c=sql.charAt(i);
  184.  
  185. if(Character.isWhitespace(c)){
  186. addTextToList(swallowed);
  187. swallowed="";
  188. }else if(c==','){
  189. addTextToList(swallowed);
  190. swallowed="";
  191. tokenList.add(new Token(c,Token.TYPE_COMMA));
  192. }else if(c=='='){
  193. addTextToList(swallowed);
  194. swallowed="";
  195. tokenList.add(new Token(c,Token.TYPE_EQUAL));
  196. }else if(c=='<'){
  197. int next=i+1;
  198. if(next<sql.length() && sql.charAt(next)=='=') {
  199. addTextToList(swallowed);
  200. swallowed="";
  201. tokenList.add(new Token("<=",Token.TYPE_LESSTHAN_OR_EQUAL));
  202. i++;
  203. }else if(next<sql.length() && sql.charAt(next)=='>') {
  204. addTextToList(swallowed);
  205. swallowed="";
  206. tokenList.add(new Token("<>",Token.TYPE_LESSTHAN_OR_GREATERTHAN));
  207. i++;
  208. }else {
  209. addTextToList(swallowed);
  210. swallowed="";
  211. tokenList.add(new Token(c,Token.TYPE_LESSTHAN));
  212. }
  213. }else if(c=='>'){
  214. int next=i+1;
  215. if(next<sql.length() && sql.charAt(next)=='=') {
  216. addTextToList(swallowed);
  217. swallowed="";
  218. tokenList.add(new Token(">=",Token.TYPE_GREATERTHAN_OR_EQUAL));
  219. i++;
  220. }else {
  221. addTextToList(swallowed);
  222. swallowed="";
  223. tokenList.add(new Token(c,Token.TYPE_GREATERTHAN));
  224. }
  225. }else if(c=='|'){
  226. int next=i+1;
  227. if(next<sql.length() && sql.charAt(next)=='|') {
  228. addTextToList(swallowed);
  229. swallowed="";
  230. tokenList.add(new Token("||",Token.TYPE_STRING_CONCAT));
  231. i++;
  232. }
  233. }else if(c=='('){
  234. addTextToList(swallowed);
  235. swallowed="";
  236. tokenList.add(new Token(c,Token.TYPE_OPEN_PARENTHESIS));
  237. }else if(c==')'){
  238. addTextToList(swallowed);
  239. swallowed="";
  240. tokenList.add(new Token(c,Token.TYPE_CLOSE_PARENTHESIS));
  241. }else if(c=='+'){
  242. addTextToList(swallowed);
  243. swallowed="";
  244. tokenList.add(new Token(c,Token.TYPE_PLUS));
  245. }else if(c=='-'){
  246. addTextToList(swallowed);
  247. swallowed="";
  248. tokenList.add(new Token(c,Token.TYPE_MINUS));
  249. }else if(c=='/'){
  250. addTextToList(swallowed);
  251. swallowed="";
  252. tokenList.add(new Token(c,Token.TYPE_DEVIDE));
  253. }else {
  254. swallowed+=c;
  255. }
  256. }
  257. }
  258.  
  259. private int findTypeByText(String text) {
  260. Object[][] arr= {
  261. {"select", Token.TYPE_SELECT},
  262. {"from", Token.TYPE_FROM},
  263. {"where", Token.TYPE_WHERE},
  264. {"and", Token.TYPE_AND},
  265. {"or", Token.TYPE_OR},
  266. {"order", Token.TYPE_ORDER},
  267. {"by", Token.TYPE_BY},
  268. {"asc", Token.TYPE_ASC},
  269. {"desc", Token.TYPE_ASC},
  270. {"asc", Token.TYPE_DESC},
  271. {"as", Token.TYPE_AS},
  272. {"group", Token.TYPE_GROUP},
  273. {"having", Token.TYPE_HAVING},
  274. {"connect", Token.TYPE_CONNECT},
  275. {"case", Token.TYPE_CASE},
  276. {"when", Token.TYPE_WHEN},
  277. {"then", Token.TYPE_THEN},
  278. {"else", Token.TYPE_ELSE},
  279. {"end", Token.TYPE_END},
  280. {"is", Token.TYPE_IS},
  281. {"null", Token.TYPE_NULL},
  282. {"true", Token.TYPE_TRUE},
  283. {"false", Token.TYPE_FALSE},
  284. {"distinct", Token.TYPE_DISTINCT},
  285. {"over", Token.TYPE_OVER},
  286. {"on", Token.TYPE_ON},
  287. {"join", Token.TYPE_JOIN},
  288. {"inner", Token.TYPE_INNER},
  289. {"left", Token.TYPE_LEFT},
  290. {"right", Token.TYPE_RIGHT},
  291. {"outer", Token.TYPE_OUTER},
  292. {"full", Token.TYPE_FULL},
  293. {"within", Token.TYPE_WITHIN},
  294. {"partition", Token.TYPE_PARTITION},
  295. };
  296.  
  297. for(Object[] arrInner:arr) {
  298. String keyword=String.valueOf(arrInner[0]);
  299. if(keyword.equalsIgnoreCase(text)) {
  300. return Integer.parseInt(arrInner[1].toString());
  301. }
  302. }
  303.  
  304. return Token.TYPE_TEXT;
  305. }
  306.  
  307. private void addTextToList(String text) {
  308. int type=findTypeByText(text);
  309. addToken2List(text,type);
  310. }
  311.  
  312. private void addToken2List(String text,int type) {
  313. if(text.trim().length()>0) {
  314. tokenList.add(new Token(text,type));
  315. }
  316. }
  317.  
  318. public void printTokenList() {
  319. final String continuousStar = createRepeatedStr("-", 84);
  320. final String layout = "%-20s %-20s %-20s %-20s %s";
  321. StringBuilder sb = new StringBuilder();
  322.  
  323. sb.append(String.format(layout, "Index", "Type No","Text","Type Desc","\n"));
  324. sb.append(continuousStar + "\n");
  325. int index=0;
  326. for(Token token:tokenList) {
  327. sb.append(String.format(layout, String.valueOf(index),String.valueOf(token.type), token.text,token.getTypeStr(),"\n"));
  328. index++;
  329. }
  330.  
  331. System.out.println(sb.toString());
  332. }
  333.  
  334. private static String createRepeatedStr(String seed, int n) {
  335. return String.join("", Collections.nCopies(n, seed));
  336. }
  337.  
  338. private String pretreat(String raw) {
  339. return raw.trim()+" ";
  340. }
  341.  
  342. public static void main(String[] args) throws Exception{
  343. String sql=removeExtraSpace(readSqlFromFile("c:\\temp\\12.sql"));
  344. System.out.println(sql);
  345. new Lexer(sql).printTokenList();;
  346. }
  347.  
  348. private static String readSqlFromFile(String filePath) throws Exception{
  349. StringBuilder sb=new StringBuilder();
  350. BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF-8"));
  351. String line = null;
  352. while( ( line = br.readLine() ) != null ) {
  353. sb.append(line);
  354. }
  355. br.close();
  356. return sb.toString();
  357. }
  358.  
  359. private static String removeExtraSpace(String raw) {
  360. return raw.replaceAll("\\s{2,}", " ");
  361. }
  362. }

--2020年5月13日 16点07分--

SQL分词器1.10版的更多相关文章

  1. elasticsearch安装ik分词器(极速版)

    简介:下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 1.下载zip包.elasticsearch-analysis-ik-1.8.0.jar下面有附件链接[ik-安装包.zip],下 ...

  2. elasticsearch 6.2.4 安装 elasticsearch-analysis-ik 分词器 (windows 10下)

    访问 https://github.com/medcl/elasticsearch-analysis-ik  找 releases 找到对应的 es 版本 下载 elasticsearch-analy ...

  3. 用lucene4.10.2分词器进行分词

    import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.luc ...

  4. Solr4.10与tomcat整合并安装中文分词器

    1.solr Solr 是Apache下的一个顶级开源项目,采用Java开发,它是基于Lucene的全文搜索服务器.Solr提供了比Lucene更为丰富的查询语言,同时实现了可配置.可扩展,并对索引. ...

  5. Elasticsearch教程(三),IK分词器安装 (极速版)

    如果只想快速安装IK,本教程管用.下面看经过. 简介: 下面讲有我已经打包并且编辑过的zip包,你可以在下面下载即可. 当前讲解的IK分词器 包的 version 为1.8. 一.下载zip包. 下面 ...

  6. 5.Solr4.10.3中配置中文分词器

    转载请出自出处:http://www.cnblogs.com/hd3013779515/ 1.下载IK Analyzer 2012FF_hf1.zip并上传到/home/test 2.按照如下命令安装 ...

  7. Elasticsearch(10) --- 内置分词器、中文分词器

    Elasticsearch(10) --- 内置分词器.中文分词器 这篇博客主要讲:分词器概念.ES内置分词器.ES中文分词器. 一.分词器概念 1.Analysis 和 Analyzer Analy ...

  8. 11大Java开源中文分词器的使用方法和分词效果对比,当前几个主要的Lucene中文分词器的比较

    本文的目标有两个: 1.学会使用11大Java开源中文分词器 2.对比分析11大Java开源中文分词器的分词效果 本文给出了11大Java开源中文分词的使用方法以及分词结果对比代码,至于效果哪个好,那 ...

  9. Lucene.Net+盘古分词器(详细介绍)(转)

    出处:http://www.cnblogs.com/magicchaiy/archive/2013/06/07/LuceneNet%E7%9B%98%E5%8F%A4%E5%88%86%E8%AF%8 ...

随机推荐

  1. CSS变化、过渡与动画

    CSS变换用于在空间中移动物体,而CSS过渡和CSS关键帧动画用于控制元素随时间推移的变化. 变换.过渡和关键帧动画的规范仍然在制定中.尽管如此,其中大多数特性已经在常用浏览器中实现了. 1.二维变换 ...

  2. Redis教程——检视阅读

    Redis教程--检视阅读 参考 Redis教程--菜鸟--蓝本--3.2.100 Redis教程--w3c--3.2.100 Redis教程--w3c--Redis开发运维实践指南 Redis教程- ...

  3. ~/.ssh/目录找不到解决方法

    执行 cd ~/.ssh发现.ssh目录找不到 原因是因为没有用root用户ssh登录过,执行一下ssh操作就会自动生成了

  4. [leetcode/lintcode 题解] Google面试题:合法组合

    给一个单词s,和一个字符串集合str.这个单词每次去掉一个字母,直到剩下最后一个字母.求验证是否存在一种删除的顺序,这个顺序下所有的单词都在str中.例如单词是’abc’,字符串集合是{‘a’,’ab ...

  5. html定时跳转页面

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  6. Solon Ioc 的注解对比Spring及JSR330

    注解对比 Solon 1.0.10 Spring JSR 330 @XInject * @Autowired @Inject 字段或参数注入 @XBean * @Component @Named Be ...

  7. Android SDK 环境的搭建 --图形界面模式和命令行模式

    Android 开发首先就是要搭建开发环境,没有用过Eclipse(ADT)开发过,直接用的Android Studio,其中最主要的就是 Android SDK的安装和搭建,所以这里只是总结下And ...

  8. xpath和css选择器对比

    基本语法对比 都可以在html中提取内容,但xpath可以提取xml的内容.

  9. JavaScript对象、函数、变量、字符串的处理、运算符

    一.对象 使用一种抽象的概念去描述,人{属性,方法} var car={type:"BYD",model:500,color:white,do:function(){"可 ...

  10. Ubuntu LNMP环境的搭建

    一.安装nginx Step1:安装: sudo apt-get install nginx Step2:查看ngnix 运行状态 : service nginx status 查看80端口是否开启: ...