
1 增加测试用例
2 修复中文查找可能导致越界的bug
3 strstr改为不使用二分(效率会慢一些,但匹配结果相对可控),推荐使用strstrs_ext








  1. #include <stdio.h>
  2. #include <windows.h>
  4. #ifndef IN
  5. #define IN
  6. #endif
  8. //函数说明:在字符串中搜索指定的关键字,支持1-nCnt个关键字
  9. //strToFind 待查找字符串 不允许为空
  10. //strKeywords 搜索关键字字符串数组 不允许为空 数组元素不允许为空(NULL),但可以是空串("")
  11. //nCnt 关键字个数
  12. //pFound 查找到的关键字在字符串数组的位置 不允许为空
  13. //返回值:
  14. //1 如果关键字存在空串,则返回strToFind
  15. //2 如果找不到关键字则返回NULL
  16. //3 如果找到关键字,则返回关键字在strKeywords中的位置(位置从0开始)
  18. //使用哈希加二分查找实现
  19. const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
  20. //使用哈希加链接实现 推荐使用
  21. const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
  22. //依次查找关键字的实现
  23. const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
  25. //以下是为了使用方便而增加的一些重载,没多大意义
  26. char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
  27. char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
  28. char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
  30. char *strstrs(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
  31. char *strstrs_ext(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
  32. char *strstrs_normal(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
  34. const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
  35. const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int pFound);
  36. const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
  37. void tets_strstrs(int nStep); // 0 strstrs 1 strstrs_ext 2 strstrs_normal


  1. // stdafx.cpp : source file that includes just the standard includes
  2. // sqlite_test.pch will be the pre-compiled header
  3. // stdafx.obj will contain the pre-compiled type information
  5. #include "stdafx.h"
  6. #include <assert.h>
  7. #include <stdlib.h>
  8. #include <time.h>
  9. #include <stdio.h>
  11. // TODO: reference any additional headers you need in STDAFX.H
  12. // and not in this file
  14. const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
  15. {
  16. return strstrs(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
  17. }
  19. const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
  20. {
  21. return strstrs_ext(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
  22. }
  24. const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
  25. {
  26. return strstrs_normal(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
  27. }
  29. const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
  30. {
  31. return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
  32. }
  34. const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
  35. {
  36. return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
  37. }
  39. const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
  40. {
  41. return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
  42. }
  44. char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
  45. {
  46. return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
  47. }
  49. char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
  50. {
  51. return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
  52. }
  54. char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
  55. {
  56. return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
  57. }
  59. typedef struct tagKeyPos
  60. {
  61. const char *m_str;
  62. size_t m_nIdx;
  63. size_t m_strLen;
  64. }KeyPos;
  66. int __strstrs_cmp(const void *p1, const void *p2)
  67. {
  68. const KeyPos *pLeft = (KeyPos *)p1, *pRight = (KeyPos *)p2;
  69. int nCmp = strcmp(pLeft->m_str, pRight->m_str);
  70. if (nCmp == )
  71. {
  72. return pLeft->m_nIdx - pRight->m_nIdx;
  73. }
  75. return nCmp;
  76. }
  78. /*
  79. //lower_bound
  80. KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey)
  81. {
  82. KeyPos *pBeg = pRealBeg;
  83. KeyPos *pEnd = pRealEnd;
  85. KeyPos *pEqal = NULL;
  86. while (pBeg != pEnd)
  87. {
  88. pEqal = pBeg + (pEnd - pBeg) / 2;
  89. int nCmp = memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen );
  90. if (nCmp == 0)
  91. {
  92. //若相等,则往前找,直至找到最后一个相等的元素
  93. while (pEqal != pBeg)
  94. {
  95. pEqal--;
  96. if (memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen ))
  97. {
  98. return pEqal + 1;
  99. }
  100. }
  102. return pBeg;
  103. }
  104. else if (nCmp > 0)
  105. {
  106. //中值比目标值大
  107. pEnd = pEqal;
  108. }
  109. else
  110. {
  111. //中值比目标值小
  112. pBeg = pEqal + 1;
  113. }
  115. }
  117. return pRealEnd;
  118. }
  119. */
  121. KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey)
  122. {
  123. KeyPos *pBeg = pRealBeg;
  124. KeyPos *pEnd = pRealEnd;
  126. while (pBeg != pEnd)
  127. {
  128. int nCmp = memcmp( pBeg->m_str, pKey->m_str, pBeg->m_strLen );
  129. if (nCmp == )
  130. {
  131. return pBeg;
  132. }
  134. ++pBeg;
  135. }
  137. return pRealEnd;
  138. }
  140. char *strstrs(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
  141. {
  142. //作者:皇家救星 创建于:2016-10-19
  143. //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈
  144. //异常参数判断
  145. assert(strToFind != NULL);
  146. assert(strKeywords != NULL);
  147. assert(pFound != NULL);
  148. assert(nCnt > );
  150. //记录各个关键字首字符到集合中 后面判断用
  151. bool mpFirstChar[] = {}; //这里如果用位图,可以节省不少空间
  152. for (size_t i = ; i < nCnt; i++)
  153. {
  154. //linux和win的char类型定义不一样 这里统一强制转换一下
  155. assert(strKeywords[i] != NULL);
  156. //使用unsigned char 确保char类型是负数时强制转换不会超过256而越界
  157. mpFirstChar[(unsigned char)strKeywords[i][]] = true;
  158. if (strKeywords[i][] == '\0')
  159. {
  160. *pFound = i;
  161. return strToFind;
  162. }
  163. }
  165. KeyPos *sortKeywords = new KeyPos[nCnt];
  166. for (size_t i = ; i < nCnt; ++i)
  167. {
  168. sortKeywords[i].m_str = strKeywords[i];
  169. sortKeywords[i].m_strLen = strlen(strKeywords[i]);
  170. sortKeywords[i].m_nIdx = i;
  171. }
  172. //不能排序,会导致关键字位置混乱
  173. //qsort(sortKeywords, nCnt, sizeof(KeyPos), __strstrs_cmp);
  175. //使用unsigned char 确保char类型是负数时强制转换不会超过256而越界
  176. unsigned char *p = (unsigned char *)strToFind;
  177. KeyPos key;
  178. KeyPos *pEnd = sortKeywords + nCnt;
  179. KeyPos *pResult = NULL;
  180. while (*p)
  181. {
  182. //判断当前字符是否在关键串首字符集中
  183. if (mpFirstChar[*p])
  184. {
  185. key.m_str = (char *)p;
  186. pResult = __strstrs_find_first(sortKeywords, pEnd, NULL, &key);
  187. if (pResult != pEnd)
  188. {
  189. *pFound = pResult->m_nIdx;
  190. delete []sortKeywords;
  191. return reinterpret_cast<char *>(p);
  192. }
  193. }
  195. p++;
  196. }
  198. delete []sortKeywords;
  199. return NULL;
  200. }
  202. typedef struct tagKeyPosExt
  203. {
  204. size_t m_strLen;
  205. size_t m_strIdx;
  206. struct tagKeyPosExt *m_next;
  207. }KeyPosExt;
  209. char *strstrs_ext(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
  210. {
  211. //作者:皇家救星 创建于:2016-10-19
  212. //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈
  213. //20190522 修改字符串有中文会导致内存访问异常的bug
  214. //异常参数判断
  215. assert(strToFind != NULL);
  216. assert(strKeywords != NULL);
  217. assert(pFound != NULL);
  218. assert(nCnt > );
  220. //仿内存池 减少new调用次数
  221. KeyPosExt *memPool = new KeyPosExt[nCnt]; //注意:memPool分配失败会抛异常
  222. memset(memPool, , nCnt * sizeof(KeyPosExt));
  223. int nUsed = ;
  225. //记录各个关键字首字符到集合中 后面判断用
  226. KeyPosExt mpFirstChar[];
  227. memset(mpFirstChar, , sizeof(mpFirstChar));
  228. for (size_t i = nCnt - ; i != (size_t)-; --i)
  229. {
  230. KeyPosExt *pPos = &memPool[nUsed++];
  231. //如果同一个首字符对应多个关键字,则用链表连起来
  232. assert(strKeywords[i] != NULL);
  233. pPos->m_strIdx = i;
  234. pPos->m_strLen = strlen(strKeywords[i]);
  236. if (pPos->m_strLen == )
  237. {
  238. *pFound = i;
  239. delete []memPool;
  240. return strToFind;
  241. }
  243. //把新的节点插到最前面
  244. //使用unsigned char 确保char类型是负数时强制转换不会超过256而越界
  245. KeyPosExt *pLast = &mpFirstChar[(unsigned char)strKeywords[i][]];
  246. pPos->m_next = pLast->m_next;
  247. pLast->m_next = pPos;
  248. }
  250. //使用unsigned char 确保char类型是负数时强制转换不会超过256而越界
  251. unsigned char *p = (unsigned char *) strToFind;
  252. while (*p)
  253. {
  254. //判断当前字符是否在关键串首字符集中
  255. for (KeyPosExt *pPos = mpFirstChar[*p].m_next; pPos != NULL; pPos = pPos->m_next)
  256. {
  257. //遍历以当前字符开头的关键串,挨个比较 看是否有匹配的
  258. if (memcmp(p, strKeywords[pPos->m_strIdx], pPos->m_strLen) == )
  259. {
  260. *pFound = pPos->m_strIdx;
  261. delete []memPool;
  262. return reinterpret_cast<char *>(p);
  263. }
  264. }
  266. p++;
  267. }
  269. delete []memPool;
  270. return NULL;
  271. }
  273. char *strstrs_normal(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
  274. {
  275. //作者:皇家救星 创建于:2016-10-19
  276. //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈
  277. //20190522 修改字符串有中文会导致内存访问异常的bug
  278. //异常参数判断
  279. assert(strToFind != NULL);
  280. assert(strKeywords != NULL);
  281. assert(pFound != NULL);
  282. assert(nCnt > );
  284. char *p = NULL;
  285. for (size_t i = ; i < nCnt; i++)
  286. {
  287. assert(strKeywords[i] != NULL);
  288. if (strKeywords[i][] == '\0')
  289. {
  290. *pFound = i;
  291. return strToFind;
  292. }
  293. }
  295. for (size_t i = ; i < nCnt; i++)
  296. {
  297. assert(strKeywords[i] != NULL);
  298. if ((p = strstr(strToFind, strKeywords[i])) != NULL)
  299. {
  300. *pFound = i;
  301. return p;
  302. }
  303. }
  304. return NULL;
  305. }
  307. //准确性测试
  308. int tets_strstrs1()
  309. {
  310. const char *strKeywords[] = {"", "select", "union", "or", "customer", "subsid",
  311. "", "group_id", "test", "from", "truncate", "s", "english1", "", "皇家"};
  312. const char *strSqls[] = {
  313. "select * from dual",
  314. "drop table",
  315. "truncate",
  316. "english",
  317. "goodby",
  318. "get 123",
  319. "123 get",
  320. " from"
  321. "D",
  322. "s",
  323. "89sfs89",
  324. "or",
  325. "sor",
  326. "orunion",
  327. "unionor",
  328. "83eejr3r9r9r33302002013345331224312343",
  329. "去9999给",
  330. "去皇家救星给"
  331. };
  333. for (int i = ; i < sizeof(strSqls) / sizeof(strSqls[]); ++i)
  334. {
  335. bool bFoundNormal = false;
  336. int nFoundNormal = ;
  337. if (NULL !=
  338. strstrs_normal(strSqls[i], strKeywords, sizeof(strKeywords) / sizeof(strKeywords[]), &nFoundNormal))
  339. {
  340. bFoundNormal = true;
  341. }
  343. bool bFoundExt = false;
  344. int nFoundExt = ;
  345. if (NULL !=
  346. strstrs_ext(strSqls[i], strKeywords, sizeof(strKeywords) / sizeof(strKeywords[]), &nFoundExt))
  347. {
  348. bFoundExt = true;
  349. }
  351. bool bFound = false;
  352. int nFound = ;
  353. if (NULL !=
  354. strstrs(strSqls[i], strKeywords, sizeof(strKeywords) / sizeof(strKeywords[]), &nFound))
  355. {
  356. bFound = true;
  357. }
  359. if ((bFound != bFoundExt || bFound != bFoundNormal)
  360. || (nFound != nFoundExt /*|| nFound != nFoundNormal*/))
  361. {
  362. printf("error! strSqls[i] = [%s]\n", strSqls[i]);
  363. printf("bFound = %d nFound = %d\n", bFound, nFound);
  364. printf("bFoundNormal = %d nFoundNormal = %d\n", bFoundNormal, nFoundNormal);
  365. printf("bFoundExt = %d nFoundExt = %d\n", bFoundExt, nFoundExt);
  366. return - - i * ;
  367. }
  368. }
  370. return ;
  371. }
  373. //效率比较及准确性测试函数
  374. void tets_strstrs(int nStep)
  375. {
  376. const int max_length = ; //max_length必须大于1024
  377. const int max_keyword = ;
  378. char *strToFound = new char[max_length + ]; //待查找的字符串
  379. char *strBackup = new char[max_length + ];
  380. char *strKeywords[max_keyword]; //关键字数组
  381. const char strBase64[] = {"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"};
  383. //为避免结果全是找不到关键字,随机将一个关键字复制到strToFound中
  384. //这样肯定会有找到关键字的情况,结果更有意义
  385. bool arrayFoundFlags[max_keyword] = {}; //标记是否把关键字复制到strToFound中
  386. int arrayFoundIdxs[max_keyword] = {}; //待替换的关键字(序号)
  387. int arrayFoundBeg[max_keyword] = {}; //在strToFound替换关键字的起始位置
  389. if (tets_strstrs1() != )
  390. {
  391. printf("函数功能验证失败\n");
  392. return;
  393. }
  395. srand((int)time(NULL));
  396. //初始化要查询的字符串
  397. for (int i = ; i < max_length; i++)
  398. {
  399. strToFound[i] = strBase64[rand() % ];
  400. }
  401. strToFound[max_length] = '\0';
  402. fprintf(stderr, "strToFound = [%s]\n", strToFound);
  404. //初始化查询关键字
  405. for (int i = ; i < max_keyword; i++)
  406. {
  407. size_t nKeyLen = max_length / ;
  408. size_t nKeyLenMin = ;
  409. strKeywords[i] = new char[nKeyLen + ];
  411. if (nKeyLen < nKeyLenMin)
  412. {
  413. fprintf(stderr, "max_length is too small\n");
  414. exit();
  415. }
  416. int nLen = rand() % (nKeyLen - nKeyLenMin) + nKeyLenMin;
  417. for (int j = ; j < nLen; j++)
  418. {
  419. strKeywords[i][j] = strBase64[rand() % ];
  420. }
  421. strKeywords[i][nLen] = '\0';
  423. //为避免随机结果都是查不到的情况,这里增加一些干预
  424. //if (0 != (rand() % 10))
  425. // {
  426. // //随机抽取约9/10的关键字 复制到待查字符串中
  427. // arrayFoundFlags[i] = true;
  428. // arrayFoundIdxs[i] = rand() % (i + 1);
  429. // arrayFoundBeg[i] = 0;
  430. // }
  432. fprintf(stderr, "strKeywords[%d] = [%s]\n", i, strKeywords[i]);
  433. fprintf(stderr, "%d: %d %d %d\n", i, arrayFoundFlags[i], arrayFoundIdxs[i], arrayFoundBeg[i]);
  434. }
  435. fflush(stderr);
  436. printf("RESULT: 函数类型 关键字总数 总耗时 总共找到次数\n");
  437. for (int cmpType = ; cmpType < ; cmpType++)
  438. {
  439. int nSn = ;
  440. double total_start = GetTickCount();
  441. for (size_t nCnt = ; nCnt < max_keyword; nCnt++)
  442. {
  443. bool bSetFound = arrayFoundFlags[nCnt];
  444. int nBeg = ;
  445. int nChange = ;
  446. int idxKeyword = ;
  447. if (bSetFound)
  448. {
  449. //把关键字替换到字符串中 这样能保证字符串肯定包含想要的字符串
  450. idxKeyword = arrayFoundIdxs[nCnt];
  451. nChange = strlen(strKeywords[idxKeyword]);
  452. nBeg = arrayFoundBeg[nCnt];
  453. memcpy(strBackup, strToFound + nBeg, nChange);
  454. strBackup[nChange] = '\0';
  455. memcpy(strToFound + nBeg, strKeywords[idxKeyword], nChange);
  456. }
  458. double start = GetTickCount();
  459. int nFoundCnt = ;
  461. //待查字符串从短到长
  462. for (int nStrlen = ; nStrlen < max_length; nStrlen += nStep)
  463. {
  464. //末尾要有\0 所以这里行把末尾字符备份起来 用\0覆盖 后面调用strstrs后再替换回去
  465. char cBak = strToFound[nStrlen];
  466. strToFound[nStrlen] = '\0';
  467. int nFound = -;
  468. const char *p;
  469. switch (cmpType)
  470. {
  471. case :
  472. p = strstrs(strToFound, strKeywords, nCnt + , &nFound);
  473. break;
  474. case :
  475. p = strstrs_ext(strToFound, strKeywords, nCnt + , &nFound);
  476. break;
  477. default:
  478. p = strstrs_normal(strToFound, strKeywords, nCnt + , &nFound);
  479. break;
  480. }
  482. //fprintf(stderr, "cmpType %d %d %d\n", cmpType, nSn, nFound);
  483. nSn++;
  484. if (p != NULL)
  485. {
  486. nFoundCnt++;
  487. }
  488. else
  489. {
  490. //假设明明有把关键字拷进去但还是返回找不到,说明结果有问题
  491. if (bSetFound && ((nBeg + nChange) <= nStrlen))
  492. {
  493. printf("cmpType = %d ###############################error!\n", cmpType);
  494. printf("strToFound = [%s], nStrlen = %d, nCnt = %d\n", strToFound, nStrlen, nCnt);
  495. printf("strKeywords[arrayFoundIdxs[nCnt]] = [%s], nBeg = %d, nChange = %d\n",
  496. strKeywords[arrayFoundIdxs[nCnt]], nBeg, nChange);
  497. exit();
  498. // switch (cmpType)
  499. // {
  500. // case 0:
  501. // p = strstrs(strToFound, strKeywords, nCnt + 1, &nFound);
  502. // break;
  503. // case 1:
  504. // p = strstrs_ext(strToFound, strKeywords, nCnt + 1, &nFound);
  505. // break;
  506. // default:
  507. // p = strstrs_normal(strToFound, strKeywords, nCnt + 1, &nFound);
  508. // break;
  509. // }
  510. }
  511. }
  513. strToFound[nStrlen] = cBak;
  514. }
  515. double end = GetTickCount();
  516. //函数类型 关键字序列 耗时 总共找到次数
  517. printf("RESULT: %d %d %f %d\n",
  518. cmpType, nCnt + , end - start, nFoundCnt);
  519. fflush(stdout);
  520. fflush(stderr);
  522. // if (nFoundCnt == 499)
  523. // {
  524. // printf("pre strToFound = [%s], strBackup = [%s], nCnt = %d nBeg %d nChange %d idxKeyword %d strKeywords[idxKeyword] %s\n",
  525. // strToFound, strBackup, nCnt, nBeg, nChange, idxKeyword, strKeywords[idxKeyword]);
  526. // }
  528. if (bSetFound)
  529. {
  530. memcpy(strToFound + nBeg, strBackup, nChange);
  531. }
  532. //
  533. // if (nFoundCnt == 499)
  534. // {
  535. // printf("strToFound = [%s], nCnt = %d nBeg %d nChange %d idxKeyword %d\n", strToFound, nCnt, nBeg, nChange, idxKeyword);
  536. // }
  537. }
  539. double total_end = GetTickCount();
  540. fprintf(stderr, "总共耗时[%f]\n", total_end - total_start);
  541. }
  543. //TODO: 此处应该要释放内存
  544. delete []strToFound;
  545. delete []strBackup;
  546. for (int i = ; i < max_keyword; i++)
  547. {
  548. delete []strKeywords[i];
  549. }
  550. }


0 代表strstrs

1 代表strstrs_ext

2 代表strstrs_normal




在任何情况下strstrs_ext都表现 最好


