最近参与一个小project,需要编写一个针对英文单词的stem 算法。

1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html

// This file was generated automatically by the Snowball to Java compiler

package org.tartarus.snowball.ext;

import org.tartarus.snowball.Among;

 /**
* This class was automatically generated by a Snowball to Java compiler
* It implements the stemming algorithm defined by a snowball script.
*/ public class englishStemmer extends org.tartarus.snowball.SnowballStemmer { private static final long serialVersionUID = 1L; private final static englishStemmer methodObject = new englishStemmer (); private final static Among a_0[] = {
new Among ( "arsen", -1, -1, "", methodObject ),
new Among ( "commun", -1, -1, "", methodObject ),
new Among ( "gener", -1, -1, "", methodObject )
}; private final static Among a_1[] = {
new Among ( "'", -1, 1, "", methodObject ),
new Among ( "'s'", 0, 1, "", methodObject ),
new Among ( "'s", -1, 1, "", methodObject )
}; private final static Among a_2[] = {
new Among ( "ied", -1, 2, "", methodObject ),
new Among ( "s", -1, 3, "", methodObject ),
new Among ( "ies", 1, 2, "", methodObject ),
new Among ( "sses", 1, 1, "", methodObject ),
new Among ( "ss", 1, -1, "", methodObject ),
new Among ( "us", 1, -1, "", methodObject )
}; private final static Among a_3[] = {
new Among ( "", -1, 3, "", methodObject ),
new Among ( "bb", 0, 2, "", methodObject ),
new Among ( "dd", 0, 2, "", methodObject ),
new Among ( "ff", 0, 2, "", methodObject ),
new Among ( "gg", 0, 2, "", methodObject ),
new Among ( "bl", 0, 1, "", methodObject ),
new Among ( "mm", 0, 2, "", methodObject ),
new Among ( "nn", 0, 2, "", methodObject ),
new Among ( "pp", 0, 2, "", methodObject ),
new Among ( "rr", 0, 2, "", methodObject ),
new Among ( "at", 0, 1, "", methodObject ),
new Among ( "tt", 0, 2, "", methodObject ),
new Among ( "iz", 0, 1, "", methodObject )
}; private final static Among a_4[] = {
new Among ( "ed", -1, 2, "", methodObject ),
new Among ( "eed", 0, 1, "", methodObject ),
new Among ( "ing", -1, 2, "", methodObject ),
new Among ( "edly", -1, 2, "", methodObject ),
new Among ( "eedly", 3, 1, "", methodObject ),
new Among ( "ingly", -1, 2, "", methodObject )
}; private final static Among a_5[] = {
new Among ( "anci", -1, 3, "", methodObject ),
new Among ( "enci", -1, 2, "", methodObject ),
new Among ( "ogi", -1, 13, "", methodObject ),
new Among ( "li", -1, 16, "", methodObject ),
new Among ( "bli", 3, 12, "", methodObject ),
new Among ( "abli", 4, 4, "", methodObject ),
new Among ( "alli", 3, 8, "", methodObject ),
new Among ( "fulli", 3, 14, "", methodObject ),
new Among ( "lessli", 3, 15, "", methodObject ),
new Among ( "ousli", 3, 10, "", methodObject ),
new Among ( "entli", 3, 5, "", methodObject ),
new Among ( "aliti", -1, 8, "", methodObject ),
new Among ( "biliti", -1, 12, "", methodObject ),
new Among ( "iviti", -1, 11, "", methodObject ),
new Among ( "tional", -1, 1, "", methodObject ),
new Among ( "ational", 14, 7, "", methodObject ),
new Among ( "alism", -1, 8, "", methodObject ),
new Among ( "ation", -1, 7, "", methodObject ),
new Among ( "ization", 17, 6, "", methodObject ),
new Among ( "izer", -1, 6, "", methodObject ),
new Among ( "ator", -1, 7, "", methodObject ),
new Among ( "iveness", -1, 11, "", methodObject ),
new Among ( "fulness", -1, 9, "", methodObject ),
new Among ( "ousness", -1, 10, "", methodObject )
}; private final static Among a_6[] = {
new Among ( "icate", -1, 4, "", methodObject ),
new Among ( "ative", -1, 6, "", methodObject ),
new Among ( "alize", -1, 3, "", methodObject ),
new Among ( "iciti", -1, 4, "", methodObject ),
new Among ( "ical", -1, 4, "", methodObject ),
new Among ( "tional", -1, 1, "", methodObject ),
new Among ( "ational", 5, 2, "", methodObject ),
new Among ( "ful", -1, 5, "", methodObject ),
new Among ( "ness", -1, 5, "", methodObject )
}; private final static Among a_7[] = {
new Among ( "ic", -1, 1, "", methodObject ),
new Among ( "ance", -1, 1, "", methodObject ),
new Among ( "ence", -1, 1, "", methodObject ),
new Among ( "able", -1, 1, "", methodObject ),
new Among ( "ible", -1, 1, "", methodObject ),
new Among ( "ate", -1, 1, "", methodObject ),
new Among ( "ive", -1, 1, "", methodObject ),
new Among ( "ize", -1, 1, "", methodObject ),
new Among ( "iti", -1, 1, "", methodObject ),
new Among ( "al", -1, 1, "", methodObject ),
new Among ( "ism", -1, 1, "", methodObject ),
new Among ( "ion", -1, 2, "", methodObject ),
new Among ( "er", -1, 1, "", methodObject ),
new Among ( "ous", -1, 1, "", methodObject ),
new Among ( "ant", -1, 1, "", methodObject ),
new Among ( "ent", -1, 1, "", methodObject ),
new Among ( "ment", 15, 1, "", methodObject ),
new Among ( "ement", 16, 1, "", methodObject )
}; private final static Among a_8[] = {
new Among ( "e", -1, 1, "", methodObject ),
new Among ( "l", -1, 2, "", methodObject )
}; private final static Among a_9[] = {
new Among ( "succeed", -1, -1, "", methodObject ),
new Among ( "proceed", -1, -1, "", methodObject ),
new Among ( "exceed", -1, -1, "", methodObject ),
new Among ( "canning", -1, -1, "", methodObject ),
new Among ( "inning", -1, -1, "", methodObject ),
new Among ( "earring", -1, -1, "", methodObject ),
new Among ( "herring", -1, -1, "", methodObject ),
new Among ( "outing", -1, -1, "", methodObject )
}; private final static Among a_10[] = {
new Among ( "andes", -1, -1, "", methodObject ),
new Among ( "atlas", -1, -1, "", methodObject ),
new Among ( "bias", -1, -1, "", methodObject ),
new Among ( "cosmos", -1, -1, "", methodObject ),
new Among ( "dying", -1, 3, "", methodObject ),
new Among ( "early", -1, 9, "", methodObject ),
new Among ( "gently", -1, 7, "", methodObject ),
new Among ( "howe", -1, -1, "", methodObject ),
new Among ( "idly", -1, 6, "", methodObject ),
new Among ( "lying", -1, 4, "", methodObject ),
new Among ( "news", -1, -1, "", methodObject ),
new Among ( "only", -1, 10, "", methodObject ),
new Among ( "singly", -1, 11, "", methodObject ),
new Among ( "skies", -1, 2, "", methodObject ),
new Among ( "skis", -1, 1, "", methodObject ),
new Among ( "sky", -1, -1, "", methodObject ),
new Among ( "tying", -1, 5, "", methodObject ),
new Among ( "ugly", -1, 8, "", methodObject )
}; private static final char g_v[] = {17, 65, 16, 1 }; private static final char g_v_WXY[] = {1, 17, 65, 208, 1 }; private static final char g_valid_LI[] = {55, 141, 2 }; private boolean B_Y_found;
private int I_p2;
private int I_p1; private void copy_from(englishStemmer other) {
B_Y_found = other.B_Y_found;
I_p2 = other.I_p2;
I_p1 = other.I_p1;
super.copy_from(other);
} private boolean r_prelude() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
// (, line 25
// unset Y_found, line 26
B_Y_found = false;
// do, line 27
v_1 = cursor;
lab0: do {
// (, line 27
// [, line 27
bra = cursor;
// literal, line 27
if (!(eq_s(1, "'")))
{
break lab0;
}
// ], line 27
ket = cursor;
// delete, line 27
slice_del();
} while (false);
cursor = v_1;
// do, line 28
v_2 = cursor;
lab1: do {
// (, line 28
// [, line 28
bra = cursor;
// literal, line 28
if (!(eq_s(1, "y")))
{
break lab1;
}
// ], line 28
ket = cursor;
// <-, line 28
slice_from("Y");
// set Y_found, line 28
B_Y_found = true;
} while (false);
cursor = v_2;
// do, line 29
v_3 = cursor;
lab2: do {
// repeat, line 29
replab3: while(true)
{
v_4 = cursor;
lab4: do {
// (, line 29
// goto, line 29
golab5: while(true)
{
v_5 = cursor;
lab6: do {
// (, line 29
if (!(in_grouping(g_v, 97, 121)))
{
break lab6;
}
// [, line 29
bra = cursor;
// literal, line 29
if (!(eq_s(1, "y")))
{
break lab6;
}
// ], line 29
ket = cursor;
cursor = v_5;
break golab5;
} while (false);
cursor = v_5;
if (cursor >= limit)
{
break lab4;
}
cursor++;
}
// <-, line 29
slice_from("Y");
// set Y_found, line 29
B_Y_found = true;
continue replab3;
} while (false);
cursor = v_4;
break replab3;
}
} while (false);
cursor = v_3;
return true;
} private boolean r_mark_regions() {
int v_1;
int v_2;
// (, line 32
I_p1 = limit;
I_p2 = limit;
// do, line 35
v_1 = cursor;
lab0: do {
// (, line 35
// or, line 41
lab1: do {
v_2 = cursor;
lab2: do {
// among, line 36
if (find_among(a_0, 3) == 0)
{
break lab2;
}
break lab1;
} while (false);
cursor = v_2;
// (, line 41
// gopast, line 41
golab3: while(true)
{
lab4: do {
if (!(in_grouping(g_v, 97, 121)))
{
break lab4;
}
break golab3;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// gopast, line 41
golab5: while(true)
{
lab6: do {
if (!(out_grouping(g_v, 97, 121)))
{
break lab6;
}
break golab5;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
} while (false);
// setmark p1, line 42
I_p1 = cursor;
// gopast, line 43
golab7: while(true)
{
lab8: do {
if (!(in_grouping(g_v, 97, 121)))
{
break lab8;
}
break golab7;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// gopast, line 43
golab9: while(true)
{
lab10: do {
if (!(out_grouping(g_v, 97, 121)))
{
break lab10;
}
break golab9;
} while (false);
if (cursor >= limit)
{
break lab0;
}
cursor++;
}
// setmark p2, line 43
I_p2 = cursor;
} while (false);
cursor = v_1;
return true;
} private boolean r_shortv() {
int v_1;
// (, line 49
// or, line 51
lab0: do {
v_1 = limit - cursor;
lab1: do {
// (, line 50
if (!(out_grouping_b(g_v_WXY, 89, 121)))
{
break lab1;
}
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab1;
}
if (!(out_grouping_b(g_v, 97, 121)))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// (, line 52
if (!(out_grouping_b(g_v, 97, 121)))
{
return false;
}
if (!(in_grouping_b(g_v, 97, 121)))
{
return false;
}
// atlimit, line 52
if (cursor > limit_backward)
{
return false;
}
} while (false);
return true;
} private boolean r_R1() {
if (!(I_p1 <= cursor))
{
return false;
}
return true;
} private boolean r_R2() {
if (!(I_p2 <= cursor))
{
return false;
}
return true;
} private boolean r_Step_1a() {
int among_var;
int v_1;
int v_2;
// (, line 58
// try, line 59
v_1 = limit - cursor;
lab0: do {
// (, line 59
// [, line 60
ket = cursor;
// substring, line 60
among_var = find_among_b(a_1, 3);
if (among_var == 0)
{
cursor = limit - v_1;
break lab0;
}
// ], line 60
bra = cursor;
switch(among_var) {
case 0:
cursor = limit - v_1;
break lab0;
case 1:
// (, line 62
// delete, line 62
slice_del();
break;
}
} while (false);
// [, line 65
ket = cursor;
// substring, line 65
among_var = find_among_b(a_2, 6);
if (among_var == 0)
{
return false;
}
// ], line 65
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 66
// <-, line 66
slice_from("ss");
break;
case 2:
// (, line 68
// or, line 68
lab1: do {
v_2 = limit - cursor;
lab2: do {
// (, line 68
// hop, line 68
{
int c = cursor - 2;
if (limit_backward > c || c > limit)
{
break lab2;
}
cursor = c;
}
// <-, line 68
slice_from("i");
break lab1;
} while (false);
cursor = limit - v_2;
// <-, line 68
slice_from("ie");
} while (false);
break;
case 3:
// (, line 69
// next, line 69
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// gopast, line 69
golab3: while(true)
{
lab4: do {
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab4;
}
break golab3;
} while (false);
if (cursor <= limit_backward)
{
return false;
}
cursor--;
}
// delete, line 69
slice_del();
break;
}
return true;
} private boolean r_Step_1b() {
int among_var;
int v_1;
int v_3;
int v_4;
// (, line 74
// [, line 75
ket = cursor;
// substring, line 75
among_var = find_among_b(a_4, 6);
if (among_var == 0)
{
return false;
}
// ], line 75
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 77
// call R1, line 77
if (!r_R1())
{
return false;
}
// <-, line 77
slice_from("ee");
break;
case 2:
// (, line 79
// test, line 80
v_1 = limit - cursor;
// gopast, line 80
golab0: while(true)
{
lab1: do {
if (!(in_grouping_b(g_v, 97, 121)))
{
break lab1;
}
break golab0;
} while (false);
if (cursor <= limit_backward)
{
return false;
}
cursor--;
}
cursor = limit - v_1;
// delete, line 80
slice_del();
// test, line 81
v_3 = limit - cursor;
// substring, line 81
among_var = find_among_b(a_3, 13);
if (among_var == 0)
{
return false;
}
cursor = limit - v_3;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 83
// <+, line 83
{
int c = cursor;
insert(cursor, cursor, "e");
cursor = c;
}
break;
case 2:
// (, line 86
// [, line 86
ket = cursor;
// next, line 86
if (cursor <= limit_backward)
{
return false;
}
cursor--;
// ], line 86
bra = cursor;
// delete, line 86
slice_del();
break;
case 3:
// (, line 87
// atmark, line 87
if (cursor != I_p1)
{
return false;
}
// test, line 87
v_4 = limit - cursor;
// call shortv, line 87
if (!r_shortv())
{
return false;
}
cursor = limit - v_4;
// <+, line 87
{
int c = cursor;
insert(cursor, cursor, "e");
cursor = c;
}
break;
}
break;
}
return true;
} private boolean r_Step_1c() {
int v_1;
int v_2;
// (, line 93
// [, line 94
ket = cursor;
// or, line 94
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 94
if (!(eq_s_b(1, "y")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 94
if (!(eq_s_b(1, "Y")))
{
return false;
}
} while (false);
// ], line 94
bra = cursor;
if (!(out_grouping_b(g_v, 97, 121)))
{
return false;
}
// not, line 95
{
v_2 = limit - cursor;
lab2: do {
// atlimit, line 95
if (cursor > limit_backward)
{
break lab2;
}
return false;
} while (false);
cursor = limit - v_2;
}
// <-, line 96
slice_from("i");
return true;
} private boolean r_Step_2() {
int among_var;
// (, line 99
// [, line 100
ket = cursor;
// substring, line 100
among_var = find_among_b(a_5, 24);
if (among_var == 0)
{
return false;
}
// ], line 100
bra = cursor;
// call R1, line 100
if (!r_R1())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 101
// <-, line 101
slice_from("tion");
break;
case 2:
// (, line 102
// <-, line 102
slice_from("ence");
break;
case 3:
// (, line 103
// <-, line 103
slice_from("ance");
break;
case 4:
// (, line 104
// <-, line 104
slice_from("able");
break;
case 5:
// (, line 105
// <-, line 105
slice_from("ent");
break;
case 6:
// (, line 107
// <-, line 107
slice_from("ize");
break;
case 7:
// (, line 109
// <-, line 109
slice_from("ate");
break;
case 8:
// (, line 111
// <-, line 111
slice_from("al");
break;
case 9:
// (, line 112
// <-, line 112
slice_from("ful");
break;
case 10:
// (, line 114
// <-, line 114
slice_from("ous");
break;
case 11:
// (, line 116
// <-, line 116
slice_from("ive");
break;
case 12:
// (, line 118
// <-, line 118
slice_from("ble");
break;
case 13:
// (, line 119
// literal, line 119
if (!(eq_s_b(1, "l")))
{
return false;
}
// <-, line 119
slice_from("og");
break;
case 14:
// (, line 120
// <-, line 120
slice_from("ful");
break;
case 15:
// (, line 121
// <-, line 121
slice_from("less");
break;
case 16:
// (, line 122
if (!(in_grouping_b(g_valid_LI, 99, 116)))
{
return false;
}
// delete, line 122
slice_del();
break;
}
return true;
} private boolean r_Step_3() {
int among_var;
// (, line 126
// [, line 127
ket = cursor;
// substring, line 127
among_var = find_among_b(a_6, 9);
if (among_var == 0)
{
return false;
}
// ], line 127
bra = cursor;
// call R1, line 127
if (!r_R1())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 128
// <-, line 128
slice_from("tion");
break;
case 2:
// (, line 129
// <-, line 129
slice_from("ate");
break;
case 3:
// (, line 130
// <-, line 130
slice_from("al");
break;
case 4:
// (, line 132
// <-, line 132
slice_from("ic");
break;
case 5:
// (, line 134
// delete, line 134
slice_del();
break;
case 6:
// (, line 136
// call R2, line 136
if (!r_R2())
{
return false;
}
// delete, line 136
slice_del();
break;
}
return true;
} private boolean r_Step_4() {
int among_var;
int v_1;
// (, line 140
// [, line 141
ket = cursor;
// substring, line 141
among_var = find_among_b(a_7, 18);
if (among_var == 0)
{
return false;
}
// ], line 141
bra = cursor;
// call R2, line 141
if (!r_R2())
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 144
// delete, line 144
slice_del();
break;
case 2:
// (, line 145
// or, line 145
lab0: do {
v_1 = limit - cursor;
lab1: do {
// literal, line 145
if (!(eq_s_b(1, "s")))
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// literal, line 145
if (!(eq_s_b(1, "t")))
{
return false;
}
} while (false);
// delete, line 145
slice_del();
break;
}
return true;
} private boolean r_Step_5() {
int among_var;
int v_1;
int v_2;
// (, line 149
// [, line 150
ket = cursor;
// substring, line 150
among_var = find_among_b(a_8, 2);
if (among_var == 0)
{
return false;
}
// ], line 150
bra = cursor;
switch(among_var) {
case 0:
return false;
case 1:
// (, line 151
// or, line 151
lab0: do {
v_1 = limit - cursor;
lab1: do {
// call R2, line 151
if (!r_R2())
{
break lab1;
}
break lab0;
} while (false);
cursor = limit - v_1;
// (, line 151
// call R1, line 151
if (!r_R1())
{
return false;
}
// not, line 151
{
v_2 = limit - cursor;
lab2: do {
// call shortv, line 151
if (!r_shortv())
{
break lab2;
}
return false;
} while (false);
cursor = limit - v_2;
}
} while (false);
// delete, line 151
slice_del();
break;
case 2:
// (, line 152
// call R2, line 152
if (!r_R2())
{
return false;
}
// literal, line 152
if (!(eq_s_b(1, "l")))
{
return false;
}
// delete, line 152
slice_del();
break;
}
return true;
} private boolean r_exception2() {
// (, line 156
// [, line 158
ket = cursor;
// substring, line 158
if (find_among_b(a_9, 8) == 0)
{
return false;
}
// ], line 158
bra = cursor;
// atlimit, line 158
if (cursor > limit_backward)
{
return false;
}
return true;
} private boolean r_exception1() {
int among_var;
// (, line 168
// [, line 170
bra = cursor;
// substring, line 170
among_var = find_among(a_10, 18);
if (among_var == 0)
{
return false;
}
// ], line 170
ket = cursor;
// atlimit, line 170
if (cursor < limit)
{
return false;
}
switch(among_var) {
case 0:
return false;
case 1:
// (, line 174
// <-, line 174
slice_from("ski");
break;
case 2:
// (, line 175
// <-, line 175
slice_from("sky");
break;
case 3:
// (, line 176
// <-, line 176
slice_from("die");
break;
case 4:
// (, line 177
// <-, line 177
slice_from("lie");
break;
case 5:
// (, line 178
// <-, line 178
slice_from("tie");
break;
case 6:
// (, line 182
// <-, line 182
slice_from("idl");
break;
case 7:
// (, line 183
// <-, line 183
slice_from("gentl");
break;
case 8:
// (, line 184
// <-, line 184
slice_from("ugli");
break;
case 9:
// (, line 185
// <-, line 185
slice_from("earli");
break;
case 10:
// (, line 186
// <-, line 186
slice_from("onli");
break;
case 11:
// (, line 187
// <-, line 187
slice_from("singl");
break;
}
return true;
} private boolean r_postlude() {
int v_1;
int v_2;
// (, line 203
// Boolean test Y_found, line 203
if (!(B_Y_found))
{
return false;
}
// repeat, line 203
replab0: while(true)
{
v_1 = cursor;
lab1: do {
// (, line 203
// goto, line 203
golab2: while(true)
{
v_2 = cursor;
lab3: do {
// (, line 203
// [, line 203
bra = cursor;
// literal, line 203
if (!(eq_s(1, "Y")))
{
break lab3;
}
// ], line 203
ket = cursor;
cursor = v_2;
break golab2;
} while (false);
cursor = v_2;
if (cursor >= limit)
{
break lab1;
}
cursor++;
}
// <-, line 203
slice_from("y");
continue replab0;
} while (false);
cursor = v_1;
break replab0;
}
return true;
} public boolean stem() {
int v_1;
int v_2;
int v_3;
int v_4;
int v_5;
int v_6;
int v_7;
int v_8;
int v_9;
int v_10;
int v_11;
int v_12;
int v_13;
// (, line 205
// or, line 207
lab0: do {
v_1 = cursor;
lab1: do {
// call exception1, line 207
if (!r_exception1())
{
break lab1;
}
break lab0;
} while (false);
cursor = v_1;
lab2: do {
// not, line 208
{
v_2 = cursor;
lab3: do {
// hop, line 208
{
int c = cursor + 3;
if (0 > c || c > limit)
{
break lab3;
}
cursor = c;
}
break lab2;
} while (false);
cursor = v_2;
}
break lab0;
} while (false);
cursor = v_1;
// (, line 208
// do, line 209
v_3 = cursor;
lab4: do {
// call prelude, line 209
if (!r_prelude())
{
break lab4;
}
} while (false);
cursor = v_3;
// do, line 210
v_4 = cursor;
lab5: do {
// call mark_regions, line 210
if (!r_mark_regions())
{
break lab5;
}
} while (false);
cursor = v_4;
// backwards, line 211
limit_backward = cursor; cursor = limit;
// (, line 211
// do, line 213
v_5 = limit - cursor;
lab6: do {
// call Step_1a, line 213
if (!r_Step_1a())
{
break lab6;
}
} while (false);
cursor = limit - v_5;
// or, line 215
lab7: do {
v_6 = limit - cursor;
lab8: do {
// call exception2, line 215
if (!r_exception2())
{
break lab8;
}
break lab7;
} while (false);
cursor = limit - v_6;
// (, line 215
// do, line 217
v_7 = limit - cursor;
lab9: do {
// call Step_1b, line 217
if (!r_Step_1b())
{
break lab9;
}
} while (false);
cursor = limit - v_7;
// do, line 218
v_8 = limit - cursor;
lab10: do {
// call Step_1c, line 218
if (!r_Step_1c())
{
break lab10;
}
} while (false);
cursor = limit - v_8;
// do, line 220
v_9 = limit - cursor;
lab11: do {
// call Step_2, line 220
if (!r_Step_2())
{
break lab11;
}
} while (false);
cursor = limit - v_9;
// do, line 221
v_10 = limit - cursor;
lab12: do {
// call Step_3, line 221
if (!r_Step_3())
{
break lab12;
}
} while (false);
cursor = limit - v_10;
// do, line 222
v_11 = limit - cursor;
lab13: do {
// call Step_4, line 222
if (!r_Step_4())
{
break lab13;
}
} while (false);
cursor = limit - v_11;
// do, line 224
v_12 = limit - cursor;
lab14: do {
// call Step_5, line 224
if (!r_Step_5())
{
break lab14;
}
} while (false);
cursor = limit - v_12;
} while (false);
cursor = limit_backward; // do, line 227
v_13 = cursor;
lab15: do {
// call postlude, line 227
if (!r_postlude())
{
break lab15;
}
} while (false);
cursor = v_13;
} while (false);
return true;
} public boolean equals( Object o ) {
return o instanceof englishStemmer;
} public int hashCode() {
return englishStemmer.class.getName().hashCode();
} }

porter2 stemming algorithm

然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。

2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)

然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。

它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。

String dictionaryPath = "lemmatiser";
EngLemmatiser lemmatiser = new EngLemmatiser(dictionaryPath, false, true); String a = "brought";
String lemmatizedWord = lemmatiser.lemmatize(a);
System.out.println(lemmatizedWord);

然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。

3. Stanford CoreNLP

后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。

    String word="magnificus";
Morphology morph=new Morphology();
System.out.println(morph.stem(word));

English Morphology的更多相关文章

  1. How to Write a Spelling Corrector

    http://norvig.com/spell-correct.html Feb 2007to August 2016 How to Write a Spelling Corrector One we ...

  2. Lesson 14 Do you speak English?

    Text I had an amusing experience last year. After I had left a small village in the south of France. ...

  3. 运行nltk示例 Resource u'tokenizers punkt english.pickle' not found解决

    nltk安装完毕后,编写如下示例程序并运行,报Resource u'tokenizers/punkt/english.pickle' not found错误 import nltk sentence ...

  4. [LeetCode] Reconstruct Original Digits from English 从英文中重建数字

    Given a non-empty string containing an out-of-order English representation of digits 0-9, output the ...

  5. [LeetCode] Integer to English Words 整数转为英文单词

    Convert a non-negative integer to its english words representation. Given input is guaranteed to be ...

  6. leetcode-【hard】273. Integer to English Words

    题目: 273. Integer to English Words Convert a non-negative integer to its english words representation ...

  7. [LeetCode] 423 Reconstruct Original Digits from English

    Given a non-empty string containing an out-of-order English representation of digits 0-9, output the ...

  8. [CareerCup] 17.7 English Phrase Describe Integer 英文单词表示数字

    17.7 Given any integer, print an English phrase that describes the integer (e.g., "One Thousand ...

  9. English随笔1

    英语中的基本五大句型  1.Subject (主语) + Verb (谓语) Li Ming works The accident happened 2.Subject (主语) + Link. V( ...

随机推荐

  1. PHP5 session 详解

    http协议是WEB服务器与客户端(浏览器)相互通信的协议,它是一种无状态协议.所谓无状态,指的是不会维护http请求数据,http请求是独立的,非持久的.而越来越复杂的WEB应用,需要保存一些用户状 ...

  2. Codeforces Round #240 (Div. 2)(A -- D)

    点我看题目 A. Mashmokh and Lights time limit per test:1 secondmemory limit per test:256 megabytesinput:st ...

  3. CodeForces 299B Ksusha the Squirrel

    http://codeforces.com/problemset/problem/299/B 题意 :这个题挺简单的,就是说这个姑娘不喜欢走有石头的扇形,所以给你一个k的值,代表她一次可以跳多少扇形. ...

  4. hdu 1525 Euclid's Game 博弈论

    思路:两个数a和b,总会出现的一个局面是b,a%b,这是必然的,如果a>=b&&a<2*b,那么只有一种情况,直接到b,a%b.否则有多种情况. 对于a/b==1这种局面, ...

  5. c缺陷与陷阱笔记-第二章 语法陷阱

    1.函数的调用和番薯返回值是函数指针的声明 定义一个函数指针,例如  int (*fp)(float),这个函数的返回值是Int,参数是1个float类型,调用这个函数的方法是 (*fp)(),还有f ...

  6. 简单Sql语句统计每年每个月的数据,每个月为数据的每列,简单SQL练习

    有一张表,数据如下 请写出结果为以下的SQL语句. 在mysql中创建表 CREATE TABLE `aa` (  `id` int(10) NOT NULL AUTO_INCREMENT COMME ...

  7. 李洪强iOS开发之【零基础学习iOS开发】【02-C语言】08-基本运算

    计算机的基本能力就是计算,所以一门程序设计语言的计算能力是非常重要的.C语言之所以无所不能,是因为它不仅有丰富的数据类型,还有强大的计算能力.C语言一共有34种运算符,包括了常见的加减乘除运算.这讲就 ...

  8. Cobalt Strike

    http://www.77169.com/hack/201512/222080.shtm

  9. 函数fsp_seg_inode_page_get_nth_inode

    #define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE) #define FSEG_PAGE_DATA FIL_PAGE_DATA #defi ...

  10. WEB架构师成长之路-架构师都要懂哪些知识 转

    Web架构师究竟都要学些什么?具备哪些能力呢?先网上查查架构师的大概的定义,参见架构师修炼之道这篇文章,写的还不错,再查查公司招聘Web架构师的要求. 总结起来大概有下面几点技能要求: 一. 架构师有 ...