325 lines
6.6 KiB
JavaScript
325 lines
6.6 KiB
JavaScript
|
|
'use strict';
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Define few standard suffix manipulations.
|
||
|
|
*/
|
||
|
|
|
||
|
|
var step2list,
|
||
|
|
step3list;
|
||
|
|
|
||
|
|
step2list = {
|
||
|
|
'ational': 'ate',
|
||
|
|
'tional': 'tion',
|
||
|
|
'enci': 'ence',
|
||
|
|
'anci': 'ance',
|
||
|
|
'izer': 'ize',
|
||
|
|
'bli': 'ble',
|
||
|
|
'alli': 'al',
|
||
|
|
'entli': 'ent',
|
||
|
|
'eli': 'e',
|
||
|
|
'ousli': 'ous',
|
||
|
|
'ization': 'ize',
|
||
|
|
'ation': 'ate',
|
||
|
|
'ator': 'ate',
|
||
|
|
'alism': 'al',
|
||
|
|
'iveness': 'ive',
|
||
|
|
'fulness': 'ful',
|
||
|
|
'ousness': 'ous',
|
||
|
|
'aliti': 'al',
|
||
|
|
'iviti': 'ive',
|
||
|
|
'biliti': 'ble',
|
||
|
|
'logi': 'log'
|
||
|
|
};
|
||
|
|
|
||
|
|
step3list = {
|
||
|
|
'icate': 'ic',
|
||
|
|
'ative': '',
|
||
|
|
'alize': 'al',
|
||
|
|
'iciti': 'ic',
|
||
|
|
'ical': 'ic',
|
||
|
|
'ful': '',
|
||
|
|
'ness': ''
|
||
|
|
};
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Define few consonant-vowel sequences.
|
||
|
|
*/
|
||
|
|
|
||
|
|
var consonant,
|
||
|
|
vowel,
|
||
|
|
consonantSequence,
|
||
|
|
vowelSequence,
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_0,
|
||
|
|
EXPRESSION_MEASURE_EQUAL_TO_1,
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_1,
|
||
|
|
EXPRESSION_VOWEL_IN_STEM,
|
||
|
|
EXPRESSION_CONSONANT_LIKE;
|
||
|
|
|
||
|
|
consonant = '[^aeiou]';
|
||
|
|
vowel = '[aeiouy]';
|
||
|
|
consonantSequence = '(' + consonant + '[^aeiouy]*)';
|
||
|
|
vowelSequence = '(' + vowel + '[aeiou]*)';
|
||
|
|
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_0 = new RegExp(
|
||
|
|
'^' + consonantSequence + '?' + vowelSequence + consonantSequence
|
||
|
|
);
|
||
|
|
|
||
|
|
EXPRESSION_MEASURE_EQUAL_TO_1 = new RegExp(
|
||
|
|
'^' + consonantSequence + '?' + vowelSequence + consonantSequence +
|
||
|
|
vowelSequence + '?$'
|
||
|
|
);
|
||
|
|
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_1 = new RegExp(
|
||
|
|
'^' + consonantSequence + '?' + '(' + vowelSequence +
|
||
|
|
consonantSequence + '){2,}'
|
||
|
|
);
|
||
|
|
|
||
|
|
EXPRESSION_VOWEL_IN_STEM = new RegExp(
|
||
|
|
'^' + consonantSequence + '?' + vowel
|
||
|
|
);
|
||
|
|
|
||
|
|
EXPRESSION_CONSONANT_LIKE = new RegExp(
|
||
|
|
'^' + consonantSequence + vowel + '[^aeiouwxy]$'
|
||
|
|
);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Define few exception-expressions.
|
||
|
|
*/
|
||
|
|
|
||
|
|
var EXPRESSION_SUFFIX_LL,
|
||
|
|
EXPRESSION_SUFFIX_E,
|
||
|
|
EXPRESSION_SUFFIX_Y,
|
||
|
|
EXPRESSION_SUFFIX_ION,
|
||
|
|
EXPRESSION_SUFFIX_ED_OR_ING,
|
||
|
|
EXPRESSION_SUFFIX_AT_OR_BL_OR_IZ,
|
||
|
|
EXPRESSION_SUFFIX_EED,
|
||
|
|
EXPRESSION_SUFFIX_S,
|
||
|
|
EXPRESSION_SUFFIX_SSES_OR_IES,
|
||
|
|
EXPRESSION_SUFFIX_MULTI_CONSONANT_LIKE,
|
||
|
|
EXPRESSION_STEP_2,
|
||
|
|
EXPRESSION_STEP_3,
|
||
|
|
EXPRESSION_STEP_4;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_LL = /ll$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_E = /^(.+?)e$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_Y = /^(.+?)y$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_ION = /^(.+?(s|t))(ion)$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_ED_OR_ING = /^(.+?)(ed|ing)$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_AT_OR_BL_OR_IZ = /(at|bl|iz)$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_EED = /^(.+?)eed$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_S = /^.+?[^s]s$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_SSES_OR_IES = /^.+?(ss|i)es$/;
|
||
|
|
|
||
|
|
EXPRESSION_SUFFIX_MULTI_CONSONANT_LIKE = /([^aeiouylsz])\1$/;
|
||
|
|
|
||
|
|
EXPRESSION_STEP_2 = new RegExp(
|
||
|
|
'^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|' +
|
||
|
|
'ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|' +
|
||
|
|
'biliti|logi)$'
|
||
|
|
);
|
||
|
|
|
||
|
|
EXPRESSION_STEP_3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
||
|
|
|
||
|
|
EXPRESSION_STEP_4 = new RegExp(
|
||
|
|
'^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|' +
|
||
|
|
'iti|ous|ive|ize)$'
|
||
|
|
);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Detect the character code for `y`.
|
||
|
|
*/
|
||
|
|
|
||
|
|
var CHARACTER_CODE_Y;
|
||
|
|
|
||
|
|
CHARACTER_CODE_Y = 'y'.charCodeAt(0);
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Stem `value`.
|
||
|
|
*
|
||
|
|
* @param {string} value
|
||
|
|
* @return {string} - Stem corresponding to `value`.
|
||
|
|
*/
|
||
|
|
function stemmer(value) {
|
||
|
|
var firstCharacterWasLowerCaseY,
|
||
|
|
match;
|
||
|
|
|
||
|
|
value = String(value).toLowerCase();
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Exit early.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (value.length < 3) {
|
||
|
|
return value;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Detect initial `y`, make sure it never
|
||
|
|
* matches.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (value.charCodeAt(0) === CHARACTER_CODE_Y) {
|
||
|
|
firstCharacterWasLowerCaseY = true;
|
||
|
|
value = 'Y' + value.substr(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Step 1a.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (EXPRESSION_SUFFIX_SSES_OR_IES.test(value)) {
|
||
|
|
/*
|
||
|
|
* Remove last two characters.
|
||
|
|
*/
|
||
|
|
|
||
|
|
value = value.substr(0, value.length - 2);
|
||
|
|
} else if (EXPRESSION_SUFFIX_S.test(value)) {
|
||
|
|
/*
|
||
|
|
* Remove last character.
|
||
|
|
*/
|
||
|
|
|
||
|
|
value = value.substr(0, value.length - 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Step 1b.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (match = EXPRESSION_SUFFIX_EED.exec(value)) {
|
||
|
|
if (EXPRESSION_MEASURE_GREATER_THAN_0.test(match[1])) {
|
||
|
|
/*
|
||
|
|
* Remove last character.
|
||
|
|
*/
|
||
|
|
|
||
|
|
value = value.substr(0, value.length - 1);
|
||
|
|
}
|
||
|
|
} else if (
|
||
|
|
(match = EXPRESSION_SUFFIX_ED_OR_ING.exec(value)) &&
|
||
|
|
EXPRESSION_VOWEL_IN_STEM.test(match[1])
|
||
|
|
) {
|
||
|
|
value = match[1];
|
||
|
|
|
||
|
|
if (EXPRESSION_SUFFIX_AT_OR_BL_OR_IZ.test(value)) {
|
||
|
|
/*
|
||
|
|
* Append `e`.
|
||
|
|
*/
|
||
|
|
|
||
|
|
value += 'e';
|
||
|
|
} else if (
|
||
|
|
EXPRESSION_SUFFIX_MULTI_CONSONANT_LIKE.test(value)
|
||
|
|
) {
|
||
|
|
/*
|
||
|
|
* Remove last character.
|
||
|
|
*/
|
||
|
|
|
||
|
|
value = value.substr(0, value.length - 1);
|
||
|
|
} else if (EXPRESSION_CONSONANT_LIKE.test(value)) {
|
||
|
|
/*
|
||
|
|
* Append `e`.
|
||
|
|
*/
|
||
|
|
|
||
|
|
value += 'e';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Step 1c.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (
|
||
|
|
(match = EXPRESSION_SUFFIX_Y.exec(value)) &&
|
||
|
|
EXPRESSION_VOWEL_IN_STEM.test(match[1])
|
||
|
|
) {
|
||
|
|
/*
|
||
|
|
* Remove suffixing `y` and append `i`.
|
||
|
|
*/
|
||
|
|
|
||
|
|
value = match[1] + 'i';
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Step 2.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (
|
||
|
|
(match = EXPRESSION_STEP_2.exec(value)) &&
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_0.test(match[1])
|
||
|
|
) {
|
||
|
|
value = match[1] + step2list[match[2]];
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Step 3.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (
|
||
|
|
(match = EXPRESSION_STEP_3.exec(value)) &&
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_0.test(match[1])
|
||
|
|
) {
|
||
|
|
value = match[1] + step3list[match[2]];
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Step 4.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (match = EXPRESSION_STEP_4.exec(value)) {
|
||
|
|
if (EXPRESSION_MEASURE_GREATER_THAN_1.test(match[1])) {
|
||
|
|
value = match[1];
|
||
|
|
}
|
||
|
|
} else if (
|
||
|
|
(match = EXPRESSION_SUFFIX_ION.exec(value)) &&
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_1.test(match[1])
|
||
|
|
) {
|
||
|
|
value = match[1];
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Step 5.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (
|
||
|
|
(match = EXPRESSION_SUFFIX_E.exec(value)) &&
|
||
|
|
(
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_1.test(match[1]) ||
|
||
|
|
(
|
||
|
|
EXPRESSION_MEASURE_EQUAL_TO_1.test(match[1]) &&
|
||
|
|
!EXPRESSION_CONSONANT_LIKE.test(match[1])
|
||
|
|
)
|
||
|
|
)
|
||
|
|
) {
|
||
|
|
value = match[1];
|
||
|
|
}
|
||
|
|
|
||
|
|
if (
|
||
|
|
EXPRESSION_SUFFIX_LL.test(value) &&
|
||
|
|
EXPRESSION_MEASURE_GREATER_THAN_1.test(value)
|
||
|
|
) {
|
||
|
|
value = value.substr(0, value.length - 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Turn initial `Y` back to `y`.
|
||
|
|
*/
|
||
|
|
|
||
|
|
if (firstCharacterWasLowerCaseY) {
|
||
|
|
value = 'y' + value.substr(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
return value;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Expose `stemmer`.
|
||
|
|
*/
|
||
|
|
|
||
|
|
module.exports = stemmer;
|