1C-Bitrix 25.700.0
Загрузка...
Поиск...
Не найдено
stemming.php
См. документацию.
1<?php
4 'TIONAL' => 'TION', 'ENCI' => 'ENCE', 'ANCI' => 'ANCE', 'ABLI' => 'ABLE', 'ENTLI' => 'ENT',
5 'IZER' => 'IZE', 'IZATION' => 'IZE', 'ATIONAL' => 'ATE', 'ATION' => 'ATE', 'ATOR' => 'ATE',
6 'ALISM' => 'AL', 'ALITI' => 'AL', 'ALLI' => 'AL', 'FULNESS' => 'FUL', 'OUSLI' => 'OUS',
7 'OUSNESS' => 'OUS', 'IVENESS' => 'IVE', 'IVITI' => 'IVE', 'BILITI' => 'BLE', 'BLI' => 'BLE',
8 'FULLI' => 'FUL', 'LESSLI' => 'LESS'
9];
11$STEMMING_EN_STEP2 = '/(' . implode('|', array_keys($STEMMING_EN_STEP2A)) . '|OGI|LI)$/';
14 'TIONAL' => 'TION', 'ATIONAL' => 'ATE', 'ALIZE' => 'AL', 'ICATE' => 'IC', 'ICITI' => 'IC',
15 'ICAL' => 'IC', 'FUL' => '', 'NESS' => ''
16];
18$STEMMING_EN_STEP3 = '/(' . implode('|', array_keys($STEMMING_EN_STEP3A)) . '|ATIVE)$/';
21 'AL', 'ANCE', 'ENCE', 'ER', 'IC',
22 'ABLE', 'IBLE', 'ANT', 'EMENT', 'MENT',
23 'ENT', 'ISM', 'ATE', 'ITI', 'OUS',
24 'IVE', 'IZE'
25];
27$STEMMING_EN_STEP4 = '/(' . implode('|', $STEMMING_EN_STEP4A) . '|ION)$/';
30 'SKIS' => 'SKI',
31 'SKIES' => 'SKY',
32 'DYING' => 'DIE',
33 'LYING' => 'LIE',
34 'TYING' => 'TIE',
35 'IDLY' => 'IDL',
36 'GENTLY' => 'GENTL',
37 'UGLY' => 'UGLI',
38 'EARLY' => 'EARLI',
39 'ONLY' => 'ONLI',
40 'SINGLY' => 'SINGL',
41 'SKY' => 'SKY',
42 'NEWS' => 'NEWS',
43 'HOWE' => 'HOWE',
44 'ATLAS' => 'ATLAS',
45 'COSMOS' => 'COSMOS',
46 'BIAS' => 'BIAS',
47 'ANDES' => 'ANDES',
48];
51 'INNING' => 1,
52 'OUTING' => 1,
53 'CANNING' => 1,
54 'HERRING' => 1,
55 'EARRING' => 1,
56 'PROCEED' => 1,
57 'EXCEED' => 1,
58 'SUCCEED' => 1,
59];
60
62{
63 return 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM';
64}
65
66function stemming_stop_en($sWord)
67{
68 if (mb_strlen($sWord) < 2)
69 {
70 return false;
71 }
72 static $stop_list = false;
73 if (!$stop_list)
74 {
75 $stop_list = [
76 'QUOTE' => 0, 'HTTP' => 0, 'WWW' => 0, 'RU' => 0, 'IMG' => 0, 'GIF' => 0, 'A' => 0, 'THE' => 0, 'IS' => 0,
77 'ARE' => 0, 'OFF' => 0, 'ON' => 0, 'AND' => 0, 'IN' => 0, 'FOR' => 0, 'OF' => 0, 'BY' => 0, 'WITH' => 0,
78 'BE' => 0, 'WAS' => 0, 'IT' => 0,
79 ];
80 if (defined('STEMMING_STOP_EN'))
81 {
82 foreach (explode(',', STEMMING_STOP_EN) as $word)
83 {
84 $word = trim($word);
85 if ($word <> '')
86 {
87 $stop_list[$word] = 0;
88 }
89 }
90 }
91 }
92 return !array_key_exists($sWord, $stop_list);
93}
94
95function stemming_upper_en($sText)
96{
97 return mb_strtoupper($sText);
98}
99
100function stemming_en($word)
101{
102 global $STEMMING_EN_STEP2A;
103 global $STEMMING_EN_STEP2;
104 global $STEMMING_EN_STEP3A;
105 global $STEMMING_EN_STEP3;
106 global $STEMMING_EN_STEP4A;
107 global $STEMMING_EN_STEP4;
108 global $STEMMING_EN_EX1;
109 global $STEMMING_EN_EX2;
110
111 //If the word has two letters or less, leave it as it is.
112 $word_len = mb_strlen($word);
113 if ($word_len <= 2)
114 {
115 return $word;
116 }
117 if (array_key_exists($word, $STEMMING_EN_EX1))
118 {
119 return $STEMMING_EN_EX1[$word];
120 }
121
122 //Set initial y, or y after a vowel, to Y, and then establish the regions R1 and R2. (See note on vowel marking.)
123 $vowels = 'AEIOUY';
124 $word = preg_replace('/^Y/', 'y', $word);
125 $word = preg_replace('/([' . $vowels . '])(Y)/', "\\1y", $word);
126
127 //In any word, R1 is the region after the first non-vowel following a vowel, or the end of the word if it contains no such a non-vowel.
128 $R1 = 0;
129 while (($R1 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R1, 1)) === false))
130 {
131 $R1++;
132 }
133 while (($R1 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R1, 1)) !== false))
134 {
135 $R1++;
136 }
137 if ($R1 < $word_len)
138 {
139 $R1++;
140 }
141 if (preg_match('/^COMMUN/', $word))
142 {
143 $R1 = 6;
144 }
145 if (preg_match('/^GENER/', $word))
146 {
147 $R1 = 5;
148 }
149
150 $R2 = $R1;
151 while (($R2 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R2, 1)) === false))
152 {
153 $R2++;
154 }
155 while (($R2 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R2, 1)) !== false))
156 {
157 $R2++;
158 }
159 if ($R2 < $word_len)
160 {
161 $R2++;
162 }
163
164 //Step 1a:
165 // Search for the longest among the following suffixes, and perform the action indicated.
166 $found = [];
167 if (preg_match('/(SSES|IED|IES|US|SS|S)$/', $word, $found))
168 {
169 switch ($found[0])
170 {
171 //sses - replace by ss
172 case 'SSES':
173 $word = mb_substr($word, 0, $word_len - 4) . 'SS';
174 break;
175 //ied+ ies* - replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri)
176 case 'IED':
177 case 'IES':
178 if (mb_strlen($word) > 4)
179 {
180 $word = mb_substr($word, 0, $word_len - 3) . 'I';
181 }
182 else
183 {
184 $word = mb_substr($word, 0, $word_len - 3) . 'IE';
185 }
186 break;
187 //s delete if the preceding word part contains a vowel not immediately before the s
188 // (so gas and this retain the s, gaps and kiwis lose it)
189 case 'S':
190 if (preg_match('/([' . $vowels . '].*.)(S)$/', $word))
191 {
192 $word = mb_substr($word, 0, $word_len - 1);
193 }
194 break;
195 //us+ ss - do nothing
196 }
197 }
198
199 if (array_key_exists($word, $STEMMING_EN_EX2))
200 {
201 return $word;
202 }
203
204 //Step 1b:
205 // Search for the longest among the following suffixes, and perform the action indicated.
206 //eed eedly+ - replace by ee if in R1
207 if (preg_match('/(EEDLY|INGLY|EDLY|EED|ING|ED)$/', $word, $found))
208 {
209 switch ($found[0])
210 {
211 case 'EEDLY':
212 case 'EED':
213 if (preg_match('/' . $found[0] . '$/', mb_substr($word, $R1)))
214 {
215 $word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0])) . 'EE';
216 }
217 break;
218 default:
219 //delete if the preceding word part contains a vowel, and then
220 if (($step1b = preg_replace('/([' . $vowels . '].*)(ED|EDLY|ING|INGLY)$/', "\\1", $word)) != $word)
221 {
222 //if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
223 if (($step1b1 = preg_replace('/(AT|BL|IZ)$/', "\\1E", $step1b)) == $step1b)
224 {
225 //if the word ends with a double remove the last letter (so hopp -> hop), or
226 if (preg_match('/(BB|DD|FF|GG|MM|NN|PP|RR|TT)$/', $step1b))
227 {
228 $step1b1 = mb_substr($step1b, 0, mb_strlen($step1b) - 1);
229 }
230 else
231 {
232 //if the word is short, add e (so hop -> hope)
233 //A word is called short if it consists of a short syllable preceded by zero or more consonants.
234 //Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y
235 //and preceded by a non-vowel, or * (b) a vowel at the beginning of the word followed by a non-vowel.
236 if (preg_match('/^[^' . $vowels . ']+[' . $vowels . '][^WXy' . $vowels . ']$/', $step1b)
237 || preg_match('/^[' . $vowels . '][^' . $vowels . ']$/', $step1b)
238 )
239 {
240 $step1b1 = $step1b . 'E';
241 }
242 }
243 }
244 $step1b = $step1b1;
245 }
246 $word = $step1b;
247 }
248 }
249
250 //Step 1c: *
251 // replace suffix y or Y by i if preceded by a non-vowel which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
252 $word = preg_replace('/^(.+[^' . $vowels . '])([yY])$/', "\\1I", $word);
253
254 //Step 2:
255 // Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
256 if (
257 preg_match($STEMMING_EN_STEP2, $word, $found)
258 && preg_match('/' . $found[0] . '$/', mb_substr($word, $R1))
259 )
260 {
261 switch ($found[0])
262 {
263 case 'OGI':
264 if (preg_match('/LOGI$/', $word))
265 {
266 $word = mb_substr($word, 0, mb_strlen($word) - 3) . 'OG';
267 }
268 break;
269 case 'LI':
270 if (preg_match('/[CDEGHKMNRT]LI$/', $word))
271 {
272 $word = mb_substr($word, 0, mb_strlen($word) - 2);
273 }
274 break;
275 default:
276 $word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0])) . $STEMMING_EN_STEP2A[$found[0]];
277 }
278 }
279
280 //Step 3:
281 // Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
282 if (
283 preg_match($STEMMING_EN_STEP3, $word, $found)
284 && preg_match('/' . $found[0] . '$/', mb_substr($word, $R1))
285 )
286 {
287 switch ($found[0])
288 {
289 case 'ATIVE':
290 if (preg_match('/ATIVE$/', mb_substr($word, $R2)))
291 {
292 $word = mb_substr($word, 0, mb_strlen($word) - 5);
293 }
294 break;
295 default:
296 $word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0])) . $STEMMING_EN_STEP3A[$found[0]];
297 }
298 }
299
300 //Step 4:
301 // Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
302 if (
303 preg_match($STEMMING_EN_STEP4, $word, $found)
304 && preg_match('/' . $found[0] . '$/', mb_substr($word, $R2))
305 )
306 {
307 switch ($found[0])
308 {
309 case 'ION':
310 if (preg_match('/[ST]ION$/', $word))
311 {
312 $word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0]));
313 }
314 break;
315 default:
316 $word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0]));
317 }
318 }
319
320 //Step 5:
321 if (
322 preg_match('/E$/', mb_substr($word, $R2))
323 || (
324 preg_match('/E$/', mb_substr($word, $R1))
325 //Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y
326 //and preceded by a non-vowel, or * (b) a vowel at the beginning of the word followed by a non-vowel.
327 && !(
328 preg_match('/[^' . $vowels . '][' . $vowels . '][^WXy' . $vowels . '].$/', $word)
329 || preg_match('/^[' . $vowels . '][^' . $vowels . '].$/', $word)
330 )
331 )
332 )
333 {
334 $word = mb_substr($word, 0, mb_strlen($word) - 1);
335 }
336 elseif (preg_match('/L$/', mb_substr($word, $R2)) && preg_match('/LL$/', $word))
337 {
338 $word = mb_substr($word, 0, mb_strlen($word) - 1);
339 }
340
341 return str_replace('y', 'Y', $word);
342}
global $STEMMING_EN_STEP3
Определения stemming.php:17
global $STEMMING_EN_STEP3A
Определения stemming.php:12
stemming_letter_en()
Определения stemming.php:61
stemming_upper_en($sText)
Определения stemming.php:95
global $STEMMING_EN_EX2
Определения stemming.php:49
global $STEMMING_EN_STEP2
Определения stemming.php:10
global $STEMMING_EN_STEP4
Определения stemming.php:26
stemming_en($word)
Определения stemming.php:100
global $STEMMING_EN_STEP4A
Определения stemming.php:19
global $STEMMING_EN_EX1
Определения stemming.php:28
global $STEMMING_EN_STEP2A
Определения stemming.php:2
stemming_stop_en($sWord)
Определения stemming.php:66
if( $daysToExpire >=0 &&$daysToExpire< 60 elseif)( $daysToExpire< 0)
Определения prolog_main_admin.php:393