1C-Bitrix 25.700.0
Загрузка...
Поиск...
Не найдено
stemming.php
См. документацию.
1<?php
3$STEMMING_RU_VOWELS = 'АЕИОУЫЭЮЯ';
5$STEMMING_RU_PERFECTIVE_GERUND = '/(ЫВШИСЬ|ИВШИСЬ|ЯВШИСЬ|АВШИСЬ|ЫВШИ|ИВШИ|ЯВШИ|АВШИ|ЫВ|ИВ|ЯВ|АВ)$/u';
6
7$STEMMING_RU_ADJECTIVE = ['ЕЕ' => 2, 'ИЕ' => 2, 'ЫЕ' => 2, 'ОЕ' => 2, 'ИМИ' => 3, 'ЫМИ' => 3, 'ЕЙ' => 2, 'ИЙ' => 2, 'ЫЙ' => 2, 'ОЙ' => 2, 'ЕМ' => 2, 'ИМ' => 2, 'ЫМ' => 2, 'ОМ' => 2, 'ЕГО' => 2, 'ОГО' => 3, 'ЕМУ' => 3, 'ОМУ' => 3, 'ИХ' => 2, 'ЫХ' => 2, 'УЮ' => 2, 'ЮЮ' => 2, 'АЯ' => 2, 'ЯЯ' => 2, 'ОЮ' => 2, 'ЕЮ' => 2];
8$STEMMING_RU_PARTICIPLE_GR1 = ['ЕМ' => 2, 'НН' => 2, 'ВШ' => 2, 'ЮЩ' => 2, 'Щ' => 1];
9$STEMMING_RU_PARTICIPLE_GR2 = ['ИВШ' => 3, 'ЫВШ' => 3, 'УЮЩ' => 3];
12foreach ($STEMMING_RU_ADJECTIVE as $i => $il)
13{
14 foreach ($STEMMING_RU_PARTICIPLE_GR1 as $j => $jl)
15 {
16 $STEMMING_RU_ADJECTIVAL_GR1[$j . $i] = $jl + $il;
17 }
18 foreach ($STEMMING_RU_PARTICIPLE_GR2 as $j => $jl)
19 {
20 $STEMMING_RU_ADJECTIVAL_GR2[$j . $i] = $jl + $il;
21 }
22}
25$STEMMING_RU_ADJECTIVAL1 = '/([АЯ])(' . implode('|', array_keys($STEMMING_RU_ADJECTIVAL_GR1)) . ')$/u';
26
28foreach ($STEMMING_RU_ADJECTIVE as $i => $il)
29{
31}
33$STEMMING_RU_ADJECTIVAL2 = '/(' . implode('|', array_keys($STEMMING_RU_ADJECTIVAL_GR2)) . ')$/u';
34
36$STEMMING_RU_VERB1 = '/([АЯ])(ННО|ЕТЕ|ЙТЕ|ЕШЬ|ЛА|НА|ЛИ|ЕМ|ЛО|НО|ЕТ|ЮТ|НЫ|ТЬ|Й|Л|Н)$/u';
37
39$STEMMING_RU_VERB2 = '/(ЕЙТЕ|УЙТЕ|ИЛА|ЫЛА|ЕНА|ИТЕ|ИЛИ|ЫЛИ|ИЛО|ЫЛО|ЕНО|УЕТ|УЮТ|ЕНЫ|ИТЬ|ЫТЬ|ИШЬ|ЕЙ|УЙ|ИЛ|ЫЛ|ИМ|ЫМ|ЕН|ЯТ|ИТ|ЫТ|УЮ|Ю)$/u';
41$STEMMING_RU_NOUN = '/(ИЯМИ|ИЯХ|ИЕМ|ИЯМ|АМИ|ЯМИ|ЬЯ|ИЯ|ЬЮ|ИЮ|ЯХ|АХ|ОМ|АМ|ЕМ|ЯМ|ИЙ|ОЙ|ЕЙ|ИЕЙ|ИИ|ЕИ|ЬЕ|ИЕ|ОВ|ЕВ|Ю|Ь|Ы|У|О|Й|И|Е|Я|А)$/u';
42
44{
45 return 'ёйцукенгшщзхъфывапролджэячсмитьбюЁЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ';
46}
47
48function stemming_ru_sort($a, $b)
49{
50 $al = mb_strlen($a);
51 $bl = mb_strlen($b);
52 if ($al == $bl)
53 {
54 return 0;
55 }
56 elseif ($al < $bl)
57 {
58 return 1;
59 }
60 else
61 {
62 return -1;
63 }
64}
65
66function stemming_stop_ru($sWord)
67{
68 if (mb_strlen($sWord) < 2)
69 {
70 return false;
71 }
72 static $stop_list = false;
73 if (!$stop_list)
74 {
75 $stop_list = [
76 'QUOTE' => 0,'HTTP' => 0,'WWW' => 0,'RU' => 0,'IMG' => 0,'GIF' => 0,'БЕЗ' => 0,'БЫ' => 0,'БЫЛ' => 0,
77 'БЫТ' => 0,'ВАМ' => 0,'ВАШ' => 0,'ВО' => 0,'ВОТ' => 0,'ВСЕ' => 0,'ВЫ' => 0,'ГДЕ' => 0,'ДА' => 0,
78 'ДАЖ' => 0,'ДЛЯ' => 0,'ДО' => 0,'ЕГ' => 0,'ЕСЛ' => 0,'ЕСТ' => 0,'ЕЩ' => 0,'ЖЕ' => 0,'ЗА' => 0,
79 'ИЗ' => 0,'ИЛИ' => 0,'ИМ' => 0,'ИХ' => 0,'КАК' => 0,'КОГД' => 0,'КТО' => 0,'ЛИ' => 0,'ЛИБ' => 0,
80 'МЕН' => 0,'МНЕ' => 0,'МО' => 0,'МЫ' => 0,'НА' => 0,'НАД' => 0,'НЕ' => 0,'НЕТ' => 0,'НИ' => 0,
81 'НО' => 0,'НУ' => 0,'ОБ' => 0,'ОН' => 0,'ОТ' => 0,'ОЧЕН' => 0,'ПО' => 0,'ПОД' => 0,'ПРИ' => 0,
82 'ПРО' => 0,'САМ' => 0,'СЕБ' => 0,'СВО' => 0,'ТАК' => 0,'ТАМ' => 0,'ТЕБ' => 0,'ТО' => 0,'ТОЖ' => 0,
83 'ТОЛЬК' => 0,'ТУТ' => 0,'ТЫ' => 0,'УЖ' => 0,'ХОТ' => 0,'ЧЕГ' => 0,'ЧЕМ' => 0,'ЧТО' => 0,'ЧТОБ' => 0,
84 'ЭТ' => 0,'ЭТОТ' => 0,
85 ];
86 if (defined('STEMMING_STOP_RU'))
87 {
88 foreach (explode(',', STEMMING_STOP_RU) as $word)
89 {
90 $word = trim($word);
91 if ($word <> '')
92 {
93 $stop_list[$word] = 0;
94 }
95 }
96 }
97 }
98 return !array_key_exists($sWord, $stop_list);
99}
100
101function stemming_upper_ru($sText)
102{
103 return str_replace(['Ё'], ['Е'], mb_strtoupper($sText));
104}
105
106function stemming_ru($word, $flags = 0)
107{
108 global $STEMMING_RU_VOWELS;
112 global $STEMMING_RU_VERB1;
113 global $STEMMING_RU_VERB2;
114 global $STEMMING_RU_NOUN;
115 //There is a 33rd letter, ё (?), but it is rarely used, and we assume it is mapped into е (e).
116 $word = str_replace('Ё', 'Е', $word);
117 //Exceptions
118 static $STEMMING_RU_EX = [
119 'БЕЗЕ' => 'БЕЗЕ',
120 'БЫЛЬ' => 'БЫЛЬ',
121 'МЕНЮ' => 'МЕНЮ',
122 'ГРАНАТ' => 'ГРАНАТ',
123 'ГРАНИТ' => 'ГРАНИТ',
124 'ТЕРМИНАЛ' => 'ТЕРМИНАЛ',
125 'ИЛИ' => 'ИЛИ',
126 'РУКАВ' => 'РУКАВ',
127 'ПРИЕМ' => 'ПРИЕМ',
128 'ОХРАНА' => 'ОХРАН',
129 'ОХРАНЫ' => 'ОХРАН',
130 'ЗАЖИМ' => 'ЗАЖИМ',
131 ];
132 if (isset($STEMMING_RU_EX[$word]))
133 {
134 return $STEMMING_RU_EX[$word];
135 }
136
137 //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES BEGINS
138 //http://www.gramma.ru/SPR/?id=2.8
139 if ($flags & 1)
140 {
141 if (preg_match('/(ОВ|ЕВ)$/', $word))
142 {
143 return [
144 stemming_ru($word . 'А'),
145 stemming_ru($word),
146 stemming_ru(mb_substr($word, 0, -2)),
147 ];
148 }
149 $found = [];
150 if (preg_match('/(ОВ|ЕВ)(А|У|ЫМ|Е)$/', $word, $found))
151 {
152 return [
153 stemming_ru($word),
154 stemming_ru(mb_substr($word, 0, -mb_strlen($found[2]))),
155 ];
156 }
157 }
158 //HERE IS AN ATTEMPT TO STEM RUSSIAN SECOND NAMES ENDS
159
160 //In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel.
161 //All tests take place in the the RV part of the word.
162 $found = [];
163 if (preg_match('/^(.*?[' . $STEMMING_RU_VOWELS . '])(.+)$/u', $word, $found))
164 {
165 $rv = $found[2];
166 $word = $found[1];
167 }
168 else
169 {
170 return $word;
171 }
172
173 //Do each of steps 1, 2, 3 and 4.
174 //Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1.
175
176 if (preg_match($STEMMING_RU_PERFECTIVE_GERUND, $rv, $found))
177 {
178 switch ($found[0])
179 {
180 case 'АВ':
181 case 'АВШИ':
182 case 'АВШИСЬ':
183 case 'ЯВ':
184 case 'ЯВШИ':
185 case 'ЯВШИСЬ':
186 $rv = mb_substr($rv, 0, 1 - mb_strlen($found[0]));
187 break;
188 default:
189 $rv = mb_substr($rv, 0, -mb_strlen($found[0]));
190 }
191 }
192 //Otherwise try and remove a REFLEXIVE ending, and then search in turn for
193 // (1) an ADJECTIVE,
194 // (2) a VERB or (3)
195 // a NOUN ending.
196 // As soon as one of the endings (1) to (3) is found remove it, and terminate step 1.
197 else
198 {
199 $rv = preg_replace('/(СЯ|СЬ)$/u', '', $rv);
200 //ADJECTIVAL
201 if (preg_match($STEMMING_RU_ADJECTIVAL1, $rv, $found))
202 {
203 $rv = mb_substr($rv, 0, -mb_strlen($found[2]));
204 }
205 elseif (preg_match($STEMMING_RU_ADJECTIVAL2, $rv, $found))
206 {
207 $rv = mb_substr($rv, 0, -mb_strlen($found[0]));
208 }
209 elseif (preg_match($STEMMING_RU_VERB1, $rv, $found))
210 {
211 $rv = mb_substr($rv, 0, -mb_strlen($found[2]));
212 }
213 elseif (preg_match($STEMMING_RU_VERB2, $rv, $found))
214 {
215 $rv = mb_substr($rv, 0, -mb_strlen($found[0]));
216 }
217 else
218 {
219 $rv = preg_replace($STEMMING_RU_NOUN, '', $rv);
220 }
221 }
222
223 //Step 2: If the word ends with и (i), remove it.
224 if (mb_substr($rv, -1) == 'И')
225 {
226 $rv = mb_substr($rv, 0, -1);
227 }
228 //Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), and if one is found, remove it.
229 //R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
230 if (preg_match('/(ОСТЬ|ОСТ)$/u', $rv))
231 {
232 $R1 = 0;
233 $rv_len = mb_strlen($rv);
234 while ( ($R1 < $rv_len) && (mb_strpos($STEMMING_RU_VOWELS, mb_substr($rv, $R1, 1)) !== false) )
235 {
236 $R1++;
237 }
238 if ($R1 < $rv_len)
239 {
240 $R1++;
241 }
242 //R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel.
243 $R2 = $R1;
244 while ( ($R2 < $rv_len) && (mb_strpos($STEMMING_RU_VOWELS, mb_substr($rv, $R2, 1)) === false) )
245 {
246 $R2++;
247 }
248 while ( ($R2 < $rv_len) && (mb_strpos($STEMMING_RU_VOWELS, mb_substr($rv, $R2, 1)) !== false) )
249 {
250 $R2++;
251 }
252 if ($R2 < $rv_len)
253 {
254 $R2++;
255 }
256 //"ОСТЬ", "ОСТ"
257 if ((mb_substr($rv, -4) == 'ОСТЬ') && ($rv_len >= ($R2 + 4)))
258 {
259 $rv = mb_substr($rv, 0, $rv_len - 4);
260 }
261 elseif ((mb_substr($rv, -3) == 'ОСТ') && ($rv_len >= ($R2 + 3)))
262 {
263 $rv = mb_substr($rv, 0, $rv_len - 3);
264 }
265 }
266 //Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it.
267 $rv = preg_replace('/(ЕЙШЕ|ЕЙШ)$/u', '', $rv);
268 $r = preg_replace('/НН$/u', 'Н', $rv);
269 if ($r == $rv)
270 {
271 $rv = preg_replace('/Ь$/u', '', $rv);
272 }
273 else
274 {
275 $rv = $r;
276 }
277
278 return $word . $rv;
279}
if( $daysToExpire >=0 &&$daysToExpire< 60 elseif)( $daysToExpire< 0)
Определения prolog_main_admin.php:393
$i
Определения factura.php:643
$STEMMING_RU_PARTICIPLE_GR1
Определения stemming.php:8
global $STEMMING_RU_NOUN
Определения stemming.php:40
global $STEMMING_RU_VERB1
Определения stemming.php:35
global $STEMMING_RU_PERFECTIVE_GERUND
Определения stemming.php:4
stemming_upper_ru($sText)
Определения stemming.php:101
global $STEMMING_RU_VOWELS
Определения stemming.php:2
stemming_ru($word, $flags=0)
Определения stemming.php:106
$STEMMING_RU_PARTICIPLE_GR2
Определения stemming.php:9
stemming_stop_ru($sWord)
Определения stemming.php:66
stemming_letter_ru()
Определения stemming.php:43
global $STEMMING_RU_VERB2
Определения stemming.php:38
global $STEMMING_RU_ADJECTIVAL2
Определения stemming.php:27
foreach($STEMMING_RU_ADJECTIVE as $i=> $il) global $STEMMING_RU_ADJECTIVAL1
Определения stemming.php:12
$STEMMING_RU_ADJECTIVE
Определения stemming.php:7
$STEMMING_RU_ADJECTIVAL_GR1
Определения stemming.php:10
$STEMMING_RU_ADJECTIVAL_GR2
Определения stemming.php:11
stemming_ru_sort($a, $b)
Определения stemming.php:48
else $a
Определения template.php:137