1C-Bitrix 25.700.0
Загрузка...
Поиск...
Не найдено
stemming.php
См. документацию.
1<?php
2
4{
5 return 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNMÄÖÜäöüß';
6}
7
8function stemming_stop_de($sWord)
9{
10 if (mb_strlen($sWord) < 2)
11 {
12 return false;
13 }
14 static $stop_list = false;
15 if (!$stop_list)
16 {
17 $stop_list = [
18 'QUOTE' => 0, 'HTTP' => 0, 'WWW' => 0, 'RU' => 0, 'IMG' => 0, 'GIF' => 0, 'aber' => 0,
19 'alle' => 0, 'allem' => 0, 'allen' => 0, 'aller' => 0, 'alles' => 0,
20 'als' => 0, 'also' => 0, 'am' => 0, 'an' => 0,
21 'ander' => 0, 'andere' => 0, 'anderem' => 0, 'anderen' => 0, 'anderer' => 0,
22 'anderes' => 0, 'anderm' => 0, 'andern' => 0, 'anderr' => 0, 'anders' => 0,
23 'auch' => 0, 'auf' => 0, 'aus' => 0, 'bei' => 0, 'bin' => 0,
24 'bis' => 0, 'bist' => 0, 'da' => 0, 'damit' => 0, 'dann' => 0,
25 'der' => 0, 'den' => 0, 'des' => 0, 'dem' => 0, 'die' => 0, 'das' => 0,
26 'daß' => 0,
27 'derselbe' => 0, 'derselben' => 0, 'denselben' => 0, 'desselben' => 0, 'demselben' => 0,
28 'dieselbe' => 0, 'dieselben' => 0, 'dasselbe' => 0,
29 'dazu' => 0,
30 'dein' => 0, 'deine' => 0, 'deinem' => 0, 'deinen' => 0, 'deiner' => 0, 'deines' => 0,
31 'denn' => 0,
32 'derer' => 0, 'dessen' => 0,
33 'dich' => 0, 'dir' => 0, 'du' => 0,
34 'dies' => 0, 'diese' => 0, 'diesem' => 0, 'diesen' => 0, 'dieser' => 0, 'dieses' => 0,
35 'doch' => 0, 'dort' => 0,
36 'durch' => 0,
37 'ein' => 0, 'eine' => 0, 'einem' => 0, 'einen' => 0, 'einer' => 0, 'eines' => 0,
38 'einig' => 0, 'einige' => 0, 'einigem' => 0, 'einigen' => 0, 'einiger' => 0, 'einiges' => 0,
39 'einmal' => 0,
40 'er' => 0, 'ihn' => 0, 'ihm' => 0,
41 'es' => 0, 'etwas' => 0,
42 'euer' => 0, 'eure' => 0, 'eurem' => 0, 'euren' => 0, 'eurer' => 0, 'eures' => 0,
43 'für' => 0, 'gegen' => 0, 'gewesen' => 0, 'hab' => 0, 'habe' => 0,
44 'haben' => 0, 'hat' => 0, 'hatte' => 0, 'hatten' => 0, 'hier' => 0,
45 'hin' => 0, 'hinter' => 0,
46 'ich' => 0, 'mich' => 0, 'mir' => 0,
47 'ihr' => 0, 'ihre' => 0, 'ihrem' => 0, 'ihren' => 0, 'ihrer' => 0, 'ihres' => 0, 'euch' => 0,
48 'im' => 0, 'in' => 0, 'indem' => 0, 'ins' => 0, 'ist' => 0,
49 'jede' => 0, 'jedem' => 0, 'jeden' => 0, 'jeder' => 0, 'jedes' => 0,
50 'jene' => 0, 'jenem' => 0, 'jenen' => 0, 'jener' => 0, 'jenes' => 0,
51 'jetzt' => 0, 'kann' => 0,
52 'kein' => 0, 'keine' => 0, 'keinem' => 0, 'keinen' => 0, 'keiner' => 0, 'keines' => 0,
53 'können' => 0, 'könnte' => 0, 'machen' => 0, 'man' => 0,
54 'manche' => 0, 'manchem' => 0, 'manchen' => 0, 'mancher' => 0, 'manches' => 0,
55 'mein' => 0, 'meine' => 0, 'meinem' => 0, 'meinen' => 0, 'meiner' => 0, 'meines' => 0,
56 'mit' => 0, 'muss' => 0, 'musste' => 0, 'nach' => 0, 'nicht' => 0,
57 'nichts' => 0, 'noch' => 0, 'nun' => 0, 'nur' => 0, 'ob' => 0,
58 'oder' => 0, 'ohne' => 0, 'sehr' => 0,
59 'sein' => 0, 'seine' => 0, 'seinem' => 0, 'seinen' => 0, 'seiner' => 0, 'seines' => 0,
60 'selbst' => 0, 'sich' => 0,
61 'sie' => 0, 'ihnen' => 0,
62 'sind' => 0, 'so' => 0,
63 'solche' => 0, 'solchem' => 0, 'solchen' => 0, 'solcher' => 0, 'solches' => 0,
64 'soll' => 0, 'sollte' => 0, 'sondern' => 0, 'sonst' => 0, 'über' => 0, 'um' => 0, 'und' => 0,
65 'uns' => 0, 'unse' => 0, 'unsem' => 0, 'unsen' => 0, 'unser' => 0, 'unses' => 0,
66 'unter' => 0, 'viel' => 0, 'vom' => 0, 'von' => 0, 'vor' => 0,
67 'während' => 0, 'war' => 0, 'waren' => 0, 'warst' => 0, 'was' => 0,
68 'weg' => 0, 'weil' => 0, 'weiter' => 0,
69 'welche' => 0, 'welchem' => 0, 'welchen' => 0, 'welcher' => 0, 'welches' => 0,
70 'wenn' => 0, 'werde' => 0, 'werden' => 0, 'wie' => 0, 'wieder' => 0,
71 'will' => 0, 'wir' => 0, 'wird' => 0, 'wirst' => 0, 'wo' => 0,
72 'wollen' => 0, 'wollte' => 0, 'würde' => 0, 'würden' => 0, 'zu' => 0,
73 'zum' => 0, 'zur' => 0, 'zwar' => 0, 'zwischen' => 0,
74 ];
75 if (defined('STEMMING_STOP_DE'))
76 {
77 foreach (explode(',', STEMMING_STOP_DE) as $word)
78 {
79 $word = trim($word);
80 if ($word <> '')
81 {
82 $stop_list[$word] = 0;
83 }
84 }
85 }
86 }
87 return !array_key_exists($sWord, $stop_list);
88}
89
90function stemming_de($word)
91{
92 $vowels = 'AEIOUYÄÖÜ';
93 //First, replace ß by ss
94 $word = str_replace('ß', 'SS', $word); //Actually ß in uppercase is already SS
95 //put u and y between vowels into lower case
96 $word = preg_replace('/([' . $vowels . '])U([' . $vowels . '])/', "\\1u\\2", $word);
97 $word = preg_replace('/([' . $vowels . '])Y([' . $vowels . '])/', "\\1y\\2", $word);
98 $word_len = mb_strlen($word);
99
100 //In any word, R1 is the region after the first non-vowel following a vowel,
101 //or the end of the word if it contains no such a non-vowel.
102 $R1 = 0;
103 while ( ($R1 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R1, 1)) === false))
104 {
105 $R1++;
106 }
107 while ( ($R1 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R1, 1)) !== false))
108 {
109 $R1++;
110 }
111 if ($R1 < $word_len)
112 {
113 $R1++;
114 }
115
116 //R2 is the region after the first non-vowel following a vowel in R1,
117 //or is the null region at the end of the word if there is no such non-vowel.
118 $R2 = $R1;
119 while ( ($R2 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R2, 1)) === false))
120 {
121 $R2++;
122 }
123 while ( ($R2 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R2, 1)) !== false))
124 {
125 $R2++;
126 }
127 if ($R2 < $word_len)
128 {
129 $R2++;
130 }
131
132 //R1 is adjusted so that the region before it contains at least 3 letters.
133 if ($R1 < 3)
134 {
135 $R1 = 3;
136 }
137
138 //Define a valid s-ending as one of b, d, f, g, h, k, l, m, n, r or t.
139 $s_ending = 'BDFGHKLMNRT';
140
141 //Define a valid st-ending as the same list, excluding letter r.
142 $st_ending = 'BDFGHKLMNT';
143
144 $word_r1 = mb_substr($word, $R1);
145
146 //Step 1:
147 //Search for the longest among the following suffixes
148 //(a) em ern er
149 //and delete if in R1
150 if (preg_match('/(ERN|EM|ER)$/', $word_r1, $match))
151 {
152 $word = mb_substr($word, 0, -mb_strlen($match[1]));
153 }
154 //(b) e en es
155 //If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s
156 elseif (preg_match('/(ES|EN|E)$/', $word_r1, $match))
157 {
158 $word = mb_substr($word, 0, -mb_strlen($match[1]));
159 if (preg_match('/NISS$/', $word))
160 {
161 $word = mb_substr($word, 0, -1);
162 }
163 }
164 //(c) s (preceded by a valid s-ending)
165 //the letter of the valid s-ending is not necessarily in R1
166 elseif (mb_substr($word_r1, -1) == 'S' && preg_match('/[' . $s_ending . ']S$/', $word))
167 {
168 $word = mb_substr($word, 0, -1);
169 }
170
171 $word_r1 = mb_substr($word, $R1);
172 //Step 2:
173 //Search for the longest among the following suffixes,
174 //(a) en er est
175 //and delete if in R1.
176 if (preg_match('/(EST|EN|ER)$/', $word_r1, $match))
177 {
178 $word = mb_substr($word, 0, -mb_strlen($match[1]));
179 }
180 //(b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
181 elseif (preg_match('/ST$/', $word_r1) && preg_match('/.{3,}[' . $st_ending . ']ST$/', $word))
182 {
183 $word = mb_substr($word, 0, -2);
184 }
185
186 //Step 3: d-suffixes (*)
187 //Search for the longest among the following suffixes, and perform the action indicated.
188
189 $word_r2 = mb_substr($word, $R2);
190 //keit
191 // delete if in R2
192 // if preceded by lich or ig, delete if in R2
193 if (preg_match('/KEIT$/', $word_r2))
194 {
195 $word = mb_substr($word, 0, -4);
196 $word_r2 = mb_substr($word, $R2);
197 if (preg_match('/(LICH|IG)$/', $word_r2, $match))
198 {
199 $word = mb_substr($word, 0, -mb_strlen($match[1]));
200 }
201 }
202 // lich heit
203 // delete if in R2
204 // if preceded by er or en, delete if in R1
205 elseif (preg_match('/(LICH|HEIT)$/', $word_r2))
206 {
207 $word = mb_substr($word, 0, -4);
208 $word_r1 = mb_substr($word, $R1);
209 if (preg_match('/(ER|EN)$/', $word_r1))
210 {
211 $word = mb_substr($word, 0, -2);
212 }
213 }
214 // end ung
215 // delete if in R2
216 // if preceded by ig, delete if in R2 and not preceded by e
217 elseif (preg_match('/(END|UNG)$/', $word_r2))
218 {
219 $word = mb_substr($word, 0, -3);
220 $word_r2 = mb_substr($word, $R2);
221 if (preg_match('/(^|[^E])(IG)$/', $word_r2))
222 {
223 $word = mb_substr($word, 0, -2);
224 }
225 }
226 // ig ik isch
227 // delete if in R2 and not preceded by e
228 elseif (preg_match('/(^|[^E])(IG|IK|ISCH)$/', $word_r2, $match))
229 {
230 $word = mb_substr($word, 0, -mb_strlen($match[2]));
231 }
232
233 //Finally,
234 //turn U and Y back into lower case, and remove the umlaut accent from a, o and u.
235 $word = str_replace(['u', 'y', 'Ä', 'Ö', 'Ü'], ['U', 'Y', 'A', 'O', 'U'], $word);
236
237 return $word;
238}
239
240function stemming_upper_de($sText)
241{
242 return str_replace(['Ä', 'Ö', 'Ü'], ['A', 'O', 'U'], mb_strtoupper($sText));
243}
stemming_upper_de($sText)
Определения stemming.php:240
stemming_stop_de($sWord)
Определения stemming.php:8
stemming_de($word)
Определения stemming.php:90
stemming_letter_de()
Определения stemming.php:3
if( $daysToExpire >=0 &&$daysToExpire< 60 elseif)( $daysToExpire< 0)
Определения prolog_main_admin.php:393