Bitrix-D7 23.9
 
Загрузка...
Поиск...
Не найдено
htmlparser.php
1<?php
2namespace Bitrix\Main\Web\DOM;
3
4use \Bitrix\Main\Text\HtmlFilter;
5
6class HtmlParser extends Parser
7{
8 public $debugTime = 0;
9
10 protected $tagsMustBeClosed = array('SCRIPT', 'STYLE');
11
12
13 public $storePhpCode = true;
14
15 protected static $objectCounter = 0;
18 protected $storedPHP = array();
19
20 public function __construct()
21 {
22 static::$objectCounter++;
23 $this->currentObjectNumber = static::$objectCounter;
24 $this->storedItemCounter = 0;
25
26 $this->setConfig(new HtmlParserConfig);
27 }
28
29 /*
30 * @param Node $node
31 * @return string
32 */
33 public function getSource(Node $node)
34 {
35 $source = '';
36 switch($node->getNodeType())
37 {
39
40 /*@var $node Element*/
41 $source = $this->getSourceElement($node);
42 break;
43
45
46 /*@var $node Attr*/
47 $source = $this->getSourceAttr($node);
48 break;
49
50 case Node::TEXT_NODE:
51
52 /*@var Text $node*/
53 if($node->getParentNode() && in_array($node->getParentNode()->getNodeName(), $this->tagsMustBeClosed))
54 {
55 $source = $node->getNodeValue();
56 }
57 else
58 {
59 $source = HtmlFilter::encode($node->getNodeValue(), ENT_QUOTES);
60 }
61
62 break;
63
65
66 /*@var Comment $node*/
67 $source = '<!--' . $node->getNodeValue() . '-->';
68 if($this->storePhpCode)
69 {
70 $source = $this->restorePHP($source);
71 }
72 break;
73
75
76 /*@var DocumentType $node*/
77 $source = $this->getSourceDocType($node);
78 break;
79 }
80
81 return $source;
82 }
83
84 protected function getSourceAttr(Attr $node)
85 {
86 return $node->getName() . '="' . HtmlFilter::encode($node->getValue()) . '"';
87 }
88
89 protected function getSourceElement(Element $node)
90 {
91 $nodeName = mb_strtolower($node->getNodeName());
92 $source = '<' . $nodeName;
93 if($node->hasAttributes())
94 {
95 $attrList = $node->getAttributesArray();
96 foreach($attrList as $attr)
97 {
98 $source .= ' ' . $this->getSource($attr);
99 }
100 }
101
102 if($node->hasChildNodes())
103 {
104 $source .= '>';
105
107 {
108 $childNodes = $node->getChildNodesArray();
109 foreach ($childNodes as $child)
110 {
111 $source .= $this->getSource($child);
112 }
113 }
114 else
115 {
116 for($i = 0; $i < $node->getChildNodes()->getLength(); $i++)
117 {
118 $source .= $this->getSource($node->getChildNodes()->item($i));
119 }
120 }
121
122
123 $source .= '</' . $nodeName . '>';
124 }
125 else
126 {
127 $source .= ' />';
128 }
129
130 return $source;
131 }
132
133 protected function getSourceDocType(DocumentType $node)
134 {
135 $source = '<!DOCTYPE html>';
136 return $source;
137 }
138
139 /*
140 * @param string $text
141 * @param Node $node
142 * @return void
143 */
144 public function parse($text = "", Node $node)
145 {
146 if($this->storePhpCode)
147 {
148 $text = $this->storePHP($text);
149 }
150 else
151 {
152 $text = $this->commentPHP($text);
153 }
154
155 $isCharOpen = true;
156 $buffer = '';
157
158 $textLength = strlen($text);
159 for($i = 0; $i < $textLength; $i++)
160 {
161 $char = substr($text, $i, 1);
162 if($char === '<')
163 {
164 $node = $this->getNextNode($buffer, $node);
165 $buffer = $char;
166 $isCharOpen = true;
167 }
168 elseif($char === '>')
169 {
170 $buffer .= $char;
171 if($isCharOpen)
172 {
173 $node = $this->getNextNode($buffer, $node);
174 $buffer = '';
175 }
176 $isCharOpen = false;
177 }
178 else
179 {
180 $buffer .= $char;
181 }
182
183 if(!$node)
184 {
185 return null;
186 }
187 }
188
189 if($buffer != '')
190 {
191 $node = $this->getNextNode($buffer, $node);
192 }
193
194 return $node;
195 }
196
197 protected function parseElement($text)
198 {
199 $result = array('NAME' => '', 'ATTRIBUTES' => array());
200
201 if(preg_match('/[ \t\r\n]/S', $text, $matches, PREG_OFFSET_CAPTURE))
202 {
203 $delimiterPosition = $matches[0][1];
204 $result['NAME'] = mb_strtoupper(mb_substr($text, 0, $delimiterPosition));
205 $textAttr = mb_substr($text, $delimiterPosition + 1);
206 $result['ATTRIBUTES'] = $this->parseAttributes($textAttr);
207 }
208 else
209 {
210 $result['NAME'] = mb_strtoupper($text);
211 }
212
213 return $result;
214 }
215
216 protected function parseDocType($text)
217 {
218 return array();
219 }
220
221 protected function parseAttributes($text)
222 {
223 static $search = array(
224 "'&(quot|#34);'i",
225 "'&(lt|#60);'i",
226 "'&(gt|#62);'i",
227 "'&(amp|#38);'i",
228 );
229
230 static $replace = array(
231 "\"",
232 "<",
233 ">",
234 "&",
235 );
236
237 $attributes = array();
238 if ($text !== "")
239 {
240 preg_match_all("/(?'name'[\w\-_:?&]+)(?'eq'\s*=\s*)?(?(eq)([\"'])(?'val'.*?)\g{-2})/s", $text, $attrTmp);
241 if(strpos($text, "&") === false)
242 {
243 foreach($attrTmp['name'] as $i => $attrName)
244 {
245 $attributes[$attrName] = $attrTmp['val'][$i];
246 }
247 }
248 else
249 {
250 foreach($attrTmp['name'] as $i => $attrName)
251 {
252 $attributes[$attrName] = preg_replace($search, $replace, $attrTmp['val'][$i]);
253 }
254 }
255 }
256
257 return $attributes;
258 }
259
260 protected function parseAttributesOld($text)
261 {
262 preg_match_all("/\b([\w_-]+\s*=\s*([\"']*)[^\\2]+?\\2)/", $text, $pairs);
263 $pairs = $pairs[0];
264
265 $attributeList = Array();
266 foreach($pairs as $pair)
267 {
268 $attr = array_map(
269 function ($data){
270 $data = preg_replace("/(^['\"]|['\"]$)/","",$data);
271 return $data;
272 },
273 preg_split("/\s*=\s*/", $pair)
274 );
275 $name = $attr[0];
276 $value = $attr[1];
277 $attributeList[$name] = $value;
278 }
279
280 return $attributeList;
281 }
282
283 protected function getNextNode($tag, Node $parentNode)
284 {
285 $node = null;
286 $isSingleTag = true;
287
288 static $tagsWithoutClose = array('INPUT'=>1, 'IMG'=>1, 'BR'=>1, 'HR'=>1, 'META'=>1, 'AREA'=>1, 'BASE'=>1, 'COL'=>1, 'EMBED'=>1, 'KEYGEN'=>1, 'LINK'=>1, 'PARAM'=>1, 'SOURCE'=>1, 'TRACK'=>1, 'WBR'=>1);
289 $tagsCantHaveNestedTags = array();
290
291 $document = $parentNode->getOwnerDocument();
292
293 if($parentNode->getNodeType() === Node::COMMENT_NODE)
294 {
295 $commentClosePosition = mb_strpos($tag, '-->');
296 if($commentClosePosition !== false)
297 {
298 $clean = mb_substr($tag, 0, $commentClosePosition);
299 $parentNode->setNodeValue($parentNode->getNodeValue() . $clean);
300 $parentNode->bxNodeFoundCloseTag = true;
301
302 $tag = mb_substr($tag, $commentClosePosition + 3);
303 if(!$tag)
304 {
305 return $parentNode->getParentNode();
306 }
307 else
308 {
309 $parentNode = $parentNode->getParentNode();
310 }
311 }
312 else
313 {
314 $parentNode->setNodeValue($parentNode->getNodeValue() . $tag);
315 return $parentNode;
316 }
317 }
318 elseif(in_array($parentNode->getNodeName(), $this->tagsMustBeClosed))
319 {
320 if(mb_strtoupper(mb_substr($tag, -9)) == '</'.$parentNode->getNodeName().'>')
321 {
322 $parentNode->bxNodeFoundCloseTag = true;
323 $parentNode = $parentNode->getParentNode();
324 }
325 else
326 {
327 $firstChild = $parentNode->getFirstChild();
328 if(!$firstChild)
329 {
330 $parentNode->appendChild($document->createTextNode($tag));
331 }
332 else
333 {
334 $firstChild->setNodeValue($firstChild->getNodeValue() . $tag);
335 }
336
337 $parentNode->bxNodeFoundCloseTag = false;
338 return $parentNode;
339 }
340 }
341
342 if(mb_substr($tag, 0, 2) === '</')
343 {
344 // closed tag
345 //TODO: find closest opened parent with same nodeName and return it
346 $cleaned = mb_strtoupper(mb_substr($tag, 2, -mb_strlen('>')));
347 $searchableNode = $parentNode;
348 $isSearchableNodeFound = false;
349
350 $unclosedNodes = array();
351 do
352 {
353 if(!$searchableNode->bxNodeFoundCloseTag)
354 {
355 $unclosedNodes[] = $searchableNode;
356 }
357
358 if($searchableNode->getNodeName() === $cleaned)
359 {
360 $isSearchableNodeFound = true;
361 break;
362 }
363 }while($searchableNode = $searchableNode->getParentNode());
364
365 if($isSearchableNodeFound)
366 {
367 foreach($unclosedNodes as $unclosedNode)
368 {
369 /* @var $unclosedNode Node */
370 if(in_array($unclosedNode->getNodeName(), $tagsCantHaveNestedTags))
371 {
372 if($unclosedNode->hasChildNodes())
373 {
374 foreach ($unclosedNode->getChildNodesArray() as $childNode)
375 {
376 $unclosedNode->getParentNode()->appendChild($unclosedNode->removeChild($childNode));
377 }
378 }
379 }
380
381 $unclosedNode->bxNodeFoundCloseTag = true;
382 }
383
384 return $searchableNode->getParentNode();
385 }
386 else
387 {
388 if(false)
389 {
390 throw new DomException('Parser error. Find close tag, but can not find open tag ' . $cleaned);
391 }
392 else
393 {
394 if ($parentNode->getParentNode())
395 {
396 $parentNode->getParentNode()->bxNodeFoundCloseTag = true;
397 }
398 return $parentNode;
399 }
400 }
401 }
402 elseif(mb_substr($tag, 0, 4) === '<!--')
403 {
404 // Comment
405 $cleaned = mb_substr($tag, 4);
406 if(mb_substr($tag, -3) == '-->')
407 {
408 $cleaned = mb_substr($cleaned, 0, -3);
409 $parentNode->bxNodeFoundCloseTag = true;
410 }
411 else
412 {
413 $isSingleTag = false;
414 }
415
416
417 //$parentNode->bxNodeFoundCloseTag = false;
418 $node = $document->createComment($cleaned);
419 }
420 elseif(mb_substr($tag, 0, 1) === '<')
421 {
422
423 // Element
424 if(mb_substr($tag, -2) === '/>')
425 {
426 // empty tag
427 $cleaned = mb_substr($tag, 1, -2);
428 $bxNodeWithCloseTag = false;
429 }
430 else
431 {
432 $cleaned = mb_substr($tag, 1, -1);
433 $isSingleTag = false;
434 $bxNodeWithCloseTag = true;
435 }
436
437 $list = $this->parseElement($cleaned);
438
439 $isDocType = mb_substr($list['NAME'], 0, mb_strlen('!DOCTYPE')) === '!DOCTYPE';
440
441 if(isset($tagsWithoutClose[$list['NAME']]) || $isDocType)
442 {
443 $bxNodeWithCloseTag = false;
444 $isSingleTag = true;
445 }
446
447 if($isDocType)
448 {
449 $list = $this->parseDocType($cleaned);
450 //TODO: set doctype fields
451 }
452 else
453 {
454 $node = $document->createElement($list['NAME']);
455 foreach($list['ATTRIBUTES'] as $attrName => $attrValue)
456 {
457 $nodeAttr = $document->createAttribute($attrName, $attrValue);
458 $node->setAttributeNode($nodeAttr);
459 }
460 $node->bxNodeWithCloseTag = $bxNodeWithCloseTag;
461 }
462 }
463 else
464 {
465 // Text
466 $cleaned = html_entity_decode($tag, ENT_QUOTES, (defined("BX_UTF") ? "UTF-8" : "ISO-8859-1"));
467 $node = $document->createTextNode($cleaned);
468 }
469
470 if($node && $parentNode)
471 {
472 $parentNode->appendChild($node);
473 if(!$isSingleTag)
474 {
475 return $node;
476 }
477 }
478
479 return $parentNode;
480 }
481
482 /*
483 * @param string $html
484 * @return string
485 */
486 public function commentPHP($html)
487 {
488 $html = str_replace(array('<?', '?>'), array('<!--', '-->'), $html);
489 return $html;
490 }
491
492 /*
493 * @param string $html
494 * @return string
495 */
496 public function storePHP($html)
497 {
498 if(preg_match_all('/(<\?[\W\w\n]*?\?>)/i', $html, $matches, PREG_SET_ORDER) && is_array($matches))
499 {
500 $prefix = 'BX_DOM_DOCUMENT_PHP_SLICE_PLACEHOLDER_' . $this->currentObjectNumber . '_';
501 foreach($matches as $key => $value)
502 {
503 $this->storedItemCounter++;
504 $this->storedPHP['<!--' . $prefix . $this->storedItemCounter . '-->'] = $value[0];
505 }
506
507 $replaceFrom = array_values($this->storedPHP);
508 $replaceTo = array_keys($this->storedPHP);
509
510 $html = str_replace($replaceFrom, $replaceTo, $html);
511 }
512
513 return $html;
514 }
515
516 /*
517 * @param string $html
518 * @return string
519 */
520 public function restorePHP($html)
521 {
522 $html = str_replace(
523 array_keys($this->storedPHP),
524 array_values($this->storedPHP),
525 $html
526 );
527
528 return $html;
529 }
530}
static encode($string, $flags=ENT_COMPAT, $doubleEncode=true)
getSourceDocType(DocumentType $node)
parse($text="", Node $node)
getNextNode($tag, Node $parentNode)
static $isNodeListAsArray
Definition node.php:19
appendChild(Node $newChild)
Definition node.php:264
setConfig(ParserConfig $config)
Definition parser.php:22