Bitrix-D7 23.9
 
Загрузка...
Поиск...
Не найдено
schemaorg.php
1<?php
2
4
9
10class SchemaOrg extends Parser
11{
13 protected $dom;
14
16 protected $schemaMetadata = array();
17
19
25 public function handle(HtmlDocument $document)
26 {
27 $this->documentEncoding = $document->getEncoding();
28 if(strpos($document->getHtml(), 'itemscope') === false)
29 return null;
30
31 if(!$this->initializeDom($document))
32 return null;
33
34 if(!$this->getSchemaMetadata())
35 return null;
36
37 if($document->getTitle() == '' && isset($this->schemaMetadata['name']))
38 {
39 $document->setTitle($this->schemaMetadata['name']);
40 }
41
42 if($document->getDescription() == '' && isset($this->schemaMetadata['description']))
43 {
44 $document->setDescription($this->schemaMetadata['description']);
45 }
46
47 if($document->getImage() == '' && isset($this->schemaMetadata['image']))
48 {
49 $document->setImage($this->schemaMetadata['image']);
50 }
51 }
52
56 protected function getSchemaMetadata()
57 {
58 // Starting with first node with itemscope attribute, to prevent walking over full document.
59 $xpath = new \DOMXPath($this->dom);
60 $itemScopeNodes = $xpath->query('//*[@itemscope]');
61
62 if(!is_a($itemScopeNodes, '\DOMNodeList') || $itemScopeNodes->length < 1)
63 return false;
64
65 $mainNode = $itemScopeNodes->item(0);
66 if(!is_a($mainNode, '\DOMElement'))
67 return false;
68
69 $this->walkDomTree($mainNode);
70
71 return true;
72 }
73
78 protected function walkDomTree(\DOMElement $currentNode, $currentDepth = 0)
79 {
80 $this->handleNode($currentNode);
81 foreach($currentNode->childNodes as $childNode)
82 {
83 if(is_a($childNode, '\DOMElement') && !($currentDepth == 0 xor $currentNode->hasAttribute('itemscope')))
84 {
85 $this->walkDomTree($childNode, $currentDepth + 1);
86 }
87 }
88 }
89
94 protected function getSchemaPropertyValue(\DOMElement $node)
95 {
96 $result = null;
97
98 switch($node->tagName)
99 {
100 case 'img':
101 $result = $node->getAttribute('src');
102 break;
103 case 'meta':
104 $result = $node->getAttribute('content');
105 break;
106 case 'a':
107 $result = $node->getAttribute('href');
108 break;
109 case 'time':
110 if($node->hasAttribute('datetime'))
111 $result = $node->getAttribute('datetime');
112 else
113 $result = $node->textContent;
114 break;
115 case 'div':
116 $result = $this->getNodeInnerHtml($node);
117 break;
118 case 'p':
119 case 'span':
120 case 'h1':
121 case 'h2':
122 case 'h3':
123 case 'h4':
124 case 'h5':
125 case 'h6':
126 $result = $node->textContent;
127 break;
128 }
129
130 // dom extension's internal encoding is always utf-8
131 $result = Encoding::convertEncoding($result, 'utf-8', $this->documentEncoding);
132 $result = trim($result);
133 return ($result <> '' ? $result : null);
134 }
135
139 protected function handleNode(\DOMElement $node)
140 {
141 if($node->hasAttribute('itemprop') && !$node->hasAttribute('itemscope'))
142 {
143 $propertyName = mb_strtolower($node->getAttribute('itemprop'));
144 $propertyValue = $this->getSchemaPropertyValue($node);
145 $this->schemaMetadata[$propertyName] = $propertyValue;
146 }
147 }
148
153 protected function getNodeInnerHtml(\DOMElement $element)
154 {
155 $innerHTML = "";
156 $children = $element->childNodes;
157
158 foreach ($children as $child)
159 {
160 $innerHTML .= $element->ownerDocument->saveHTML($child);
161 }
162
163 return $innerHTML;
164 }
165
170 protected function initializeDom(HtmlDocument $document)
171 {
172 if(!class_exists('DOMDocument'))
173 {
174 return false;
175 }
176
177 $this->dom = new \DOMDocument();
178 // Prevents parsing errors bubbling
179 libxml_use_internal_errors(true);
180 $result = $this->dom->loadHTML('<?xml encoding="'.$document->getEncoding().'">'.$document->getHtml(), LIBXML_COMPACT);
181
182 return $result;
183 }
184}
getNodeInnerHtml(\DOMElement $element)
initializeDom(HtmlDocument $document)
getSchemaPropertyValue(\DOMElement $node)
Definition schemaorg.php:94
walkDomTree(\DOMElement $currentNode, $currentDepth=0)
Definition schemaorg.php:78