Bitrix-D7 23.9
 
Указатель Классы Пространства имен Функции Переменные
Загрузка...
Поиск...
Не найдено
htmldocument.php
1<?php
2
4
11
13{
14 const MAX_IMAGES = 4;
16 const MAX_HTML_LENGTH = 1048576; // 1 MB
17
19 protected $uri;
20
22 protected $html;
23
25 protected $htmlEncoding;
26
30 protected $metadata = array(
31 "TITLE" => null,
32 "DESCRIPTION" => null,
33 "IMAGE" => null,
34 "EMBED" => null,
35 "DATE_EXPIRE" => null,
36 );
37
39 protected $metaElements = array();
40
42 protected $linkElements = array();
43
50 public function __construct($html, Uri $uri)
51 {
52 $this->html = substr($html, 0, self::MAX_HTML_LENGTH);
53 $this->uri = $uri;
54 }
55
61 public function getUri()
62 {
63 return $this->uri;
64 }
65
71 public function getHtml()
72 {
73 return $this->html;
74 }
75
81 public function checkMetadata()
82 {
83 $result = ( $this->metadata['TITLE'] != ''
84 && $this->metadata['DESCRIPTION'] != ''
85 && $this->metadata['IMAGE'] != '');
86
87 if ($this->isEmbeddingAllowed())
88 {
89 $result = $result && $this->metadata['EMBED'] != '';
90 }
91
92 return $result;
93 }
94
101 public function getMetadata()
102 {
103 return $this->metadata;
104 }
105
111 public function getTitle()
112 {
113 return $this->metadata['TITLE'];
114 }
115
122 public function setTitle($title)
123 {
124 if ($title <> '')
125 {
126 $this->metadata['TITLE'] = $this->filterString($title);
127 }
128 }
129
133 public function getDescription()
134 {
135 return $this->metadata['DESCRIPTION'];
136 }
137
144 public function setDescription($description)
145 {
146 if ($description <> '')
147 {
148 $this->metadata['DESCRIPTION'] = $this->filterString($description);
149 }
150 }
151
155 public function getImage()
156 {
157 return $this->metadata['IMAGE'];
158 }
159
166 public function setImage($image)
167 {
168 if ($image <> '')
169 {
170 $imageUrl = $this->normalizeImageUrl($image);
171 if (!is_null($imageUrl) && $this->validateImage($imageUrl, true))
172 {
173 $this->metadata['IMAGE'] = $imageUrl;
174 }
175 }
176 }
177
181 public function getEmdbed()
182 {
183 return $this->metadata['EMBED'];
184 }
185
192 public function setEmbed($embed)
193 {
194 if ($this->isEmbeddingAllowed())
195 {
196 $this->metadata['EMBED'] = $embed;
197 }
198 }
199
209 public function setExtraField($fieldName, $fieldValue)
210 {
211 if ($fieldName == 'FAVICON')
212 {
213 $this->metadata['EXTRA'][$fieldName] = $this->convertRelativeUriToAbsolute($fieldValue);
214 }
215 elseif ($fieldName == 'IMAGES')
216 {
217 if (is_array($fieldValue))
218 {
219 $this->metadata['EXTRA']['IMAGES'] = array();
220 foreach($fieldValue as $image)
221 {
222 $image = $this->normalizeImageUrl($image);
223 if ($image)
224 {
225 $this->metadata['EXTRA']['IMAGES'][] = $image;
226 }
227
228 if (count($this->metadata['EXTRA']['IMAGES']) >= self::MAX_IMAGES)
229 {
230 break;
231 }
232 }
233 }
234 }
235 else
236 {
237 $this->metadata['EXTRA'][$fieldName] = $this->filterString($fieldValue);
238 }
239 }
240
246 public function getExtraField($fieldName)
247 {
248 return $this->metadata['EXTRA'][$fieldName] ?? null;
249 }
250
256 public function setDateExpire(DateTime $dateExpire)
257 {
258 if (!isset($this->metadata['DATE_EXPIRE']) || $this->metadata['DATE_EXPIRE']->getTimestamp() > $dateExpire->getTimestamp())
259 {
260 $this->metadata['DATE_EXPIRE'] = $dateExpire;
261 }
262 }
263
269 public function getDateExpire(): ?DateTime
270 {
271 return $this->metadata['DATE_EXPIRE'];
272 }
273
280 public function setEncoding($encoding)
281 {
282 $encoding = trim($encoding, " \t\n\r\0\x0B'\"");
283 $this->htmlEncoding = $encoding;
284 }
285
289 public function getEncoding()
290 {
291 if ($this->htmlEncoding <> '')
292 {
293 return $this->htmlEncoding;
294 }
295
296 $this->htmlEncoding = $this->detectEncoding();
297 return $this->htmlEncoding;
298 }
299
305 public function detectEncoding()
306 {
307 $result = '';
308 if (empty($this->metaElements))
309 {
310 $this->metaElements = $this->extractElementAttributes('meta');
311 }
312
313 foreach($this->metaElements as $metaElement)
314 {
315 if (isset($metaElement['http-equiv']) && mb_strtolower($metaElement['http-equiv']) == 'content-type')
316 {
317 if (preg_match('/charset=([\w\-]+)/', $metaElement['content'], $matches))
318 {
319 $result = $matches[1];
320 break;
321 }
322 }
323 elseif (isset($metaElement['charset']))
324 {
325 $result = $metaElement['charset'];
326 break;
327 }
328 }
329
330 return $result;
331 }
332
339 public function extractElementAttributes($tagName)
340 {
341 $results = array();
342 preg_match_all("/<$tagName.+?>/mis", $this->html, $elements);
343
344 foreach($elements[0] as $element)
345 {
346 preg_match_all('/(?:([\w\-_]+)=([\'"])(.*?)\g{-2}\s*)/mis', $element, $matches);
347
348 $elementAttributes = array();
349 foreach($matches[1] as $k => $attributeName)
350 {
351 $attributeName = mb_strtolower($attributeName);
352 $attributeValue = $matches[3][$k];
353 $elementAttributes[$attributeName] = $attributeValue;
354 }
355
356 $results[] = $elementAttributes;
357 }
358
359 return $results;
360 }
361
368 public function getMetaContent($name)
369 {
370 if (empty($this->metaElements))
371 {
372 $this->metaElements = $this->extractElementAttributes('meta');
373 }
374 $name = mb_strtolower($name);
375
376 foreach ($this->metaElements as $metaElement)
377 {
378 if ((isset($metaElement['name']) && mb_strtolower($metaElement['name']) === $name
379 || isset($metaElement['property']) && mb_strtolower($metaElement['property']) === $name)
380 && $metaElement['content'] <> '')
381 {
382 return $metaElement['content'];
383 }
384 }
385
386 return null;
387 }
388
395 public function getLinkHref($rel)
396 {
397 if (empty($this->linkElements))
398 {
399 $this->linkElements = $this->extractElementAttributes('link');
400 }
401 $rel = mb_strtolower($rel);
402
403 foreach ($this->linkElements as $linkElement)
404 {
405 if (isset($linkElement['rel'])
406 && mb_strtolower($linkElement['rel']) == $rel
407 && $linkElement['href'] <> '')
408 {
409 return $linkElement['href'];
410 }
411 }
412
413 return null;
414 }
415
422 protected function filterString($str)
423 {
424 $str = html_entity_decode($str, ENT_QUOTES, $this->getEncoding());
425 $str = Encoding::convertEncoding($str, $this->getEncoding(), Context::getCurrent()->getCulture()->getCharset());
426 $str = trim($str);
427 $str = strip_tags($str);
428
429 return $str;
430 }
431
438 {
439 if (strpos($uri, '//') === 0)
440 {
441 $uri = $this->uri->getScheme().":".$uri;
442 }
443
444 if (preg_match('#^https?://#', $uri))
445 {
446 return $uri;
447 }
448
449 $pars = parse_url($uri);
450 if ($pars === false)
451 {
452 return null;
453 }
454
455 if (isset($pars['host']))
456 {
457 $result = $uri;
458 }
459 elseif (isset($pars['path']))
460 {
461 if (mb_substr($pars['path'], 0, 1) !== '/')
462 {
463 $pathPrefix = preg_replace('/^(.+?)([^\/]*)$/', '$1', $this->uri->getPath());
464 $pars['path'] = $pathPrefix.$pars['path'];
465 }
466
467 $uriPort = '';
468 if ($this->uri->getScheme() === 'http' && $this->uri->getPort() != '80'
469 || $this->uri->getScheme() === 'https' && $this->uri->getPort() != '443')
470 {
471 $uriPort = ':'.$this->uri->getPort();
472 }
473
474 $result = $this->uri->getScheme().'://'
475 .$this->uri->getHost()
476 .$uriPort
477 .$pars['path']
478 .(isset($pars['query']) ? '?'.$pars['query'] : '')
479 .(isset($pars['fragment']) ? '#'.$pars['fragment'] : '');
480 }
481 else
482 {
483 $result = null;
484 }
485
486 return $result;
487 }
488
494 protected function normalizeImageUrl($url): ?string
495 {
496 $url = $this->convertRelativeUriToAbsolute($url);
497 if (mb_strlen($url) > self::MAX_IMAGE_URL_LENGTH)
498 {
499 $url = null;
500 }
501 return $url;
502 }
503
509 protected function validateImage($url, $skipForPrivateIp = false)
510 {
511 $httpClient = new HttpClient();
512 $httpClient->setTimeout(5);
513 $httpClient->setStreamTimeout(5);
514 $httpClient->setPrivateIp(false);
515 $httpClient->setHeader('User-Agent', UrlPreview::USER_AGENT);
516
517 if (!$httpClient->query('HEAD', $url))
518 {
519 $errorCode = array_key_first($httpClient->getError());
520 return ($skipForPrivateIp && $errorCode === 'PRIVATE_IP');
521 }
522
523 if ($httpClient->getStatus() !== 200)
524 {
525 return false;
526 }
527
528 $contentType = $httpClient->getHeaders()->getContentType();
529
530 return MimeType::isImage($contentType);
531 }
532
537 protected function isEmbeddingAllowed()
538 {
539 return UrlPreview::isHostTrusted($this->uri);
540 }
541}
static getCurrent()
Definition context.php:241
validateImage($url, $skipForPrivateIp=false)
setExtraField($fieldName, $fieldValue)
setDateExpire(DateTime $dateExpire)
static isImage($mime)
Definition mimetype.php:269