page: strip metadata tags from article preface
authorMischa POSLAWSKY <perl@shiar.org>
Fri, 12 Jul 2019 02:00:57 +0000 (04:00 +0200)
committerMischa POSLAWSKY <perl@shiar.org>
Fri, 12 Jul 2019 02:18:16 +0000 (04:18 +0200)
Search html body top for all <meta /> html intended as page data, available from
object $meta, but not shown directly as contents.  Matches for description
were returned already as teaser, but not removed.  Similar overrides are now
considered for title and image methods as well.

article.inc.php

index a6a4ef800e9504f7440a5d34b42d774606c2fe96..3c65a97d871b7fb8245023785a40f74646dc73ad 100644 (file)
@@ -17,6 +17,7 @@ function showdate($parts)
 class ArchiveArticle
 {
        public $raw, $preface, $title, $body;
+       public $meta = [];
 
        function __construct($path)
        {
@@ -24,6 +25,15 @@ class ArchiveArticle
                $this->link = preg_replace('{(?:/index)?\.html$}', '', $path);
                if (file_exists($this->page)) {
                        $this->raw = file_get_contents($this->page);
+
+                       if (preg_match_all('{
+                               \G <meta \s+ property="( [^"]+ )" \s+ content="( [^"]* )" > \s*
+                       }x', $this->raw, $meta)) {
+                               $matchlen = array_sum(array_map('strlen', $meta[0]));
+                               $this->raw = substr($this->raw, $matchlen); # delete matched contents
+                               $this->meta = array_combine($meta[1], $meta[2]); # [property => content]
+                       }
+
                        @list ($this->preface, $this->title, $this->body) =
                                preg_split('{<h2>(.*?)</h2>\s*}', $this->raw, 2, PREG_SPLIT_DELIM_CAPTURE);
                }
@@ -36,7 +46,7 @@ class ArchiveArticle
 
        function safetitle()
        {
-               return trim(strip_tags($this->title));
+               return trim($this->meta['og:title'] ?? strip_tags($this->title));
        }
        function name()
        {
@@ -80,14 +90,12 @@ class ArchiveArticle
                }
                return $this->body;
        }
+
        function teaser()
        {
-               if (preg_match('{
-                       <meta \s+ name="description" [^>]* content="([^">]*)"
-               }x', $this->preface, $meta)) {
-                       # prefer specific page description if found (assume before title)
-                       #TODO: strip from body contents
-                       return $meta[1];
+               if ($override = @$this->meta['og:description']) {
+                       # prefer specific page description if found in metadata
+                       return $override;
                }
 
                if (preg_match('{
@@ -115,6 +123,11 @@ class ArchiveArticle
        }
        function image()
        {
+               if ($override = @$this->meta['og:image']) {
+                       # prefer specific page image if found in metadata
+                       return $override;
+               }
+
                if ( preg_match('/\bsrc="([^"]*)"/', $this->img, $src) ) {
                        return $src[1];
                }