From 304d3a0b88052d16aca0230f67532acd6822e482 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Wed, 20 Nov 2019 18:56:34 +0300 Subject: tag-related fixes 1. move tag sanitization to feedparser common item class 2. enforce length limit on tags when parsing 3. support multiple tags passed via one dc:subject and other such elements, parse them as a comma-separated list 4. sort resulting tag list to prevent different order between feed updates 5. remove some duplicate code related to tag validation 6. allow + symbol in tags --- classes/feeditem/atom.php | 8 ++++---- classes/feeditem/common.php | 31 +++++++++++++++++++++++++++++++ classes/feeditem/rss.php | 8 ++++---- 3 files changed, 39 insertions(+), 8 deletions(-) (limited to 'classes/feeditem') diff --git a/classes/feeditem/atom.php b/classes/feeditem/atom.php index a962b59f2..a03080981 100755 --- a/classes/feeditem/atom.php +++ b/classes/feeditem/atom.php @@ -103,20 +103,20 @@ class FeedItem_Atom extends FeedItem_Common { function get_categories() { $categories = $this->elem->getElementsByTagName("category"); - $cats = array(); + $cats = []; foreach ($categories as $cat) { if ($cat->hasAttribute("term")) - array_push($cats, trim($cat->getAttribute("term"))); + array_push($cats, $cat->getAttribute("term")); } $categories = $this->xpath->query("dc:subject", $this->elem); foreach ($categories as $cat) { - array_push($cats, clean(trim($cat->nodeValue))); + array_push($cats, $cat->nodeValue); } - return $cats; + return $this->normalize_categories($cats); } function get_enclosures() { diff --git a/classes/feeditem/common.php b/classes/feeditem/common.php index 3193ed273..f208f4a48 100755 --- a/classes/feeditem/common.php +++ b/classes/feeditem/common.php @@ -162,4 +162,35 @@ abstract class FeedItem_Common extends FeedItem { } } + static function normalize_categories($cats) { + + $tmp = []; + + foreach ($cats as $rawcat) { + $tmp = array_merge($tmp, explode(",", $rawcat)); + } + + $tmp = array_map(function($srccat) { + $cat = clean(trim(mb_strtolower($srccat))); + + // we don't support numeric tags + if (is_numeric($cat)) + $cat = 't:' . $cat; + + $cat = preg_replace('/[,\'\"]/', "", $cat); + + if (DB_TYPE == "mysql") { + $cat = preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xEF\xBF\xBD", $cat); + } + + if (mb_strlen($cat) > 250) + $cat = mb_substr($cat, 0, 250); + + return $cat; + }, $tmp); + + asort($tmp); + + return array_unique($tmp); + } } diff --git a/classes/feeditem/rss.php b/classes/feeditem/rss.php index 916c73ec4..1f7953c51 100755 --- a/classes/feeditem/rss.php +++ b/classes/feeditem/rss.php @@ -97,19 +97,19 @@ class FeedItem_RSS extends FeedItem_Common { function get_categories() { $categories = $this->elem->getElementsByTagName("category"); - $cats = array(); + $cats = []; foreach ($categories as $cat) { - array_push($cats, trim($cat->nodeValue)); + array_push($cats, $cat->nodeValue); } $categories = $this->xpath->query("dc:subject", $this->elem); foreach ($categories as $cat) { - array_push($cats, clean(trim($cat->nodeValue))); + array_push($cats, $cat->nodeValue); } - return $cats; + return $this->normalize_categories($cats); } function get_enclosures() { -- cgit v1.2.3-54-g00ecf