Update ComicPress logic

This updates the logic to work across a variety of additional sites. Additionally, it grabs the author's comments from comics, such as the text on Buttersafe. This does not update the list of supported comics. There are too many comic sites to enumerate all of them anyway.
author: vjkcxl <349-vjkcxl@users.noreply.gitlab.tt-rss.org> 2025-09-01 14:06:10 -0500
committer: vjkcxl <349-vjkcxl@users.noreply.gitlab.tt-rss.org> 2025-09-01 14:06:10 -0500
commit: efe6fbd3fa97d4dcf22cdaa72561cc91421bcfe2 (patch)
tree: 340b8454c1aa4bfa38384819e2f554b9a2a60c45
parent: 98dbf4973380a525cbdeb542786838de10ad6d08 (diff)
1 files changed, 46 insertions, 39 deletions
diff --git a/plugins/af_comics/filters/af_comics_comicpress.php b/plugins/af_comics/filters/af_comics_comicpress.php
index 0afa19906..54ad515d7 100755
--- a/plugins/af_comics/filters/af_comics_comicpress.php
+++ b/plugins/af_comics/filters/af_comics_comicpress.php
@@ -9,59 +9,66 @@ class Af_Comics_ComicPress extends Af_ComicFilter {
 
 	function process(&$article) {
 		if (str_contains($article["guid"], "bunicomic.com") ||
-				str_contains($article["guid"], "buttersafe.com") ||
-				str_contains($article["guid"], "extrafabulouscomics.com") ||
-				str_contains($article["guid"], "danbydraws.com") ||
-				str_contains($article["guid"], "theduckwebcomics.com/Powerup_Comics") ||
-				str_contains($article["guid"], "happyjar.com") ||
-				str_contains($article["guid"], "nedroid.com") ||
-				str_contains($article["guid"], "stonetoss.com") ||
-				str_contains($article["guid"], "csectioncomics.com")) {
+			str_contains($article["guid"], "buttersafe.com") ||
+			str_contains($article["guid"], "extrafabulouscomics.com") ||
+			str_contains($article["guid"], "danbydraws.com") ||
+			str_contains($article["guid"], "theduckwebcomics.com/Powerup_Comics") ||
+			str_contains($article["guid"], "happyjar.com") ||
+			str_contains($article["guid"], "nedroid.com") ||
+			str_contains($article["guid"], "stonetoss.com") ||
+			str_contains($article["guid"], "csectioncomics.com")) {
 
-				// lol at people who block clients by user agent
-				// oh noes my ad revenue Q_Q
+			$res = UrlHelper::fetch(["url" => $article["link"]]);
 
-				$res = UrlHelper::fetch(["url" => $article["link"],
-					"useragent" => "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"]);
+			$doc = new DOMDocument();
 
-				$doc = new DOMDocument();
+			if ($res && $doc->loadHTML($res)) {
+				$xpath = new DOMXPath($doc);
+				$img = $xpath->query('//div[@id="comic"]')->item(0);
+				$text = $xpath->query('//div[@class="entry" or @class="entry-content"]')->item(0);
 
-				if ($res && $doc->loadHTML($res)) {
-					$xpath = new DOMXPath($doc);
-					$basenode = $xpath->query('//div[@id="comic"]|//img[contains(@class, "wp-post-image")]/..')->item(0);
+				if ($img || $text) {
+					$article["content"] = '';
 
-					if ($basenode) {
-						$article["content"] = $doc->saveHTML($basenode);
-						return true;
+					if ($img) {
+						$this->cleanup($xpath, $img);
+						$article["content"] .= $doc->saveHTML($img);
 					}
 
-					/** @var DOMElement|null $webtoon_link (buni specific) */
-					$webtoon_link = $xpath->query("//a[contains(@href,'www.webtoons.com')]")->item(0);
+					if ($text) {
+						$this->cleanup($xpath, $text);
+						$article["content"] .= $doc->saveHTML($text);
+					}
 
-					if ($webtoon_link) {
+					return true;
+				}
+			}
+		}
 
-						$res = UrlHelper::fetch(["url" => $webtoon_link->getAttribute("href"),
-							"useragent" => "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"]);
+		return false;
+	}
 
-						if (@$doc->loadHTML($res)) {
-							$xpath = new DOMXPath($doc);
-							$basenode = $xpath->query('//div[@id="_viewerBox"]')->item(0);
+	private function cleanup($xpath, $content_node) {
+		$toUpdates = $xpath->query('//img[@data-src]', $content_node);
+		$this->move_all_attributes($toUpdates, 'data-src', 'src');
 
-							if ($basenode) {
-								$imgs = $xpath->query("//img[@data-url]", $basenode);
+		$toUpdates = $xpath->query('//img[@data-srcset]', $content_node);
+		$this->move_all_attributes($toUpdates, 'data-srcset', 'srcset');
 
-								foreach ($imgs as $img) {
-									$img->setAttribute("src", $img->getAttribute("data-url"));
-								}
+		$toUpdates = $xpath->query('//img[@data-sizes]', $content_node);
+		$this->move_all_attributes($toUpdates, 'data-sizes', 'sizes');
 
-								$article["content"] = $doc->saveHTML($basenode);
-								return true;
-							}
-						}
-					}
-				}
+		$toRemoves = $xpath->query('.//*[contains(@class, "sharedaddy") or contains(@class, "relatedposts") or contains(@class, "donation_table") or contains(@class, "above-comic") or contains(@class, "oli_")]', $content_node);
+		foreach ($toRemoves as $toRemove) {
+			$toRemove->parentNode->removeChild($toRemove);
 		}
+	}
 
-		return false;
+	private function move_all_attributes($toUpdates, $srcName, $dstName) {
+		foreach ($toUpdates as $toUpdate) {
+			$attributeValue = $toUpdate->getAttribute($srcName);
+			$toUpdate->setAttribute($dstName, $attributeValue);
+			$toUpdate->removeAttribute($srcName);
+		}
 	}
 }
author	vjkcxl <349-vjkcxl@users.noreply.gitlab.tt-rss.org>	2025-09-01 14:06:10 -0500
committer	vjkcxl <349-vjkcxl@users.noreply.gitlab.tt-rss.org>	2025-09-01 14:06:10 -0500
commit	efe6fbd3fa97d4dcf22cdaa72561cc91421bcfe2 (patch)
tree	340b8454c1aa4bfa38384819e2f554b9a2a60c45
parent	98dbf4973380a525cbdeb542786838de10ad6d08 (diff)