From 88484a246597407c9891a642cfbaed33f999c6d8 Mon Sep 17 00:00:00 2001 From: Steph Enders Date: Fri, 8 Mar 2024 07:50:33 -0500 Subject: Initial commit porting over gemparse.py into its own repo This was originally apart of, and built specifically for, thewomaninmyeye-org capsule and site. It uses a very narrow subset of semantic html. That said with some more minor tweaks I can see this being usable for senders-io as well as a generally sharable python script. --- README.md | 9 ++++++ gemparse.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 102 insertions(+) create mode 100644 README.md create mode 100755 gemparse.py create mode 100644 requirements.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..db6f678 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# gemparse + +gemparse is a python script that takes in semantic HTTP and outputs gemtext. + +## usage + +`./gemparse.py source.html desitnation.gmi [http://example.com/source.html]` + +Optionally passing in the http url will insert a link to it on the web if desired. diff --git a/gemparse.py b/gemparse.py new file mode 100755 index 0000000..52aae82 --- /dev/null +++ b/gemparse.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +import sys +import re +from bs4 import BeautifulSoup + +def strip(line): + if line is None: + return line + else: + return line.strip() + +def handleNestedTag(tag, links, linkCnt, dest_f): + for child in tag.contents: + if child.name == "a": + href = child.get("href") + if href is not None: + if href.startswith("#"): + print(strip(child.string), end=" ", file=dest_f) + else: + print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) + links[linkCnt] = href + linkCnt += 1 + else: + # it's not a href link then its likely an anchor + pass + else: + if child.string == "\n": + print(strip(child.string), end="", file=dest_f) + else: + print(strip(child.string), end=" ", file=dest_f) + print("", file=dest_f) + +source = sys.argv[1] +dest = sys.argv[2] +http_url = sys.argv[3] + + +source_f = open(source, "r") +dest_f = open(dest, "w") + +html = BeautifulSoup(source_f, 'html.parser') +heading = None +subheading = None + +if html.article.header: + heading = strip(html.article.header.h2.string) + if html.article.header.p: + subheading = strip(html.article.header.p.string) +else: + heading = strip(html.article.h2.string) + +if heading is None: + print("No heading found", file=sys.stderr) + exit(1) +print("# {}".format(heading), file=dest_f) +if subheading is not None: + print("\n{}".format(subheading), file=dest_f) + +links = dict() +linkCnt = 0 + +for tag in html.article.children: + if tag.name == "p": + if len(tag.contents) == 1: + # it's just text + print("\n{}".format(strip(tag.string)), file=dest_f) + else: + print("\n", end="", file=dest_f) + handleNestedTag(tag, links, linkCnt, dest_f) + elif tag.name == "ul" or tag.name == "ol": + for child in tag.contents: + if child.name == "li": + print("*", end=" ", file=dest_f) + handleNestedTag(child, links, linkCnt, dest_f) + if tag.name == "figure": + if tag.blockquote is not None: + print(">", end=" ", file=dest_f) + handleNestedTag(tag.blockquote, links, linkCnt, dest_f) + elif tag.name == "h3": + print("\n## {}".format(strip(tag.string)), file=dest_f) + elif tag.name == "h4": + print("\n### {}".format(strip(tag.string)), file=dest_f) + +print("\n## Links\n", file=dest_f) +for link_n, link in links.items(): + print("=> {} [{}] {}".format(link, link_n, link), file=dest_f) + +if http_url is not None: + print("=> {} view on http".format(http_url), file=dest_f) + +print("=> /log/ back", file=dest_f) +print("=> / capsule", file=dest_f) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1e42172 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +beautifulsoup4 >= 4.1 -- cgit v1.2.3-54-g00ecf