From 88484a246597407c9891a642cfbaed33f999c6d8 Mon Sep 17 00:00:00 2001
From: Steph Enders <steph@senders.io>
Date: Fri, 8 Mar 2024 07:50:33 -0500
Subject: Initial commit porting over gemparse.py into its own repo

This was originally apart of, and built specifically for,
thewomaninmyeye-org capsule and site. It uses a very narrow subset of
semantic html. That said with some more minor tweaks I can see this
being usable for senders-io as well as a generally sharable python script.
---
 README.md        |  9 ++++++
 gemparse.py      | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  1 +
 3 files changed, 102 insertions(+)
 create mode 100644 README.md
 create mode 100755 gemparse.py
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..db6f678
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# gemparse
+
+gemparse is a python script that takes in semantic HTTP and outputs gemtext.
+
+## usage
+
+`./gemparse.py source.html desitnation.gmi [http://example.com/source.html]`
+
+Optionally passing in the http url will insert a link to it on the web if desired.
diff --git a/gemparse.py b/gemparse.py
new file mode 100755
index 0000000..52aae82
--- /dev/null
+++ b/gemparse.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+import sys
+import re
+from bs4 import BeautifulSoup
+
+def strip(line):
+    if line is None:
+        return line
+    else:
+        return line.strip()
+
+def handleNestedTag(tag, links, linkCnt, dest_f):
+    for child in tag.contents:
+        if child.name == "a":
+            href = child.get("href")
+            if href is not None:
+                if href.startswith("#"):
+                    print(strip(child.string), end=" ", file=dest_f)
+                else:
+                    print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
+                    links[linkCnt] = href
+                    linkCnt += 1
+            else:
+                # it's not a href link then its likely an anchor
+                pass
+        else:
+            if child.string == "\n":
+                print(strip(child.string), end="", file=dest_f)
+            else:
+                print(strip(child.string), end=" ", file=dest_f)
+    print("", file=dest_f)
+
+source = sys.argv[1]
+dest = sys.argv[2]
+http_url = sys.argv[3]
+
+
+source_f = open(source, "r")
+dest_f = open(dest, "w")
+
+html = BeautifulSoup(source_f, 'html.parser')
+heading = None
+subheading = None
+
+if html.article.header:
+    heading = strip(html.article.header.h2.string)
+    if html.article.header.p:
+        subheading = strip(html.article.header.p.string)
+else:
+    heading = strip(html.article.h2.string)
+
+if heading is None:
+    print("No heading found", file=sys.stderr)
+    exit(1)
+print("# {}".format(heading), file=dest_f)
+if subheading is not None:
+    print("\n{}".format(subheading), file=dest_f)
+
+links = dict()
+linkCnt = 0
+
+for tag in html.article.children:
+    if tag.name == "p":
+        if len(tag.contents) == 1:
+            # it's just text
+            print("\n{}".format(strip(tag.string)), file=dest_f)
+        else:
+            print("\n", end="", file=dest_f)
+            handleNestedTag(tag, links, linkCnt, dest_f)
+    elif tag.name == "ul" or tag.name == "ol":
+        for child in tag.contents:
+            if child.name == "li":
+                print("*", end=" ", file=dest_f)
+                handleNestedTag(child, links, linkCnt, dest_f)
+    if tag.name == "figure":
+        if tag.blockquote is not None:
+            print(">", end=" ", file=dest_f)
+            handleNestedTag(tag.blockquote, links, linkCnt, dest_f)
+    elif tag.name == "h3":
+        print("\n## {}".format(strip(tag.string)), file=dest_f)
+    elif tag.name == "h4":
+        print("\n### {}".format(strip(tag.string)), file=dest_f)
+
+print("\n## Links\n", file=dest_f)
+for link_n, link in links.items():
+    print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)
+
+if http_url is not None:
+    print("=> {} view on http".format(http_url), file=dest_f)
+    
+print("=> /log/ back", file=dest_f)
+print("=> / capsule", file=dest_f)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1e42172
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+beautifulsoup4 >= 4.1
-- 
cgit v1.2.3-54-g00ecf