summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteph Enders <steph@senders.io>2024-03-08 07:50:33 -0500
committerSteph Enders <steph@senders.io>2024-03-08 07:50:33 -0500
commit88484a246597407c9891a642cfbaed33f999c6d8 (patch)
tree858114b539cb5811e375f55409b0fbb0d8638837
Initial commit porting over gemparse.py into its own repomain
This was originally apart of, and built specifically for, thewomaninmyeye-org capsule and site. It uses a very narrow subset of semantic html. That said with some more minor tweaks I can see this being usable for senders-io as well as a generally sharable python script.
-rw-r--r--README.md9
-rwxr-xr-xgemparse.py92
-rw-r--r--requirements.txt1
3 files changed, 102 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..db6f678
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# gemparse
+
+gemparse is a python script that takes in semantic HTTP and outputs gemtext.
+
+## usage
+
+`./gemparse.py source.html desitnation.gmi [http://example.com/source.html]`
+
+Optionally passing in the http url will insert a link to it on the web if desired.
diff --git a/gemparse.py b/gemparse.py
new file mode 100755
index 0000000..52aae82
--- /dev/null
+++ b/gemparse.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+import sys
+import re
+from bs4 import BeautifulSoup
+
+def strip(line):
+ if line is None:
+ return line
+ else:
+ return line.strip()
+
+def handleNestedTag(tag, links, linkCnt, dest_f):
+ for child in tag.contents:
+ if child.name == "a":
+ href = child.get("href")
+ if href is not None:
+ if href.startswith("#"):
+ print(strip(child.string), end=" ", file=dest_f)
+ else:
+ print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
+ links[linkCnt] = href
+ linkCnt += 1
+ else:
+ # it's not a href link then its likely an anchor
+ pass
+ else:
+ if child.string == "\n":
+ print(strip(child.string), end="", file=dest_f)
+ else:
+ print(strip(child.string), end=" ", file=dest_f)
+ print("", file=dest_f)
+
+source = sys.argv[1]
+dest = sys.argv[2]
+http_url = sys.argv[3]
+
+
+source_f = open(source, "r")
+dest_f = open(dest, "w")
+
+html = BeautifulSoup(source_f, 'html.parser')
+heading = None
+subheading = None
+
+if html.article.header:
+ heading = strip(html.article.header.h2.string)
+ if html.article.header.p:
+ subheading = strip(html.article.header.p.string)
+else:
+ heading = strip(html.article.h2.string)
+
+if heading is None:
+ print("No heading found", file=sys.stderr)
+ exit(1)
+print("# {}".format(heading), file=dest_f)
+if subheading is not None:
+ print("\n{}".format(subheading), file=dest_f)
+
+links = dict()
+linkCnt = 0
+
+for tag in html.article.children:
+ if tag.name == "p":
+ if len(tag.contents) == 1:
+ # it's just text
+ print("\n{}".format(strip(tag.string)), file=dest_f)
+ else:
+ print("\n", end="", file=dest_f)
+ handleNestedTag(tag, links, linkCnt, dest_f)
+ elif tag.name == "ul" or tag.name == "ol":
+ for child in tag.contents:
+ if child.name == "li":
+ print("*", end=" ", file=dest_f)
+ handleNestedTag(child, links, linkCnt, dest_f)
+ if tag.name == "figure":
+ if tag.blockquote is not None:
+ print(">", end=" ", file=dest_f)
+ handleNestedTag(tag.blockquote, links, linkCnt, dest_f)
+ elif tag.name == "h3":
+ print("\n## {}".format(strip(tag.string)), file=dest_f)
+ elif tag.name == "h4":
+ print("\n### {}".format(strip(tag.string)), file=dest_f)
+
+print("\n## Links\n", file=dest_f)
+for link_n, link in links.items():
+ print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)
+
+if http_url is not None:
+ print("=> {} view on http".format(http_url), file=dest_f)
+
+print("=> /log/ back", file=dest_f)
+print("=> / capsule", file=dest_f)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1e42172
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+beautifulsoup4 >= 4.1