1 files changed, 68 insertions, 0 deletions
diff --git a/gemparse.py b/gemparse.py
new file mode 100755
index 0000000..28ac91a
--- /dev/null
+++ b/gemparse.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import sys
+import re
+from bs4 import BeautifulSoup
+
+def strip(string):
+    ret = []
+    for s in string.split("\n"):
+        ret.append(s.lstrip())
+    return "\n".join(ret)
+
+source = sys.argv[1]
+dest = sys.argv[2]
+http_url = sys.argv[3]
+
+source_f = open(source, "r")
+dest_f = open(dest, "w")
+
+html = BeautifulSoup(source_f, 'html.parser')
+heading = None
+subheading = None
+
+if html.article.header:
+    heading = strip(html.article.header.h2.string)
+    if html.article.header.p:
+        subheading = strip(html.article.header.p.string)
+else:
+    heading = strip(html.article.h2.string)
+
+if heading is None:
+    print("No heading found", file=sys.stderr)
+    exit(1)
+print("# {}\n".format(heading), file=dest_f)
+if subheading is not None:
+    print(subheading, file=dest_f)
+
+links = dict()
+linkCnt = 0
+
+for tag in html.article.children:
+    if tag.name == "p":
+        if len(tag.contents) == 1:
+            # it's just text
+            print(strip(tag.string), file=dest_f)
+        else:
+            # contains multiple tags
+            # we should check for links
+            for child in tag.contents:
+                if child.name == "a":
+                    href = child.get("href")
+                    print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
+                    links[linkCnt] = href
+                    linkCnt += 1
+                else:
+                    if child.string == "\n":
+                        print(child.string, end="", file=dest_f)
+                    else:
+                        print(strip(child.string), end=" ", file=dest_f)
+
+
+print("\n## Links\n", file=dest_f)
+for link_n, link in links.items():
+    print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)
+
+print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
+print("=> /log/ back", file=dest_f)
+print("=> / capsule", file=dest_f)
+