summaryrefslogtreecommitdiff
path: root/gemparse.py
diff options
context:
space:
mode:
Diffstat (limited to 'gemparse.py')
-rwxr-xr-xgemparse.py68
1 files changed, 68 insertions, 0 deletions
diff --git a/gemparse.py b/gemparse.py
new file mode 100755
index 0000000..28ac91a
--- /dev/null
+++ b/gemparse.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+import sys
+import re
+from bs4 import BeautifulSoup
+
+def strip(string):
+ ret = []
+ for s in string.split("\n"):
+ ret.append(s.lstrip())
+ return "\n".join(ret)
+
+source = sys.argv[1]
+dest = sys.argv[2]
+http_url = sys.argv[3]
+
+source_f = open(source, "r")
+dest_f = open(dest, "w")
+
+html = BeautifulSoup(source_f, 'html.parser')
+heading = None
+subheading = None
+
+if html.article.header:
+ heading = strip(html.article.header.h2.string)
+ if html.article.header.p:
+ subheading = strip(html.article.header.p.string)
+else:
+ heading = strip(html.article.h2.string)
+
+if heading is None:
+ print("No heading found", file=sys.stderr)
+ exit(1)
+print("# {}\n".format(heading), file=dest_f)
+if subheading is not None:
+ print(subheading, file=dest_f)
+
+links = dict()
+linkCnt = 0
+
+for tag in html.article.children:
+ if tag.name == "p":
+ if len(tag.contents) == 1:
+ # it's just text
+ print(strip(tag.string), file=dest_f)
+ else:
+ # contains multiple tags
+ # we should check for links
+ for child in tag.contents:
+ if child.name == "a":
+ href = child.get("href")
+ print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
+ links[linkCnt] = href
+ linkCnt += 1
+ else:
+ if child.string == "\n":
+ print(child.string, end="", file=dest_f)
+ else:
+ print(strip(child.string), end=" ", file=dest_f)
+
+
+print("\n## Links\n", file=dest_f)
+for link_n, link in links.items():
+ print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)
+
+print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
+print("=> /log/ back", file=dest_f)
+print("=> / capsule", file=dest_f)
+