diff options
-rw-r--r-- | README.md | 9 | ||||
-rwxr-xr-x | gemparse.py | 92 | ||||
-rw-r--r-- | requirements.txt | 1 |
3 files changed, 102 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..db6f678 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# gemparse + +gemparse is a python script that takes in semantic HTTP and outputs gemtext. + +## usage + +`./gemparse.py source.html desitnation.gmi [http://example.com/source.html]` + +Optionally passing in the http url will insert a link to it on the web if desired. diff --git a/gemparse.py b/gemparse.py new file mode 100755 index 0000000..52aae82 --- /dev/null +++ b/gemparse.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +import sys +import re +from bs4 import BeautifulSoup + +def strip(line): + if line is None: + return line + else: + return line.strip() + +def handleNestedTag(tag, links, linkCnt, dest_f): + for child in tag.contents: + if child.name == "a": + href = child.get("href") + if href is not None: + if href.startswith("#"): + print(strip(child.string), end=" ", file=dest_f) + else: + print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) + links[linkCnt] = href + linkCnt += 1 + else: + # it's not a href link then its likely an anchor + pass + else: + if child.string == "\n": + print(strip(child.string), end="", file=dest_f) + else: + print(strip(child.string), end=" ", file=dest_f) + print("", file=dest_f) + +source = sys.argv[1] +dest = sys.argv[2] +http_url = sys.argv[3] + + +source_f = open(source, "r") +dest_f = open(dest, "w") + +html = BeautifulSoup(source_f, 'html.parser') +heading = None +subheading = None + +if html.article.header: + heading = strip(html.article.header.h2.string) + if html.article.header.p: + subheading = strip(html.article.header.p.string) +else: + heading = strip(html.article.h2.string) + +if heading is None: + print("No heading found", file=sys.stderr) + exit(1) +print("# {}".format(heading), file=dest_f) +if subheading is not None: + print("\n{}".format(subheading), file=dest_f) + +links = dict() +linkCnt = 0 + +for tag in html.article.children: + if tag.name == "p": + if len(tag.contents) == 1: + # it's just text + print("\n{}".format(strip(tag.string)), file=dest_f) + else: + print("\n", end="", file=dest_f) + handleNestedTag(tag, links, linkCnt, dest_f) + elif tag.name == "ul" or tag.name == "ol": + for child in tag.contents: + if child.name == "li": + print("*", end=" ", file=dest_f) + handleNestedTag(child, links, linkCnt, dest_f) + if tag.name == "figure": + if tag.blockquote is not None: + print(">", end=" ", file=dest_f) + handleNestedTag(tag.blockquote, links, linkCnt, dest_f) + elif tag.name == "h3": + print("\n## {}".format(strip(tag.string)), file=dest_f) + elif tag.name == "h4": + print("\n### {}".format(strip(tag.string)), file=dest_f) + +print("\n## Links\n", file=dest_f) +for link_n, link in links.items(): + print("=> {} [{}] {}".format(link, link_n, link), file=dest_f) + +if http_url is not None: + print("=> {} view on http".format(http_url), file=dest_f) + +print("=> /log/ back", file=dest_f) +print("=> / capsule", file=dest_f) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1e42172 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +beautifulsoup4 >= 4.1 |