diff options
Diffstat (limited to 'gemparse.py')
-rwxr-xr-x | gemparse.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/gemparse.py b/gemparse.py new file mode 100755 index 0000000..52aae82 --- /dev/null +++ b/gemparse.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +import sys +import re +from bs4 import BeautifulSoup + +def strip(line): + if line is None: + return line + else: + return line.strip() + +def handleNestedTag(tag, links, linkCnt, dest_f): + for child in tag.contents: + if child.name == "a": + href = child.get("href") + if href is not None: + if href.startswith("#"): + print(strip(child.string), end=" ", file=dest_f) + else: + print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) + links[linkCnt] = href + linkCnt += 1 + else: + # it's not a href link then its likely an anchor + pass + else: + if child.string == "\n": + print(strip(child.string), end="", file=dest_f) + else: + print(strip(child.string), end=" ", file=dest_f) + print("", file=dest_f) + +source = sys.argv[1] +dest = sys.argv[2] +http_url = sys.argv[3] + + +source_f = open(source, "r") +dest_f = open(dest, "w") + +html = BeautifulSoup(source_f, 'html.parser') +heading = None +subheading = None + +if html.article.header: + heading = strip(html.article.header.h2.string) + if html.article.header.p: + subheading = strip(html.article.header.p.string) +else: + heading = strip(html.article.h2.string) + +if heading is None: + print("No heading found", file=sys.stderr) + exit(1) +print("# {}".format(heading), file=dest_f) +if subheading is not None: + print("\n{}".format(subheading), file=dest_f) + +links = dict() +linkCnt = 0 + +for tag in html.article.children: + if tag.name == "p": + if len(tag.contents) == 1: + # it's just text + print("\n{}".format(strip(tag.string)), file=dest_f) + else: + print("\n", end="", file=dest_f) + handleNestedTag(tag, links, linkCnt, dest_f) + elif tag.name == "ul" or tag.name == "ol": + for child in tag.contents: + if child.name == "li": + print("*", end=" ", file=dest_f) + handleNestedTag(child, links, linkCnt, dest_f) + if tag.name == "figure": + if tag.blockquote is not None: + print(">", end=" ", file=dest_f) + handleNestedTag(tag.blockquote, links, linkCnt, dest_f) + elif tag.name == "h3": + print("\n## {}".format(strip(tag.string)), file=dest_f) + elif tag.name == "h4": + print("\n### {}".format(strip(tag.string)), file=dest_f) + +print("\n## Links\n", file=dest_f) +for link_n, link in links.items(): + print("=> {} [{}] {}".format(link, link_n, link), file=dest_f) + +if http_url is not None: + print("=> {} view on http".format(http_url), file=dest_f) + +print("=> /log/ back", file=dest_f) +print("=> / capsule", file=dest_f) |