#!/usr/bin/env python3 import sys import re from bs4 import BeautifulSoup def strip(line): if line is None: return line else: return line.strip() def handleNestedTag(tag, links, linkCnt, dest_f): for child in tag.contents: if child.name == "a": href = child.get("href") if href is not None: if href.startswith("#"): print(strip(child.string), end=" ", file=dest_f) else: print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) links[linkCnt] = href linkCnt += 1 else: # it's not a href link then its likely an anchor pass else: if child.string == "\n": print(strip(child.string), end="", file=dest_f) else: print(strip(child.string), end=" ", file=dest_f) print("", file=dest_f) source = sys.argv[1] dest = sys.argv[2] http_url = sys.argv[3] source_f = open(source, "r") dest_f = open(dest, "w") html = BeautifulSoup(source_f, 'html.parser') heading = None subheading = None if html.article.header: heading = strip(html.article.header.h2.string) if html.article.header.p: subheading = strip(html.article.header.p.string) else: heading = strip(html.article.h2.string) if heading is None: print("No heading found", file=sys.stderr) exit(1) print("# {}".format(heading), file=dest_f) if subheading is not None: print("\n{}".format(subheading), file=dest_f) links = dict() linkCnt = 0 for tag in html.article.children: if tag.name == "p": if len(tag.contents) == 1: # it's just text print("\n{}".format(strip(tag.string)), file=dest_f) else: print("\n", end="", file=dest_f) handleNestedTag(tag, links, linkCnt, dest_f) elif tag.name == "ul" or tag.name == "ol": for child in tag.contents: if child.name == "li": print("*", end=" ", file=dest_f) handleNestedTag(child, links, linkCnt, dest_f) if tag.name == "figure": if tag.blockquote is not None: print(">", end=" ", file=dest_f) handleNestedTag(tag.blockquote, links, linkCnt, dest_f) elif tag.name == "h3": print("\n## {}".format(strip(tag.string)), file=dest_f) elif tag.name == "h4": print("\n### {}".format(strip(tag.string)), file=dest_f) print("\n## Links\n", file=dest_f) for link_n, link in links.items(): print("=> {} [{}] {}".format(link, link_n, link), file=dest_f) if http_url is not None: print("=> {} view on http".format(http_url), file=dest_f) print("=> /log/ back", file=dest_f) print("=> / capsule", file=dest_f)