#!/usr/bin/env python3 import sys import re from bs4 import BeautifulSoup def strip(string): ret = [] for s in string.split("\n"): ret.append(s.lstrip()) return "\n".join(ret) source = sys.argv[1] dest = sys.argv[2] http_url = sys.argv[3] source_f = open(source, "r") dest_f = open(dest, "w") html = BeautifulSoup(source_f, 'html.parser') heading = None subheading = None if html.article.header: heading = strip(html.article.header.h2.string) if html.article.header.p: subheading = strip(html.article.header.p.string) else: heading = strip(html.article.h2.string) if heading is None: print("No heading found", file=sys.stderr) exit(1) print("# {}\n".format(heading), file=dest_f) if subheading is not None: print(subheading, file=dest_f) links = dict() linkCnt = 0 for tag in html.article.children: if tag.name == "p": if len(tag.contents) == 1: # it's just text print(strip(tag.string), file=dest_f) else: # contains multiple tags # we should check for links for child in tag.contents: if child.name == "a": href = child.get("href") print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) links[linkCnt] = href linkCnt += 1 else: if child.string == "\n": print(child.string, end="", file=dest_f) else: print(strip(child.string), end=" ", file=dest_f) print("\n## Links\n", file=dest_f) for link_n, link in links.items(): print("=> {} [{}] {}".format(link, link_n, link), file=dest_f) print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f) print("=> /log/ back", file=dest_f) print("=> / capsule", file=dest_f)