diff options
-rwxr-xr-x | gemparse.py | 56 |
1 files changed, 33 insertions, 23 deletions
diff --git a/gemparse.py b/gemparse.py index 28ac91a..54dd990 100755 --- a/gemparse.py +++ b/gemparse.py @@ -3,11 +3,25 @@ import sys import re from bs4 import BeautifulSoup -def strip(string): - ret = [] - for s in string.split("\n"): - ret.append(s.lstrip()) - return "\n".join(ret) +def strip(line): + if line is None: + return line + else: + return line.strip() + +def handleNestedTag(tag, links, linkCnt, dest_f): + for child in tag.contents: + if child.name == "a": + href = child.get("href") + print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) + links[linkCnt] = href + linkCnt += 1 + else: + if child.string == "\n": + print(strip(child.string), end="", file=dest_f) + else: + print(strip(child.string), end=" ", file=dest_f) + print("", file=dest_f) source = sys.argv[1] dest = sys.argv[2] @@ -30,9 +44,9 @@ else: if heading is None: print("No heading found", file=sys.stderr) exit(1) -print("# {}\n".format(heading), file=dest_f) +print("# {}".format(heading), file=dest_f) if subheading is not None: - print(subheading, file=dest_f) + print("\n{}".format(subheading), file=dest_f) links = dict() linkCnt = 0 @@ -41,22 +55,19 @@ for tag in html.article.children: if tag.name == "p": if len(tag.contents) == 1: # it's just text - print(strip(tag.string), file=dest_f) + print("\n{}".format(strip(tag.string)), file=dest_f) else: - # contains multiple tags - # we should check for links - for child in tag.contents: - if child.name == "a": - href = child.get("href") - print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) - links[linkCnt] = href - linkCnt += 1 - else: - if child.string == "\n": - print(child.string, end="", file=dest_f) - else: - print(strip(child.string), end=" ", file=dest_f) - + print("\n", end="", file=dest_f) + handleNestedTag(tag, links, linkCnt, dest_f) + if tag.name == "ul" or tag.name == "ol": + for child in tag.contents: + if child.name == "li": + print("*", end=" ", file=dest_f) + handleNestedTag(child, links, linkCnt, dest_f) + if tag.name == "h3": + print("\n## {}".format(strip(tag.string)), file=dest_f) + if tag.name == "h4": + print("\n### {}".format(strip(tag.string)), file=dest_f) print("\n## Links\n", file=dest_f) for link_n, link in links.items(): @@ -65,4 +76,3 @@ for link_n, link in links.items(): print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f) print("=> /log/ back", file=dest_f) print("=> / capsule", file=dest_f) - |