From 8178dd95210373495ffe9dd6ca333819eec882c1 Mon Sep 17 00:00:00 2001 From: Steph Enders Date: Fri, 14 Jul 2023 17:06:08 -0400 Subject: Support more tags in gemparse Gemparse now supports parsing: - h3 - h4 - ul, ol, li - a tags as anchors () - a tags as anchor refs () --- gemparse.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/gemparse.py b/gemparse.py index 54dd990..dd83c89 100755 --- a/gemparse.py +++ b/gemparse.py @@ -13,9 +13,16 @@ def handleNestedTag(tag, links, linkCnt, dest_f): for child in tag.contents: if child.name == "a": href = child.get("href") - print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) - links[linkCnt] = href - linkCnt += 1 + if href is not None: + if href.startswith("#"): + print(strip(child.string), end=" ", file=dest_f) + else: + print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f) + links[linkCnt] = href + linkCnt += 1 + else: + # it's not a href link then its likely an anchor + pass else: if child.string == "\n": print(strip(child.string), end="", file=dest_f) @@ -59,14 +66,18 @@ for tag in html.article.children: else: print("\n", end="", file=dest_f) handleNestedTag(tag, links, linkCnt, dest_f) - if tag.name == "ul" or tag.name == "ol": + elif tag.name == "ul" or tag.name == "ol": for child in tag.contents: if child.name == "li": print("*", end=" ", file=dest_f) handleNestedTag(child, links, linkCnt, dest_f) - if tag.name == "h3": + if tag.name == "figure": + if tag.blockquote is not None: + print(">", end=" ", file=dest_f) + handleNestedTag(tag.blockquote, links, linkCnt, dest_f) + elif tag.name == "h3": print("\n## {}".format(strip(tag.string)), file=dest_f) - if tag.name == "h4": + elif tag.name == "h4": print("\n### {}".format(strip(tag.string)), file=dest_f) print("\n## Links\n", file=dest_f) -- cgit v1.2.3-54-g00ecf