Fix extra lines issue and support lists

This now supports lists and removes all the extra lines! I kept strip as a function but made it just a proxy for .strip() but its None safe
author: Steph Enders <steph@senders.io> 2023-07-06 11:29:32 -0400
committer: Steph Enders <steph@senders.io> 2023-07-06 11:29:32 -0400
commit: 8c75b82a28f79f3adb74fa392cbef222d8228320 (patch)
tree: 8b4bfde69003612aa74615c44988dee913f3d958
parent: 52dc371b42fe103d81054706346cbc68f19a160c (diff)
1 files changed, 33 insertions, 23 deletions
diff --git a/gemparse.py b/gemparse.py
index 28ac91a..54dd990 100755
--- a/gemparse.py
+++ b/gemparse.py
@@ -3,11 +3,25 @@ import sys
 import re
 from bs4 import BeautifulSoup
 
-def strip(string):
-    ret = []
-    for s in string.split("\n"):
-        ret.append(s.lstrip())
-    return "\n".join(ret)
+def strip(line):
+    if line is None:
+        return line
+    else:
+        return line.strip()
+
+def handleNestedTag(tag, links, linkCnt, dest_f):
+    for child in tag.contents:
+        if child.name == "a":
+            href = child.get("href")
+            print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
+            links[linkCnt] = href
+            linkCnt += 1
+        else:
+            if child.string == "\n":
+                print(strip(child.string), end="", file=dest_f)
+            else:
+                print(strip(child.string), end=" ", file=dest_f)
+    print("", file=dest_f)
 
 source = sys.argv[1]
 dest = sys.argv[2]
@@ -30,9 +44,9 @@ else:
 if heading is None:
     print("No heading found", file=sys.stderr)
     exit(1)
-print("# {}\n".format(heading), file=dest_f)
+print("# {}".format(heading), file=dest_f)
 if subheading is not None:
-    print(subheading, file=dest_f)
+    print("\n{}".format(subheading), file=dest_f)
 
 links = dict()
 linkCnt = 0
@@ -41,22 +55,19 @@ for tag in html.article.children:
     if tag.name == "p":
         if len(tag.contents) == 1:
             # it's just text
-            print(strip(tag.string), file=dest_f)
+            print("\n{}".format(strip(tag.string)), file=dest_f)
         else:
-            # contains multiple tags
-            # we should check for links
-            for child in tag.contents:
-                if child.name == "a":
-                    href = child.get("href")
-                    print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
-                    links[linkCnt] = href
-                    linkCnt += 1
-                else:
-                    if child.string == "\n":
-                        print(child.string, end="", file=dest_f)
-                    else:
-                        print(strip(child.string), end=" ", file=dest_f)
-
+            print("\n", end="", file=dest_f)
+            handleNestedTag(tag, links, linkCnt, dest_f)
+    if tag.name == "ul" or tag.name == "ol":
+        for child in tag.contents:
+            if child.name == "li":
+                print("*", end=" ", file=dest_f)
+                handleNestedTag(child, links, linkCnt, dest_f)
+    if tag.name == "h3":
+        print("\n## {}".format(strip(tag.string)), file=dest_f)
+    if tag.name == "h4":
+        print("\n### {}".format(strip(tag.string)), file=dest_f)
 
 print("\n## Links\n", file=dest_f)
 for link_n, link in links.items():
@@ -65,4 +76,3 @@ for link_n, link in links.items():
 print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
 print("=> /log/ back", file=dest_f)
 print("=> / capsule", file=dest_f)
-
author	Steph Enders <steph@senders.io>	2023-07-06 11:29:32 -0400
committer	Steph Enders <steph@senders.io>	2023-07-06 11:29:32 -0400
commit	8c75b82a28f79f3adb74fa392cbef222d8228320 (patch)
tree	8b4bfde69003612aa74615c44988dee913f3d958
parent	52dc371b42fe103d81054706346cbc68f19a160c (diff)