summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteph Enders <steph@senders.io>2023-07-06 11:29:32 -0400
committerSteph Enders <steph@senders.io>2023-07-06 11:29:32 -0400
commit8c75b82a28f79f3adb74fa392cbef222d8228320 (patch)
tree8b4bfde69003612aa74615c44988dee913f3d958
parent52dc371b42fe103d81054706346cbc68f19a160c (diff)
Fix extra lines issue and support lists
This now supports lists and removes all the extra lines! I kept strip as a function but made it just a proxy for .strip() but its None safe
-rwxr-xr-xgemparse.py56
1 files changed, 33 insertions, 23 deletions
diff --git a/gemparse.py b/gemparse.py
index 28ac91a..54dd990 100755
--- a/gemparse.py
+++ b/gemparse.py
@@ -3,11 +3,25 @@ import sys
import re
from bs4 import BeautifulSoup
-def strip(string):
- ret = []
- for s in string.split("\n"):
- ret.append(s.lstrip())
- return "\n".join(ret)
+def strip(line):
+ if line is None:
+ return line
+ else:
+ return line.strip()
+
+def handleNestedTag(tag, links, linkCnt, dest_f):
+ for child in tag.contents:
+ if child.name == "a":
+ href = child.get("href")
+ print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
+ links[linkCnt] = href
+ linkCnt += 1
+ else:
+ if child.string == "\n":
+ print(strip(child.string), end="", file=dest_f)
+ else:
+ print(strip(child.string), end=" ", file=dest_f)
+ print("", file=dest_f)
source = sys.argv[1]
dest = sys.argv[2]
@@ -30,9 +44,9 @@ else:
if heading is None:
print("No heading found", file=sys.stderr)
exit(1)
-print("# {}\n".format(heading), file=dest_f)
+print("# {}".format(heading), file=dest_f)
if subheading is not None:
- print(subheading, file=dest_f)
+ print("\n{}".format(subheading), file=dest_f)
links = dict()
linkCnt = 0
@@ -41,22 +55,19 @@ for tag in html.article.children:
if tag.name == "p":
if len(tag.contents) == 1:
# it's just text
- print(strip(tag.string), file=dest_f)
+ print("\n{}".format(strip(tag.string)), file=dest_f)
else:
- # contains multiple tags
- # we should check for links
- for child in tag.contents:
- if child.name == "a":
- href = child.get("href")
- print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
- links[linkCnt] = href
- linkCnt += 1
- else:
- if child.string == "\n":
- print(child.string, end="", file=dest_f)
- else:
- print(strip(child.string), end=" ", file=dest_f)
-
+ print("\n", end="", file=dest_f)
+ handleNestedTag(tag, links, linkCnt, dest_f)
+ if tag.name == "ul" or tag.name == "ol":
+ for child in tag.contents:
+ if child.name == "li":
+ print("*", end=" ", file=dest_f)
+ handleNestedTag(child, links, linkCnt, dest_f)
+ if tag.name == "h3":
+ print("\n## {}".format(strip(tag.string)), file=dest_f)
+ if tag.name == "h4":
+ print("\n### {}".format(strip(tag.string)), file=dest_f)
print("\n## Links\n", file=dest_f)
for link_n, link in links.items():
@@ -65,4 +76,3 @@ for link_n, link in links.items():
print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
print("=> /log/ back", file=dest_f)
print("=> / capsule", file=dest_f)
-