gemparse.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

#!/usr/bin/env python3
import sys
import re
from bs4 import BeautifulSoup

def strip(line):
    if line is None:
        return line
    else:
        return line.strip()

def handleNestedTag(tag, links, linkCnt, dest_f):
    for child in tag.contents:
        if child.name == "a":
            href = child.get("href")
            if href is not None:
                if href.startswith("#"):
                    print(strip(child.string), end=" ", file=dest_f)
                else:
                    print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
                    links[linkCnt] = href
                    linkCnt += 1
            else:
                # it's not a href link then its likely an anchor
                pass
        else:
            if child.string == "\n":
                print(strip(child.string), end="", file=dest_f)
            else:
                print(strip(child.string), end=" ", file=dest_f)
    print("", file=dest_f)

source = sys.argv[1]
dest = sys.argv[2]
http_url = sys.argv[3]

source_f = open(source, "r")
dest_f = open(dest, "w")

html = BeautifulSoup(source_f, 'html.parser')
heading = None
subheading = None

if html.article.header:
    heading = strip(html.article.header.h2.string)
    if html.article.header.p:
        subheading = strip(html.article.header.p.string)
else:
    heading = strip(html.article.h2.string)

if heading is None:
    print("No heading found", file=sys.stderr)
    exit(1)
print("# {}".format(heading), file=dest_f)
if subheading is not None:
    print("\n{}".format(subheading), file=dest_f)

links = dict()
linkCnt = 0

for tag in html.article.children:
    if tag.name == "p":
        if len(tag.contents) == 1:
            # it's just text
            print("\n{}".format(strip(tag.string)), file=dest_f)
        else:
            print("\n", end="", file=dest_f)
            handleNestedTag(tag, links, linkCnt, dest_f)
    elif tag.name == "ul" or tag.name == "ol":
        for child in tag.contents:
            if child.name == "li":
                print("*", end=" ", file=dest_f)
                handleNestedTag(child, links, linkCnt, dest_f)
    if tag.name == "figure":
        if tag.blockquote is not None:
            print(">", end=" ", file=dest_f)
            handleNestedTag(tag.blockquote, links, linkCnt, dest_f)
    elif tag.name == "h3":
        print("\n## {}".format(strip(tag.string)), file=dest_f)
    elif tag.name == "h4":
        print("\n### {}".format(strip(tag.string)), file=dest_f)

print("\n## Links\n", file=dest_f)
for link_n, link in links.items():
    print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)

print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
print("=> /log/ back", file=dest_f)
print("=> / capsule", file=dest_f)