gemparse.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

#!/usr/bin/env python3
import sys
import re
from bs4 import BeautifulSoup

def strip(string):
    ret = []
    for s in string.split("\n"):
        ret.append(s.lstrip())
    return "\n".join(ret)

source = sys.argv[1]
dest = sys.argv[2]
http_url = sys.argv[3]

source_f = open(source, "r")
dest_f = open(dest, "w")

html = BeautifulSoup(source_f, 'html.parser')
heading = None
subheading = None

if html.article.header:
    heading = strip(html.article.header.h2.string)
    if html.article.header.p:
        subheading = strip(html.article.header.p.string)
else:
    heading = strip(html.article.h2.string)

if heading is None:
    print("No heading found", file=sys.stderr)
    exit(1)
print("# {}\n".format(heading), file=dest_f)
if subheading is not None:
    print(subheading, file=dest_f)

links = dict()
linkCnt = 0

for tag in html.article.children:
    if tag.name == "p":
        if len(tag.contents) == 1:
            # it's just text
            print(strip(tag.string), file=dest_f)
        else:
            # contains multiple tags
            # we should check for links
            for child in tag.contents:
                if child.name == "a":
                    href = child.get("href")
                    print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
                    links[linkCnt] = href
                    linkCnt += 1
                else:
                    if child.string == "\n":
                        print(child.string, end="", file=dest_f)
                    else:
                        print(strip(child.string), end=" ", file=dest_f)


print("\n## Links\n", file=dest_f)
for link_n, link in links.items():
    print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)

print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
print("=> /log/ back", file=dest_f)
print("=> / capsule", file=dest_f)