1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#!/usr/bin/env python3
import sys
import re
from bs4 import BeautifulSoup
def strip(line):
if line is None:
return line
else:
return line.strip()
def handleNestedTag(tag, links, linkCnt, dest_f):
for child in tag.contents:
if child.name == "a":
href = child.get("href")
if href is not None:
if href.startswith("#"):
print(strip(child.string), end=" ", file=dest_f)
else:
print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
links[linkCnt] = href
linkCnt += 1
else:
# it's not a href link then its likely an anchor
pass
else:
if child.string == "\n":
print(strip(child.string), end="", file=dest_f)
else:
print(strip(child.string), end=" ", file=dest_f)
print("", file=dest_f)
source = sys.argv[1]
dest = sys.argv[2]
http_url = sys.argv[3]
source_f = open(source, "r")
dest_f = open(dest, "w")
html = BeautifulSoup(source_f, 'html.parser')
heading = None
subheading = None
if html.article.header:
heading = strip(html.article.header.h2.string)
if html.article.header.p:
subheading = strip(html.article.header.p.string)
else:
heading = strip(html.article.h2.string)
if heading is None:
print("No heading found", file=sys.stderr)
exit(1)
print("# {}".format(heading), file=dest_f)
if subheading is not None:
print("\n{}".format(subheading), file=dest_f)
links = dict()
linkCnt = 0
for tag in html.article.children:
if tag.name == "p":
if len(tag.contents) == 1:
# it's just text
print("\n{}".format(strip(tag.string)), file=dest_f)
else:
print("\n", end="", file=dest_f)
handleNestedTag(tag, links, linkCnt, dest_f)
elif tag.name == "ul" or tag.name == "ol":
for child in tag.contents:
if child.name == "li":
print("*", end=" ", file=dest_f)
handleNestedTag(child, links, linkCnt, dest_f)
if tag.name == "figure":
if tag.blockquote is not None:
print(">", end=" ", file=dest_f)
handleNestedTag(tag.blockquote, links, linkCnt, dest_f)
elif tag.name == "h3":
print("\n## {}".format(strip(tag.string)), file=dest_f)
elif tag.name == "h4":
print("\n### {}".format(strip(tag.string)), file=dest_f)
print("\n## Links\n", file=dest_f)
for link_n, link in links.items():
print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)
print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
print("=> /log/ back", file=dest_f)
print("=> / capsule", file=dest_f)
|