1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
#!/usr/bin/env python3
import sys
import re
from bs4 import BeautifulSoup
def strip(string):
ret = []
for s in string.split("\n"):
ret.append(s.lstrip())
return "\n".join(ret)
source = sys.argv[1]
dest = sys.argv[2]
http_url = sys.argv[3]
source_f = open(source, "r")
dest_f = open(dest, "w")
html = BeautifulSoup(source_f, 'html.parser')
heading = None
subheading = None
if html.article.header:
heading = strip(html.article.header.h2.string)
if html.article.header.p:
subheading = strip(html.article.header.p.string)
else:
heading = strip(html.article.h2.string)
if heading is None:
print("No heading found", file=sys.stderr)
exit(1)
print("# {}\n".format(heading), file=dest_f)
if subheading is not None:
print(subheading, file=dest_f)
links = dict()
linkCnt = 0
for tag in html.article.children:
if tag.name == "p":
if len(tag.contents) == 1:
# it's just text
print(strip(tag.string), file=dest_f)
else:
# contains multiple tags
# we should check for links
for child in tag.contents:
if child.name == "a":
href = child.get("href")
print("{}[{}]".format(strip(child.string), linkCnt), end=" ", file=dest_f)
links[linkCnt] = href
linkCnt += 1
else:
if child.string == "\n":
print(child.string, end="", file=dest_f)
else:
print(strip(child.string), end=" ", file=dest_f)
print("\n## Links\n", file=dest_f)
for link_n, link in links.items():
print("=> {} [{}] {}".format(link, link_n, link), file=dest_f)
print("=> https://thewomaninmyeye.org/{} view on http".format(http_url), file=dest_f)
print("=> /log/ back", file=dest_f)
print("=> / capsule", file=dest_f)
|