gemfeed.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226

#!/usr/bin/env python3
import argparse
import datetime
import glob
import os
import os.path
import re
import stat
import urllib.parse
from fnmatch import fnmatch

from feedgen.feed import FeedGenerator

def is_world_readable(filename):
    """
    Return True if the named file is world readable, otherwise return False.
    """
    st = os.stat(filename)
    return st.st_mode & stat.S_IROTH

def extract_first_heading(filename, default=""):
    """
    Open a file which is presumed to contain text/gemini content and return
    the contents of the first heading line (regardless of heading level).
    If no heading lines are found, return the specified default.
    """
    with open(filename) as fp:
        for line in fp:
            if line.startswith("#"):
                while line[0] == "#":
                    line = line[1:]
                return line.strip()
    return default

def get_feed_title(directory):
    """
    If an index.gmi or index.gemini file exists and is worldreadable, return
    the content of the first heading line in the file, otherwise return a
    default feed title.
    """
    # By default, use the deepest directory name as a feed title
    # This needs a little care, as os.path.basename will return an empty
    # string if `directory` ends in a trailing slash...
    head, default = os.path.split(directory)
    if not default:
        default = os.path.basename(head)
    # Check for index files which may override the default
    for index_file in ("index.gmi", "index.gemini"):
        index_file = os.path.join(directory, index_file)
        if os.path.exists(index_file) and is_world_readable(index_file):
            return extract_first_heading(index_file, default)
    return default

def find_files(directory, time_func, n=10):
    """
    Return the n most recently created world readable files with extensions of
    .gmi or .gemini, as a list sorted from most to least recent.
    """
    gemini_files = []
    for root, dirs, files in os.walk(directory):
        path = root.split(os.sep)
        for f in files:
            full_path = os.path.join(root, f)
            for extension in ("gmi", "gemini"):
                if not fnmatch(f, 'index.{}'.format(extension)) and \
                   fnmatch(f, '*.{}'.format(extension)) and \
                   is_world_readable(full_path):
                    gemini_files.append(full_path)
    return sorted(gemini_files, key=time_func, reverse=True)[0:n]

def urljoin(base, url):
    """
    Return an absolute URL formed by combining the provided base and relative
    URLs.

    This is necessary because the various functions in Python's urllib to do
    this do not function as expected if the URL scheme is not recognised,
    which of course gemini:// is not.  Thus, we need to do a little dance
    where we transform gemini URLs to https URLs, join them, and then undo
    the transformation.
    """
    base = urllib.parse.urlsplit(base)
    base = base._replace(scheme="https")
    base = urllib.parse.urlunsplit(base)
    joined = urllib.parse.urljoin(base, url)
    joined = urllib.parse.urlsplit(joined)
    joined = joined._replace(scheme="gemini")
    return urllib.parse.urlunsplit(joined)

def populate_entry_from_file(directory, filename, base_url, entry, time_func):
    """
    Set the id, title, updated and link attributes of the provided
    FeedGenerator entry object according the contents of the named
    Gemini file and the base URL.
    """
    relative_filename = os.path.relpath(filename, start=directory)
    url = urljoin(base_url, relative_filename)
    entry.guid(url)
    entry.link(href=url, rel="alternate")
    updated = get_update_time(filename, time_func)
    entry.updated(updated)
    default_title = os.path.splitext(os.path.basename(filename))[0]
    title = extract_first_heading(filename, default_title)
    entry.title(title)

def get_update_time(filename, time_func):
    """
    Return an update time for a Gemini file.

    If the filename begins with an ISO8601 date stamp, that date
    (with a time of midnight) will be used.  Otherwise, the file
    "creation time" (which in unix is actually the time of last
    metadata update) will be used instead as a best estimate.
    """
    # Check for leading YYYY-MM-DD
    basename = os.path.basename(filename)
    if re.search("^[0-9]{4}-[01][0-9]-[0-3][0-9]", basename):
        date = basename[0:10] + " Z" # Add UTC marker
        return datetime.datetime.strptime(date, "%Y-%m-%d %z")
    else:
        updated = time_func(filename)
        return datetime.datetime.fromtimestamp(updated, tz=datetime.timezone.utc)

def build_feed(directory, time_func, base_url, output="atom.xml", n=10,
        title="", subtitle="", author="", email="", verbose=False):
    """
    Build an Atom feed for all world readable Gemini files in the current
    directory, and write it to atom.xml.
    """
    # If a title hasn't been provided, try to get one from an index page
    if not title:
        title = get_feed_title(directory)

    # Let user know feed title and URL
    feed_url = urljoin(base_url, output)
    if verbose:
        print('Generating feed "{}", which should be served from {}'.format(title, feed_url))

    # Setup feed
    feed = FeedGenerator()
    feed.id(base_url)
    feed.title(title)
    if subtitle:
        feed.subtitle(subtitle)
    author_details = {}
    if author:
        author_details["name"] = author
    if email:
        author_details["email"] = email
    if author_details:
        feed.author(author_details)
    feed.link(href=feed_url, rel='self')
    feed.link(href=base_url, rel='alternate')

    # Add one entry per .gmi file
    files = find_files(directory, time_func, n)
    if not files:
        if verbose:
            print("No world-readable Gemini content found! :(")
        return
    for n, filename in enumerate(files):
        entry = feed.add_entry()
        populate_entry_from_file(directory, filename, base_url, entry, time_func)
        if n == 0:
            feed.updated(entry.updated())
        if verbose:
            print("Adding {} with title '{}'...".format(os.path.basename(filename),
                entry.title()))

    # Write file
    output = os.path.join(directory, output)
    feed.atom_file(output, pretty=True)
    if verbose:
        print("Wrote Atom feed to {}.".format(output))

def main():
    """
    Parse command line arguments, do some minor processing, and then invoke
    the build_feed command with the provided settings.
    """

    # Get cwd as default value for --directory
    cwd = os.getcwd()

    # Parse arguments
    parser = argparse.ArgumentParser(description='Generate an Atom feed for Gemini content.')
    parser.add_argument('-a', '--author', dest='author', type=str,
            help="feed author's name")
    parser.add_argument('-b', '--base', dest='base_url', type=str,
            required=True, help='base URL for feed and entries')
    parser.add_argument('-d', '--directory', dest='directory', type=str,
            default=cwd, help='directory to find content and save feed to')
    parser.add_argument('-e', '--email', dest='email', type=str,
            help="feed author's email address")
    parser.add_argument('-n', dest='n', type=int, default=10,
            help='include N most recently created files in feed (default 10)')
    parser.add_argument('-o', '--output', dest='output', type=str,
            default="atom.xml", help='output filename')
    parser.add_argument('-q', '--quiet', dest='verbose', action="store_false",
            help='Write nothing to stdout under non-error conditions')
    parser.add_argument('-s', '--subtitle', dest='subtitle', type=str,
            help='feed subtitle')
    parser.add_argument('-t', '--title', dest='title', type=str,
            help='feed title')
    parser.add_argument('--mtime', action="store_true",
            help='Use file modification time, not file update time, in feeds')
    args = parser.parse_args()

    # Normalise base URL
    base_url = urllib.parse.urlsplit(args.base_url)
    if not base_url.netloc and base_url.path:
        # Handle a naked domain, which urlsplit will interpet at a local path
        base_url = base_url._replace(netloc=base_url.path, path="")
    base_url = base_url._replace(scheme="gemini")
    args.base_url = urllib.parse.urlunsplit(base_url)
    if not args.base_url.endswith("/"):
        args.base_url += "/"

    # Build the feed
    time_function = os.path.getmtime if args.mtime else os.path.getctime
    build_feed(args.directory, time_function, args.base_url, args.output,
            args.n, args.title, args.subtitle, args.author, args.email,
            args.verbose)

if __name__ == "__main__":
    main()