zachwalk - Feed date scraper for Gemini (protocol)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

#!/usr/bin/env python3
"""load links from a gemini file and output the latest date for each one"""

import sys
import socket
import ssl
from dateutil.parser import parse

DEFAULT = '1970-01-01'

def getnewestdate(url):
    """load the url, and find the newest date listed in a link"""
    # TODO: outsource to pre-installed cli program?
    hostname = url.split('/')[2]
    s = socket.create_connection((hostname, 1965))
    context = ssl.SSLContext()
    context.check_hostname = False
    context.verify_mode = ssl.CERT_NONE
    s = context.wrap_socket(s, server_hostname = hostname)
    s.sendall((url + '\r\n').encode("UTF-8"))
    # Get header and check for redirects
    fp = s.makefile("rb")
    header = fp.readline()
    header = header.decode("UTF-8").strip()
    # TODO: something special if status is not 2x
    return gnd(fp)
def gnd(fp):
    for line in fp:
     line=line.decode('UTF-8')
     if line.strip()[0:2] == '=>':
      try:
        desc =getdesc(line)
        desc=desc.replace(':',' :') #quirk for parse on drew's capsule
        date = parse(desc,fuzzy=True).date()
        # todo: read lots of these and compare them
        return date
      except:
        pass
    return DEFAULT

def main(argv):
  with open(argv[1]) as f:
    for line in f:
        if line[0:2] == '=>':
            # plz don't use multiple spaces.
            url = line.split(' ')[1]
            desc = getdesc(line)
            newestdate = getnewestdate(url)
            print(f'=> {url} {newestdate} - {desc}')

def getdesc(line):
  return ' '.join(line.split(' ')[2:])

if __name__ == '__main__':
  main(sys.argv)