Feed date scraper for Gemini (protocol)
getnewestdate: load url and parse first date from a link
Zach DeCook 2021-02-04
parent c5d4f0e · commit fb1a7fc
-rwxr-xr-xzachwalk.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/zachwalk.py b/zachwalk.py
index cd3e9f3..ad16746 100755
--- a/zachwalk.py
+++ b/zachwalk.py
@@ -2,9 +2,34 @@
"""load links from a gemini file and output the latest date for each one"""
import sys
+import socket
+import ssl
+from dateutil.parser import parse
def getnewestdate(url):
"""load the url, and find the newest date listed in a link"""
+ # TODO: outsource to pre-installed cli program?
+ hostname = url.split('/')[2]
+ s = socket.create_connection((hostname, 1965))
+ context = ssl.SSLContext()
+ context.check_hostname = False
+ context.verify_mode = ssl.CERT_NONE
+ s = context.wrap_socket(s, server_hostname = hostname)
+ s.sendall((url + '\r\n').encode("UTF-8"))
+ # Get header and check for redirects
+ fp = s.makefile("rb")
+ header = fp.readline()
+ header = header.decode("UTF-8").strip()
+ # TODO: something special if status is not 2x
+ for line in fp:
+ line=line.decode('UTF-8')
+ if line.strip()[0:2] == '=>':
+ try:
+ date = parse(line,fuzzy=True).date()
+ # todo: read lots of these and compare them
+ return date
+ except:
+ pass
return '1970-01-01'
with open(sys.argv[1]) as f: