Feed date scraper for Gemini (protocol)
-rwxr-xr-xtest_zachwalk.py19
-rwxr-xr-xzachwalk.py32
2 files changed, 21 insertions, 30 deletions
diff --git a/test_zachwalk.py b/test_zachwalk.py
index 6a3b579..a535ebc 100755
--- a/test_zachwalk.py
+++ b/test_zachwalk.py
@@ -1,20 +1,19 @@
#!/usr/bin/env python3
import zachwalk
-from dateutil.parser import parse
def main():
assert zachwalk.gnd([b'']) == zachwalk.DEFAULT
assert zachwalk.gnd([b'2021-01-31']) == zachwalk.DEFAULT
- assert zachwalk.gnd([b'=> path.gmi 2021-01-31 - my post']) == parse('2021-01-31').date()
- assert zachwalk.gnd([b'=> 2020/11/25/hello-gemini.gmi 2020-11-25 - Hello, Gemini!']) == parse('2020-11-25').date()
- assert zachwalk.gnd([b'=> gemini://drewdevault.com/2020/09/21/Gemini-TOFU.gmi 2020-09-21: TOFU recommendations for Gemini']) == parse('2020-09-21').date()
- assert zachwalk.gnd(['=> gemini://drewdevault.com/2021/02/15/Status-update-February-2021.gmi 2021-02-15: Status update, February 2021']) == parse('2021-02-15').date()
- assert zachwalk.gnd(['=>2021-01-28.gmi 2021-01-28 - RE ew0k: Your Gemini Browser and Server are Probably Doing Certificates Wrong']) == parse('2021-01-28').date()
+ assert zachwalk.gnd([b'=> path.gmi 2021-01-31 - my post']) == '2021-01-31'
+ assert zachwalk.gnd([b'=> 2020/11/25/hello-gemini.gmi 2020-11-25 - Hello, Gemini!']) == '2020-11-25'
+ assert zachwalk.gnd([b'=> gemini://drewdevault.com/2020/09/21/Gemini-TOFU.gmi 2020-09-21: TOFU recommendations for Gemini']) == '2020-09-21'
+ assert zachwalk.gnd(['=> gemini://drewdevault.com/2021/02/15/Status-update-February-2021.gmi 2021-02-15: Status update, February 2021']) == '2021-02-15'
+ assert zachwalk.gnd(['=>2021-01-28.gmi 2021-01-28 - RE ew0k: Your Gemini Browser and Server are Probably Doing Certificates Wrong']) == '2021-01-28'
assert zachwalk.gnd(['=> atom.xml Atom Feed']) == zachwalk.DEFAULT
- assert zachwalk.gnd(['=> geminitoepub.gmi 2021-02-27 Gemini to Epub']) == parse('2021-02-27').date()
- assert zachwalk.gnd(['=> m5paper.gmi 2021-01-31 M5Paper']) == parse('2021-01-31').date()
- assert zachwalk.gnd(['=> gemini://fossphones.com/03-29-22.gmi 2022-03-29 Linux Phone News - March 29, 2022']) == parse('2022-03-29').date()
- assert zachwalk.gnd(['=> pizza.gmi 1999-01-01 yum', '=> crepes.gmi 2099-01-01 cool']) == parse('2099-01-01').date()
+ assert zachwalk.gnd(['=> geminitoepub.gmi 2021-02-27 Gemini to Epub']) == '2021-02-27'
+ assert zachwalk.gnd(['=> m5paper.gmi 2021-01-31 M5Paper']) == '2021-01-31'
+ assert zachwalk.gnd(['=> gemini://fossphones.com/03-29-22.gmi 2022-03-29 Linux Phone News - March 29, 2022']) == '2022-03-29'
+ assert zachwalk.gnd(['=> pizza.gmi 1999-01-01 yum', '=> crepes.gmi 2099-01-01 cool']) == '2099-01-01'
assert zachwalk.getdesc('=> m5paper.gmi 2021-01-31 M5Paper') == '2021-01-31 M5Paper'
diff --git a/zachwalk.py b/zachwalk.py
index da03aaf..a77acad 100755
--- a/zachwalk.py
+++ b/zachwalk.py
@@ -2,7 +2,7 @@
"""load links from a gemini file and output the latest date for each one"""
__author__ = "Zach DeCook"
__email__ = "zachdecook@librem.one"
-__copyright__ = "Copyright (C) 2021 Zach DeCook"
+__copyright__ = "Copyright (C) 2021-2022 Zach DeCook"
__license__ = "AGPL"
__version__ = "3"
@@ -10,9 +10,8 @@ import sys
import socket
import ssl
import fileinput
-from dateutil.parser import parse
-DEFAULT = parse('1970-01-01').date()
+DEFAULT = "1970-01-01"
def getnewestdate(url):
"""load the url, and find the newest date listed in a link"""
@@ -39,28 +38,18 @@ def gnd(fp):
line=line.decode('UTF-8')
if line.strip()[0:2] == '=>':
desc =getdesc(line)
- try:
- date = parse(desc[0:10],fuzzy=True).date()
- if date > nd:
- nd = date
- except:
- try:
- date = parse(desc,fuzzy=True).date()
- if date > nd:
- nd = date
- except:
- pass
- pass
+ date = desc[0:10]
+ # basic check if it is a date
+ if isDate(date) and date > nd:
+ nd = date
return nd
def replaceDateIfNewer(desc, newestdate):
- try:
- tup = parse(desc, fuzzy_with_tokens=True)
- date = tup[0].date()
- except:
+ date = desc[0:10]
+ if not isDate(date):
return f'{newestdate} - {desc}'
if newestdate > date:
- return str(newestdate) + ' '.join(tup[1])
+ return str(newestdate) + desc[10:]
return desc
def main():
@@ -80,6 +69,9 @@ def main():
def isAbsGeminiUrl(url):
return url[0:9] == 'gemini://'
+def isDate(date):
+ return date[0:4].isnumeric() and date[5:7].isnumeric() and date[8:].isnumeric()
+
def getdesc(line):
return ' '.join(line[2:].strip().replace(' ',' ').split(' ')[1:]).lstrip()