Feed date scraper for Gemini (protocol)
| -rwxr-xr-x | test_zachwalk.py | 19 | ||||
| -rwxr-xr-x | zachwalk.py | 32 |
2 files changed, 21 insertions, 30 deletions
diff --git a/test_zachwalk.py b/test_zachwalk.py index 6a3b579..a535ebc 100755 --- a/test_zachwalk.py +++ b/test_zachwalk.py @@ -1,20 +1,19 @@ #!/usr/bin/env python3 import zachwalk -from dateutil.parser import parse def main(): assert zachwalk.gnd([b'']) == zachwalk.DEFAULT assert zachwalk.gnd([b'2021-01-31']) == zachwalk.DEFAULT - assert zachwalk.gnd([b'=> path.gmi 2021-01-31 - my post']) == parse('2021-01-31').date() - assert zachwalk.gnd([b'=> 2020/11/25/hello-gemini.gmi 2020-11-25 - Hello, Gemini!']) == parse('2020-11-25').date() - assert zachwalk.gnd([b'=> gemini://drewdevault.com/2020/09/21/Gemini-TOFU.gmi 2020-09-21: TOFU recommendations for Gemini']) == parse('2020-09-21').date() - assert zachwalk.gnd(['=> gemini://drewdevault.com/2021/02/15/Status-update-February-2021.gmi 2021-02-15: Status update, February 2021']) == parse('2021-02-15').date() - assert zachwalk.gnd(['=>2021-01-28.gmi 2021-01-28 - RE ew0k: Your Gemini Browser and Server are Probably Doing Certificates Wrong']) == parse('2021-01-28').date() + assert zachwalk.gnd([b'=> path.gmi 2021-01-31 - my post']) == '2021-01-31' + assert zachwalk.gnd([b'=> 2020/11/25/hello-gemini.gmi 2020-11-25 - Hello, Gemini!']) == '2020-11-25' + assert zachwalk.gnd([b'=> gemini://drewdevault.com/2020/09/21/Gemini-TOFU.gmi 2020-09-21: TOFU recommendations for Gemini']) == '2020-09-21' + assert zachwalk.gnd(['=> gemini://drewdevault.com/2021/02/15/Status-update-February-2021.gmi 2021-02-15: Status update, February 2021']) == '2021-02-15' + assert zachwalk.gnd(['=>2021-01-28.gmi 2021-01-28 - RE ew0k: Your Gemini Browser and Server are Probably Doing Certificates Wrong']) == '2021-01-28' assert zachwalk.gnd(['=> atom.xml Atom Feed']) == zachwalk.DEFAULT - assert zachwalk.gnd(['=> geminitoepub.gmi 2021-02-27 Gemini to Epub']) == parse('2021-02-27').date() - assert zachwalk.gnd(['=> m5paper.gmi 2021-01-31 M5Paper']) == parse('2021-01-31').date() - assert zachwalk.gnd(['=> gemini://fossphones.com/03-29-22.gmi 2022-03-29 Linux Phone News - March 29, 2022']) == parse('2022-03-29').date() - assert zachwalk.gnd(['=> pizza.gmi 1999-01-01 yum', '=> crepes.gmi 2099-01-01 cool']) == parse('2099-01-01').date() + assert zachwalk.gnd(['=> geminitoepub.gmi 2021-02-27 Gemini to Epub']) == '2021-02-27' + assert zachwalk.gnd(['=> m5paper.gmi 2021-01-31 M5Paper']) == '2021-01-31' + assert zachwalk.gnd(['=> gemini://fossphones.com/03-29-22.gmi 2022-03-29 Linux Phone News - March 29, 2022']) == '2022-03-29' + assert zachwalk.gnd(['=> pizza.gmi 1999-01-01 yum', '=> crepes.gmi 2099-01-01 cool']) == '2099-01-01' assert zachwalk.getdesc('=> m5paper.gmi 2021-01-31 M5Paper') == '2021-01-31 M5Paper' diff --git a/zachwalk.py b/zachwalk.py index da03aaf..a77acad 100755 --- a/zachwalk.py +++ b/zachwalk.py @@ -2,7 +2,7 @@ """load links from a gemini file and output the latest date for each one""" __author__ = "Zach DeCook" __email__ = "zachdecook@librem.one" -__copyright__ = "Copyright (C) 2021 Zach DeCook" +__copyright__ = "Copyright (C) 2021-2022 Zach DeCook" __license__ = "AGPL" __version__ = "3" @@ -10,9 +10,8 @@ import sys import socket import ssl import fileinput -from dateutil.parser import parse -DEFAULT = parse('1970-01-01').date() +DEFAULT = "1970-01-01" def getnewestdate(url): """load the url, and find the newest date listed in a link""" @@ -39,28 +38,18 @@ def gnd(fp): line=line.decode('UTF-8') if line.strip()[0:2] == '=>': desc =getdesc(line) - try: - date = parse(desc[0:10],fuzzy=True).date() - if date > nd: - nd = date - except: - try: - date = parse(desc,fuzzy=True).date() - if date > nd: - nd = date - except: - pass - pass + date = desc[0:10] + # basic check if it is a date + if isDate(date) and date > nd: + nd = date return nd def replaceDateIfNewer(desc, newestdate): - try: - tup = parse(desc, fuzzy_with_tokens=True) - date = tup[0].date() - except: + date = desc[0:10] + if not isDate(date): return f'{newestdate} - {desc}' if newestdate > date: - return str(newestdate) + ' '.join(tup[1]) + return str(newestdate) + desc[10:] return desc def main(): @@ -80,6 +69,9 @@ def main(): def isAbsGeminiUrl(url): return url[0:9] == 'gemini://' +def isDate(date): + return date[0:4].isnumeric() and date[5:7].isnumeric() and date[8:].isnumeric() + def getdesc(line): return ' '.join(line[2:].strip().replace(' ',' ').split(' ')[1:]).lstrip() |