Feed date scraper for Gemini (protocol)
* fix parsing line lines with tabs
Zach DeCook 2021-04-08
parent f54a46a · commit 1c18fc1
-rwxr-xr-xtest_zachwalk.py3
-rwxr-xr-xzachwalk.py3
2 files changed, 5 insertions, 1 deletions
diff --git a/test_zachwalk.py b/test_zachwalk.py
index 0576ac5..454dc56 100755
--- a/test_zachwalk.py
+++ b/test_zachwalk.py
@@ -12,6 +12,9 @@ def main():
assert zachwalk.gnd(['=>2021-01-28.gmi 2021-01-28 - RE ew0k: Your Gemini Browser and Server are Probably Doing Certificates Wrong']) == parse('2021-01-28').date()
assert zachwalk.gnd(['=> atom.xml Atom Feed']) == zachwalk.DEFAULT
assert zachwalk.gnd(['=> geminitoepub.gmi 2021-02-27 Gemini to Epub']) == parse('2021-02-27').date()
+ assert zachwalk.gnd(['=> m5paper.gmi 2021-01-31 M5Paper']) == parse('2021-01-31').date()
+
+ assert zachwalk.getdesc('=> m5paper.gmi 2021-01-31 M5Paper') == '2021-01-31 M5Paper'
if __name__ == '__main__':
main()
diff --git a/zachwalk.py b/zachwalk.py
index b65df8c..145c174 100755
--- a/zachwalk.py
+++ b/zachwalk.py
@@ -60,6 +60,7 @@ def replaceDateIfNewer(desc, newestdate):
def main():
for line in fileinput.input(): #stdin or file from argv
if line[0:2] == '=>':
+ # don't use tabs
url = line[2:].strip().split(' ')[0]
if isAbsGeminiUrl(url):
desc = getdesc(line)
@@ -74,7 +75,7 @@ def isAbsGeminiUrl(url):
return url[0:9] == 'gemini://'
def getdesc(line):
- return ' '.join(line[2:].strip().split(' ')[1:])
+ return ' '.join(line[2:].strip().replace(' ',' ').split(' ')[1:]).lstrip()
if __name__ == '__main__':
main()