usfm2gmi - Convert usfm bibles into gemtext (python library/utility)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

#!/usr/bin/env python3
"""Convert usfm line-by-line into gemtext"""
__author__ = "Zach DeCook"
__email__ = "zachdecook@librem.one"
__copyright__ = "Copyright (C) 2021 Zach DeCook"
__license__ = "AGPL"
__version__ = "3"
import fileinput

def printf(string):
  print(string,end='')

def smallcaps(word):
  sc = 'ᴀʙᴄᴅᴇғɢʜɪᴊᴋʟᴍɴᴏᴘǫʀsᴛᴜᴠᴡxʏᴢ'
  new = ''
  for c in word:
    if c >= 'a' and c <= 'z':
      # I like C programming.
      new += sc[ord(c)-ord('a')]
    else:
      new += c
  return new

def superscript(word):
  #TODO: also superscript lowercase letters
  ss='⁰¹²³⁴⁵⁶⁷⁸⁹:;<=>?@ᴬᴮCᴰᴱFᴳᴴᶦᴶᴷᴸᴹᴺᴼᴾQᴿSᵀᵁⱽᵂ'
  new = ''
  for c in word:
    if c >= '0' and c <= 'W':
      new += ss[ord(c)-ord('0')]
    else:
      new += c
  return new

def convert(line, printStrongs=False):
  """Convert a string to a list of tuples, each a token"""
  # TODO: preserve the lack of whitespace before a backslash.
  split = line.replace('\\', ' \\').replace('\\nd*','\\nd* ').replace('\\+nd*','\\+nd* ').replace('\\f*','\\f* ').replace('\\wj*','\\wj* ').replace('\\w*',' \\w* ').replace('\\+w*', '\\+w* ').split()
  out = ''
  nd = False
  superS = False
  if len(split) == 0:
    return out
  elif split[0] in ['\\mt1','\\mt','\\ms','\\h']:
    return '\n# ' + convert(' '.join(split[1:]))
  # TODO: parse as word for title tags in title line
  elif split[0] in ['\\mt2','\\s','\\s1']:
    return '\n## ' + convert(' '.join(split[1:]))
  elif split[0] in ['\\mt3','\\d', '\\sp']:
    return '\n### ' + convert(' '.join(split[1:]))
  elif split[0] == '\\b':
    return '\n'
  elif split[0] == '\\rem':
    return out
  skip = 0
  for word in split:
    if skip > 0:
      skip = skip - 1
    elif word in ['\\id','\\ide']:
      skip = 1
    elif word in ['\\v','\\c']:
      skip = 1
    elif word in ['\\p','\\m']:
      out += '\n'
    elif word in ['\\pi','\\pi1','\\mi']:
      out += '\n\t'
    elif word in ['\\li1']:
      out += '\n* '
    elif word in ['\\q', '\\q1']:
      out += '\n> '
    elif word in ['\\q2', '\\q22']: # \q22 is bad input
      out += '\n>\t'
    elif word in ['\\q3']:
      out += '\n>\t\t'
    elif word in ['\\qs']:
      out += '\t'
    elif word in ['\\qs*']:
      continue
    elif word in ['\\r']:
      out += '\n> '
    elif word in ['\\wj','\\wj*']:
      continue
    elif word in ['\\em','\\it']:
      out += '*'
    elif word in ['\\em*', '\\it*']:
      out = out.rstrip() + '*'
    elif word in ['\\nd','\\+nd']:
      nd = True
    elif word in ['\\nd*','\\+nd*']:
      nd = False
    # Footnotes (https://ubsicap.github.io/usfm/notes_basic/fnotes.html)
    elif word == '\\f':
      out += '['
      skip = 1 # the next character is the footnote caller
    elif word == '\\fr':
      skip = 1 # verse reference not necessary for inline fn
    elif word == '\\f*':
      out += ']'
    # Cross-references (https://ubsicap.github.io/usfm/notes_basic/xrefs.html)
    elif word == '\\x':
      out += '('
      skip = 1 # next character is xref caller
    elif word == '\\xo':
      skip = 1 # verse reference not necessary for inline xref
    elif word in ['\\xt']:
      continue
    elif word == '\\x*':
      out += ')'
    # TODO: support Endnotes (\fe and \fe*)
    elif word in ['\\ft']:
      continue # TODO: fancy formatting of more types
    # Words which appear in the glossary.
    elif word in ['\\w','\\w*', '\\+w', '\\+w*']:
      continue
    elif word in ['\\nb']:
      continue
    elif word == '\\+sup':
      superS = True
    elif word == '\\+sup*':
      superS = False
    elif '|strong="' in word:
      spl = word.split('|')
      out += spl[0]
      if printStrongs:
        out += superscript(spl[1][8:-1])
      out += ' '
    elif 'x-morph="' in word:
      continue
    # Remove those extra spaces that sneak in.
    elif word in [',', '.', ';', '”', ',”', '.”', '?”', ')', ':', '!', '?', '.’', '.’”', '?’”', '?’', ';”', '!”', ');', '),', '’s', '.)']:
      if out[-1] == ' ':
        out = out[:-1] + word + ' '
      else:
        out += word + ' '
    elif word in ['“', '(', '‘']:
      out += word
    else:
     if nd:
      out += smallcaps(word) + ' '
     if superS:
      out += superscript(word) + ' '
     else:
      out += word + ' '
  return out

def main():
  """Read usfm from stdin, output gemtext to stdout
     ./usfm2gmi <in.usfm >out.md
  """
  for line in fileinput.input():
    gmi = convert(line)
    printf(gmi)

if __name__ == '__main__':
  main()