Text.Lexer


   0 | module Text.Lexer
   1 | 
   2 | import Data.Bool
   3 | import Data.List
   4 | import Data.Nat
   5 | 
   6 | import public Text.Lexer.Core
   7 | import public Text.Quantity
   8 | import public Text.Token
   9 | 
  10 | %default total
  11 | 
  12 | export
  13 | toTokenMap : List (Lexer, k) -> TokenMap (Token k)
  14 | toTokenMap = map $ \(l, kind) => (l, Tok kind)
  15 | 
  16 | ||| Recognise any character.
  17 | ||| /./
  18 | export
  19 | any : Lexer
  20 | any = pred (const True)
  21 | 
  22 | ||| Recognise a lexer or recognise no input. This is not guaranteed
  23 | ||| to consume input.
  24 | ||| /`l`?/
  25 | export
  26 | opt : (l : Lexer) -> Recognise False
  27 | opt l = l <|> empty
  28 | 
  29 | ||| Recognise any character if the sub-lexer `l` fails.
  30 | ||| /(?!`l`)./
  31 | export
  32 | non : (l : Lexer) -> Lexer
  33 | non l = reject l <+> any
  34 | 
  35 | ||| Produce recognisers by applying a function to elements of a container, and
  36 | ||| recognise the first match. Consumes input if the function produces consuming
  37 | ||| recognisers. Fails if the container is empty.
  38 | export
  39 | choiceMap : {c : Bool} ->
  40 |             Foldable t => (a -> Recognise c) -> t a -> Recognise c
  41 | choiceMap {c} f xs = foldr (\x, acc => rewrite sym (andSameNeutral c) in
  42 |                                                f x <|> acc)
  43 |                            fail xs
  44 | 
  45 | ||| Recognise the first matching recogniser in a container. Consumes input if
  46 | ||| recognisers in the list consume. Fails if the container is empty.
  47 | export
  48 | choice : {c : _} ->
  49 |          Foldable t => t (Recognise c) -> Recognise c
  50 | choice = choiceMap id
  51 | 
  52 | ||| Sequence a list of recognisers. Guaranteed to consume input if the list is
  53 | ||| non-empty and the recognisers consume.
  54 | export
  55 | concat : {c : _} ->
  56 |          (xs : List (Recognise c)) -> Recognise (isCons xs && c)
  57 | concat = Lexer.Core.concatMap id
  58 | 
  59 | ||| Recognise a specific character.
  60 | ||| /[`x`]/
  61 | export
  62 | is : (x : Char) -> Lexer
  63 | is x = pred (==x)
  64 | 
  65 | ||| Recognise anything but the given character.
  66 | ||| /[\^`x`]/
  67 | export
  68 | isNot : (x : Char) -> Lexer
  69 | isNot x = pred (/=x)
  70 | 
  71 | ||| Recognise a specific character (case-insensitive).
  72 | ||| /[`x`]/i
  73 | export
  74 | like : (x : Char) -> Lexer
  75 | like x = pred (\y => toUpper x == toUpper y)
  76 | 
  77 | ||| Recognise anything but the given character (case-insensitive).
  78 | ||| /[\^`x`]/i
  79 | export
  80 | notLike : (x : Char) -> Lexer
  81 | notLike x = pred (\y => toUpper x /= toUpper y)
  82 | 
  83 | ||| Recognise a specific string.
  84 | ||| Fails if the string is empty.
  85 | ||| /`str`/
  86 | export
  87 | exact : (str : String) -> Lexer
  88 | exact str = case unpack str of
  89 |                  [] => fail
  90 |                  (x :: xs) => concatMap is (x :: xs)
  91 | 
  92 | ||| Recognise a specific string (case-insensitive).
  93 | ||| Fails if the string is empty.
  94 | ||| /`str`/i
  95 | export
  96 | approx : (str : String) -> Lexer
  97 | approx str = case unpack str of
  98 |                   [] => fail
  99 |                   (x :: xs) => concatMap like (x :: xs)
 100 | 
 101 | ||| Recognise any of the characters in the given string.
 102 | ||| /[`chars`]/
 103 | export
 104 | oneOf : (chars : String) -> Lexer
 105 | oneOf chars = pred (\x => x `elem` unpack chars)
 106 | 
 107 | ||| Recognise a character range. Also works in reverse!
 108 | ||| /[`start`-`end`]/
 109 | export
 110 | range : (start : Char) -> (end : Char) -> Lexer
 111 | range start end = pred (\x => (x >= min start end)
 112 |                            && (x <= max start end))
 113 | 
 114 | mutual
 115 |   ||| Recognise a sequence of at least one sub-lexers
 116 |   ||| /`l`+/
 117 |   export
 118 |   some : Lexer -> Lexer
 119 |   some l = l <+> many l
 120 | 
 121 |   ||| Recognise a sequence of at zero or more sub-lexers. This is not
 122 |   ||| guaranteed to consume input
 123 |   ||| /`l`\*/
 124 |   export
 125 |   many : Lexer -> Recognise False
 126 |   many l = opt (some l)
 127 | 
 128 | ||| Repeat the sub-lexer `l` one or more times until the lexer
 129 | ||| `stopBefore` is encountered. `stopBefore` will not be consumed.
 130 | ||| /((?!`stopBefore`)`l`)\+/
 131 | export
 132 | someUntil : (stopBefore : Recognise c) -> (l : Lexer) -> Lexer
 133 | someUntil stopBefore l = some (reject stopBefore <+> l)
 134 | 
 135 | ||| Repeat the sub-lexer `l` zero or more times until the lexer
 136 | ||| `stopBefore` is encountered. `stopBefore` will not be consumed.
 137 | ||| Not guaranteed to consume input.
 138 | ||| /((?!`stopBefore`)`l`)\*/
 139 | export
 140 | manyUntil : (stopBefore : Recognise c) -> (l : Lexer) -> Recognise False
 141 | manyUntil stopBefore l = many (reject stopBefore <+> l)
 142 | 
 143 | ||| Repeat the sub-lexer `l` zero or more times until the lexer
 144 | ||| `stopAfter` is encountered, and consume it. Guaranteed to
 145 | ||| consume if `stopAfter` consumes.
 146 | ||| /`l`\*?`stopAfter`/
 147 | export
 148 | manyThen : (stopAfter : Recognise c) -> (l : Lexer) -> Recognise c
 149 | manyThen stopAfter l = manyUntil stopAfter l <+> stopAfter
 150 | 
 151 | ||| Recognise a sub-lexer repeated as specified by `q`. Fails if `q` has
 152 | ||| `min` and `max` in the wrong order. Consumes input unless `min q` is zero.
 153 | ||| /`l`{`q`}/
 154 | export
 155 | count : (q : Quantity) -> (l : Lexer) -> Recognise (isSucc (min q))
 156 | count (Qty Z Nothing) l = many l
 157 | count (Qty Z (Just Z)) _ = empty
 158 | count (Qty Z (Just (S max))) l = opt $ l <+> count (atMost max) l
 159 | count (Qty (S min) Nothing) l = l <+> count (atLeast min) l
 160 | count (Qty (S min) (Just Z)) _ = fail
 161 | count (Qty (S min) (Just (S max))) l = l <+> count (between min max) l
 162 | 
 163 | ||| Recognise a single digit 0-9
 164 | ||| /[0-9]/
 165 | export
 166 | digit : Lexer
 167 | digit = pred isDigit
 168 | 
 169 | ||| Recognise one or more digits
 170 | ||| /[0-9]+/
 171 | export
 172 | digits : Lexer
 173 | digits = some digit
 174 | 
 175 | ||| Recognise a single binary digit
 176 | ||| /[0-1]/
 177 | export
 178 | binDigit : Lexer
 179 | binDigit = pred (\c => c == '0' || c == '1')
 180 | 
 181 | ||| Recognise one or more binary digits
 182 | ||| /[0-1]+/
 183 | export
 184 | binDigits : Lexer
 185 | binDigits = some binDigit
 186 | 
 187 | ||| Recognise a single hexadecimal digit
 188 | ||| /[0-9A-Fa-f]/
 189 | export
 190 | hexDigit : Lexer
 191 | hexDigit = pred isHexDigit
 192 | 
 193 | ||| Recognise one or more hexadecimal digits
 194 | ||| /[0-9A-Fa-f]+/
 195 | export
 196 | hexDigits : Lexer
 197 | hexDigits = some hexDigit
 198 | 
 199 | ||| Recognise a single octal digit
 200 | ||| /[0-8]/
 201 | export
 202 | octDigit : Lexer
 203 | octDigit = pred isOctDigit
 204 | 
 205 | ||| Recognise one or more octal digits
 206 | ||| /[0-8]+/
 207 | export
 208 | octDigits : Lexer
 209 | octDigits = some octDigit
 210 | 
 211 | ||| Recognise a single alpha character
 212 | ||| /[A-Za-z]/
 213 | export
 214 | alpha : Lexer
 215 | alpha = pred isAlpha
 216 | 
 217 | ||| Recognise one or more alpha characters
 218 | ||| /[A-Za-z]+/
 219 | export
 220 | alphas : Lexer
 221 | alphas = some alpha
 222 | 
 223 | ||| Recognise a lowercase alpha character
 224 | ||| /[a-z]/
 225 | export
 226 | lower : Lexer
 227 | lower = pred isLower
 228 | 
 229 | ||| Recognise one or more lowercase alpha characters
 230 | ||| /[a-z]+/
 231 | export
 232 | lowers : Lexer
 233 | lowers = some lower
 234 | 
 235 | ||| Recognise an uppercase alpha character
 236 | ||| /[A-Z]/
 237 | export
 238 | upper : Lexer
 239 | upper = pred isUpper
 240 | 
 241 | ||| Recognise one or more uppercase alpha characters
 242 | ||| /[A-Z]+/
 243 | export
 244 | uppers : Lexer
 245 | uppers = some upper
 246 | 
 247 | ||| Recognise an alphanumeric character
 248 | ||| /[A-Za-z0-9]/
 249 | export
 250 | alphaNum : Lexer
 251 | alphaNum = pred isAlphaNum
 252 | 
 253 | ||| Recognise one or more alphanumeric characters
 254 | ||| /[A-Za-z0-9]+/
 255 | export
 256 | alphaNums : Lexer
 257 | alphaNums = some alphaNum
 258 | 
 259 | ||| Recognise a single whitespace character
 260 | ||| /\\s/
 261 | export
 262 | space : Lexer
 263 | space = pred isSpace
 264 | 
 265 | ||| Recognise one or more whitespace characters
 266 | ||| /\\s+/
 267 | export
 268 | spaces : Lexer
 269 | spaces = some space
 270 | 
 271 | ||| Recognise a single newline sequence. Understands CRLF, CR, and LF
 272 | ||| /\\r\\n|[\\r\\n]/
 273 | export
 274 | newline : Lexer
 275 | newline = let crlf = "\r\n" in
 276 |               exact crlf <|> oneOf crlf
 277 | 
 278 | ||| Recognise one or more newline sequences. Understands CRLF, CR, and LF
 279 | ||| /(\\r\\n|[\\r\\n])+)/
 280 | export
 281 | newlines : Lexer
 282 | newlines = some newline
 283 | 
 284 | ||| Recognise a single non-whitespace, non-alphanumeric character
 285 | ||| /[\^\\sA-Za-z0-9]/
 286 | export
 287 | symbol : Lexer
 288 | symbol = pred (\x => not (isSpace x || isAlphaNum x))
 289 | 
 290 | ||| Recognise one or more non-whitespace, non-alphanumeric characters
 291 | ||| /[\^\\sA-Za-z0-9]+/
 292 | export
 293 | symbols : Lexer
 294 | symbols = some symbol
 295 | 
 296 | ||| Recognise a single control character
 297 | ||| /[\\x00-\\x1f\\x7f-\\x9f]/
 298 | export
 299 | control : Lexer
 300 | control = pred isControl
 301 | 
 302 | ||| Recognise one or more control characters
 303 | ||| /[\\x00-\\x1f\\x7f-\\x9f]+/
 304 | export
 305 | controls : Lexer
 306 | controls = some control
 307 | 
 308 | ||| Recognise zero or more occurrences of a sub-lexer between
 309 | ||| delimiting lexers
 310 | ||| /`start`(`l`)\*?`end`/
 311 | export
 312 | surround : (start : Lexer) -> (end : Lexer) -> (l : Lexer) -> Lexer
 313 | surround start end l = start <+> manyThen end l
 314 | 
 315 | ||| Recognise zero or more occurrences of a sub-lexer surrounded
 316 | ||| by the same quote lexer on both sides (useful for strings)
 317 | ||| /`q`(`l`)\*?`q`/
 318 | export
 319 | quote : (q : Lexer) -> (l : Lexer) -> Lexer
 320 | quote q l = surround q q l
 321 | 
 322 | ||| Recognise an escape sub-lexer (often '\\') followed by
 323 | ||| another sub-lexer
 324 | ||| /[`esc`]`l`/
 325 | export
 326 | escape : (esc : Lexer) -> Lexer -> Lexer
 327 | escape esc l = esc <+> l
 328 | 
 329 | ||| Recognise a string literal, including escaped characters.
 330 | ||| (Note: doesn't yet handle escape sequences such as \123)
 331 | ||| /"(\\\\.|.)\*?"/
 332 | export
 333 | stringLit : Lexer
 334 | stringLit = quote (is '"') (escape (is '\\') any <|> any)
 335 | 
 336 | ||| Recognise a character literal, including escaped characters.
 337 | ||| (Note: doesn't yet handle escape sequences such as \123)
 338 | ||| /'(\\\\.|[\^'])'/
 339 | export
 340 | charLit : Lexer
 341 | charLit = let q = '\'' in
 342 |               is q <+> (escape (is '\\') (control <|> any) <|> isNot q) <+> is q
 343 |   where
 344 |     lexStr : List String -> Lexer
 345 |     lexStr [] = fail
 346 |     lexStr (t :: ts) = exact t <|> lexStr ts
 347 | 
 348 |     control : Lexer
 349 |     control = lexStr ["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
 350 |                       "BS",  "HT",  "LF",  "VT",  "FF",  "CR",  "SO",  "SI",
 351 |                       "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
 352 |                       "CAN", "EM",  "SUB", "ESC", "FS",  "GS",  "RS",  "US",
 353 |                       "SP",  "DEL"]
 354 |                 <|> (is 'x' <+> hexDigits)
 355 |                 <|> (is 'o' <+> octDigits)
 356 |                 <|> digits
 357 | 
 358 | ||| Recognise an integer literal (possibly with a '-' prefix)
 359 | ||| /-?[0-9]+/
 360 | export
 361 | intLit : Lexer
 362 | intLit = opt (is '-') <+> digits
 363 | 
 364 | ||| Recognise a binary literal, prefixed by "0b"
 365 | ||| /0b[0-1]+/
 366 | export
 367 | binLit : Lexer
 368 | binLit = exact "0b" <+> binDigits
 369 | 
 370 | ||| Recognise a hexadecimal literal, prefixed by "0x" or "0X"
 371 | ||| /0[Xx][0-9A-Fa-f]+/
 372 | export
 373 | hexLit : Lexer
 374 | hexLit = approx "0x" <+> hexDigits
 375 | 
 376 | ||| Recognise an octal literal, prefixed by "0o"
 377 | ||| /0o[0-9A-Fa-f]+/
 378 | export
 379 | octLit : Lexer
 380 | octLit = exact "0o" <+> octDigits
 381 | 
 382 | export
 383 | digitsUnderscoredLit : Lexer
 384 | digitsUnderscoredLit = digits <+> many (is '_' <+> digits)
 385 | 
 386 | export
 387 | binUnderscoredLit : Lexer
 388 | binUnderscoredLit = binLit <+> many (is '_' <+> binDigits)
 389 | 
 390 | export
 391 | hexUnderscoredLit : Lexer
 392 | hexUnderscoredLit = hexLit <+> many (is '_' <+> hexDigits)
 393 | 
 394 | export
 395 | octUnderscoredLit : Lexer
 396 | octUnderscoredLit = octLit <+> many (is '_' <+> octDigits)
 397 | 
 398 | ||| Recognise `start`, then recognise all input until a newline is encountered,
 399 | ||| and consume the newline. Will succeed if end-of-input is encountered before
 400 | ||| a newline.
 401 | ||| /`start`[\^\\r\\n]+(\\r\\n|[\\r\\n])?/
 402 | export
 403 | lineComment : (start : Lexer) -> Lexer
 404 | lineComment start = start <+> manyUntil newline any <+> opt newline
 405 | 
 406 | ||| Recognise all input between `start` and `end` lexers.
 407 | ||| Supports balanced nesting.
 408 | |||
 409 | ||| For block comments that don't support nesting (such as C-style comments),
 410 | ||| use `surround`
 411 | export
 412 | blockComment : (start : Lexer) -> (end : Lexer) -> Lexer
 413 | blockComment start end = start <+> middle <+> end
 414 |   where
 415 |     middle : Recognise False
 416 |     middle = manyUntil end (blockComment start end <|> any)
 417 |