0 | module Text.Lexer.Core
5 | import public Control.Delayed
6 | import public Text.Bounded
14 | data Recognise : (consumes : Bool) -> Type where
15 | Empty : Recognise False
17 | Lookahead : (positive : Bool) -> Recognise c -> Recognise False
18 | Pred : (Char -> Bool) -> Recognise True
19 | SeqEat : Recognise True -> Inf (Recognise e) -> Recognise True
20 | SeqEmpty : Recognise e1 -> Recognise e2 -> Recognise (e1 || e2)
21 | SeqSame : Recognise e -> Recognise e -> Recognise e
22 | Alt : Recognise e1 -> Recognise e2 -> Recognise (e1 && e2)
27 | Lexer = Recognise True
32 | (<+>) : {c1 : Bool} ->
33 | Recognise c1 -> inf c1 (Recognise c2) -> Recognise (c1 || c2)
34 | (<+>) {c1 = False} = SeqEmpty
35 | (<+>) {c1 = True} = SeqEat
40 | (<|>) : Recognise c1 -> Recognise c2 -> Recognise (c1 && c2)
50 | empty : Recognise False
55 | pred : (Char -> Bool) -> Lexer
60 | expect : Recognise c -> Recognise False
61 | expect = Lookahead True
65 | reject : Recognise c -> Recognise False
66 | reject = Lookahead False
72 | concatMap : (a -> Recognise c) -> (xs : List a) -> Recognise (isCons xs && c)
73 | concatMap _ [] = Empty
74 | concatMap f (x :: []) = f x
75 | concatMap f (x :: xs@(_ :: _)) = SeqSame (f x) (concatMap f xs)
77 | data StrLen : Type where
78 | MkStrLen : String -> Nat -> StrLen
80 | getString : StrLen -> String
81 | getString (MkStrLen str n) = str
83 | strIndex : StrLen -> Nat -> Maybe Char
84 | strIndex (MkStrLen str len) i
85 | = if cast {to = Integer} i >= cast len then Nothing
86 | else Just (assert_total (prim__strIndex str (cast i)))
88 | mkStr : String -> StrLen
89 | mkStr str = MkStrLen str (length str)
91 | strTail : Nat -> StrLen -> StrLen
92 | strTail start (MkStrLen str len)
93 | = MkStrLen (substr start len str) (minus len start)
98 | scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)
99 | scan Empty tok str = pure (tok, str)
100 | scan Fail tok str = Nothing
101 | scan (Lookahead positive r) tok str
102 | = if isJust (scan r tok str) == positive
103 | then pure (tok, str)
105 | scan (Pred f) tok [] = Nothing
106 | scan (Pred f) tok (c :: str)
108 | then Just (c :: tok, str)
110 | scan (SeqEat r1 r2) tok str
111 | = do (tok', rest) <- scan r1 tok str
113 | assert_total (scan r2 tok' rest)
114 | scan (SeqEmpty r1 r2) tok str
115 | = do (tok', rest) <- scan r1 tok str
117 | scan (SeqSame r1 r2) tok str
118 | = do (tok', rest) <- scan r1 tok str
120 | scan (Alt r1 r2) tok str
121 | = maybe (scan r2 tok str) Just (scan r1 tok str)
128 | TokenMap : (tokenType : Type) -> Type
129 | TokenMap tokenType = List (Lexer, String -> tokenType)
131 | tokenise : (a -> Bool) ->
132 | (line : Int) -> (col : Int) ->
133 | List (WithBounds a) -> TokenMap a ->
134 | List Char -> (List (WithBounds a), (Int, Int, List Char))
135 | tokenise pred line col acc tmap str
136 | = case getFirstToken tmap str of
137 | Just (tok, line', col', rest) =>
140 | then (reverse acc, (line, col, []))
141 | else assert_total (tokenise pred line' col' (tok :: acc) tmap rest)
142 | Nothing => (reverse acc, (line, col, str))
144 | countNLs : List Char -> Nat
145 | countNLs str = List.length (filter (== '\n') str)
147 | getCols : List Char -> Int -> Int
149 | = case span (/= '\n') x of
150 | (incol, []) => c + cast (length incol)
151 | (incol, _) => cast (length incol)
153 | getFirstToken : TokenMap a -> List Char ->
154 | Maybe (WithBounds a, Int, Int, List Char)
155 | getFirstToken [] str = Nothing
156 | getFirstToken ((lex, fn) :: ts) str
157 | = case scan lex [] str of
158 | Just (tok, rest) =>
159 | let line' = line + cast (countNLs tok)
160 | col' = getCols tok col in
161 | Just (MkBounded (fn (pack (reverse tok))) False (MkBounds line col line' col'),
163 | Nothing => getFirstToken ts str
170 | lex : TokenMap a -> String -> (List (WithBounds a), (Int, Int, String))
172 | = let (ts, (l, c, str')) = tokenise (const False) 0 0 [] tmap (unpack str) in
173 | (ts, (l, c, pack str'))
176 | lexTo : (a -> Bool) ->
177 | TokenMap a -> String -> (List (WithBounds a), (Int, Int, String))
178 | lexTo pred tmap str
179 | = let (ts, (l, c, str')) = tokenise pred 0 0 [] tmap (unpack str) in
180 | (ts, (l, c, pack str'))