Text.Literate


   0 | ||| A simple module to process 'literate' documents.
   1 | |||
   2 | ||| The module uses a lexer to split the document into code blocks,
   3 | ||| delineated by user-defined markers, and code lines that are
   4 | ||| indicated be a line marker. The lexer returns a document stripped
   5 | ||| of non-code elements but preserving the original document's line
   6 | ||| count. Column numbering of code lines are not preserved.
   7 | |||
   8 | ||| The underlying tokeniser is greedy.
   9 | |||
  10 | ||| Once it identifies a line marker it reads a prettifying space then
  11 | ||| consumes until the end of line. Once identifies a starting code
  12 | ||| block marker, the lexer will consume input until the next
  13 | ||| identifiable end block is encountered. Any other content is
  14 | ||| treated as part of the original document.
  15 | |||
  16 | ||| Thus, the input literate files *must* be well-formed w.r.t
  17 | ||| to code line markers and code blocks.
  18 | |||
  19 | ||| A further restriction is that literate documents cannot contain
  20 | ||| the markers within the document's main text: This will confuse the
  21 | ||| lexer.
  22 | |||
  23 | module Text.Literate
  24 | 
  25 | import Text.Lexer
  26 | 
  27 | import Data.List
  28 | import Data.List1
  29 | import Data.List.Views
  30 | import Data.String
  31 | 
  32 | %default total
  33 | 
  34 | untilEOL : Recognise False
  35 | untilEOL = manyUntil newline any
  36 | 
  37 | line : String -> Lexer
  38 | line s = exact s <+> (newline <|> space <+> untilEOL)
  39 | 
  40 | block : String -> String -> Lexer
  41 | block s e = surround (exact s <+> untilEOL) (exact e <+> untilEOL) any
  42 | 
  43 | notCodeLine : Lexer
  44 | notCodeLine = newline
  45 |            <|> any <+> untilEOL
  46 | 
  47 | data Token = CodeBlock String String String
  48 |            | Any String
  49 |            | CodeLine String String
  50 | 
  51 | Show Token where
  52 |   showPrec d (CodeBlock l r x) = showCon d "CodeBlock" $ showArg l ++ showArg r ++ showArg x
  53 |   showPrec d (Any x)           = showCon d "Any" $ showArg x
  54 |   showPrec d (CodeLine m x)    = showCon d "CodeLine" $ showArg m ++ showArg x
  55 | 
  56 | rawTokens : (delims  : List (String, String))
  57 |          -> (markers : List String)
  58 |          -> TokenMap (Token)
  59 | rawTokens delims ls =
  60 |           map (\(l,r) => (block l r, CodeBlock (trim l) (trim r))) delims
  61 |        ++ map (\m => (line m, CodeLine (trim m))) ls
  62 |        ++ [(notCodeLine, Any)]
  63 | 
  64 | namespace Compat
  65 |   -- `reduce` below was depending on the old behaviour of `lines` before #1585
  66 |   -- was merged. That old `lines` function is added here to preserve behaviour
  67 |   -- of `reduce`.
  68 |   lines' : List Char -> List1 (List Char)
  69 |   lines' [] = singleton []
  70 |   lines' s  = case break isNL s of
  71 |                    (l, s') => l ::: case s' of
  72 |                                          [] => []
  73 |                                          _ :: s'' => forget $ lines' (assert_smaller s s'')
  74 |   export
  75 |   lines : String -> List1 String
  76 |   lines s = map pack (lines' (unpack s))
  77 | 
  78 | ||| Merge the tokens into a single source file.
  79 | reduce : List (WithBounds Token) -> List String -> String
  80 | reduce [] acc = concat (reverse acc)
  81 | reduce (MkBounded (Any x) _ _ :: rest) acc =
  82 |   -- newline will always be tokenized as a single token
  83 |   if x == "\n"
  84 |   then reduce rest ("\n"::acc)
  85 |   else reduce rest acc
  86 | 
  87 | reduce (MkBounded (CodeLine m src) _ _ :: rest) acc =
  88 |     if m == trim src
  89 |     then reduce rest ("\n"::acc)
  90 |     else reduce rest ((substr (length m + 1) -- remove space to right of marker.
  91 |                               (length src)
  92 |                               src
  93 |                       )::acc)
  94 | 
  95 | reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc with (Compat.lines src) -- Strip the deliminators surrounding the block.
  96 |   reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: ys) with (snocList ys)
  97 |     reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: []) | Empty = reduce rest acc -- 2
  98 |     reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: (srcs ++ [f])) | (Snoc f srcs rec) =
  99 |         -- the "\n" counts for the open deliminator; the closing deliminator should always be followed by a (Any "\n"), so we don't add a newline
 100 |         reduce rest (((unlines srcs) ++ "\n") :: "\n" :: acc)
 101 | 
 102 | -- [ NOTE ] 1 & 2 shouldn't happen as code blocks are well formed i.e. have two deliminators.
 103 | 
 104 | 
 105 | public export
 106 | record LiterateError where
 107 |   constructor MkLitErr
 108 |   line   : Int
 109 |   column : Int
 110 |   input  : String
 111 | 
 112 | ||| Description of literate styles.
 113 | |||
 114 | ||| A 'literate' style comprises of
 115 | |||
 116 | ||| + a list of code block deliminators (`deliminators`);
 117 | ||| + a list of code line markers (`line_markers`); and
 118 | ||| + a list of known file extensions `file_extensions`.
 119 | |||
 120 | ||| Some example specifications:
 121 | |||
 122 | ||| + Bird Style
 123 | |||
 124 | |||```
 125 | |||MkLitStyle Nil [">"] [".lidr"]
 126 | |||```
 127 | |||
 128 | ||| + Literate Haskell (for LaTeX)
 129 | |||
 130 | |||```
 131 | |||MkLitStyle [("\\begin{code}", "\\end{code}"),("\\begin{spec}","\\end{spec}")]
 132 | |||           Nil
 133 | |||           [".lhs", ".tex"]
 134 | |||```
 135 | |||
 136 | ||| + OrgMode
 137 | |||
 138 | |||```
 139 | |||MkLitStyle [("#+BEGIN_SRC idris","#+END_SRC"), ("#+COMMENT idris","#+END_COMMENT")]
 140 | |||           ["#+IDRIS:"]
 141 | |||           [".org"]
 142 | |||```
 143 | |||
 144 | ||| + Common Mark
 145 | |||
 146 | |||```
 147 | |||MkLitStyle [("```idris","```"), ("<!-- idris","--!>")]
 148 | |||           Nil
 149 | |||           [".md", ".markdown"]
 150 | |||```
 151 | |||
 152 | public export
 153 | record LiterateStyle where
 154 |   constructor MkLitStyle
 155 |   ||| The pairs of start and end tags for code blocks.
 156 |   deliminators : List (String, String)
 157 | 
 158 |   ||| Line markers that indicate a line contains code.
 159 |   line_markers : List String
 160 | 
 161 |   ||| Recognised file extensions. Not used by the module, but will be
 162 |   ||| of use when connecting to code that reads in the original source
 163 |   ||| files.
 164 |   file_extensions : List String
 165 | 
 166 | ||| Given a 'literate specification' extract the code from the
 167 | ||| literate source file (`litStr`) that follows the presented style.
 168 | |||
 169 | ||| @specification The literate specification to use.
 170 | ||| @litStr  The literate source file.
 171 | |||
 172 | ||| Returns a `LiterateError` if the literate file contains malformed
 173 | ||| code blocks or code lines.
 174 | |||
 175 | export
 176 | extractCode : (specification : LiterateStyle)
 177 |            -> (litStr        : String)
 178 |            -> Either LiterateError String
 179 | extractCode (MkLitStyle delims markers exts) str =
 180 |       case lex (rawTokens delims markers) str of
 181 |         (toks, (_,_,"")) => Right (reduce toks Nil)
 182 |         (_, (l,c,i))     => Left (MkLitErr l c i)
 183 | 
 184 | ||| Synonym for `extractCode`.
 185 | export
 186 | unlit : (specification : LiterateStyle)
 187 |      -> (litStr        : String)
 188 |      -> Either LiterateError String
 189 | unlit = extractCode
 190 | 
 191 | ||| Is the provided line marked up using a line marker?
 192 | |||
 193 | ||| If the line is suffixed by any one of the style's set of line
 194 | ||| markers then return length of literate line marker, and the code stripped from the line
 195 | ||| marker. Otherwise, return Nothing and the unmarked line.
 196 | export
 197 | isLiterateLine : (specification : LiterateStyle)
 198 |               -> (str : String)
 199 |               -> Pair (Maybe String) String
 200 | isLiterateLine (MkLitStyle delims markers _) str with (lex (rawTokens delims markers) str)
 201 |   isLiterateLine (MkLitStyle delims markers _) str | ([MkBounded (CodeLine m str') _ _], (_,_, "")) = (Just m, str')
 202 |   isLiterateLine (MkLitStyle delims markers _) str | (_, _) = (Nothing, str)
 203 | 
 204 | ||| Given a 'literate specification' embed the given code using the
 205 | ||| literate style provided.
 206 | |||
 207 | ||| If the style uses deliminators to denote code blocks use the first
 208 | ||| pair of deliminators in the style. Otherwise use first linemarker
 209 | ||| in the style. If there is **no style** return the presented code
 210 | ||| string unembedded.
 211 | |||
 212 | |||
 213 | ||| @specification The literate specification to use.
 214 | ||| @code  The code to embed,
 215 | |||
 216 | |||
 217 | export
 218 | embedCode : (specification : LiterateStyle)
 219 |          -> (code : String)
 220 |          -> String
 221 | embedCode (MkLitStyle ((s,e)::delims) _            _) str = unlines [s,str,e]
 222 | embedCode (MkLitStyle Nil             (m::markers) _) str = unwords [m, str]
 223 | embedCode (MkLitStyle _               _            _) str = str
 224 | 
 225 | ||| Synonym for `embedCode`
 226 | export
 227 | relit : (specification : LiterateStyle)
 228 |      -> (code : String)
 229 |      -> String
 230 | relit = embedCode
 231 | 
 232 | -- --------------------------------------------------------------------- [ EOF ]
 233 |