-- | This module provides conversions between various different markup -- formats. In principle, it provides two different conversions: -- -- 1. Converting the Superdoc markup language to HTML. -- -- 2. Converting Unicode streams to HTML. -- -- Conversion 1 is exposed by the function 'markup'. This is used by -- the post-Haddock hook. -- -- Conversion 2 is provided by the 'to_html' function. It is used by -- the post-HsColour hook. module Distribution.Superdoc.Markup where import Distribution.Superdoc.UTF8 import Data.Char import qualified Data.Set as Set import Data.Set (Set) import System.IO import Text.ParserCombinators.ReadP -- ---------------------------------------------------------------------- -- * Format definitions -- $ The Superdoc markup language provides tags for superscripts, -- subscripts, and more. The following markup is recognized: -- -- \[bold Markup:] -- -- * [bold [literal [super /text/]]]: superscript. -- -- * [bold [literal [sup /text/]]]: superscript. A synonym for -- [literal [super /text/]]. -- -- * [bold [literal [sub /text/]]]: subscript. -- -- * [bold [literal [exp /text/]]]: exponential function. -- -- * [bold [literal [bold /text/]]]: bold. -- -- * [bold [literal [center /text/]]]: centered. -- -- * [bold [literal [nobr /text/]]]: inhibit line breaks. -- -- * [bold [literal [image /filename/]]]: insert image. -- -- * [bold [literal [uni /nnnn/]]]: Unicode character. -- -- * [bold [literal [literal text]]]: literal text. Brackets \'[\' and -- \']\' may only occur in nested pairs. -- ---------------------------------------------------------------------- -- * Filters -- | A filter is basically a function from strings to strings. Ideally -- a filter is lazy, so that the input string is consumed -- incrementally; however, this is not strictly necessary. A filter -- may also return another result in addition to a string. type Filter a = String -> (String, a) -- | The identity filter. filter_id :: Filter () filter_id s = (s, ()) -- | Run a filter by reading from one handle and writing to another. -- The handles are set to binary mode. filter_handles :: Filter a -> Handle -> Handle -> IO a filter_handles filter fin fout = do hSetBinaryMode fin True hSetBinaryMode fout True input <- hGetContents fin let (output, a) = filter input hPutStr fout output return a -- | Run a filter by reading from a file and writing to another file. -- We do not assume that the two files are necessarily distinct, so -- special care is taken not to overwrite the output file until after -- the input file has been read. filter_file :: Filter a -> FilePath -> FilePath -> IO a filter_file filter infile outfile = do h <- openBinaryFile infile ReadMode input <- hGetContents h -- make sure we read it all case length input of 0 -> return () _ -> return () hClose h let (output, a) = filter input withBinaryFile outfile WriteMode $ \h -> do hPutStr h output return a -- | Run a filter on a number of files, overwriting each file in -- place. filter_files :: Filter a -> [FilePath] -> IO [a] filter_files filter files = do sequence [ filter_file filter f f | f <- files ] -- ---------------------------------------------------------------------- -- * Markup parser -- $ This section defines a simple grammar and parser for the Superdoc -- markup language, translating it to HTML. This is used to -- post-process Haddock's output. -- ---------------------------------------------------------------------- -- ** Top-level function -- | The top-level parser for Superdoc markup, expressed as a -- filter. In addition to producing HTML output, this filter also -- returns the set of all image files that were linked to. markup :: Filter (Set FilePath) markup input = case readP_to_S markup_top input of ((output, images), "") : _ -> (output, images) _ -> error "markup: parse error" -- this should not happen -- ---------------------------------------------------------------------- -- ** Grammar definition -- | Top-level parser for Superdoc markup. -- -- /top/ ::= (/other/ | /tag/ | /char/)*. markup_top :: ReadP (String, Set FilePath) markup_top = do lst <- many (lift markup_other <++ markup_tag <++ lift markup_char) eof return (concat (map fst lst), Set.unions (map snd lst)) -- | Lift a parser returning a string to a parser returning a string and an empty set. lift :: ReadP String -> ReadP (String, Set FilePath) lift p = do s <- p return (s, Set.empty) -- | Like 'markup', but only permit brackets \"[\" and \"]\" to occur -- in nested pairs. -- -- /nested/ ::= (/other/ | /tag/ | /bracketed/ | /underscore/)*. markup_nested :: ReadP (String, Set FilePath) markup_nested = do lst <- many (lift markup_other <++ markup_tag <++ markup_bracketed <++ lift markup_nonbracket) return (concat (map fst lst), Set.unions (map snd lst)) -- | Parse bracketed text. -- -- /bracketed/ ::= \"[\" /nested/ \"]\". markup_bracketed :: ReadP (String, Set FilePath) markup_bracketed = do char '[' (s, images) <- markup_nested char ']' return ("[" ++ s ++ "]", images) -- | Parse any single character except \'[\' and \']\'. -- -- /nonbracket/ ::= any character besides \'[\', \']\'. markup_nonbracket :: ReadP String markup_nonbracket = do c <- satisfy (\x -> x /= '[' && x /= ']') return [c] -- | Parse any sequence of non-special characters: anything but \'[\' and \']\'. -- -- /other/ ::= (any character besides \'[\', \']\')+. markup_other :: ReadP String markup_other = do s <- munch1 (\x -> not (x `elem` ['[', ']'])) return s -- | Parse any one character. -- -- /char/ ::= any character. markup_char :: ReadP String markup_char = do c <- get return [c] -- | Parse a tag. -- -- /tag/ ::= \"[\" /keyword/ /body/ \"]\". markup_tag :: ReadP (String, Set FilePath) markup_tag = do char '[' name <- markup_keyword munch1 isSpace res <- markup_body name char ']' return res -- | Parse a keyword. -- -- /keyword/ ::= \"sup\" | \"super\" | \"sub\" | \"exp\" | \"bold\" | \"center\" | \"nobr\" | \"image\" | \"uni\" | \"literal\". markup_keyword :: ReadP String markup_keyword = choice [ string name | name <- keywords ] where keywords = ["sup", "super", "sub", "exp", "bold", "center", "nobr", "image", "uni", "literal"] -- | Parse any text with balanced brackets. -- -- /literal/ ::= (/nonbracket/ | /bracketed_literal/)*. markup_literal :: ReadP String markup_literal = do lst <- many (markup_nonbracket <++ markup_bracketed_literal) return (concat lst) -- | Parse any bracketed text with balanced brackets. -- -- /bracketed_literal/ ::= \"[\" /literal/ \"]\". markup_bracketed_literal :: ReadP String markup_bracketed_literal = do char '[' s <- markup_literal char ']' return ("[" ++ s ++ "]") -- | Parse a tag's body. What to do depends on the tag name. -- -- /body/ ::= /nested/ (for /keyword/ = sup, super, sub, exp, bold, center, nobr), -- -- /body/ ::= /filename/ (for /keyword/ = image), -- -- /body/ ::= /digit/+ (for /keyword/ = uni). -- -- /body/ ::= /literal/ (for /keyword/ = literal). markup_body :: String -> ReadP (String, Set FilePath) markup_body "sup" = do (s, images) <- markup_nested return ("<sup>" ++ s ++ "</sup>", images) markup_body "super" = do (s, images) <- markup_nested return ("<sup>" ++ s ++ "</sup>", images) markup_body "sub" = do (s, images) <- markup_nested return ("<sub>" ++ s ++ "</sub>", images) markup_body "exp" = do (s, images) <- markup_nested return ("<i>e</i><sup>" ++ s ++ "</sup>", images) markup_body "bold" = do (s, images) <- markup_nested return ("<b>" ++ s ++ "</b>", images) markup_body "center" = do (s, images) <- markup_nested return ("<center>" ++ s ++ "</center>", images) markup_body "nobr" = do (s, images) <- markup_nested return ("<nobr>" ++ s ++ "</nobr>", images) markup_body "image" = do filename <- munch1 (\x -> x /= ']' && not (isSpace x)) return ("<img src=\"" ++ filename ++ "\">", Set.singleton filename) markup_body "uni" = do n <- munch1 isDigit skipSpaces return ("&#" ++ n ++ ";", Set.empty) markup_body "literal" = do n <- markup_literal return (n, Set.empty) markup_body _ = do error "markup_body: unknown tag" -- ---------------------------------------------------------------------- -- * Unicode to HTML conversion -- | Convert a tokenized Unicode stream into HTML entities. Non-ASCII -- characters are represented as HTML entities of the form @&#@/nnnn/@;@. -- Any invalid characters are simply copied to the output. to_html :: [Token] -> String to_html [] = [] to_html (Unicode c:cs) | isAscii c = c : to_html cs | otherwise = ("&#" ++ (show (ord c)) ++ ";") ++ to_html cs to_html (Invalid c:cs) = c : to_html cs