-- |
-- Module      : Data.String.Interpolate.Parse
-- Copyright   : (c) William Yao, 2019-2020
-- License     : BSD-3
-- Maintainer  : williamyaoh@gmail.com
-- Stability   : experimental
-- Portability : POSIX
--
-- YOU SHOULD NOT USE THIS MODULE.
--
-- This is exported mainly so tests can introspect on the implementation.

{-# LANGUAGE PackageImports #-}

module Data.String.Interpolate.Parse
  ( InterpSegment(..), parseInterpSegments, dosToUnix )
where

import           Data.Char
import qualified "base" Numeric as N

data InterpSegment
  = Expression String
  | Verbatim String
  | Newline
  | Spaces Int
  | Tabs Int
  deriving (Eq, Show)

-- |
-- Given the raw input from a quasiquote, parse it into the information
-- we need to output the actual expression.
--
-- Returns an error message if parsing fails.
parseInterpSegments :: String -> Either String [InterpSegment]
parseInterpSegments = switch
  -- Given how complicated this is getting, it might be worth switching
  -- to megaparsec instead of hand-rolling this.
  where switch :: String -> Either String [InterpSegment]
        switch ""             = pure []
        switch ('#':'{':rest) = expr rest
        switch ('#':_)        = Left "unescaped # symbol without interpolation brackets"
        switch ('\n':rest)    = newline rest  -- CRLF handled by `dosToUnix'
        switch (' ':rest)     = spaces 1 rest
        switch ('\t':rest)    = tabs 1 rest
        switch other          = verbatim "" other

        verbatim :: String -> String -> Either String [InterpSegment]
        verbatim acc parsee = case parsee of
          "" ->
            ((Verbatim . reverse) acc :) <$> switch parsee
          (c:_) | c `elem` ['#', ' ', '\t', '\n'] ->
            ((Verbatim . reverse) acc :) <$> switch parsee
          ('\\':'#':rest) ->
            verbatim ('#':acc) rest
          ('\\':_) -> case unescapeChar parsee of
            (FoundChar c, rest) -> verbatim (c:acc) rest
            (EscapeEmpty, rest) -> verbatim acc rest
            (EscapeUnterminated, _) -> Left "unterminated backslash escape at end of string"
            (UnknownEscape esc, _) -> Left ("unknown escape character: " ++ [esc])
          c:cs ->
            verbatim (c:acc) cs

        expr :: String -> Either String [InterpSegment]
        expr parsee = case span (/= '}') parsee of
          (_, "")        -> Left "unterminated #{...} interpolation"
          (expr, _:rest) -> (Expression expr :) <$> switch rest

        newline :: String -> Either String [InterpSegment]
        newline parsee = (Newline :) <$> switch parsee

        spaces :: Int -> String -> Either String [InterpSegment]
        spaces n (' ':rest) = spaces (n+1) rest
        spaces n other      = (Spaces n :) <$> switch other

        tabs :: Int -> String -> Either String [InterpSegment]
        tabs n ('\t':rest) = tabs (n+1) rest
        tabs n other       = (Tabs n :) <$> switch other

dosToUnix :: String -> String
dosToUnix = go
  where go xs = case xs of
          '\r' : '\n' : ys -> '\n' : go ys
          y : ys           -> y : go ys
          []               -> []

data EscapeResult
  = FoundChar Char
  | EscapeEmpty         -- ^ Haskell's lexical syntax has \& as an escape that produces an empty string
  | EscapeUnterminated
  | UnknownEscape Char

-- |
-- Haskell 2010 character unescaping, see:
-- <http://www.haskell.org/onlinereport/haskell2010/haskellch2.html#x7-200002.6>
--
-- Unescape the very first backslashed character of the string, if it's a known
-- escape.
unescapeChar :: String -> (EscapeResult, String)
unescapeChar input = case input of
  "" -> (EscapeEmpty, input)
  '\\' : 'x' : x : xs | isHexDigit x -> case span isHexDigit xs of
    (ys, zs) -> ((FoundChar . chr . readHex $ x:ys), zs)
  '\\' : 'o' : x : xs | isOctDigit x -> case span isOctDigit xs of
    (ys, zs) -> ((FoundChar . chr . readOct $ x:ys), zs)
  '\\' : x : xs | isDigit x -> case span isDigit xs of
    (ys, zs) -> ((FoundChar . chr . read $ x:ys), zs)
  '\\' : input_ -> case input_ of
    '\\' : xs        -> (FoundChar ('\\'), xs)
    'a' : xs         -> (FoundChar ('\a'), xs)
    'b' : xs         -> (FoundChar ('\b'), xs)
    'f' : xs         -> (FoundChar ('\f'), xs)
    'n' : xs         -> (FoundChar ('\n'), xs)
    'r' : xs         -> (FoundChar ('\r'), xs)
    't' : xs         -> (FoundChar ('\t'), xs)
    'v' : xs         -> (FoundChar ('\v'), xs)
    '&' : xs         -> (EscapeEmpty, xs)
    'N':'U':'L' : xs -> (FoundChar ('\NUL'), xs)
    'S':'O':'H' : xs -> (FoundChar ('\SOH'), xs)
    'S':'T':'X' : xs -> (FoundChar ('\STX'), xs)
    'E':'T':'X' : xs -> (FoundChar ('\ETX'), xs)
    'E':'O':'T' : xs -> (FoundChar ('\EOT'), xs)
    'E':'N':'Q' : xs -> (FoundChar ('\ENQ'), xs)
    'A':'C':'K' : xs -> (FoundChar ('\ACK'), xs)
    'B':'E':'L' : xs -> (FoundChar ('\BEL'), xs)
    'B':'S' : xs     -> (FoundChar ('\BS'), xs)
    'H':'T' : xs     -> (FoundChar ('\HT'), xs)
    'L':'F' : xs     -> (FoundChar ('\LF'), xs)
    'V':'T' : xs     -> (FoundChar ('\VT'), xs)
    'F':'F' : xs     -> (FoundChar ('\FF'), xs)
    'C':'R' : xs     -> (FoundChar ('\CR'), xs)
    'S':'O' : xs     -> (FoundChar ('\SO'), xs)
    'S':'I' : xs     -> (FoundChar ('\SI'), xs)
    'D':'L':'E' : xs -> (FoundChar ('\DLE'), xs)
    'D':'C':'1' : xs -> (FoundChar ('\DC1'), xs)
    'D':'C':'2' : xs -> (FoundChar ('\DC2'), xs)
    'D':'C':'3' : xs -> (FoundChar ('\DC3'), xs)
    'D':'C':'4' : xs -> (FoundChar ('\DC4'), xs)
    'N':'A':'K' : xs -> (FoundChar ('\NAK'), xs)
    'S':'Y':'N' : xs -> (FoundChar ('\SYN'), xs)
    'E':'T':'B' : xs -> (FoundChar ('\ETB'), xs)
    'C':'A':'N' : xs -> (FoundChar ('\CAN'), xs)
    'E':'M' : xs     -> (FoundChar ('\EM'), xs)
    'S':'U':'B' : xs -> (FoundChar ('\SUB'), xs)
    'E':'S':'C' : xs -> (FoundChar ('\ESC'), xs)
    'F':'S' : xs     -> (FoundChar ('\FS'), xs)
    'G':'S' : xs     -> (FoundChar ('\GS'), xs)
    'R':'S' : xs     -> (FoundChar ('\RS'), xs)
    'U':'S' : xs     -> (FoundChar ('\US'), xs)
    'S':'P' : xs     -> (FoundChar ('\SP'), xs)
    'D':'E':'L' : xs -> (FoundChar ('\DEL'), xs)
    '^':'@' : xs     -> (FoundChar ('\^@'), xs)
    '^':'A' : xs     -> (FoundChar ('\^A'), xs)
    '^':'B' : xs     -> (FoundChar ('\^B'), xs)
    '^':'C' : xs     -> (FoundChar ('\^C'), xs)
    '^':'D' : xs     -> (FoundChar ('\^D'), xs)
    '^':'E' : xs     -> (FoundChar ('\^E'), xs)
    '^':'F' : xs     -> (FoundChar ('\^F'), xs)
    '^':'G' : xs     -> (FoundChar ('\^G'), xs)
    '^':'H' : xs     -> (FoundChar ('\^H'), xs)
    '^':'I' : xs     -> (FoundChar ('\^I'), xs)
    '^':'J' : xs     -> (FoundChar ('\^J'), xs)
    '^':'K' : xs     -> (FoundChar ('\^K'), xs)
    '^':'L' : xs     -> (FoundChar ('\^L'), xs)
    '^':'M' : xs     -> (FoundChar ('\^M'), xs)
    '^':'N' : xs     -> (FoundChar ('\^N'), xs)
    '^':'O' : xs     -> (FoundChar ('\^O'), xs)
    '^':'P' : xs     -> (FoundChar ('\^P'), xs)
    '^':'Q' : xs     -> (FoundChar ('\^Q'), xs)
    '^':'R' : xs     -> (FoundChar ('\^R'), xs)
    '^':'S' : xs     -> (FoundChar ('\^S'), xs)
    '^':'T' : xs     -> (FoundChar ('\^T'), xs)
    '^':'U' : xs     -> (FoundChar ('\^U'), xs)
    '^':'V' : xs     -> (FoundChar ('\^V'), xs)
    '^':'W' : xs     -> (FoundChar ('\^W'), xs)
    '^':'X' : xs     -> (FoundChar ('\^X'), xs)
    '^':'Y' : xs     -> (FoundChar ('\^Y'), xs)
    '^':'Z' : xs     -> (FoundChar ('\^Z'), xs)
    '^':'[' : xs     -> (FoundChar ('\^['), xs)
    '^':'\\' : xs    -> (FoundChar ('\^\'), xs)
    '^':']' : xs     -> (FoundChar ('\^]'), xs)
    '^':'^' : xs     -> (FoundChar ('\^^'), xs)
    '^':'_' : xs     -> (FoundChar ('\^_'), xs)
    x:xs             -> (UnknownEscape x, xs)
    ""               -> (EscapeUnterminated, "")
  x:xs -> (FoundChar x, xs)

  where readHex :: String -> Int
        readHex xs = case N.readHex xs of
          [(n, "")] -> n
          _         -> error "Data.String.Interpolate.Util.readHex: no parse"

        readOct :: String -> Int
        readOct xs = case N.readOct xs of
          [(n, "")] -> n
          _         -> error "Data.String.Interpolate.Util.readHex: no parse"