src/Data/Position/Interface.hs

{- This file is part of text-position.
 -
 - Written in 2015 by fr33domlover <fr33domlover@riseup.net>.
 -
 - ♡ Copying is an act of love. Please copy, reuse and share.
 -
 - To the extent possible under law, the author(s) have dedicated all copyright
 - and related and neighboring rights to this software to the public domain
 - worldwide. This software is distributed without any warranty.
 -
 - You should have received a copy of the CC0 Public Domain Dedication along
 - with this software. If not, see
 - <http://creativecommons.org/publicdomain/zero/1.0/>.
 -}

module Data.Position.Interface (
    -- * Special Positions
      zeroPosition
    , firstPosition
    -- * Special Advances
    , emptyAdvance
    , defaultAdvance
    -- * Creating Advances
    , psymAdvance
    , symAdvance
    , linecharAdvance
    , stringAdvance
    , newlineAdvance
    , commonAdvance
    , (<++>)
    -- * Applying Advances
    , tryAdvance
    , tryAdvanceC
    , advance
    , advanceC
    -- * Utilities Based on Advances
    , defaultAnnotate
    , enrichOnce
    , enrichOnceD
    , enrich
    , enrichD
    , bless
    , tokens
    , textInfo
    )
where

import Data.List (isPrefixOf, mapAccumL)
import Data.Maybe (fromJust, maybe)
import Data.Position.Types
import Text.Regex.Applicative

-------------------------------------------------------------------------------
-- Special Positions
-------------------------------------------------------------------------------

-- | The position before the first character in a file, to be used as an
-- initial value before reading actual characters.
zeroPosition :: Position
zeroPosition = Position 1 0 0

-- | The position of the first character in a file.
firstPosition :: Position
firstPosition = Position 1 1 1

-------------------------------------------------------------------------------
-- Special Advances
-------------------------------------------------------------------------------

-- | The zero advance. It doesn't match any input and doesn't consume any
-- characters. Applying it doesn't change the position.
emptyAdvance :: Advance s
emptyAdvance = empty

-- | The default advance when reading a character, e.g. a letter or a digit.
-- The new character would have column number higher by 1, and character index
-- higher by once (advances by 1 for each character read). The pattern accepts
-- any single character.
defaultAdvance :: Advance s
defaultAdvance = f <$ anySym
    where
    f (Position l c ch) = Position l (c + 1) (ch + 1)

-------------------------------------------------------------------------------
-- Creating Advances
-------------------------------------------------------------------------------

-- | Create an advance for a single character based on a predicate.
psymAdvance :: (s -> Bool) -> (Position -> Position) -> Advance s
psymAdvance p a = a <$ psym p

-- | Create an advance for the given character.
symAdvance :: Eq s => s -> (Position -> Position) -> Advance s
symAdvance c = psymAdvance (c ==)

-- | Create an advance for a line character with the specified width. This is
-- mainly useful for tabs and perhaps the various space characters in Unicode.
-- Example for tab:
--
-- > tabAdv = linecharAdvance '\t' 8
linecharAdvance :: Eq s
                => s         -- ^ The character
                -> Int       -- ^ How many columns the character takes
                -> Advance s
linecharAdvance c width = symAdvance c f
    where
    f (Position l c ch) = Position l (c + width) (ch + 1)

-- | Create an advance for the given character sequence.
stringAdvance :: Eq s => [s] -> (Position -> Position) -> Advance s
stringAdvance s a = a <$ string s

-- | Create an advance for a character or sequence of characters expressing a
-- newline, i.e. starting a new line. As the advance expresses the position
-- /after/ the character, applying the advance results with a position at
-- column 1.
newlineAdvance :: Eq s => [s] -> Advance s
newlineAdvance s = stringAdvance s f
    where
    f (Position l c ch) = Position (l + 1) 1 (ch + length s)

-- | Create a set of common advances supporting tabs and newlines. More
-- advances can easily be added by @<|>@ing them to the result. The result
-- doesn't include the default advance.
commonAdvance :: Int  -- ^ Tab width (usually 2, 4 or 8)
              -> Bool -- ^ Whether carriage return (CR) counts as a newline
              -> Bool -- ^ Whether linefeed (LF) counts as a newline
              -> Bool -- ^ Whether the sequence CR LF counts as a newline
              -> Bool -- ^ Whether formfeed (FF) counts as a newline
              -> Advance Char
commonAdvance tab cr lf crlf ff = foldr (<|>) tabAdv nlAdv
    where
    tabAdv = linecharAdvance '\t' tab
    nlAdv = [ adv | (adv, True) <- zipList ]
    zipList = zip (map newlineAdvance ["\r\n", "\r", "\n", "\f"])
                  [crlf, cr, lf, ff]

-- | Concatenate two advances into a single advance accepting their patterns
-- in order, and applying the advances on top of each other. For example,
-- concatenating an advance for @'a'@ and an advance for @'b'@ results with an
-- advance accepting @"ab"@ and moving the position 2 columns forward.
(<++>) :: Advance s -> Advance s -> Advance s
a <++> b = flip (.) <$> a <*> b
infixl 4 <++>

-------------------------------------------------------------------------------
-- Applying Advances
-------------------------------------------------------------------------------

-- | Given a list of remaining characters to read, the next position in the
-- file and a set of advance rules, try to consume characters once and
-- determine what is the next position after reading them. Example:
--
-- >>> tryAdvance defaultAdvance (Position 1 1 1) "abc"
-- (Position 1 2 2,"bc")
--
-- If there is no match, it returns the input position and the input list, i.e.
-- no characters will be consumed.
tryAdvance :: Advance s -> Position -> [s] -> (Position, [s])
tryAdvance a p l =
    case findFirstPrefix a l of
        Nothing          -> (p, l)
        Just (adv, rest) -> (adv p, rest)

-- | Like 'tryAdvance', but reads one character at most. In the general case
-- you'll want to use 'tryAdvance', because 'tryAdvanceC' breaks chains. For
-- example, while 'tryAdvance' can recognize @"\r\n"@ as a single newline,
-- 'tryAdvanceC' will consume only the @'\r'@, splitting the string into 2
-- newlines.
--
-- If there is no match, the input position is returned.
tryAdvanceC :: Advance s -> Position -> s -> Position
tryAdvanceC a p s = fst $ tryAdvance a p [s]

-- | Given a list of remaining characters to read, the next position in the
-- file and a set of advance rules, consume characters once and determine what
-- is the next position after reading them.
--
-- The 'defaultAdvance' is appended (using '<|>') to the given advance.
-- Therefore, if the given list isn't empty, at leat character will be
-- consumed. The intended use is to encode all the special cases (tab,
-- newlines, non-spacing marks, etc.) in the given advance, and let the
-- 'defaultAdvance' catch the rest.
advance :: Advance s -> Position -> [s] -> (Position, [s])
advance a = tryAdvance (a <|> defaultAdvance)

-- | Like 'advance', but reads exactly one character. Patterns which require
-- more than one character fail to match. Like 'tryAdvanceC', but has the
-- 'defaultAdvance' appended, which means is always consumes given a non-empty
-- list.
advanceC :: Advance s -> Position -> s -> Position
advanceC a p s = fst $ advance a p [s]

-------------------------------------------------------------------------------
-- Utilities based on Advances
-------------------------------------------------------------------------------

-- | Given the next position and a list matched there, annotate the symbols
-- with position information. For a single character, it is simply the given
-- position. For a sequence, this annotation assigns all the symbols the same
-- line and column, incrementing only the character index.
--
-- >>> defaultAnnotate (Position 1 1 1) "a"
-- [Positioned 'a' (Position 1 1 1)]
--
-- >>> defaultAnnotate (Position 1 1 1) "\r\n"
-- [Positioned '\r' (Position 1 1 1), Positioned '\n' (Position 1 1 2)]
--
-- The last example would give the same positions to any list of the same
-- length, e.g. @"ab"@ instead of @"\r\n"@.
defaultAnnotate :: Position -> [s] -> [Positioned s]
defaultAnnotate p []     = []
defaultAnnotate p (c:cs) = Positioned c p : defaultAnnotate (f p) cs
    where
    f (Position l c ch) = Position l c (ch + 1)

-- | Given an advance rule, the next available position and a symbol list,
-- consume symbols once. Return a list of them, annotated with position
-- information, as well as the next position and the rest of the input.
-- On empty input, return @[]@, the given position and the input list.
--
-- If more than one character is matched, the sequence is annotated with
-- consecutive character indices, but with the same line and column.
--
-- >>> enrichOnce (newlineAdvance "\r\n") (Position 1 1 1) "\r\nhello"
-- ( [ Positioned '\r' (Position 1 1 1)
--   , Positioned '\n' (Position 1 1 2)
--   ]
-- , Position 2 1 3
-- , "hello"
-- )
enrichOnce :: Advance s -> Position -> [s] -> ([Positioned s], Position, [s])
enrichOnce = enrichOnceD defaultAnnotate defaultAdvance

-- | Given an advance rule, the next available position and a symbol list, try
-- to consume symbols once. If consumed, return a list of them, annotated with
-- position information, as well as the next position and the rest of the
-- input. Otherwise, return @[]@, the given position and the input list.
--
-- If more than one character is matched, the sequence is annotated using the
-- function passed as the first parameter.
--
-- >>> let ann = defaultAnnotate; adv = empty
-- >>> enrichOnceD ann adv (newlineAdvance "\r\n") (Position 1 1 1) "\r\nhello"
-- ( [ Positioned '\r' (Position 1 1 1)
--   , Positioned '\n' (Position 1 1 2)
--   ]
-- , Position 2 1 3
-- , "hello"
-- )
enrichOnceD :: (Position -> [s] -> [Positioned s]) -- ^ annotation function
            -> Advance s                           -- ^ default advance
            -> Advance s                           -- ^ advance rule
            -> Position                            -- ^ initial position
            -> [s]                                 -- ^ input list
            -> ([Positioned s], Position, [s])
enrichOnceD ann def adv pos syms = f $ findFirstPrefix re syms
    where
    re = g <$> withMatched (adv <|> def)
    g (apply, l) = (apply pos, ann pos l)
    f Nothing               = ([], firstPosition, syms)
    f (Just ((p, s), rest)) = (s, p, rest)

-- | Given a list of symbols, annotate it with position based on advance rules.
-- Each symbol is annotated with its position in the text. In addition to the
-- annotated list, the next available position is returned (i.e. the position
-- of the next symbol, if another symbol were appended to the list).
--
-- >>> enrich defaultAdvance "abc"
-- ( [ Positioned 'a' (Position 1 1 1))
--   , Positioned 'b' (Position 1 2 2))
--   ]
-- , Position 1 3 3
-- )
--
-- It is implemented using the 'defaultAdvance' as a default, i.e. the entire
-- list is always consumed.
enrich :: Advance s -> [s] -> ([Positioned s], Position)
enrich adv = f . enrichD defaultAnnotate defaultAdvance adv
    where
    f (ps, p, _) = (ps, p)

-- | Like 'enrich', but takes an annotation function as the first parameter,
-- and a default advance as the second parameter. The rest of the parameters
-- are the same ones 'enrich' takes. It allows using custom defaults. To have
-- no default advance, pass 'empty'.
--
-- Since a match of the whole list isn't guaranteed, there is an additional
-- list in the return type, containing the rest of the input. If the entire
-- input is matched, that list will be @[]@. If no input is matched at all,
-- the annotated list is @[]@, the position is 'firstPosition' and the
-- additional list (rest of input) is the input list.
enrichD :: (Position -> [s] -> [Positioned s])
        -> Advance s
        -> Advance s
        -> [s]
        -> ([Positioned s], Position, [s])
enrichD ann def adv syms = f ([], firstPosition, syms)
    where
    g = enrichOnceD ann def adv
    f acc@(ps, p, s) =
        let (ps', p', s') = g p s
        in  if null ps' then acc else f (ps ++ ps', p', s')

-- | Given a regex, create an equivalent position-aware regex. The resulting
-- regex reads position-tagged symbols, and returns a position-tagged result.
bless :: RE s a -> PosRE s a
bless re = g <$> withMatched (comap f re)
    where
    f (Positioned c _) = c
    g (val, [])                 = Positioned val zeroPosition
    g (val, Positioned _ p : _) = Positioned val p

-- | Tokenize an input list and get list of tokens. If there was an error (no
-- regex match), get the text position at which it happened.
tokens :: Advance s -- ^ Advance rule for position tagging, e.g. made with
                    --   'commonAdvance'
       -> RE s a    -- ^ Regex which selects and returns a single token
       -> [s]       -- ^ Input list of symbols
       -> ( [Positioned a]
          , Maybe (Positioned s)
          )         -- ^ List of tokens matched. If the entire input was
                    --   matched, the second element is 'Nothing'. Otherwise,
                    --   it is the (position-tagged) symbol at which matching
                    --   failed.
tokens adv re syms =
    let re'   = many $ bless re
        syms' = fst $ enrich adv syms
    in  case findFirstPrefix re' syms' of
            Nothing          -> ([], Just $ head syms')
            Just (list, [])  -> (list, Nothing)
            Just (list, x:_) -> (list, Just x)

-- | Get some numbers describing the given text (list of symbols):
--
-- * The total number of lines
-- * The length (number of columns) of the last line
-- * The total number of characters
--
-- Note that this probably isn't the fastest implementation. It's possible to
-- compute directly by counting the lines and the characters. This function is
-- here anyway, as a demonstration of using this library.
--
-- >>> let adv = commonAdvance 4 True True True True
-- >>> textInfo adv "Hello world!\nHow are you?\nWonderful!"
-- (3,11,36)
textInfo :: Advance s -> [s] -> (Int, Int, Int)
textInfo adv syms = g $ f <$> many (adv <|> defaultAdvance)
    where
    f flist = h $ foldl (flip (.)) id flist $ firstPosition
    g re = fromJust $ match re syms
    h (Position l c ch) = (l, c - 1, ch - 1)