{-|
Module      : Z.Data.Parser.Numeric
Description : Textual numeric parsers.
Copyright   : (c) Dong Han, 2017-2019
License     : BSD
Maintainer  : winterland1989@gmail.com
Stability   : experimental
Portability : non-portable

Textual numeric parsers.

-}

module Z.Data.Parser.Numeric
  ( -- * Decimal
    uint, int
    -- * Hex
  , hex
    -- * Fractional
  , rational
  , float, double
  , scientific
  , scientifically
    -- * Stricter fractional(rfc8259)
  , rational'
  , float', double'
  , scientific'
  , scientifically'
    -- * Misc
  , hexLoop
  , decLoop
  , decLoopIntegerFast
  , isHexDigit
  , isDigit
  , floatToScientific
  , doubleToScientific
  ) where

import           Control.Applicative
import           Control.Monad
import           Data.Bits
import           Data.Int
import qualified Data.Scientific          as Sci
import           Data.Word
import           Foreign.Ptr              (IntPtr)
import qualified Z.Data.Builder.Numeric as B
import           Z.Data.Parser.Base     (Parser, (<?>))
import qualified Z.Data.Parser.Base     as P
import qualified Z.Data.Vector.Base     as V
import qualified Z.Data.Vector.Extra    as V

#define WORD64_MAX_DIGITS_LEN 18

#define PLUS     43
#define MINUS    45
#define DOT      46
#define LITTLE_E 101
#define BIG_E    69
#define C_0 48

-- | Parse and decode an unsigned hex number.  The hex digits
-- @\'a\'@ through @\'f\'@ may be upper or lower case.
--
-- This parser does not accept a leading @\"0x\"@ string, and consider
-- sign bit part of the binary hex nibbles, i.e.
-- 'parse hex "0xFF" == Right (-1 :: Int8)'
--
hex :: (Integral a, Bits a) => Parser a
{-# INLINE hex #-}
{-# SPECIALIZE INLINE hex :: Parser Int    #-}
{-# SPECIALIZE INLINE hex :: Parser Int64  #-}
{-# SPECIALIZE INLINE hex :: Parser Int32  #-}
{-# SPECIALIZE INLINE hex :: Parser Int16  #-}
{-# SPECIALIZE INLINE hex :: Parser Int8   #-}
{-# SPECIALIZE INLINE hex :: Parser Word   #-}
{-# SPECIALIZE INLINE hex :: Parser Word64 #-}
{-# SPECIALIZE INLINE hex :: Parser Word32 #-}
{-# SPECIALIZE INLINE hex :: Parser Word16 #-}
{-# SPECIALIZE INLINE hex :: Parser Word8  #-}
{-# SPECIALIZE INLINE hex :: Parser Integer #-}
{-# SPECIALIZE INLINE hex :: Parser IntPtr #-}
hex = "Z.Data.Parser.Numeric.hex" <?> hexLoop 0 <$> P.takeWhile1 isHexDigit

-- | decode hex digits sequence within an array.
hexLoop :: (Integral a, Bits a)
        => a    -- ^ accumulator, usually start from 0
        -> V.Bytes
        -> a
{-# INLINE hexLoop #-}
hexLoop = V.foldl' step
  where
    step a w = a `unsafeShiftL` 4 + fromIntegral (w2iHex w)
    w2iHex w
        | w <= 57   = w - 48
        | w <= 70   = w - 55
        | w <= 102  = w - 87

-- | A fast digit predicate.
isHexDigit :: Word8 -> Bool
{-# INLINE isHexDigit #-}
isHexDigit w = w - 48 <= 9 || w - 65 <= 5 || w - 97 <= 5

-- | Parse and decode an unsigned decimal number.
uint :: (Integral a) => Parser a
{-# INLINE uint #-}
{-# SPECIALIZE INLINE uint :: Parser Int    #-}
{-# SPECIALIZE INLINE uint :: Parser Int64  #-}
{-# SPECIALIZE INLINE uint :: Parser Int32  #-}
{-# SPECIALIZE INLINE uint :: Parser Int16  #-}
{-# SPECIALIZE INLINE uint :: Parser Int8   #-}
{-# SPECIALIZE INLINE uint :: Parser Word   #-}
{-# SPECIALIZE INLINE uint :: Parser Word64 #-}
{-# SPECIALIZE INLINE uint :: Parser Word32 #-}
{-# SPECIALIZE INLINE uint :: Parser Word16 #-}
{-# SPECIALIZE INLINE uint :: Parser Word8  #-}
{-# SPECIALIZE INLINE uint :: Parser Integer #-}
uint = "Z.Data.Parser.Numeric.uint" <?> decLoop 0 <$> P.takeWhile1 isDigit

-- | decode digits sequence within an array.
decLoop :: Integral a
        => a    -- ^ accumulator, usually start from 0
        -> V.Bytes
        -> a
{-# INLINE decLoop #-}
decLoop = V.foldl' step
  where step a w = a * 10 + fromIntegral (w - 48)

-- | decode digits sequence within an array.
--
-- A fast version to decode 'Integer' using machine word as much as possible.
decLoopIntegerFast :: V.Bytes -> Integer
{-# INLINE decLoopIntegerFast #-}
decLoopIntegerFast bs
    | V.length bs <= WORD64_MAX_DIGITS_LEN = fromIntegral (decLoop @Word64 0 bs)
    | otherwise                            = decLoop @Integer 0 bs

-- | A fast digit predicate.
isDigit :: Word8 -> Bool
isDigit w = w - 48 <= 9
{-# INLINE isDigit #-}

-- | Parse a decimal number with an optional leading @\'+\'@ or @\'-\'@ sign
-- character.
int :: (Integral a) => Parser a
{-# INLINE int #-}
{-# SPECIALIZE INLINE int :: Parser Int    #-}
{-# SPECIALIZE INLINE int :: Parser Int64  #-}
{-# SPECIALIZE INLINE int :: Parser Int32  #-}
{-# SPECIALIZE INLINE int :: Parser Int16  #-}
{-# SPECIALIZE INLINE int :: Parser Int8   #-}
{-# SPECIALIZE INLINE int :: Parser Word   #-}
{-# SPECIALIZE INLINE int :: Parser Word64 #-}
{-# SPECIALIZE INLINE int :: Parser Word32 #-}
{-# SPECIALIZE INLINE int :: Parser Word16 #-}
{-# SPECIALIZE INLINE int :: Parser Word8  #-}
{-# SPECIALIZE INLINE int :: Parser Integer #-}
int = "Z.Data.Parser.Numeric.int" <?> do
    w <- P.peek
    if w == MINUS
    then P.skipWord8 *> (negate <$> uint')
    else if w == PLUS then P.skipWord8 *> uint' else uint'
  where
    -- strip uint's message
    uint' = decLoop 0 <$> P.takeWhile1 isDigit

-- | Parse a rational number.
--
-- The syntax accepted by this parser is the same as for 'double'.
--
-- /Note/: this parser is not safe for use with inputs from untrusted
-- sources.  An input with a suitably large exponent such as
-- @"1e1000000000"@ will cause a huge 'Integer' to be allocated,
-- resulting in what is effectively a denial-of-service attack.
--
-- In most cases, it is better to use 'double' or 'scientific'
-- instead.
--
rational :: (Fractional a) => Parser a
{-# INLINE rational #-}
rational = "Z.Data.Parser.Numeric.rational" <?> scientificallyInternal realToFrac

-- | Parse a rational number and round to 'Double'.
--
-- This parser accepts an optional leading sign character, followed by
-- at least one decimal digit.  The syntax similar to that accepted by
-- the 'read' function, with the exception that a trailing @\'.\'@ or
-- @\'e\'@ /not/ followed by a number is not consumed.
--
-- Examples with behaviour identical to 'read':
--
-- >parse_ double "3"     == ("", Right 3.0)
-- >parse_ double "3.1"   == ("", Right 3.1)
-- >parse_ double "3e4"   == ("", Right 30000.0)
-- >parse_ double "3.1e4" == ("", Right 31000.0)
--
-- >parse_ double ".3"    == (".3", Left ParserError)
-- >parse_ double "e3"    == ("e3", Left ParserError)
--
-- Examples of differences from 'read':
--
-- >parse_ double "3.foo" == (".foo", Right 3.0)
-- >parse_ double "3e"    == ("e",    Right 3.0)
-- >parse_ double "-3e"   == ("e",    Right -3.0)
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
--
double :: Parser Double
{-# INLINE double #-}
double = "Z.Data.Parser.Numeric.double" <?> scientificallyInternal Sci.toRealFloat

-- | Parse a rational number and round to 'Float'.
--
-- Single precision version of 'double'.
float :: Parser Float
{-# INLINE float #-}
float = "Z.Data.Parser.Numeric.float" <?> scientificallyInternal Sci.toRealFloat

-- | Parse a scientific number.
--
-- The syntax accepted by this parser is the same as for 'double'.
--
scientific :: Parser Sci.Scientific
{-# INLINE scientific #-}
scientific = "Z.Data.Parser.Numeric.scientific" <?> scientificallyInternal id

-- | Parse a scientific number and convert to result using a user supply function.
--
-- The syntax accepted by this parser is the same as for 'double'.
scientifically :: (Sci.Scientific -> a) -> Parser a
{-# INLINE scientifically #-}
scientifically h = "Z.Data.Parser.Numeric.scientifically" <?> scientificallyInternal h

-- | Strip message version.
scientificallyInternal :: (Sci.Scientific -> a) -> Parser a
{-# INLINE scientificallyInternal #-}
scientificallyInternal h = do
    !sign <- P.peek
    when (sign == PLUS || sign == MINUS) (P.skipWord8)
    !intPart <- P.takeWhile1 isDigit
    -- backtrack here is neccessary to avoid eating extra dot or e
    -- attoparsec is doing it wrong here: https://github.com/bos/attoparsec/issues/112
    !sci <- (do
        -- during number parsing we want to use machine word as much as possible
        -- so as long as range permit, we use Word64 instead of final Integer
        !fracPart <- P.word8 DOT *> P.takeWhile1 isDigit
        let !ilen = V.length intPart
            !flen = V.length fracPart
            !base =
                if ilen + flen <= WORD64_MAX_DIGITS_LEN
                then fromIntegral (decLoop @Word64 (decLoop @Word64 0 intPart) fracPart)
                else
                    let i = decLoopIntegerFast intPart
                        f = decLoopIntegerFast fracPart
                    in i * 10 ^ flen + f
        parseE base flen) <|> (parseE (decLoopIntegerFast intPart) 0)

    pure $! if sign /= MINUS then h sci else h (negate sci)
  where
    {-# INLINE parseE #-}
    parseE c e =
        (do _ <- P.satisfy (\w -> w ==  LITTLE_E || w == BIG_E)
            Sci.scientific c . subtract e <$> int) <|> pure (Sci.scientific c (negate e))

--------------------------------------------------------------------------------

-- | Parse a rational number.
--
-- The syntax accepted by this parser is the same as for 'double''.
--
-- /Note/: this parser is not safe for use with inputs from untrusted
-- sources.  An input with a suitably large exponent such as
-- @"1e1000000000"@ will cause a huge 'Integer' to be allocated,
-- resulting in what is effectively a denial-of-service attack.
--
-- In most cases, it is better to use 'double'' or 'scientific''
-- instead.
--
rational' :: (Fractional a) => Parser a
{-# INLINE rational' #-}
rational' = "Z.Data.Parser.Numeric.rational'" <?> scientificallyInternal' realToFrac

-- | More strict number parsing(rfc8259).
--
-- 'scientific' support parse @2314.@ and @21321exyz@ without eating extra dot or @e@ via
-- backtrack, this is not allowed in some strict grammer such as JSON, so we make an
-- non-backtrack strict number parser separately using LL(1) lookahead. This parser also
-- agree with 'read' on extra dot or e handling:
--
-- >parse_ double "3.foo" == Left ParseError
-- >parse_ double "3e"    == Left ParseError
--
-- Leading zeros or @+@ sign is also not allowed:
--
-- >parse_ double "+3.14" == Left ParseError
-- >parse_ double "0014" == Left ParseError
--
-- If you have a similar grammer, you can use this parser to save considerable time.
--
-- @
--      number = [ minus ] int [ frac ] [ exp ]
--      decimal-point = %x2E       ; .
--      digit1-9 = %x31-39         ; 1-9
--      e = %x65 / %x45            ; e E
--      exp = e [ minus / plus ] 1*DIGIT
--      frac = decimal-point 1*DIGIT
-- @
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
-- reference: https://tools.ietf.org/html/rfc8259#section-6
double' :: Parser Double
{-# INLINE double' #-}
double' = "Z.Data.Parser.Numeric.double'" <?> scientificallyInternal' Sci.toRealFloat

-- | Parse a rational number and round to 'Float' using stricter grammer.
--
-- Single precision version of 'double''.
float' :: Parser Float
{-# INLINE float' #-}
float' = "Z.Data.Parser.Numeric.float'" <?> scientificallyInternal' Sci.toRealFloat

-- | Parse a scientific number.
--
-- The syntax accepted by this parser is the same as for 'double''.
scientific' :: Parser Sci.Scientific
{-# INLINE scientific' #-}
scientific' = "Z.Data.Parser.Numeric.scientific'" <?> scientificallyInternal' id

-- | Parse a scientific number and convert to result using a user supply function.
--
-- The syntax accepted by this parser is the same as for 'double''.
scientifically' :: (Sci.Scientific -> a) -> P.Parser a
{-# INLINE scientifically' #-}
scientifically' h = "Z.Data.Parser.Numeric.scientifically'" <?> scientificallyInternal' h

-- | Strip message version of scientifically'.
scientificallyInternal' :: (Sci.Scientific -> a) -> P.Parser a
{-# INLINE scientificallyInternal' #-}
scientificallyInternal' h = do
    !sign <- P.peek
    when (sign == MINUS) (P.skipWord8) -- no leading plus is allowed
    !intPart <- P.takeWhile1 isDigit
    when (V.length intPart > 1 && V.head intPart == C_0) (P.fail' "leading zeros are not allowed")
    mdot <- P.peekMaybe
    !sci <- case mdot of
        Just DOT -> do
            !fracPart <- P.skipWord8 *> P.takeWhile1 isDigit
            -- during number parsing we want to use machine word as much as possible
            -- so as long as range permit, we use Word64 instead of final Integer
            let !ilen = V.length intPart
                !flen = V.length fracPart
                !base =
                    if ilen + flen <= WORD64_MAX_DIGITS_LEN
                    then fromIntegral (decLoop @Word64 (decLoop @Word64 0 intPart) fracPart)
                    else
                        let i = decLoopIntegerFast intPart
                            f = decLoopIntegerFast fracPart
                        in i * 10 ^ flen + f
            parseE base flen
        _ -> parseE (decLoopIntegerFast intPart) 0
    pure $! if sign /= MINUS then h sci else h (negate sci)
  where
    {-# INLINE parseE #-}
    parseE !c !e = do
        me <- P.peekMaybe
        e' <- case me of
            Just ec | ec == LITTLE_E || ec == BIG_E -> P.skipWord8 *> int
            _ -> pure 0
        pure $! Sci.scientific c (e' - e)

--------------------------------------------------------------------------------

floatToScientific :: Float -> Sci.Scientific
{-# INLINE floatToScientific #-}
floatToScientific rf | rf < 0    = -(fromFloatingDigits (B.grisu3_sp (-rf)))
                     | rf == 0   = 0
                     | otherwise = fromFloatingDigits (B.grisu3_sp rf)

doubleToScientific :: Double -> Sci.Scientific
{-# INLINE doubleToScientific #-}
doubleToScientific rf | rf < 0    = -(fromFloatingDigits (B.grisu3 (-rf)))
                      | rf == 0   = 0
                      | otherwise = fromFloatingDigits (B.grisu3 rf)

fromFloatingDigits :: ([Int], Int) -> Sci.Scientific
{-# INLINE fromFloatingDigits #-}
fromFloatingDigits (digits, e) = go digits 0 0
  where
    -- There's no way a float or double has more digits a 'Int64' can't handle
    go :: [Int] -> Int64 -> Int -> Sci.Scientific
    go []     !c !n = Sci.scientific (fromIntegral c) (e - n)
    go (d:ds) !c !n = go ds (c * 10 + fromIntegral d) (n + 1)