Data/Attoparsec/ByteString/Char8.hs

{-# LANGUAGE BangPatterns, FlexibleInstances, TypeFamilies,
    TypeSynonymInstances, GADTs #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}

-- |
-- Module      :  Data.Attoparsec.ByteString.Char8
-- Copyright   :  Bryan O'Sullivan 2007-2011
-- License     :  BSD3
--
-- Maintainer  :  bos@serpentine.com
-- Stability   :  experimental
-- Portability :  unknown
--
-- Simple, efficient, character-oriented combinator parsing for
-- 'B.ByteString' strings, loosely based on the Parsec library.

module Data.Attoparsec.ByteString.Char8
    (
    -- * Character encodings
    -- $encodings

    -- * Parser types
      Parser
    , A.Result
    , A.IResult(..)
    , I.compareResults

    -- * Running parsers
    , A.parse
    , A.feed
    , A.parseOnly
    , A.parseTest
    , A.parseWith

    -- ** Result conversion
    , A.maybeResult
    , A.eitherResult

    -- * Combinators
    , (I.<?>)
    , I.try
    , module Data.Attoparsec.Combinator

    -- * Parsing individual characters
    , char
    , char8
    , anyChar
    , notChar
    , peekChar
    , satisfy

    -- ** Special character parsers
    , digit
    , letter_iso8859_15
    , letter_ascii
    , space

    -- ** Fast predicates
    , isDigit
    , isDigit_w8
    , isAlpha_iso8859_15
    , isAlpha_ascii
    , isSpace
    , isSpace_w8

    -- *** Character classes
    , inClass
    , notInClass

    -- * Efficient string handling
    , I.string
    , stringCI
    , skipSpace
    , skipWhile
    , I.take
    , scan
    , takeWhile
    , takeWhile1
    , takeTill

    -- ** String combinators
    -- $specalt
    , (.*>)
    , (<*.)

    -- ** Consume all remaining input
    , I.takeByteString
    , I.takeLazyByteString

    -- * Text parsing
    , I.endOfLine
    , isEndOfLine
    , isHorizontalSpace

    -- * Numeric parsers
    , decimal
    , hexadecimal
    , signed
    , double
    , Number(..)
    , number
    , rational

    -- * State observation and manipulation functions
    , I.endOfInput
    , I.atEnd
    ) where

import Control.Applicative ((*>), (<*), (<$>), (<|>))
import Data.Attoparsec.ByteString.FastSet (charClass, memberChar)
import Data.Attoparsec.ByteString.Internal (Parser, (<?>))
import Data.Attoparsec.Combinator
import Data.Attoparsec.Number (Number(..))
import Data.Bits (Bits, (.|.), shiftL)
import Data.ByteString.Internal (c2w, w2c)
import Data.Int (Int8, Int16, Int32, Int64)
import Data.Ratio ((%))
import Data.String (IsString(..))
import Data.Word (Word8, Word16, Word32, Word64, Word)
import Prelude hiding (takeWhile)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Internal as I
import qualified Data.Attoparsec.Internal as I
import qualified Data.ByteString as B8
import qualified Data.ByteString.Char8 as B

instance (a ~ B.ByteString) => IsString (Parser a) where
    fromString = I.string . B.pack

-- $encodings
--
-- This module is intended for parsing text that is
-- represented using an 8-bit character set, e.g. ASCII or
-- ISO-8859-15.  It /does not/ make any attempt to deal with character
-- encodings, multibyte characters, or wide characters.  In
-- particular, all attempts to use characters above code point U+00FF
-- will give wrong answers.
--
-- Code points below U+0100 are simply translated to and from their
-- numeric values, so e.g. the code point U+00A4 becomes the byte
-- @0xA4@ (which is the Euro symbol in ISO-8859-15, but the generic
-- currency sign in ISO-8859-1).  Haskell 'Char' values above U+00FF
-- are truncated, so e.g. U+1D6B7 is truncated to the byte @0xB7@.

-- ASCII-specific but fast, oh yes.
toLower :: Word8 -> Word8
toLower w | w >= 65 && w <= 90 = w + 32
          | otherwise          = w

-- | Satisfy a literal string, ignoring case.
stringCI :: B.ByteString -> Parser B.ByteString
stringCI = I.stringTransform (B8.map toLower)
{-# INLINE stringCI #-}

-- | Consume input as long as the predicate returns 'True', and return
-- the consumed input.
--
-- This parser requires the predicate to succeed on at least one byte
-- of input: it will fail if the predicate never returns 'True' or if
-- there is no input left.
takeWhile1 :: (Char -> Bool) -> Parser B.ByteString
takeWhile1 p = I.takeWhile1 (p . w2c)
{-# INLINE takeWhile1 #-}

-- | The parser @satisfy p@ succeeds for any byte for which the
-- predicate @p@ returns 'True'. Returns the byte that is actually
-- parsed.
--
-- >digit = satisfy isDigit
-- >    where isDigit c = c >= '0' && c <= '9'
satisfy :: (Char -> Bool) -> Parser Char
satisfy = I.satisfyWith w2c
{-# INLINE satisfy #-}

-- | Match a letter, in the ISO-8859-15 encoding.
letter_iso8859_15 :: Parser Char
letter_iso8859_15 = satisfy isAlpha_iso8859_15 <?> "letter_iso8859_15"
{-# INLINE letter_iso8859_15 #-}

-- | Match a letter, in the ASCII encoding.
letter_ascii :: Parser Char
letter_ascii = satisfy isAlpha_ascii <?> "letter_ascii"
{-# INLINE letter_ascii #-}

-- | A fast alphabetic predicate for the ISO-8859-15 encoding
--
-- /Note/: For all character encodings other than ISO-8859-15, and
-- almost all Unicode code points above U+00A3, this predicate gives
-- /wrong answers/.
isAlpha_iso8859_15 :: Char -> Bool
isAlpha_iso8859_15 c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
                       (c >= '\166' && moby c)
  where moby = notInClass "\167\169\171-\179\182\183\185\187\191\215\247"
        {-# NOINLINE moby #-}
{-# INLINE isAlpha_iso8859_15 #-}

-- | A fast alphabetic predicate for the ASCII encoding
--
-- /Note/: For all character encodings other than ASCII, and
-- almost all Unicode code points above U+007F, this predicate gives
-- /wrong answers/.
isAlpha_ascii :: Char -> Bool
isAlpha_ascii c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
{-# INLINE isAlpha_ascii #-}

-- | Parse a single digit.
digit :: Parser Char
digit = satisfy isDigit <?> "digit"
{-# INLINE digit #-}

-- | A fast digit predicate.
isDigit :: Char -> Bool
isDigit c = c >= '0' && c <= '9'
{-# INLINE isDigit #-}

-- | A fast digit predicate.
isDigit_w8 :: Word8 -> Bool
isDigit_w8 w = w >= 48 && w <= 57
{-# INLINE isDigit_w8 #-}

-- | Match any character.
anyChar :: Parser Char
anyChar = satisfy $ const True
{-# INLINE anyChar #-}

-- | Match any character. Returns 'Nothing' if end of input has been
-- reached. Does not consume any input.
--
-- /Note/: Because this parser does not fail, do not use it with
-- combinators such as 'many', because such parsers loop until a
-- failure occurs.  Careless use will thus result in an infinite loop.
peekChar :: Parser (Maybe Char)
peekChar = (fmap w2c) `fmap` I.peekWord8
{-# INLINE peekChar #-}

-- | Fast predicate for matching ASCII space characters.
--
-- /Note/: This predicate only gives correct answers for the ASCII
-- encoding.  For instance, it does not recognise U+00A0 (non-breaking
-- space) as a space character, even though it is a valid ISO-8859-15
-- byte. For a Unicode-aware and only slightly slower predicate,
-- use 'Data.Char.isSpace'
isSpace :: Char -> Bool
isSpace c = (c == ' ') || ('\t' <= c && c <= '\r')
{-# INLINE isSpace #-}

-- | Fast 'Word8' predicate for matching ASCII space characters.
isSpace_w8 :: Word8 -> Bool
isSpace_w8 w = (w == 32) || (9 <= w && w <= 13)
{-# INLINE isSpace_w8 #-}


-- | Parse a space character.
--
-- /Note/: This parser only gives correct answers for the ASCII
-- encoding.  For instance, it does not recognise U+00A0 (non-breaking
-- space) as a space character, even though it is a valid ISO-8859-15
-- byte.
space :: Parser Char
space = satisfy isSpace <?> "space"
{-# INLINE space #-}

-- | Match a specific character.
char :: Char -> Parser Char
char c = satisfy (== c) <?> [c]
{-# INLINE char #-}

-- | Match a specific character, but return its 'Word8' value.
char8 :: Char -> Parser Word8
char8 c = I.satisfy (== c2w c) <?> [c]
{-# INLINE char8 #-}

-- | Match any character except the given one.
notChar :: Char -> Parser Char
notChar c = satisfy (/= c) <?> "not " ++ [c]
{-# INLINE notChar #-}

-- | Match any character in a set.
--
-- >vowel = inClass "aeiou"
--
-- Range notation is supported.
--
-- >halfAlphabet = inClass "a-nA-N"
--
-- To add a literal \'-\' to a set, place it at the beginning or end
-- of the string.
inClass :: String -> Char -> Bool
inClass s = (`memberChar` mySet)
    where mySet = charClass s
{-# INLINE inClass #-}

-- | Match any character not in a set.
notInClass :: String -> Char -> Bool
notInClass s = not . inClass s
{-# INLINE notInClass #-}

-- | Consume input as long as the predicate returns 'True', and return
-- the consumed input.
--
-- This parser does not fail.  It will return an empty string if the
-- predicate returns 'False' on the first byte of input.
--
-- /Note/: Because this parser does not fail, do not use it with
-- combinators such as 'many', because such parsers loop until a
-- failure occurs.  Careless use will thus result in an infinite loop.
takeWhile :: (Char -> Bool) -> Parser B.ByteString
takeWhile p = I.takeWhile (p . w2c)
{-# INLINE takeWhile #-}

-- | A stateful scanner.  The predicate consumes and transforms a
-- state argument, and each transformed state is passed to successive
-- invocations of the predicate on each byte of the input until one
-- returns 'Nothing' or the input ends.
--
-- This parser does not fail.  It will return an empty string if the
-- predicate returns 'Nothing' on the first byte of input.
--
-- /Note/: Because this parser does not fail, do not use it with
-- combinators such as 'many', because such parsers loop until a
-- failure occurs.  Careless use will thus result in an infinite loop.
scan :: s -> (s -> Char -> Maybe s) -> Parser B.ByteString
scan s0 p = I.scan s0 (\s -> p s . w2c)
{-# INLINE scan #-}

-- | Consume input as long as the predicate returns 'False'
-- (i.e. until it returns 'True'), and return the consumed input.
--
-- This parser does not fail.  It will return an empty string if the
-- predicate returns 'True' on the first byte of input.
--
-- /Note/: Because this parser does not fail, do not use it with
-- combinators such as 'many', because such parsers loop until a
-- failure occurs.  Careless use will thus result in an infinite loop.
takeTill :: (Char -> Bool) -> Parser B.ByteString
takeTill p = I.takeTill (p . w2c)
{-# INLINE takeTill #-}

-- | Skip past input for as long as the predicate returns 'True'.
skipWhile :: (Char -> Bool) -> Parser ()
skipWhile p = I.skipWhile (p . w2c)
{-# INLINE skipWhile #-}

-- | Skip over white space.
skipSpace :: Parser ()
skipSpace = I.skipWhile isSpace_w8
{-# INLINE skipSpace #-}

-- $specalt
--
-- The '.*>' and '<*.' combinators are intended for use with the
-- @OverloadedStrings@ language extension.  They simplify the common
-- task of matching a statically known string, then immediately
-- parsing something else.
--
-- An example makes this easier to understand:
--
-- @{-\# LANGUAGE OverloadedStrings #-}
--
-- shoeSize = \"Shoe size: \" '.*>' 'decimal'
-- @
--
-- If we were to try to use '*>' above instead, the type checker would
-- not be able to tell which 'IsString' instance to use for the text
-- in quotes.  We would have to be explicit, using either a type
-- signature or the 'I.string' parser.

-- | Type-specialized version of '*>' for 'B.ByteString'.
(.*>) :: B.ByteString -> Parser a -> Parser a
s .*> f = I.string s *> f

-- | Type-specialized version of '<*' for 'B.ByteString'.
(<*.) :: Parser a -> B.ByteString -> Parser a
f <*. s = f <* I.string s

-- | A predicate that matches either a carriage return @\'\\r\'@ or
-- newline @\'\\n\'@ character.
isEndOfLine :: Word8 -> Bool
isEndOfLine w = w == 13 || w == 10
{-# INLINE isEndOfLine #-}

-- | A predicate that matches either a space @\' \'@ or horizontal tab
-- @\'\\t\'@ character.
isHorizontalSpace :: Word8 -> Bool
isHorizontalSpace w = w == 32 || w == 9
{-# INLINE isHorizontalSpace #-}

-- | Parse and decode an unsigned hexadecimal number.  The hex digits
-- @\'a\'@ through @\'f\'@ may be upper or lower case.
--
-- This parser does not accept a leading @\"0x\"@ string.
hexadecimal :: (Integral a, Bits a) => Parser a
hexadecimal = B8.foldl' step 0 `fmap` I.takeWhile1 isHexDigit
  where
    isHexDigit w = (w >= 48 && w <= 57) ||
                   (w >= 97 && w <= 102) ||
                   (w >= 65 && w <= 70)
    step a w | w >= 48 && w <= 57  = (a `shiftL` 4) .|. fromIntegral (w - 48)
             | w >= 97             = (a `shiftL` 4) .|. fromIntegral (w - 87)
             | otherwise           = (a `shiftL` 4) .|. fromIntegral (w - 55)
{-# SPECIALISE hexadecimal :: Parser Int #-}
{-# SPECIALISE hexadecimal :: Parser Int8 #-}
{-# SPECIALISE hexadecimal :: Parser Int16 #-}
{-# SPECIALISE hexadecimal :: Parser Int32 #-}
{-# SPECIALISE hexadecimal :: Parser Int64 #-}
{-# SPECIALISE hexadecimal :: Parser Integer #-}
{-# SPECIALISE hexadecimal :: Parser Word #-}
{-# SPECIALISE hexadecimal :: Parser Word8 #-}
{-# SPECIALISE hexadecimal :: Parser Word16 #-}
{-# SPECIALISE hexadecimal :: Parser Word32 #-}
{-# SPECIALISE hexadecimal :: Parser Word64 #-}

-- | Parse and decode an unsigned decimal number.
decimal :: Integral a => Parser a
decimal = B8.foldl' step 0 `fmap` I.takeWhile1 isDig
  where isDig w  = w >= 48 && w <= 57
        step a w = a * 10 + fromIntegral (w - 48)
{-# SPECIALISE decimal :: Parser Int #-}
{-# SPECIALISE decimal :: Parser Int8 #-}
{-# SPECIALISE decimal :: Parser Int16 #-}
{-# SPECIALISE decimal :: Parser Int32 #-}
{-# SPECIALISE decimal :: Parser Int64 #-}
{-# SPECIALISE decimal :: Parser Integer #-}
{-# SPECIALISE decimal :: Parser Word #-}
{-# SPECIALISE decimal :: Parser Word8 #-}
{-# SPECIALISE decimal :: Parser Word16 #-}
{-# SPECIALISE decimal :: Parser Word32 #-}
{-# SPECIALISE decimal :: Parser Word64 #-}

-- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign
-- character.
signed :: Num a => Parser a -> Parser a
{-# SPECIALISE signed :: Parser Int -> Parser Int #-}
{-# SPECIALISE signed :: Parser Int8 -> Parser Int8 #-}
{-# SPECIALISE signed :: Parser Int16 -> Parser Int16 #-}
{-# SPECIALISE signed :: Parser Int32 -> Parser Int32 #-}
{-# SPECIALISE signed :: Parser Int64 -> Parser Int64 #-}
{-# SPECIALISE signed :: Parser Integer -> Parser Integer #-}
signed p = (negate <$> (char8 '-' *> p))
       <|> (char8 '+' *> p)
       <|> p

-- | Parse a rational number.
--
-- This parser accepts an optional leading sign character, followed by
-- at least one decimal digit.  The syntax similar to that accepted by
-- the 'read' function, with the exception that a trailing @\'.\'@ or
-- @\'e\'@ /not/ followed by a number is not consumed.
--
-- Examples with behaviour identical to 'read', if you feed an empty
-- continuation to the first result:
--
-- >rational "3"     == Done 3.0 ""
-- >rational "3.1"   == Done 3.1 ""
-- >rational "3e4"   == Done 30000.0 ""
-- >rational "3.1e4" == Done 31000.0, ""
--
-- Examples with behaviour identical to 'read':
--
-- >rational ".3"    == Fail "input does not start with a digit"
-- >rational "e3"    == Fail "input does not start with a digit"
--
-- Examples of differences from 'read':
--
-- >rational "3.foo" == Done 3.0 ".foo"
-- >rational "3e"    == Done 3.0 "e"
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
rational :: Fractional a => Parser a
{-# SPECIALIZE rational :: Parser Double #-}
{-# SPECIALIZE rational :: Parser Float #-}
{-# SPECIALIZE rational :: Parser Rational #-}
rational = floaty $ \real frac fracDenom -> fromRational $
                     real % 1 + frac % fracDenom

-- | Parse a rational number.
--
-- The syntax accepted by this parser is the same as for 'rational'.
--
-- /Note/: This function is almost ten times faster than 'rational',
-- but is slightly less accurate.
--
-- The 'Double' type supports about 16 decimal places of accuracy.
-- For 94.2% of numbers, this function and 'rational' give identical
-- results, but for the remaining 5.8%, this function loses precision
-- around the 15th decimal place.  For 0.001% of numbers, this
-- function will lose precision at the 13th or 14th decimal place.
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
double :: Parser Double
double = floaty asDouble

asDouble :: Integer -> Integer -> Integer -> Double
asDouble real frac fracDenom =
    fromIntegral real + fromIntegral frac / fromIntegral fracDenom
{-# INLINE asDouble #-}

-- | Parse a number, attempting to preserve both speed and precision.
--
-- The syntax accepted by this parser is the same as for 'rational'.
--
-- /Note/: This function is almost ten times faster than 'rational'.
-- On integral inputs, it gives perfectly accurate answers, and on
-- floating point inputs, it is slightly less accurate than
-- 'rational'.
--
-- This function does not accept string representations of \"NaN\" or
-- \"Infinity\".
number :: Parser Number
number = floaty $ \real frac fracDenom ->
         if frac == 0 && fracDenom == 0
         then I real
         else D (asDouble real frac fracDenom)
{-# INLINE number #-}

data T = T !Integer !Int

floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Parser a
{-# INLINE floaty #-}
floaty f = do
  let minus = 45
      plus  = 43
  !positive <- ((== plus) <$> I.satisfy (\c -> c == minus || c == plus)) <|>
               return True
  real <- decimal
  let tryFraction = do
        let dot = 46
        _ <- I.satisfy (==dot)
        ds <- I.takeWhile isDigit_w8
        case I.parseOnly decimal ds of
                Right n -> return $ T n (B.length ds)
                _       -> fail "no digits after decimal"
  T fraction fracDigits <- tryFraction <|> return (T 0 0)
  let littleE = 101
      bigE    = 69
      e w = w == littleE || w == bigE
  power <- (I.satisfy e *> signed decimal) <|> return (0::Int)
  let n = if fracDigits == 0
          then if power == 0
               then fromIntegral real
               else fromIntegral real * (10 ^^ power)
          else if power == 0
               then f real fraction (10 ^ fracDigits)
               else f real fraction (10 ^ fracDigits) * (10 ^^ power)
  return $ if positive
           then n
           else -n