{-# LANGUAGE BangPatterns  #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE Safe          #-}

-- |
-- Copyright: © Oren Ben-Kiki 2007,
--            © Herbert Valerio Riedel 2015-2018
-- SPDX-License-Identifier: GPL-2.0-or-later
--
-- UTF decoding
--
-- This really should be factored out to the standard libraries. Since it isn't
-- there, we get to tailor it exactly to our needs. We use lazy byte strings as
-- input, which should give reasonable I\/O performance when reading large
-- files. The output is a normal 'Char' list which is easy to work with and
-- should be efficient enough as long as the 'Data.YAML.Token.Parser' does its job right.
--
module Data.YAML.Token.Encoding
  ( decode
  , Encoding(..)
  ) where

import qualified Data.ByteString.Lazy       as BL
import qualified Data.ByteString.Lazy.Char8 as BLC

import           Util

-- | Denotes the /Unicode Transformation Format/ (UTF) used for serializing the YAML document
data Encoding = UTF8    -- ^ UTF-8 encoding (or ASCII)
              | UTF16LE -- ^ UTF-16 little endian
              | UTF16BE -- ^ UTF-16 big endian
              | UTF32LE -- ^ UTF-32 little endian
              | UTF32BE -- ^ UTF-32 big endian
              deriving (Encoding -> Encoding -> Bool
(Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Bool) -> Eq Encoding
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Encoding -> Encoding -> Bool
$c/= :: Encoding -> Encoding -> Bool
== :: Encoding -> Encoding -> Bool
$c== :: Encoding -> Encoding -> Bool
Eq,(forall x. Encoding -> Rep Encoding x)
-> (forall x. Rep Encoding x -> Encoding) -> Generic Encoding
forall x. Rep Encoding x -> Encoding
forall x. Encoding -> Rep Encoding x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Encoding x -> Encoding
$cfrom :: forall x. Encoding -> Rep Encoding x
Generic)

-- | @show encoding@ converts an 'Encoding' to the encoding name (with a "-")
-- as used by most programs.
instance Show Encoding where
    show :: Encoding -> String
show Encoding
UTF8    = String
"UTF-8"
    show Encoding
UTF16LE = String
"UTF-16LE"
    show Encoding
UTF16BE = String
"UTF-16BE"
    show Encoding
UTF32LE = String
"UTF-32LE"
    show Encoding
UTF32BE = String
"UTF-32BE"

-- | @since 0.2.0
instance NFData Encoding where rnf :: Encoding -> ()
rnf !Encoding
_ = ()

-- | @'decode' bytes@ automatically detects the 'Encoding' used and converts the
-- /bytes/ to Unicode characters, with byte offsets. Note the offset is for
-- past end of the character, not its beginning.
decode :: BLC.ByteString -> (Encoding, [(Int, Char)])
decode :: ByteString -> (Encoding, [(Int, Char)])
decode ByteString
text = (Encoding
encoding, Encoding -> ByteString -> [(Int, Char)]
undoEncoding Encoding
encoding ByteString
text)
  where
    encoding :: Encoding
encoding = [Word8] -> Encoding
detectEncoding ([Word8] -> Encoding) -> [Word8] -> Encoding
forall a b. (a -> b) -> a -> b
$ ByteString -> [Word8]
BL.unpack (ByteString -> [Word8]) -> ByteString -> [Word8]
forall a b. (a -> b) -> a -> b
$ Int64 -> ByteString -> ByteString
BL.take Int64
4 ByteString
text

-- | @'detectEncoding' text@ examines the first few chars (bytes) of the /text/
-- to deduce the Unicode encoding used according to the YAML spec.
detectEncoding :: [Word8] -> Encoding
detectEncoding :: [Word8] -> Encoding
detectEncoding [Word8]
text = case [Word8]
text of
    Word8
0x00 : Word8
0x00 : Word8
0xFE : Word8
0xFF : [Word8]
_ -> Encoding
UTF32BE
    Word8
0x00 : Word8
0x00 : Word8
0x00 : Word8
_    : [Word8]
_ -> Encoding
UTF32BE
    Word8
0xFF : Word8
0xFE : Word8
0x00 : Word8
0x00 : [Word8]
_ -> Encoding
UTF32LE
    Word8
_    : Word8
0x00 : Word8
0x00 : Word8
0x00 : [Word8]
_ -> Encoding
UTF32LE
    Word8
0xFE : Word8
0xFF : [Word8]
_               -> Encoding
UTF16BE
    Word8
0x00 : Word8
_    : [Word8]
_               -> Encoding
UTF16BE
    Word8
0xFF : Word8
0xFE : [Word8]
_               -> Encoding
UTF16LE
    Word8
_    : Word8
0x00 : [Word8]
_               -> Encoding
UTF16LE
    Word8
0xEF : Word8
0xBB : Word8
0xBF : [Word8]
_        -> Encoding
UTF8
    [Word8]
_                             -> Encoding
UTF8

-- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode
-- characters according to the /encoding/.
undoEncoding :: Encoding -> BLC.ByteString -> [(Int, Char)]
undoEncoding :: Encoding -> ByteString -> [(Int, Char)]
undoEncoding Encoding
encoding ByteString
bytes =
  case Encoding
encoding of
    Encoding
UTF8    -> ByteString -> Int -> [(Int, Char)]
undoUTF8 ByteString
bytes Int
0
    Encoding
UTF16LE -> [(Int, Char)] -> [(Int, Char)]
combinePairs ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
bytes Int
0
    Encoding
UTF16BE -> [(Int, Char)] -> [(Int, Char)]
combinePairs ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
bytes Int
0
    Encoding
UTF32LE -> [(Int, Char)] -> [(Int, Char)]
forall a. [(a, Char)] -> [(a, Char)]
validateScalars ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
bytes Int
0
    Encoding
UTF32BE -> [(Int, Char)] -> [(Int, Char)]
forall a. [(a, Char)] -> [(a, Char)]
validateScalars ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
bytes Int
0
  where
    validateScalars :: [(a, Char)] -> [(a, Char)]
validateScalars [] = []
    validateScalars (x :: (a, Char)
x@(a
_,Char
c):[(a, Char)]
rest)
      | Char
'\xD800' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c, Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDFFF' = String -> [(a, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-32 stream contains invalid surrogate code-point"
      | Bool
otherwise                    = (a, Char)
x (a, Char) -> [(a, Char)] -> [(a, Char)]
forall a. a -> [a] -> [a]
: [(a, Char)] -> [(a, Char)]
validateScalars [(a, Char)]
rest

-- ** UTF-32 decoding

-- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/
-- left to read.
hasFewerThan :: Int -> BLC.ByteString -> Bool
hasFewerThan :: Int -> ByteString -> Bool
hasFewerThan Int
n ByteString
bytes
  | Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = ByteString -> Bool
BLC.null ByteString
bytes
  | Int
n  Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1 = ByteString -> Bool
BLC.null ByteString
bytes Bool -> Bool -> Bool
|| Int -> ByteString -> Bool
hasFewerThan (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) (ByteString -> ByteString
BLC.tail ByteString
bytes)
  | Bool
otherwise = Bool
False

-- | @undoUTF32LE bytes offset@ decoded a UTF-32LE /bytes/ stream to Unicode
-- chars.
undoUTF32LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32LE :: ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
bytes Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan Int
4 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-32LE input contains invalid number of bytes"
  | Bool
otherwise = let first :: Char
first    = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes'   = ByteString -> ByteString
BLC.tail ByteString
bytes
                    second :: Char
second   = ByteString -> Char
BLC.head ByteString
bytes'
                    bytes'' :: ByteString
bytes''  = ByteString -> ByteString
BLC.tail ByteString
bytes'
                    third :: Char
third    = ByteString -> Char
BLC.head ByteString
bytes''
                    bytes''' :: ByteString
bytes''' = ByteString -> ByteString
BLC.tail ByteString
bytes''
                    fourth :: Char
fourth   = ByteString -> Char
BLC.head ByteString
bytes'''
                    rest :: ByteString
rest     = ByteString -> ByteString
BLC.tail ByteString
bytes'''
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4,
                    Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$        Char -> Int
ord Char
first
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
second
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
third
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
*  Char -> Int
ord Char
fourth)))(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4)

-- | @undoUTF32BE bytes offset@ decoded a UTF-32BE /bytes/ stream to Unicode
-- chars.
undoUTF32BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32BE :: ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
bytes Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan Int
4 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-32BE input contains invalid number of bytes"
  | Bool
otherwise = let first :: Char
first    = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes'   = ByteString -> ByteString
BLC.tail ByteString
bytes
                    second :: Char
second   = ByteString -> Char
BLC.head ByteString
bytes'
                    bytes'' :: ByteString
bytes''  = ByteString -> ByteString
BLC.tail ByteString
bytes'
                    third :: Char
third    = ByteString -> Char
BLC.head ByteString
bytes''
                    bytes''' :: ByteString
bytes''' = ByteString -> ByteString
BLC.tail ByteString
bytes''
                    fourth :: Char
fourth   = ByteString -> Char
BLC.head ByteString
bytes'''
                    rest :: ByteString
rest     = ByteString -> ByteString
BLC.tail ByteString
bytes'''
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4,
                    Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$        Char -> Int
ord Char
fourth
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
third
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
second
                        Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
*  Char -> Int
ord Char
first)))(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4)

-- ** UTF-16 decoding

-- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a
-- single Unicode character.
combinePairs :: [(Int, Char)] -> [(Int, Char)]
combinePairs :: [(Int, Char)] -> [(Int, Char)]
combinePairs []                          = []
combinePairs (head' :: (Int, Char)
head'@(Int
_, Char
head_char):[(Int, Char)]
tail')
  | Char
'\xD800' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
head_char Bool -> Bool -> Bool
&& Char
head_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDBFF' = (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead (Int, Char)
head' [(Int, Char)]
tail'
  | Char
'\xDC00' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
head_char Bool -> Bool -> Bool
&& Char
head_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDFFF' = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16 contains trail surrogate without lead surrogate"
  | Bool
otherwise                                      = (Int, Char)
head'(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:[(Int, Char)] -> [(Int, Char)]
combinePairs [(Int, Char)]
tail'

-- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the
-- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues
-- combining surrogate pairs.
combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead (Int, Char)
_lead []                                 = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16 contains lead surrogate as final character"
combineLead (Int
_, Char
lead_char) ((Int
trail_offset, Char
trail_char):[(Int, Char)]
rest)
  | Char
'\xDC00' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
trail_char Bool -> Bool -> Bool
&& Char
trail_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDFFF' = (Int
trail_offset, Char -> Char -> Char
combineSurrogates Char
lead_char Char
trail_char)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:[(Int, Char)] -> [(Int, Char)]
combinePairs [(Int, Char)]
rest
  | Bool
otherwise                                        = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16 contains lead surrogate without trail surrogate"

-- | @surrogateOffset@ is copied from the Unicode FAQs.
surrogateOffset :: Int
surrogateOffset :: Int
surrogateOffset = Int
0x10000 Int -> Int -> Int
forall a. Num a => a -> a -> a
- (Int
0xD800 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
1024) Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xDC00

-- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single
-- Unicode character.
combineSurrogates :: Char -> Char -> Char
combineSurrogates :: Char -> Char -> Char
combineSurrogates Char
lead Char
trail = Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
lead Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
1024 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
trail Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
surrogateOffset

-- | @undoUTF18LE bytes offset@ decoded a UTF-16LE /bytes/ stream to Unicode
-- chars.
undoUTF16LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16LE :: ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
bytes Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan Int
2 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16LE input contains odd number of bytes"
  | Bool
otherwise = let low :: Char
low    = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
                    high :: Char
high   = ByteString -> Char
BLC.head ByteString
bytes'
                    rest :: ByteString
rest   = ByteString -> ByteString
BLC.tail ByteString
bytes'
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2, Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
low Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
high Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
256)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2)

-- | @undoUTF18BE bytes offset@ decoded a UTF-16BE /bytes/ stream to Unicode
-- chars.
undoUTF16BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16BE :: ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
bytes Int
offset
  | ByteString -> Bool
BLC.null ByteString
bytes = []
  | Int -> ByteString -> Bool
hasFewerThan Int
2 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16BE input contains odd number of bytes"
  | Bool
otherwise = let high :: Char
high   = ByteString -> Char
BLC.head ByteString
bytes
                    bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
                    low :: Char
low    = ByteString -> Char
BLC.head ByteString
bytes'
                    rest :: ByteString
rest   = ByteString -> ByteString
BLC.tail ByteString
bytes'
                in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2, Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
low Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
high Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
256)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2)

-- ** UTF-8 decoding

-- | @undoUTF8 bytes offset@ decoded a UTF-8 /bytes/ stream to Unicode chars.
undoUTF8 :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF8 :: ByteString -> Int -> [(Int, Char)]
undoUTF8 ByteString
bytes = [Word8] -> Int -> [(Int, Char)]
undoUTF8' (ByteString -> [Word8]
BL.unpack ByteString
bytes)

w2c :: Word8 -> Char
w2c :: Word8 -> Char
w2c = Int -> Char
chr (Int -> Char) -> (Word8 -> Int) -> Word8 -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

w2i :: Word8 -> Int
w2i :: Word8 -> Int
w2i = Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

undoUTF8' :: [Word8] -> Int -> [(Int, Char)]
undoUTF8' :: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [] Int
_ = []
undoUTF8' (Word8
first:[Word8]
rest) !Int
offset
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80  = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
    !c :: Char
c       = Word8 -> Char
w2c Word8
first
undoUTF8' (Word8
first:[Word8]
rest) !Int
offset
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xC0  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 input contains invalid first byte"
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xE0  = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8   Word8
first Int
offset [Word8]
rest
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xF0  = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 Word8
first Int
offset [Word8]
rest
  | Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xF8  = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8  Word8
first Int
offset [Word8]
rest
  | Bool
otherwise     = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 input contains invalid first byte"

-- | @decodeTwoUTF8 first offset bytes@ decodes a two-byte UTF-8 character,
-- where the /first/ byte is already available and the second is the head of
-- the /bytes/, and then continues to undo the UTF-8 encoding.
decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 Word8
first Int
offset (Word8
second:[Word8]
rest)
  | Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 double byte char has invalid second byte"
  | Bool
otherwise = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2
    !c :: Char
c       = Int -> Char
chr ((Word8 -> Int
w2i Word8
first Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xc0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40  Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80))
decodeTwoUTF8 Word8
_ Int
_ [] = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 double byte char is missing second byte at eof"

-- | @decodeThreeUTF8 first offset bytes@ decodes a three-byte UTF-8 character,
-- where the /first/ byte is already available and the second and third are the
-- head of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 Word8
first Int
offset (Word8
second:Word8
third:[Word8]
rest)
  | Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 triple byte char has invalid second byte"
  | Word8
third Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<  Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 triple byte char has invalid third byte"
  | Bool
otherwise = (Int
offset', Char
c)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
3
    !c :: Char
c       = Int -> Char
chr((Word8 -> Int
w2i Word8
first  Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xE0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x1000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
third  Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80))
decodeThreeUTF8 Word8
_ Int
_ [Word8]
_ =String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 triple byte char is missing bytes at eof"

-- | @decodeFourUTF8 first offset bytes@ decodes a four-byte UTF-8 character,
-- where the /first/ byte is already available and the second, third and fourth
-- are the head of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 Word8
first Int
offset (Word8
second:Word8
third:Word8
fourth:[Word8]
rest)
  | Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char has invalid second byte"
  | Word8
third  Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char has invalid third byte"
  | Word8
third  Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third  = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char has invalid fourth byte"
  | Bool
otherwise                      = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
  where
    !offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4
    !c :: Char
c       = Int -> Char
chr((Word8 -> Int
w2i Word8
first  Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xF0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x1000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
third  Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
                   (Word8 -> Int
w2i Word8
fourth Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80))

decodeFourUTF8 Word8
_ Int
_ [Word8]
_ = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char is missing bytes at eof"