{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE Safe #-}
module Data.YAML.Token.Encoding
( decode
, Encoding(..)
) where
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Char8 as BLC
import Util
data Encoding = UTF8
| UTF16LE
| UTF16BE
| UTF32LE
| UTF32BE
deriving (Encoding -> Encoding -> Bool
(Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Bool) -> Eq Encoding
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Encoding -> Encoding -> Bool
$c/= :: Encoding -> Encoding -> Bool
== :: Encoding -> Encoding -> Bool
$c== :: Encoding -> Encoding -> Bool
Eq,(forall x. Encoding -> Rep Encoding x)
-> (forall x. Rep Encoding x -> Encoding) -> Generic Encoding
forall x. Rep Encoding x -> Encoding
forall x. Encoding -> Rep Encoding x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Encoding x -> Encoding
$cfrom :: forall x. Encoding -> Rep Encoding x
Generic)
instance Show Encoding where
show :: Encoding -> String
show Encoding
UTF8 = String
"UTF-8"
show Encoding
UTF16LE = String
"UTF-16LE"
show Encoding
UTF16BE = String
"UTF-16BE"
show Encoding
UTF32LE = String
"UTF-32LE"
show Encoding
UTF32BE = String
"UTF-32BE"
instance NFData Encoding where rnf :: Encoding -> ()
rnf !Encoding
_ = ()
decode :: BLC.ByteString -> (Encoding, [(Int, Char)])
decode :: ByteString -> (Encoding, [(Int, Char)])
decode ByteString
text = (Encoding
encoding, Encoding -> ByteString -> [(Int, Char)]
undoEncoding Encoding
encoding ByteString
text)
where
encoding :: Encoding
encoding = [Word8] -> Encoding
detectEncoding ([Word8] -> Encoding) -> [Word8] -> Encoding
forall a b. (a -> b) -> a -> b
$ ByteString -> [Word8]
BL.unpack (ByteString -> [Word8]) -> ByteString -> [Word8]
forall a b. (a -> b) -> a -> b
$ Int64 -> ByteString -> ByteString
BL.take Int64
4 ByteString
text
detectEncoding :: [Word8] -> Encoding
detectEncoding :: [Word8] -> Encoding
detectEncoding [Word8]
text = case [Word8]
text of
Word8
0x00 : Word8
0x00 : Word8
0xFE : Word8
0xFF : [Word8]
_ -> Encoding
UTF32BE
Word8
0x00 : Word8
0x00 : Word8
0x00 : Word8
_ : [Word8]
_ -> Encoding
UTF32BE
Word8
0xFF : Word8
0xFE : Word8
0x00 : Word8
0x00 : [Word8]
_ -> Encoding
UTF32LE
Word8
_ : Word8
0x00 : Word8
0x00 : Word8
0x00 : [Word8]
_ -> Encoding
UTF32LE
Word8
0xFE : Word8
0xFF : [Word8]
_ -> Encoding
UTF16BE
Word8
0x00 : Word8
_ : [Word8]
_ -> Encoding
UTF16BE
Word8
0xFF : Word8
0xFE : [Word8]
_ -> Encoding
UTF16LE
Word8
_ : Word8
0x00 : [Word8]
_ -> Encoding
UTF16LE
Word8
0xEF : Word8
0xBB : Word8
0xBF : [Word8]
_ -> Encoding
UTF8
[Word8]
_ -> Encoding
UTF8
undoEncoding :: Encoding -> BLC.ByteString -> [(Int, Char)]
undoEncoding :: Encoding -> ByteString -> [(Int, Char)]
undoEncoding Encoding
encoding ByteString
bytes =
case Encoding
encoding of
Encoding
UTF8 -> ByteString -> Int -> [(Int, Char)]
undoUTF8 ByteString
bytes Int
0
Encoding
UTF16LE -> [(Int, Char)] -> [(Int, Char)]
combinePairs ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
bytes Int
0
Encoding
UTF16BE -> [(Int, Char)] -> [(Int, Char)]
combinePairs ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
bytes Int
0
Encoding
UTF32LE -> [(Int, Char)] -> [(Int, Char)]
forall a. [(a, Char)] -> [(a, Char)]
validateScalars ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
bytes Int
0
Encoding
UTF32BE -> [(Int, Char)] -> [(Int, Char)]
forall a. [(a, Char)] -> [(a, Char)]
validateScalars ([(Int, Char)] -> [(Int, Char)]) -> [(Int, Char)] -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
bytes Int
0
where
validateScalars :: [(a, Char)] -> [(a, Char)]
validateScalars [] = []
validateScalars (x :: (a, Char)
x@(a
_,Char
c):[(a, Char)]
rest)
| Char
'\xD800' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
c, Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDFFF' = String -> [(a, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-32 stream contains invalid surrogate code-point"
| Bool
otherwise = (a, Char)
x (a, Char) -> [(a, Char)] -> [(a, Char)]
forall a. a -> [a] -> [a]
: [(a, Char)] -> [(a, Char)]
validateScalars [(a, Char)]
rest
hasFewerThan :: Int -> BLC.ByteString -> Bool
hasFewerThan :: Int -> ByteString -> Bool
hasFewerThan Int
n ByteString
bytes
| Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = ByteString -> Bool
BLC.null ByteString
bytes
| Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1 = ByteString -> Bool
BLC.null ByteString
bytes Bool -> Bool -> Bool
|| Int -> ByteString -> Bool
hasFewerThan (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) (ByteString -> ByteString
BLC.tail ByteString
bytes)
| Bool
otherwise = Bool
False
undoUTF32LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32LE :: ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
bytes Int
offset
| ByteString -> Bool
BLC.null ByteString
bytes = []
| Int -> ByteString -> Bool
hasFewerThan Int
4 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-32LE input contains invalid number of bytes"
| Bool
otherwise = let first :: Char
first = ByteString -> Char
BLC.head ByteString
bytes
bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
second :: Char
second = ByteString -> Char
BLC.head ByteString
bytes'
bytes'' :: ByteString
bytes'' = ByteString -> ByteString
BLC.tail ByteString
bytes'
third :: Char
third = ByteString -> Char
BLC.head ByteString
bytes''
bytes''' :: ByteString
bytes''' = ByteString -> ByteString
BLC.tail ByteString
bytes''
fourth :: Char
fourth = ByteString -> Char
BLC.head ByteString
bytes'''
rest :: ByteString
rest = ByteString -> ByteString
BLC.tail ByteString
bytes'''
in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4,
Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
first
Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
second
Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
third
Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Char -> Int
ord Char
fourth)))(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF32LE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4)
undoUTF32BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF32BE :: ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
bytes Int
offset
| ByteString -> Bool
BLC.null ByteString
bytes = []
| Int -> ByteString -> Bool
hasFewerThan Int
4 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-32BE input contains invalid number of bytes"
| Bool
otherwise = let first :: Char
first = ByteString -> Char
BLC.head ByteString
bytes
bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
second :: Char
second = ByteString -> Char
BLC.head ByteString
bytes'
bytes'' :: ByteString
bytes'' = ByteString -> ByteString
BLC.tail ByteString
bytes'
third :: Char
third = ByteString -> Char
BLC.head ByteString
bytes''
bytes''' :: ByteString
bytes''' = ByteString -> ByteString
BLC.tail ByteString
bytes''
fourth :: Char
fourth = ByteString -> Char
BLC.head ByteString
bytes'''
rest :: ByteString
rest = ByteString -> ByteString
BLC.tail ByteString
bytes'''
in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4,
Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
fourth
Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
third
Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Char -> Int
ord Char
second
Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Char -> Int
ord Char
first)))(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF32BE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4)
combinePairs :: [(Int, Char)] -> [(Int, Char)]
combinePairs :: [(Int, Char)] -> [(Int, Char)]
combinePairs [] = []
combinePairs (head' :: (Int, Char)
head'@(Int
_, Char
head_char):[(Int, Char)]
tail')
| Char
'\xD800' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
head_char Bool -> Bool -> Bool
&& Char
head_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDBFF' = (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead (Int, Char)
head' [(Int, Char)]
tail'
| Char
'\xDC00' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
head_char Bool -> Bool -> Bool
&& Char
head_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDFFF' = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16 contains trail surrogate without lead surrogate"
| Bool
otherwise = (Int, Char)
head'(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:[(Int, Char)] -> [(Int, Char)]
combinePairs [(Int, Char)]
tail'
combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
combineLead (Int, Char)
_lead [] = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16 contains lead surrogate as final character"
combineLead (Int
_, Char
lead_char) ((Int
trail_offset, Char
trail_char):[(Int, Char)]
rest)
| Char
'\xDC00' Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
trail_char Bool -> Bool -> Bool
&& Char
trail_char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xDFFF' = (Int
trail_offset, Char -> Char -> Char
combineSurrogates Char
lead_char Char
trail_char)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:[(Int, Char)] -> [(Int, Char)]
combinePairs [(Int, Char)]
rest
| Bool
otherwise = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16 contains lead surrogate without trail surrogate"
surrogateOffset :: Int
surrogateOffset :: Int
surrogateOffset = Int
0x10000 Int -> Int -> Int
forall a. Num a => a -> a -> a
- (Int
0xD800 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
1024) Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xDC00
combineSurrogates :: Char -> Char -> Char
combineSurrogates :: Char -> Char -> Char
combineSurrogates Char
lead Char
trail = Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
lead Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
1024 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
trail Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
surrogateOffset
undoUTF16LE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16LE :: ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
bytes Int
offset
| ByteString -> Bool
BLC.null ByteString
bytes = []
| Int -> ByteString -> Bool
hasFewerThan Int
2 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16LE input contains odd number of bytes"
| Bool
otherwise = let low :: Char
low = ByteString -> Char
BLC.head ByteString
bytes
bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
high :: Char
high = ByteString -> Char
BLC.head ByteString
bytes'
rest :: ByteString
rest = ByteString -> ByteString
BLC.tail ByteString
bytes'
in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2, Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
low Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
high Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
256)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF16LE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2)
undoUTF16BE :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF16BE :: ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
bytes Int
offset
| ByteString -> Bool
BLC.null ByteString
bytes = []
| Int -> ByteString -> Bool
hasFewerThan Int
2 ByteString
bytes = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-16BE input contains odd number of bytes"
| Bool
otherwise = let high :: Char
high = ByteString -> Char
BLC.head ByteString
bytes
bytes' :: ByteString
bytes' = ByteString -> ByteString
BLC.tail ByteString
bytes
low :: Char
low = ByteString -> Char
BLC.head ByteString
bytes'
rest :: ByteString
rest = ByteString -> ByteString
BLC.tail ByteString
bytes'
in (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2, Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Char -> Int
ord Char
low Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
high Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
256)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
:(ByteString -> Int -> [(Int, Char)]
undoUTF16BE ByteString
rest (Int -> [(Int, Char)]) -> Int -> [(Int, Char)]
forall a b. (a -> b) -> a -> b
$ Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2)
undoUTF8 :: BLC.ByteString -> Int -> [(Int, Char)]
undoUTF8 :: ByteString -> Int -> [(Int, Char)]
undoUTF8 ByteString
bytes = [Word8] -> Int -> [(Int, Char)]
undoUTF8' (ByteString -> [Word8]
BL.unpack ByteString
bytes)
w2c :: Word8 -> Char
w2c :: Word8 -> Char
w2c = Int -> Char
chr (Int -> Char) -> (Word8 -> Int) -> Word8 -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
w2i :: Word8 -> Int
w2i :: Word8 -> Int
w2i = Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
undoUTF8' :: [Word8] -> Int -> [(Int, Char)]
undoUTF8' :: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [] Int
_ = []
undoUTF8' (Word8
first:[Word8]
rest) !Int
offset
| Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
where
!offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
!c :: Char
c = Word8 -> Char
w2c Word8
first
undoUTF8' (Word8
first:[Word8]
rest) !Int
offset
| Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xC0 = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 input contains invalid first byte"
| Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xE0 = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 Word8
first Int
offset [Word8]
rest
| Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xF0 = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 Word8
first Int
offset [Word8]
rest
| Word8
first Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0xF8 = Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 Word8
first Int
offset [Word8]
rest
| Bool
otherwise = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 input contains invalid first byte"
decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeTwoUTF8 Word8
first Int
offset (Word8
second:[Word8]
rest)
| Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 double byte char has invalid second byte"
| Bool
otherwise = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
where
!offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2
!c :: Char
c = Int -> Char
chr ((Word8 -> Int
w2i Word8
first Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xc0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80))
decodeTwoUTF8 Word8
_ Int
_ [] = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 double byte char is missing second byte at eof"
decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeThreeUTF8 Word8
first Int
offset (Word8
second:Word8
third:[Word8]
rest)
| Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 triple byte char has invalid second byte"
| Word8
third Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 triple byte char has invalid third byte"
| Bool
otherwise = (Int
offset', Char
c)(Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
where
!offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
3
!c :: Char
c = Int -> Char
chr((Word8 -> Int
w2i Word8
first Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xE0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x1000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
(Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
(Word8 -> Int
w2i Word8
third Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80))
decodeThreeUTF8 Word8
_ Int
_ [Word8]
_ =String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 triple byte char is missing bytes at eof"
decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 :: Word8 -> Int -> [Word8] -> [(Int, Char)]
decodeFourUTF8 Word8
first Int
offset (Word8
second:Word8
third:Word8
fourth:[Word8]
rest)
| Word8
second Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
second = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char has invalid second byte"
| Word8
third Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char has invalid third byte"
| Word8
third Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
0x80 Bool -> Bool -> Bool
|| Word8
0xBF Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
< Word8
third = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char has invalid fourth byte"
| Bool
otherwise = (Int
offset', Char
c) (Int, Char) -> [(Int, Char)] -> [(Int, Char)]
forall a. a -> [a] -> [a]
: [Word8] -> Int -> [(Int, Char)]
undoUTF8' [Word8]
rest Int
offset'
where
!offset' :: Int
offset' = Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
4
!c :: Char
c = Int -> Char
chr((Word8 -> Int
w2i Word8
first Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0xF0) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
(Word8 -> Int
w2i Word8
second Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x1000 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
(Word8 -> Int
w2i Word8
third Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
0x40 Int -> Int -> Int
forall a. Num a => a -> a -> a
+
(Word8 -> Int
w2i Word8
fourth Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
0x80))
decodeFourUTF8 Word8
_ Int
_ [Word8]
_ = String -> [(Int, Char)]
forall a. HasCallStack => String -> a
error String
"UTF-8 quad byte char is missing bytes at eof"