{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE CPP #-}
module Basement.Alg.UTF8
( nextAscii
, nextAsciiDigit
, expectAscii
, next
, nextSkip
, nextWith
, prev
, prevSkip
, writeASCII
, writeUTF8
, toList
, all
, any
, foldr
, length
, reverse
) where
import GHC.Types
import GHC.Word
import GHC.Prim
import Data.Bits
import Data.Proxy
import Basement.Alg.Class
import Basement.Compat.Base hiding (toList)
import Basement.Compat.Primitive
import Basement.Monad
import Basement.Numerical.Additive
import Basement.Numerical.Subtractive
import Basement.Types.OffsetSize
import Basement.Types.Char7 (Char7(..))
import Basement.PrimType
import Basement.UTF8.Helper
import Basement.UTF8.Table
import Basement.UTF8.Types
nextAscii :: Indexable container Word8 => container -> Offset Word8 -> StepASCII
nextAscii ba n = StepASCII w
where
!w = index ba n
{-# INLINE nextAscii #-}
nextAsciiDigit :: Indexable container Word8 => container -> Offset Word8 -> StepDigit
nextAsciiDigit ba n = StepDigit (index ba n - 0x30)
{-# INLINE nextAsciiDigit #-}
expectAscii :: Indexable container Word8 => container -> Offset Word8 -> Word8 -> Bool
expectAscii ba n v = index ba n == v
{-# INLINE expectAscii #-}
next :: Indexable container Word8 => container -> Offset8 -> Step
next ba n =
case getNbBytes h of
0 -> Step (toChar1 h) (n + Offset 1)
1 -> Step (toChar2 h (index ba (n + Offset 1))) (n + Offset 2)
2 -> Step (toChar3 h (index ba (n + Offset 1))
(index ba (n + Offset 2))) (n + Offset 3)
3 -> Step (toChar4 h (index ba (n + Offset 1))
(index ba (n + Offset 2))
(index ba (n + Offset 3))) (n + Offset 4)
r -> error ("next: internal error: invalid input: offset=" <> show n <> " table=" <> show r <> " h=" <> show (stepAsciiRawValue h))
where
!h = nextAscii ba n
{-# INLINE next #-}
nextSkip :: Indexable container Word8 => container -> Offset Word8 -> Offset Word8
nextSkip ba n = n + 1 + Offset (getNbBytes (nextAscii ba n))
{-# INLINE nextSkip #-}
nextWith :: Indexable container Word8
=> StepASCII
-> container
-> Offset8
-> Step
nextWith h ba n =
case getNbBytes h of
1 -> Step (toChar2 h (index ba n)) (n + Offset 1)
2 -> Step (toChar3 h (index ba n) (index ba (n + Offset 1))) (n + Offset 2)
3 -> Step (toChar4 h (index ba n)
(index ba (n + Offset 1))
(index ba (n + Offset 2))) (n + Offset 3)
r -> error ("nextWith: internal error: invalid input: offset=" <> show n <> " table=" <> show r <> " h=" <> show (stepAsciiRawValue h))
{-# INLINE nextWith #-}
prev :: Indexable container Word8 => container -> Offset Word8 -> StepBack
prev ba offset =
case index ba prevOfs1 of
(W8# v1) | isContinuation# v1 -> atLeast2 (maskContinuation# v1)
| otherwise -> StepBack (toChar# v1) prevOfs1
where
sz1 = CountOf 1
!prevOfs1 = offset `offsetMinusE` sz1
prevOfs2 = prevOfs1 `offsetMinusE` sz1
prevOfs3 = prevOfs2 `offsetMinusE` sz1
prevOfs4 = prevOfs3 `offsetMinusE` sz1
atLeast2 !v =
case index ba prevOfs2 of
(W8# v2) | isContinuation# v2 -> atLeast3 (or# (uncheckedShiftL# (maskContinuation# v2) 6#) v)
| otherwise -> StepBack (toChar# (or# (uncheckedShiftL# (maskHeader2# v2) 6#) v)) prevOfs2
atLeast3 !v =
case index ba prevOfs3 of
(W8# v3) | isContinuation# v3 -> atLeast4 (or# (uncheckedShiftL# (maskContinuation# v3) 12#) v)
| otherwise -> StepBack (toChar# (or# (uncheckedShiftL# (maskHeader3# v3) 12#) v)) prevOfs3
atLeast4 !v =
case index ba prevOfs4 of
(W8# v4) -> StepBack (toChar# (or# (uncheckedShiftL# (maskHeader4# v4) 18#) v)) prevOfs4
prevSkip :: Indexable container Word8 => container -> Offset Word8 -> Offset Word8
prevSkip ba offset = loop (offset `offsetMinusE` sz1)
where
sz1 = CountOf 1
loop o
| isContinuation (index ba o) = loop (o `offsetMinusE` sz1)
| otherwise = o
writeASCII :: (PrimMonad prim, RandomAccess container prim Word8)
=> container -> Offset8 -> Char7 -> prim ()
writeASCII mba !i (Char7 c) = write mba i c
{-# INLINE writeASCII #-}
writeUTF8 :: (PrimMonad prim, RandomAccess container prim Word8)
=> container -> Offset8 -> Char -> prim Offset8
writeUTF8 mba !i !c
| bool# (ltWord# x 0x80## ) = encode1
| bool# (ltWord# x 0x800## ) = encode2
| bool# (ltWord# x 0x10000##) = encode3
| otherwise = encode4
where
!(I# xi) = fromEnum c
!x = int2Word# xi
encode1 = write mba i (W8# x) >> pure (i + Offset 1)
encode2 = do
let x1 = or# (uncheckedShiftRL# x 6#) 0xc0##
x2 = toContinuation x
write mba i (W8# x1)
write mba (i+1) (W8# x2)
pure (i + Offset 2)
encode3 = do
let x1 = or# (uncheckedShiftRL# x 12#) 0xe0##
x2 = toContinuation (uncheckedShiftRL# x 6#)
x3 = toContinuation x
write mba i (W8# x1)
write mba (i+Offset 1) (W8# x2)
write mba (i+Offset 2) (W8# x3)
pure (i + Offset 3)
encode4 = do
let x1 = or# (uncheckedShiftRL# x 18#) 0xf0##
x2 = toContinuation (uncheckedShiftRL# x 12#)
x3 = toContinuation (uncheckedShiftRL# x 6#)
x4 = toContinuation x
write mba i (W8# x1)
write mba (i+Offset 1) (W8# x2)
write mba (i+Offset 2) (W8# x3)
write mba (i+Offset 3) (W8# x4)
pure (i + Offset 4)
toContinuation :: Word# -> Word#
toContinuation w = or# (and# w 0x3f##) 0x80##
{-# INLINE writeUTF8 #-}
toList :: Indexable container Word8 => container -> Offset Word8 -> Offset Word8 -> [Char]
toList ba !start !end = loop start
where
loop !idx
| idx == end = []
| otherwise = c : loop idx'
where (Step c idx') = next ba idx
all :: Indexable container Word8
=> (Char -> Bool) -> container -> Offset Word8 -> Offset Word8 -> Bool
all predicate ba start end = loop start
where
loop !idx
| idx == end = True
| predicate c = loop idx'
| otherwise = False
where (Step c idx') = next ba idx
{-# INLINE all #-}
any :: Indexable container Word8
=> (Char -> Bool) -> container -> Offset Word8 -> Offset Word8 -> Bool
any predicate ba start end = loop start
where
loop !idx
| idx == end = False
| predicate c = True
| otherwise = loop idx'
where (Step c idx') = next ba idx
{-# INLINE any #-}
foldr :: Indexable container Word8
=> container -> Offset Word8 -> Offset Word8 -> (Char -> a -> a) -> a -> a
foldr dat start end f acc = loop start
where
loop !i
| i == end = acc
| otherwise =
let (Step c i') = next dat i
in c `f` loop i'
{-# INLINE foldr #-}
length :: (Indexable container Word8, Indexable container Word64)
=> container -> Offset Word8 -> Offset Word8 -> CountOf Char
length dat start end
| start == end = 0
| otherwise = processStart 0 start
where
end64 :: Offset Word64
end64 = offsetInElements end
prx64 :: Proxy Word64
prx64 = Proxy
mask64_80 :: Word64
mask64_80 = 0x8080808080808080
processStart :: CountOf Char -> Offset Word8 -> CountOf Char
processStart !c !i
| i == end = c
| offsetIsAligned prx64 i = processAligned c (offsetInElements i)
| otherwise =
let h = index dat i
cont = (h .&. 0xc0) == 0x80
c' = if cont then c else c+1
in processStart c' (i+1)
processAligned :: CountOf Char -> Offset Word64 -> CountOf Char
processAligned !c !i
| i >= end64 = processEnd c (offsetInBytes i)
| otherwise =
let !h = index dat i
!h80 = h .&. mask64_80
in if h80 == 0
then processAligned (c+8) (i+1)
else let !nbAscii = if h80 == mask64_80 then 0 else CountOf (8 - popCount h80)
!nbHigh = CountOf $ popCount (h .&. (h80 `unsafeShiftR` 1))
in processAligned (c + nbAscii + nbHigh) (i+1)
processEnd !c !i
| i == end = c
| otherwise =
let h = index dat i
cont = (h .&. 0xc0) == 0x80
c' = if cont then c else c+1
in processStart c' (i+1)
{-# INLINE length #-}
reverse :: (PrimMonad prim, Indexable container Word8)
=> MutableByteArray# (PrimState prim)
-> Offset Word8
-> container
-> Offset Word8
-> Offset Word8
-> prim ()
reverse dst dstOfs src start end
| start == end = pure ()
| otherwise = loop (dstOfs `offsetPlusE` (offsetAsSize (end `offsetSub` start)) `offsetSub` 1) start
where
loop !d !s
| s == end = pure ()
| headerIsAscii h = primMbaWrite dst d (stepAsciiRawValue h) >> loop (d `offsetSub` 1) (s + 1)
| otherwise = do
case getNbBytes h of
1 -> do
primMbaWrite dst (d `offsetSub` 1) (stepAsciiRawValue h)
primMbaWrite dst d (index src (s + 1))
loop (d `offsetSub` 2) (s + 2)
2 -> do
primMbaWrite dst (d `offsetSub` 2) (stepAsciiRawValue h)
primMbaWrite dst (d `offsetSub` 1) (index src (s + 1))
primMbaWrite dst d (index src (s + 2))
loop (d `offsetSub` 3) (s + 3)
3 -> do
primMbaWrite dst (d `offsetSub` 3) (stepAsciiRawValue h)
primMbaWrite dst (d `offsetSub` 2) (index src (s + 1))
primMbaWrite dst (d `offsetSub` 1) (index src (s + 2))
primMbaWrite dst d (index src (s + 3))
loop (d `offsetSub` 4) (s + 4)
_ -> error "impossible"
where h = nextAscii src s
{-# INLINE reverse #-}