{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings #-}

-- | We define our own normalization function instead of depending on
-- unicode-transforms, because we need a lazy (streaming) normalization
-- function for maximum efficiency.  No point normalizing two whole 'Text's
-- if we can see from the first few characters how they should be ordered.
-- See <https://unicode.org/reports/tr15/> for a description of the algorithm
-- implemented here.
module Text.Collate.Normalize
  ( toNFD
  )
where
import qualified Data.IntMap as M
import Text.Collate.UnicodeData (genCanonicalDecompositionMap)
import Text.Collate.CanonicalCombiningClass (canonicalCombiningClass)
import Data.List (sortOn)

canonicalDecompositionMap :: M.IntMap [Int]
canonicalDecompositionMap :: IntMap [Int]
canonicalDecompositionMap = $(genCanonicalDecompositionMap)

-- | Lazily normalize a list of code points to its canonical decomposition (NFD).
toNFD :: [Int] -> [Int]
toNFD :: [Int] -> [Int]
toNFD = [Int] -> [Int]
rearrangeCombiningMarks forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Int] -> [Int]
recursivelyDecompose

rearrangeCombiningMarks :: [Int] -> [Int]
rearrangeCombiningMarks :: [Int] -> [Int]
rearrangeCombiningMarks = [Int] -> [Int]
go
 where
  go :: [Int] -> [Int]
go [] = []
  go (Int
c:[Int]
cs) =
    if Int -> Int
canonicalCombiningClass Int
c forall a. Eq a => a -> a -> Bool
== Int
0
       then Int
c forall a. a -> [a] -> [a]
: [Int] -> [Int]
reorderMarks [Int]
cs
       else [Int] -> [Int]
reorderMarks (Int
cforall a. a -> [a] -> [a]
:[Int]
cs)
  reorderMarks :: [Int] -> [Int]
reorderMarks [Int]
zs =
    case forall a. (a -> Bool) -> [a] -> ([a], [a])
break (\Int
z -> Int -> Int
canonicalCombiningClass Int
z forall a. Eq a => a -> a -> Bool
== Int
0) [Int]
zs of
      ([], [Int]
ys)  -> [Int] -> [Int]
go [Int]
ys
      ([Int
x], [Int]
ys) -> Int
x forall a. a -> [a] -> [a]
: [Int] -> [Int]
go [Int]
ys
      ([Int
x1,Int
x2], [Int]
ys)
        | Int -> Int
canonicalCombiningClass Int
x1 forall a. Ord a => a -> a -> Bool
<= Int -> Int
canonicalCombiningClass Int
x2
                    -> Int
x1 forall a. a -> [a] -> [a]
: Int
x2 forall a. a -> [a] -> [a]
: [Int] -> [Int]
go [Int]
ys
        | Bool
otherwise -> Int
x2 forall a. a -> [a] -> [a]
: Int
x1 forall a. a -> [a] -> [a]
: [Int] -> [Int]
go [Int]
ys
      ([Int]
xs, [Int]
ys)  -> forall b a. Ord b => (a -> b) -> [a] -> [a]
sortOn Int -> Int
canonicalCombiningClass [Int]
xs forall a. [a] -> [a] -> [a]
++ [Int] -> [Int]
go [Int]
ys

recursivelyDecompose :: [Int] -> [Int]
recursivelyDecompose :: [Int] -> [Int]
recursivelyDecompose = forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Int -> [Int] -> [Int]
go forall a. Monoid a => a
mempty
  where go :: Int -> [Int] -> [Int]
go Int
c
          | Int
c forall a. Ord a => a -> a -> Bool
< Int
0xc0 = (Int
c forall a. a -> [a] -> [a]
:)
          | Int -> Bool
isHangulSyllable Int
c = Int -> [Int] -> [Int]
decomposeHangulSyllable Int
c
          | Bool
otherwise =
              case forall a. Int -> IntMap a -> Maybe a
M.lookup Int
c IntMap [Int]
canonicalDecompositionMap of
                Maybe [Int]
Nothing -> (Int
c forall a. a -> [a] -> [a]
:)
                Just [Int]
ds -> (\[Int]
xs -> forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Int -> [Int] -> [Int]
go [Int]
xs [Int]
ds)

-- | Hangul syllable range is AC00 - D7A3.
isHangulSyllable :: Int -> Bool
isHangulSyllable :: Int -> Bool
isHangulSyllable Int
cp = Int
cp forall a. Ord a => a -> a -> Bool
>= Int
0xAC00 Bool -> Bool -> Bool
&& Int
cp forall a. Ord a => a -> a -> Bool
<= Int
0xD7A3

-- Hangul decomposition is algorithmic; see "Hangul Syllable Decomposition" in
-- the Unicode spec, which gives this algorithm:
--
-- SBase = AC0016
-- LBase = 110016
-- VBase = 116116
-- TBase = 11A716
-- LCount = 19
-- VCount = 21
-- TCount = 28
-- NCount = 588 (VCount * TCount) SCount = 11172 (LCount * NCount)
-- SIndex = s - SBase
-- LIndex = SIndex div NCount
-- VIndex = (SIndex mod NCount) div TCount TIndex = SIndex mod TCount
-- LPart = LBase + LIndex
-- VPart = VBase + VIndex
-- TPart = TBase + TIndex if TIndex > 0
-- If TIndex = 0, then there is no trailing consonant, so map the precomposed
-- Hangul syllable s to its full decomposition d = <LPart, VPart>. Otherwise,
-- there is a trailing consonant, so map s to its full decomposition d = <LPart,
-- VPart, TPart>.

decomposeHangulSyllable :: Int -> ([Int] -> [Int])
decomposeHangulSyllable :: Int -> [Int] -> [Int]
decomposeHangulSyllable !Int
c =
  if Int
sindex forall a. Ord a => a -> a -> Bool
< Int
0 Bool -> Bool -> Bool
|| Int
sindex forall a. Ord a => a -> a -> Bool
>= Int
scount
     then (Int
cforall a. a -> [a] -> [a]
:)
     else
       let l :: Int
l = Int
lbase forall a. Num a => a -> a -> a
+ (Int
sindex forall a. Integral a => a -> a -> a
`div` Int
ncount)
           v :: Int
v = Int
vbase forall a. Num a => a -> a -> a
+ ((Int
sindex forall a. Integral a => a -> a -> a
`mod` Int
ncount) forall a. Integral a => a -> a -> a
`div` Int
tcount)
           t :: Int
t = Int
tbase forall a. Num a => a -> a -> a
+ (Int
sindex forall a. Integral a => a -> a -> a
`mod` Int
tcount)
        in if Int
t forall a. Eq a => a -> a -> Bool
/= Int
tbase
              then (Int
lforall a. a -> [a] -> [a]
:) forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int
vforall a. a -> [a] -> [a]
:) forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int
tforall a. a -> [a] -> [a]
:)
              else (Int
lforall a. a -> [a] -> [a]
:) forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int
vforall a. a -> [a] -> [a]
:)
 where
  !sindex :: Int
sindex = Int
c forall a. Num a => a -> a -> a
- Int
sbase
  !sbase :: Int
sbase = Int
0xAC00
  !lbase :: Int
lbase = Int
0x1100
  !vbase :: Int
vbase = Int
0x1161
  !tbase :: Int
tbase = Int
0x11A7
  !tcount :: Int
tcount = Int
28
  !ncount :: Int
ncount = Int
588 -- vcount * tcount
  !scount :: Int
scount = Int
11172 -- lcount * ncount
  -- !lcount = 19
  -- !vcount = 21