{-# LANGUAGE TupleSections #-}

-- |
-- Module    : Data.Kanji
-- Copyright : (c) Colin Woodbury, 2015 - 2020
-- License   : GPL3
-- Maintainer: Colin Woodbury <colin@fosskers.ca>
--
-- A library for analysing the density of Kanji in given texts,
-- according to their "Level" classification, as defined by the
-- Japan Kanji Aptitude Testing Foundation (日本漢字能力検定協会).

module Data.Kanji
  (
    -- * Kanji
    Kanji
  , kanji, _kanji
  , allKanji
  , isKanji, isHiragana, isKatakana
    -- * Character Categories
  , CharCat(..)
  , category
    -- * Levels
  , Level(..)
  , level
    -- * Analysis
  , percentSpread
  , levelDist
  , uniques
    -- ** Densities
  , densities
  , elementaryDen
  , middleDen
  , highDen
  ) where

import           Control.Arrow hiding (second)
import           Data.Foldable (fold)
import           Data.Kanji.Levels
import           Data.Kanji.Types
import           Data.List (group, sort)
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import qualified Data.Text as T

---

-- | All Japanese `Kanji`, grouped by their Level (級).
allKanji :: M.Map Level (S.Set Kanji)
allKanji :: Map Level (Set Kanji)
allKanji =  [(Level, Set Kanji)] -> Map Level (Set Kanji)
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([(Level, Set Kanji)] -> Map Level (Set Kanji))
-> ([Set Kanji] -> [(Level, Set Kanji)])
-> [Set Kanji]
-> Map Level (Set Kanji)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Level] -> [Set Kanji] -> [(Level, Set Kanji)]
forall a b. [a] -> [b] -> [(a, b)]
zip [ Level
Ten .. ] ([Set Kanji] -> Map Level (Set Kanji))
-> [Set Kanji] -> Map Level (Set Kanji)
forall a b. (a -> b) -> a -> b
$ (Set Char -> Set Kanji) -> [Set Char] -> [Set Kanji]
forall a b. (a -> b) -> [a] -> [b]
map ((Char -> Kanji) -> Set Char -> Set Kanji
forall b a. Ord b => (a -> b) -> Set a -> Set b
S.map Char -> Kanji
Kanji) [Set Char]
ks
  where ks :: [Set Char]
ks = [ Set Char
tenth, Set Char
ninth, Set Char
eighth, Set Char
seventh, Set Char
sixth
             , Set Char
fifth, Set Char
fourth, Set Char
third, Set Char
preSecond, Set Char
second ]

-- | All Japanese `Kanji` with their `Level`.
allKanji' :: M.Map Kanji Level
allKanji' :: Map Kanji Level
allKanji' = [(Kanji, Level)] -> Map Kanji Level
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([(Kanji, Level)] -> Map Kanji Level)
-> (Map Level (Set (Kanji, Level)) -> [(Kanji, Level)])
-> Map Level (Set (Kanji, Level))
-> Map Kanji Level
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Set (Kanji, Level) -> [(Kanji, Level)]
forall a. Set a -> [a]
S.toList (Set (Kanji, Level) -> [(Kanji, Level)])
-> (Map Level (Set (Kanji, Level)) -> Set (Kanji, Level))
-> Map Level (Set (Kanji, Level))
-> [(Kanji, Level)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Level (Set (Kanji, Level)) -> Set (Kanji, Level)
forall (t :: * -> *) m. (Foldable t, Monoid m) => t m -> m
fold (Map Level (Set (Kanji, Level)) -> Map Kanji Level)
-> Map Level (Set (Kanji, Level)) -> Map Kanji Level
forall a b. (a -> b) -> a -> b
$ (Level -> Set Kanji -> Set (Kanji, Level))
-> Map Level (Set Kanji) -> Map Level (Set (Kanji, Level))
forall k a b. (k -> a -> b) -> Map k a -> Map k b
M.mapWithKey (\Level
k Set Kanji
v -> (Kanji -> (Kanji, Level)) -> Set Kanji -> Set (Kanji, Level)
forall b a. Ord b => (a -> b) -> Set a -> Set b
S.map (,Level
k) Set Kanji
v) Map Level (Set Kanji)
allKanji

-- | What `Level` does a Kanji belong to? `Unknown` for Kanji above level `Two`.
level :: Kanji -> Level
level :: Kanji -> Level
level Kanji
k = Level -> (Level -> Level) -> Maybe Level -> Level
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Level
Unknown Level -> Level
forall a. a -> a
id (Maybe Level -> Level) -> Maybe Level -> Level
forall a b. (a -> b) -> a -> b
$ Kanji -> Map Kanji Level -> Maybe Level
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Kanji
k Map Kanji Level
allKanji'
{-# INLINE level #-}

-- | Percentage of appearance of each `CharCat` in the source text.
-- The percentages will sum to 1.0.
densities :: T.Text -> M.Map CharCat Float
densities :: Text -> Map CharCat Float
densities Text
t = [(CharCat, Float)] -> Map CharCat Float
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([(CharCat, Float)] -> Map CharCat Float)
-> ([Char] -> [(CharCat, Float)]) -> [Char] -> Map CharCat Float
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([CharCat] -> (CharCat, Float))
-> [[CharCat]] -> [(CharCat, Float)]
forall a b. (a -> b) -> [a] -> [b]
map ([CharCat] -> CharCat
forall a. [a] -> a
head ([CharCat] -> CharCat)
-> ([CharCat] -> Float) -> [CharCat] -> (CharCat, Float)
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& [CharCat] -> Float
forall a (t :: * -> *) a. (Fractional a, Foldable t) => t a -> a
f) ([[CharCat]] -> [(CharCat, Float)])
-> ([Char] -> [[CharCat]]) -> [Char] -> [(CharCat, Float)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [CharCat] -> [[CharCat]]
forall a. Eq a => [a] -> [[a]]
group ([CharCat] -> [[CharCat]])
-> ([Char] -> [CharCat]) -> [Char] -> [[CharCat]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [CharCat] -> [CharCat]
forall a. Ord a => [a] -> [a]
sort ([CharCat] -> [CharCat])
-> ([Char] -> [CharCat]) -> [Char] -> [CharCat]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> CharCat) -> [Char] -> [CharCat]
forall a b. (a -> b) -> [a] -> [b]
map Char -> CharCat
category ([Char] -> Map CharCat Float) -> [Char] -> Map CharCat Float
forall a b. (a -> b) -> a -> b
$ Text -> [Char]
T.unpack Text
t
  where f :: t a -> a
f t a
xs = Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (t a -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length t a
xs) a -> a -> a
forall a. Fractional a => a -> a -> a
/ Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Text -> Int
T.length Text
t)

-- | How much of the Kanji found are learnt in elementary school in Japan?
--
-- > elementaryDen . levelDist :: [Kanji] -> Float
elementaryDen :: M.Map Level Float -> Float
elementaryDen :: Map Level Float -> Float
elementaryDen Map Level Float
m = (Float -> Float -> Float) -> Float -> Map Level Float -> Float
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Float -> Float -> Float
forall a. Num a => a -> a -> a
(+) Float
0 (Map Level Float -> Float)
-> (Set Level -> Map Level Float) -> Set Level -> Float
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Level Float -> Set Level -> Map Level Float
forall k a. Ord k => Map k a -> Set k -> Map k a
M.restrictKeys Map Level Float
m (Set Level -> Float) -> Set Level -> Float
forall a b. (a -> b) -> a -> b
$ [Level] -> Set Level
forall a. Ord a => [a] -> Set a
S.fromList [ Level
Five, Level
Six .. ]

-- | How much of the Kanji found are learnt by the end of middle school?
--
-- > middleDen . levelDist :: [Kanji] -> Float
middleDen :: M.Map Level Float -> Float
middleDen :: Map Level Float -> Float
middleDen Map Level Float
m = (Float -> Float -> Float) -> Float -> Map Level Float -> Float
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Float -> Float -> Float
forall a. Num a => a -> a -> a
(+) Float
0 (Map Level Float -> Float)
-> (Set Level -> Map Level Float) -> Set Level -> Float
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Level Float -> Set Level -> Map Level Float
forall k a. Ord k => Map k a -> Set k -> Map k a
M.restrictKeys Map Level Float
m (Set Level -> Float) -> Set Level -> Float
forall a b. (a -> b) -> a -> b
$ [Level] -> Set Level
forall a. Ord a => [a] -> Set a
S.fromList [ Level
Three, Level
Four .. ]

-- | How much of the Kanji found are learnt by the end of high school?
--
-- > highDen . levelDist :: [Kanji] -> Float
highDen :: M.Map Level Float -> Float
highDen :: Map Level Float -> Float
highDen Map Level Float
m = (Float -> Float -> Float) -> Float -> Map Level Float -> Float
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Float -> Float -> Float
forall a. Num a => a -> a -> a
(+) Float
0 (Map Level Float -> Float)
-> (Set Level -> Map Level Float) -> Set Level -> Float
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Map Level Float -> Set Level -> Map Level Float
forall k a. Ord k => Map k a -> Set k -> Map k a
M.restrictKeys Map Level Float
m (Set Level -> Float) -> Set Level -> Float
forall a b. (a -> b) -> a -> b
$ [Level] -> Set Level
forall a. Ord a => [a] -> Set a
S.fromList [ Level
Two, Level
PreTwo .. ]

-- | How much of each `Level` is represented by a group of Kanji?
-- The distribution values will sum to 1.
levelDist :: [Kanji] -> M.Map Level Float
levelDist :: [Kanji] -> Map Level Float
levelDist [Kanji]
ks = [(Level, Float)] -> Map Level Float
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([(Level, Float)] -> Map Level Float)
-> ([Level] -> [(Level, Float)]) -> [Level] -> Map Level Float
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Level] -> (Level, Float)) -> [[Level]] -> [(Level, Float)]
forall a b. (a -> b) -> [a] -> [b]
map [Level] -> (Level, Float)
forall a. [a] -> (a, Float)
percentPair ([[Level]] -> [(Level, Float)])
-> ([Level] -> [[Level]]) -> [Level] -> [(Level, Float)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Level] -> [[Level]]
forall a. Eq a => [a] -> [[a]]
group ([Level] -> [[Level]])
-> ([Level] -> [Level]) -> [Level] -> [[Level]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Level] -> [Level]
forall a. Ord a => [a] -> [a]
sort ([Level] -> Map Level Float) -> [Level] -> Map Level Float
forall a b. (a -> b) -> a -> b
$ (Kanji -> Level) -> [Kanji] -> [Level]
forall a b. (a -> b) -> [a] -> [b]
map Kanji -> Level
level [Kanji]
ks
  where percentPair :: [a] -> (a, Float)
percentPair [a]
qns = ([a] -> a
forall a. [a] -> a
head [a]
qns, Int -> Float
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
qns) Float -> Float -> Float
forall a. Fractional a => a -> a -> a
/ Float
totalKs)
        totalKs :: Float
totalKs = Int -> Float
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Float) -> Int -> Float
forall a b. (a -> b) -> a -> b
$ [Kanji] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Kanji]
ks

-- | The distribution of each `Kanji` in a set of them.
-- The distribution values must sum to 1.
percentSpread :: [Kanji] -> M.Map Kanji Float
percentSpread :: [Kanji] -> Map Kanji Float
percentSpread [Kanji]
ks = Int -> Float
forall a. Integral a => a -> Float
getPercent (Int -> Float) -> Map Kanji Int -> Map Kanji Float
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Map Kanji Int
kQuants
    where getPercent :: a -> Float
getPercent a
q = a -> Float
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
q Float -> Float -> Float
forall a. Fractional a => a -> a -> a
/ Float
totalKanji
          kQuants :: Map Kanji Int
kQuants      = [Kanji] -> Map Kanji Int
kanjiQuantities [Kanji]
ks
          totalKanji :: Float
totalKanji   = Int -> Float
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Float) -> Int -> Float
forall a b. (a -> b) -> a -> b
$ (Int -> Int -> Int) -> Int -> Map Kanji Int -> Int
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) Int
0 Map Kanji Int
kQuants

-- | Determines how many times each `Kanji` appears in given set of them.
kanjiQuantities :: [Kanji] -> M.Map Kanji Int
kanjiQuantities :: [Kanji] -> Map Kanji Int
kanjiQuantities = [(Kanji, Int)] -> Map Kanji Int
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList ([(Kanji, Int)] -> Map Kanji Int)
-> ([Kanji] -> [(Kanji, Int)]) -> [Kanji] -> Map Kanji Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Kanji] -> (Kanji, Int)) -> [[Kanji]] -> [(Kanji, Int)]
forall a b. (a -> b) -> [a] -> [b]
map ([Kanji] -> Kanji
forall a. [a] -> a
head ([Kanji] -> Kanji) -> ([Kanji] -> Int) -> [Kanji] -> (Kanji, Int)
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& [Kanji] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length) ([[Kanji]] -> [(Kanji, Int)])
-> ([Kanji] -> [[Kanji]]) -> [Kanji] -> [(Kanji, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Kanji] -> [[Kanji]]
forall a. Eq a => [a] -> [[a]]
group ([Kanji] -> [[Kanji]])
-> ([Kanji] -> [Kanji]) -> [Kanji] -> [[Kanji]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Kanji] -> [Kanji]
forall a. Ord a => [a] -> [a]
sort

-- | Which Kanji appeared from each Level in the text?
uniques :: [Kanji] -> M.Map Level (S.Set Kanji)
uniques :: [Kanji] -> Map Level (Set Kanji)
uniques = (Map Level (Set Kanji) -> Kanji -> Map Level (Set Kanji))
-> Map Level (Set Kanji) -> Set Kanji -> Map Level (Set Kanji)
forall a b. (a -> b -> a) -> a -> Set b -> a
S.foldl' Map Level (Set Kanji) -> Kanji -> Map Level (Set Kanji)
h Map Level (Set Kanji)
forall k a. Map k a
M.empty (Set Kanji -> Map Level (Set Kanji))
-> ([Kanji] -> Set Kanji) -> [Kanji] -> Map Level (Set Kanji)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Kanji] -> Set Kanji
forall a. Ord a => [a] -> Set a
S.fromList
  where h :: Map Level (Set Kanji) -> Kanji -> Map Level (Set Kanji)
h Map Level (Set Kanji)
a Kanji
k = (\Level
l -> (Set Kanji -> Set Kanji -> Set Kanji)
-> Level
-> Set Kanji
-> Map Level (Set Kanji)
-> Map Level (Set Kanji)
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith Set Kanji -> Set Kanji -> Set Kanji
forall a. Semigroup a => a -> a -> a
(<>) Level
l (Kanji -> Set Kanji
forall a. a -> Set a
S.singleton Kanji
k) Map Level (Set Kanji)
a) (Level -> Map Level (Set Kanji)) -> Level -> Map Level (Set Kanji)
forall a b. (a -> b) -> a -> b
$ Kanji -> Level
level Kanji
k