module NLP.Hext.NaiveBayes (makeMaterial,
                   runBayes,
                   Classified(..),
                   Material(..),
                   Labeled(..),
                   FList,
                   -- * Example: Simple Usage
                   -- $simpleExample
                   )where

import qualified Data.Map.Lazy as M
import Data.Maybe
import Data.Char
import Data.Function
import Data.List
import qualified Data.Text.Lazy as T

-- | A frequency list of words
type FList = M.Map T.Text Int -- TODO make hash

-- | A frequency list of words that has been assigned a class
data Labeled a = Labeled { flist :: FList, _lClass :: a}

-- | A list of labeled data
type Material a = [Labeled a]

-- | A class which has a specific probability of occuring
data Classified a = Classified { _cClass :: a, _probability :: Double } deriving (Eq)

instance (Eq a) => Ord (Classified a) where compare = compare `on` _probability

-- | Creates learning material for the program combining
-- samples and their corresponding classes into
-- a 'Labeled' datatype.
makeMaterial :: [(String,a)] -- ^ a list of text samples and their corresponding classes
                -> Material a
makeMaterial ((s,c):rest) = (Labeled (vectorize s) c): makeMaterial rest
makeMaterial [] = []

-- | Runs a sample string through the Naive Bayes algorithm using
-- training material made by 'makeMaterial'
runBayes :: (Eq a) => Material a  -- ^ learning material made with 'makeMaterial'
            -> String -- ^ the sample string to be classified
            -> a -- ^ a datatype representing a class to classify text
runBayes trainingMaterial sample = 
    argmax $ classify trainingMaterial (T.words $ T.pack sample)

classify :: (Eq a) => Material a -> [T.Text] -> [Classified a]
classify mat = f where
    classes = nub [c | (Labeled f c) <- mat]
    lengthVocab = totalUniqueWords . unions $ vecs mat
    prob c s = 
        let caseC = unions . vecs $ filter (\(Labeled fl cl) -> c == cl) mat
            n = totalWords caseC
            denom = n + lengthVocab
        in foldl' (\acc word -> (pWordGivenClass word denom caseC) * acc) (pClass c mat) s
    f s = [Classified c $ prob c s | c <- classes]

-- the probability of a class occurs,
-- given a set of learning material
pClass :: (Eq a) => a -> Material a -> Double
pClass cl [] = error "no material given"
pClass cl docs =
    let count = length $ filter (\(Labeled fl clas) -> clas == cl) docs
    in (fromIntegral count) / (fromIntegral $ length docs)

-- the probability the word occurs given the class
pWordGivenClass :: T.Text -> Int -> FList -> Double
pWordGivenClass w denom currentCase =
    (fromIntegral (nk + 1)) / (fromIntegral denom) where
        nk = totalOfWord w currentCase    

-- returns the class that which has the highest probability associated with it
argmax :: (Eq a) => [Classified a] -> a
argmax = _cClass . maximum

removePunctuation :: T.Text -> T.Text
removePunctuation = T.filter (not . isPunctuation)

-- takes a list of words and makes a frequency list
vectorize :: String -> FList
vectorize = 
    M.fromListWith (+) . flip zip (repeat 1) . T.words . removePunctuation . T.pack

-- a list of frequency lists, derived from a set of material
vecs :: Material a -> [FList]
vecs = map flist

-- the union of multiple frequency lists
-- adds occurences of each word together
unions :: [FList] -> FList
unions = M.unionsWith (+)

totalUniqueWords :: FList -> Int
totalUniqueWords = M.size

totalWords :: FList -> Int
totalWords = M.foldl' (+) 0 

totalOfWord :: T.Text -> FList -> Int
totalOfWord word doc = M.findWithDefault 0 word doc

{- $simpleExample

In this example a list of sample reviews and their corresponding classes
are zipped into an association list to be passed into the 'makeMaterial' function.
This newly created material is then passed into the 'runBayes' function, along with
a new review. This will classify the new review based on the training material that
has been given.

> data Class = Positive | Negative deriving (Eq, Show)

> doc1 = "I loved the movie"
> doc2 = "I hated the movie"
> doc3 = "a great movie. good movie"
> doc4 = "poor acting"
> doc5 = "great acting. a good movie"

> docs = [doc1, doc2, doc3, doc4, doc5]
> correspondingClasses = [Positive, Negative, Positive, Negative, Positive]
> classifiedDocs = zip docs correspondingClasses

> main :: IO ()
> main = do
>     let material = makeMaterial classifiedDocs
>     let review = "I loved the great acting"
>     let result = runBayes material review
>     
>     putStrLn $ "The review '" ++ review ++ "' is " ++ show result
-}