module NaiveBayes (makeMaterial,
runBayes,
Classified(..),
Material(..),
Labeled(..),
FList,
)where
import qualified Data.Map.Lazy as M
import Data.Maybe
import Data.Char
import Data.Function
import Data.List
import qualified Data.Text.Lazy as T
type FList = M.Map T.Text Int
data Labeled a = Labeled { flist :: FList, _lClass :: a}
type Material a = [Labeled a]
data Classified a = Classified { _cClass :: a, _probability :: Double } deriving (Eq)
instance (Eq a) => Ord (Classified a) where compare = compare `on` _probability
makeMaterial :: [(String,a)]
-> Material a
makeMaterial ((s,c):rest) = (Labeled (vectorize s) c): makeMaterial rest
makeMaterial [] = []
runBayes :: (Eq a) => Material a
-> String
-> a
runBayes trainingMaterial sample =
argmax $ classify trainingMaterial (T.words $ T.pack sample)
classify :: (Eq a) => Material a -> [T.Text] -> [Classified a]
classify mat = f where
classes = nub [c | (Labeled f c) <- mat]
lengthVocab = totalUniqueWords . unions $ vecs mat
prob c s =
let caseC = unions . vecs $ filter (\(Labeled fl cl) -> c == cl) mat
n = totalWords caseC
denom = n + lengthVocab
in foldl' (\acc word -> (pWordGivenClass word denom caseC) * acc) (pClass c mat) s
f s = [Classified c $ prob c s | c <- classes]
pClass :: (Eq a) => a -> Material a -> Double
pClass cl [] = error "no material given"
pClass cl docs =
let count = length $ filter (\(Labeled fl clas) -> clas == cl) docs
in (fromIntegral count) / (fromIntegral $ length docs)
pWordGivenClass :: T.Text -> Int -> FList -> Double
pWordGivenClass w denom currentCase =
(fromIntegral (nk + 1)) / (fromIntegral denom) where
nk = totalOfWord w currentCase
argmax :: (Eq a) => [Classified a] -> a
argmax = _cClass . maximum
removePunctuation :: T.Text -> T.Text
removePunctuation = T.filter (not . isPunctuation)
vectorize :: String -> FList
vectorize =
M.fromListWith (+) . flip zip (repeat 1) . T.words . removePunctuation . T.pack
vecs :: Material a -> [FList]
vecs = map flist
unions :: [FList] -> FList
unions = M.unionsWith (+)
totalUniqueWords :: FList -> Int
totalUniqueWords = M.size
totalWords :: FList -> Int
totalWords = M.foldl' (+) 0
totalOfWord :: T.Text -> FList -> Int
totalOfWord word doc = M.findWithDefault 0 word doc