module NLP.Hext.NaiveBayes (FrequencyList, Labeled(..), Classified(..), BayesModel(..), emptyModel, teach, runBayes, -- * Example: Simple Usage -- $simpleExample ) where import qualified Data.HashMap.Lazy as H import qualified Data.Set as S import Data.Maybe import Data.Char import Data.Function import Data.List import qualified Data.Text.Lazy as T -- | A hash representing frequency list of words type FrequencyList = H.HashMap T.Text Int -- | A frequency list of words that has been assigned a class data Labeled a = Labeled { hash :: FrequencyList -- ^ a frequency list , label :: a } -- ^ the class label for a piece of text -- | A class which has a specific probability of occuring data Classified a = Classified { _class :: a , probability :: Double } deriving (Eq) -- | A model representing the knowledge that has been given data BayesModel a = BayesModel { classes :: S.Set a -- ^ a set of user-defined classes , vocab :: FrequencyList -- ^ the frequency list of all vocabulary , material :: [Labeled a] -- ^ a list of all of the classified text } instance (Show a) => Show (BayesModel a) where show model = show (classes model) ++ " " ++ show (vocab model) instance (Eq a) => Ord (Classified a) where compare = compare `on` probability instance (Show a) => Show (Classified a) where show c = show (_class c, probability c) -- | an empty model to begin teaching emptyModel :: BayesModel a emptyModel = BayesModel S.empty H.empty [] -- | teaches the model teach :: (Ord a) => T.Text -- ^ the sample -> a -- ^ sample's class -> BayesModel a -- ^ the current model -> BayesModel a -- ^ the new model teach source c model = let fl = vectorize source labeled = Labeled fl c vc = vocab model vc' = H.union fl vc cs = classes model mat = material model in BayesModel (S.insert c cs) vc' (labeled:mat) -- | Runs a sample string through the Naive Bayes algorithm using -- a model containing all knowledge from previous learning runBayes :: (Ord a, Eq a) => BayesModel a -- ^ a model that has been taught using 'learn' -> String -- ^ the sample string to be classified -> a -- ^ a datatype representing a class to classify text runBayes model sample = argmax $ classify model (T.words $ T.pack sample) classify :: (Ord a, Eq a) => BayesModel a -> [T.Text] -> S.Set (Classified a) classify model = f where cs = classes model lengthVocab = H.size $ vocab model mat = material model prob c ws = let caseC = unions . vecs $ filter ((== c) . label) mat n = totalWords caseC denom = n + lengthVocab in foldl' (\acc word -> (pWordGivenClass word denom caseC) * acc) (pClass c mat) ws f wrds = S.map (\c -> Classified c $ prob c wrds) cs -- the probability of a class occurs, -- given a set of learning material pClass :: (Eq a) => a -> [Labeled a] -> Double pClass cl [] = 0 pClass cl docs = let count = length $ filter (\(Labeled fl clas) -> clas == cl) docs in (fromIntegral count) / (fromIntegral $ length docs) -- the probability the word occurs given the class pWordGivenClass :: T.Text -> Int -> FrequencyList -> Double pWordGivenClass w denom currentCase = (fromIntegral (nk + 1)) / (fromIntegral denom) where nk = totalOfWord w currentCase -- returns the class that which has the highest probability associated with it argmax :: (Eq a) => S.Set (Classified a) -> a argmax = _class . S.findMax removePunctuation :: T.Text -> T.Text removePunctuation = T.filter (not . isPunctuation) -- takes a list of words and makes a frequency list vectorize :: T.Text -> FrequencyList vectorize = H.fromListWith (+) . flip zip (repeat 1) . T.words . removePunctuation -- a list of frequency lists, derived from a set of material vecs :: [Labeled a] -> [FrequencyList] vecs = map hash -- the union of multiple frequency lists -- adds occurences of each word together unions :: [FrequencyList] -> FrequencyList unions = foldl' (\acc hmap -> H.unionWith (+) hmap acc) H.empty totalWords :: FrequencyList -> Int totalWords = H.foldl' (+) 0 totalOfWord :: T.Text -> FrequencyList -> Int totalOfWord word doc = H.lookupDefault 0 word doc {- $simpleExample In this example a list of sample reviews and their corresponding classes are zipped into an association list to be passed into the 'makeMaterial' function. This newly created material is then passed into the 'runBayes' function, along with a new review. This will classify the new review based on the training material that has been given. > data Class = Positive | Negative deriving (Eq, Show) > > doc1 = "I loved the movie" > doc2 = "I hated the movie" > doc3 = "a great movie. good movie" > doc4 = "poor acting" > doc5 = "great acting. a good movie" > > docs = [doc1, doc2, doc3, doc4, doc5] > correspondingClasses = [Positive, Negative, Positive, Negative, Positive] > classifiedDocs = zip docs correspondingClasses > > main :: IO () > main = do > -- teachMultiple returns a BayesModel Class > let teachMultiple = foldl (\m (sample, cl) -> teach (T.pack sample) cl m) emptyModel > > let review = "I hated the poor acting" > let result = runBayes (teachMultiple classifiedDocs) review > > putStrLn $ "The review '" ++ review ++ "' is " ++ show result -- Negative -}