{-# LANGUAGE DataKinds #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE PolyKinds #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE Trustworthy #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE AllowAmbiguousTypes #-} {-# LANGUAGE OverloadedLabels #-} {-# LANGUAGE QuantifiedConstraints #-} {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} {-# HLINT ignore "Use camelCase" #-} {- | A [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model (or topic model) for learning the distribution over words and topics in a text document. -} module LDA where import Model ( Model, dirichlet, discrete', categorical ) import Sampler ( Sampler ) import Control.Monad ( replicateM ) import Data.Kind (Constraint) import Env ( Observables, Observable(get), Assign((:=)), nil, (<:>) ) import Inference.SIM as SIM ( simulate ) import Inference.MH as MH ( mh ) {- | An LDA environment. Assuming 1 document with 2 topics and a vocabulary of 4 words, the parameters of the model environment would have the following shape: θ would be [[prob_topic_1, prob_topic_2] -- probabilities of topics in document 1 ] φ would be [[prob_word_1, prob_word_2, prob_word_3, prob_word_4] -- probabilities of words in topic 1 [prob_word_1, prob_word_2, prob_word_3, prob_word_4] -- probabilities of words in topic 2 ] -} type TopicEnv = '[ "θ" ':= [Double], -- ^ probabilities of each topic in a document "φ" ':= [Double], -- ^ probabilities of each word in a topic "w" ':= String -- ^ word ] -- | Prior distribution for topics in a document docTopicPrior :: Observable env "θ" [Double] -- | number of topics => Int -- | probability of each topic -> Model env ts [Double] docTopicPrior n_topics = dirichlet (replicate n_topics 1) #θ -- | Prior distribution for words in a topic topicWordPrior :: Observable env "φ" [Double] -- | vocabulary => [String] -- | probability of each word -> Model env ts [Double] topicWordPrior vocab = dirichlet (replicate (length vocab) 1) #φ -- | A distribution generating words according to their probabilities wordDist :: Observable env "w" String -- | vocabulary => [String] -- | probability of each word -> [Double] -- | generated word -> Model env ts String wordDist vocab ps = categorical (zip vocab ps) #w -- | Distribution over the topics in a document, over the distribution of words in a topic topicModel :: (Observables env '["φ", "θ"] [Double], Observable env "w" String) -- | vocabulary => [String] -- | number of topics -> Int -- | number of words -> Int -- | generated words -> Model env ts [String] topicModel vocab n_topics n_words = do -- Generate distribution over words for each topic topic_word_ps <- replicateM n_topics $ topicWordPrior vocab -- Generate distribution over topics for a given document doc_topic_ps <- docTopicPrior n_topics replicateM n_words (do z <- discrete' doc_topic_ps let word_ps = topic_word_ps !! z wordDist vocab word_ps) -- | Topic distribution over many topics topicModels :: (Observables env '["φ", "θ"] [Double], Observable env "w" String) -- | vocabulary => [String] -- | number of topics -> Int -- | number of words for each document -> [Int] -- | generated words for each document -> Model env ts [[String]] topicModels vocab n_topics doc_words = do mapM (topicModel vocab n_topics) doc_words -- | Example possible vocabulary vocab :: [String] vocab = ["DNA", "evolution", "parsing", "phonology"] -- | Simulating from topic model simLDA :: Sampler [String] simLDA = do -- Specify model inputs let n_words = 100 n_topics = 2 -- Specify model environment env_in = #θ := [[0.5, 0.5]] <:> #φ := [[0.12491280814569208,1.9941599739151505e-2,0.5385152817942926,0.3166303103208638], [1.72605174564027e-2,2.9475900240868515e-2,9.906011619752661e-2,0.8542034661052021]] <:> #w := [] <:> nil -- Simulate from topic model (words, env_out) <- SIM.simulate (topicModel vocab n_topics) env_in n_words return words -- | Example document of words topic_data :: [String] topic_data = ["DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution","DNA","evolution", "parsing", "phonology", "DNA","evolution", "DNA", "parsing", "evolution","phonology", "evolution", "DNA"] -- | MH inference from topic model mhLDA :: Sampler ([[Double]], [[Double]]) mhLDA = do -- Specify model inputs let n_words = 100 n_topics = 2 -- Specify model environment env_mh_in = #θ := [] <:> #φ := [] <:> #w := topic_data <:> nil -- Run MH for 500 iterations env_mh_outs <- MH.mh 500 (topicModel vocab n_topics) (n_words, env_mh_in) ["φ", "θ"] -- Draw the most recent sampled parameters from the MH trace let env_pred = head env_mh_outs θs = get #θ env_pred φs = get #φ env_pred return (θs, φs)