src/Algorithms/MDP/ValueIteration.hs

-- | This module provides several flavors of the value iteration
-- algorithm for solving MDPs.
module Algorithms.MDP.ValueIteration
       ( -- * Value iteration algorithms
         valueIteration
       , relativeValueIteration
       , undiscountedRelativeValueIteration
         -- * Helper functions for value iteration
       , valueIterate
       , relativeValueIterate
       , undiscountedRVI
       ) where

import qualified Data.Vector as V

import Algorithms.MDP

-- | Compute the inner product between two vectors.
inner :: (Num t) => V.Vector t -> V.Vector t -> t
inner u v = V.sum (V.zipWith (*) u v)

-- | Compute an infinite sequence of estimates of cost functions
-- converging to the true cost function.
--
-- This method should only be used on discounted MDPs (e.g. an MDP
-- with a discount factor less than one).
valueIteration ::
  (Ord t, Num t) => 
  MDP a b t      -- ^ The MDP to solve
  -> [CF a b t]  -- ^ An converging sequence of cost functions
valueIteration mdp =
  let
    states = _states mdp
    actions = _actions mdp

    zero = V.map (\s -> (s, V.head actions, 0)) states
  in
    iterate (valueIterate mdp) zero

-- | Computes the next estimate of the cost function.
valueIterate :: (Ord t, Num t) => 
                MDP a b t -- ^ The MDP to solve
             -> CF a b t  -- ^ The current cost function estimate
             -> CF a b t  -- ^ The next cost function estimate
valueIterate mdp cf = V.imap (choiceFor mdp cf) (_states mdp)

-- | Finds the action that minimizes the one-step payoff using the
-- given cost function.
choiceFor :: (Ord t, Num t) =>
             MDP a b t -- ^ The MDP we are solving
          -> CF a b t  -- ^ The current cost function
          -> Int       -- ^ The state for which we choose an action
          -> a         -- ^ The state for which we choose an action
          -> (a, b, t) -- ^ The choice of action and associated cost
choiceFor mdp cf sIndex s =
  let

    actions = V.fromList [(_actions mdp) V.! ac' | ac' <- V.toList ((_actionSet mdp) V.! sIndex)]
    
    cmp (_, x) (_, y) = compare x y
    costs = V.map (costForAction mdp cf sIndex) (_actionSet mdp V.! sIndex)
    pairs = V.zip actions costs
    (ac, c) = V.minimumBy cmp pairs
  in
    (s, ac, c)

-- | Computes the cost implied by choosing an action in the given
-- state.
costForAction :: (Num t) => 
                 MDP a b t -- ^ The MDP we are solving.
              -> CF a b t  -- ^ The current cost function.
              -> Int       -- ^ The index of the state.
              -> Int       -- ^ The index of the action.
              -> t         -- ^ The estimated cost.
costForAction mdp cf sIndex ac =
  let
    alpha = _discount mdp
    fixedCost = (_costs mdp) V.! ac V.! sIndex
    transCost = inner (_trans mdp V.! ac V.! sIndex) (V.map (\(_, _, c) -> c) cf)
  in
    fixedCost + alpha * transCost

-- | An implementation of value iteration that computes monotonic
-- error bounds.
--
-- The error bounds provided at each iteration are additive in each
-- state. That is, given a cost estimate 'c' for a given state and
-- lower and upper bounds 'lb' and 'ub', the true cost is guaranteed
-- to be in the interval [c + lb, c + ub].
relativeValueIteration ::
  (Read t, Ord t, Fractional t) => 
  MDP a b t           -- ^ The MDP to solve
  -> [CFBounds a b t] -- ^ A converging sequence of cost functions.
relativeValueIteration mdp =
  let
    states = _states mdp
    actions = _actions mdp

    zero = V.map (\s -> (s, V.head actions, 0)) states

    cf = CFBounds zero (read "-Infinity") (read "Infinity")
  in
    iterate (relativeValueIterate mdp) cf

-- | Computes the next estimate of the cost function and associated
-- error bounds.
relativeValueIterate ::
  (Ord t, Fractional t) => 
  MDP a b t 
  -> CFBounds a b t 
  -> CFBounds a b t
relativeValueIterate mdp (CFBounds cf _ _) =
  let
    alpha = _discount mdp
    cf' = valueIterate mdp cf
    (lb, ub) = (V.minimum diffs, V.maximum diffs)
      where
        diffs = V.zipWith (\(_, _, a) (_, _, b) -> a - b) cf' cf
    scale = alpha / (1 - alpha)
  in 
    CFBounds
    { _CF = cf'
    , _lb = scale * lb
    , _ub = scale * ub
    }

-- | Relative value iteration for undiscounted MDPs.
undiscountedRelativeValueIteration ::
  (Ord t, Fractional t, Read t) =>
  MDP a b t           -- ^ The MDP to solve
  -> [CFBounds a b t] -- ^ A converging sequence of cost functions
undiscountedRelativeValueIteration mdp =
  let
    states = _states mdp
    actions = _actions mdp

    trans  = _trans mdp
    update s v = V.imap (\i z -> tau * z + if i == s then (1 - tau) else 0) v

    trans' = V.map (\vv -> V.imap (\s v -> update s v) vv) trans

    tau = 0.5
    mdp' = mdp {_trans = trans'}
    zeroV = V.map (\s -> (s, V.head actions, 0)) states
    zero = CFBounds zeroV (read "-Infinity") (read "Infinity")
    distinguished = 0
  in
    iterate (undiscountedRVI mdp' distinguished) zero

-- | Performs a single iterate of relative value iteration for the
-- undiscounted problem.
undiscountedRVI :: (Ord t, Fractional t) =>
                   MDP a b t
                -> Int
                -> CFBounds a b t
                -> CFBounds a b t
undiscountedRVI mdp distinguished (CFBounds h _ _) =
  let
    th = valueIterate mdp h
    (_, _, distinguishedCost) = th V.! distinguished

    th' = V.map (\(s, ac, z) -> (s, ac, z - distinguishedCost)) th

    (lb, ub) = (V.minimum diffs, V.maximum diffs)
      where
        diffs = V.zipWith (\(_, _, a) (_, _, b) -> a - b) th h

  in
    CFBounds th' lb ub