{-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-}
module Data.SearchEngine.DocTermIds (
    DocTermIds,
    TermId,
    fieldLength,
    fieldTermCount,
    fieldElems,
    create,
    denseTable,
    vecIndexIx,
    vecCreateIx,
  ) where

import Data.SearchEngine.TermBag (TermBag, TermId)
import qualified Data.SearchEngine.TermBag as TermBag

import Data.Vector (Vector, (!))
import qualified Data.Vector as Vec
import qualified Data.Vector.Unboxed as UVec
import Data.Ix (Ix)
import qualified Data.Ix as Ix


-- | The 'TermId's for the 'Term's that occur in a document. Documents may have
-- multiple fields and the 'DocTerms' type holds them separately for each field.
--
newtype DocTermIds field = DocTermIds (Vector TermBag)
  deriving (Int -> DocTermIds field -> ShowS
[DocTermIds field] -> ShowS
DocTermIds field -> String
(Int -> DocTermIds field -> ShowS)
-> (DocTermIds field -> String)
-> ([DocTermIds field] -> ShowS)
-> Show (DocTermIds field)
forall field. Int -> DocTermIds field -> ShowS
forall field. [DocTermIds field] -> ShowS
forall field. DocTermIds field -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: forall field. Int -> DocTermIds field -> ShowS
showsPrec :: Int -> DocTermIds field -> ShowS
$cshow :: forall field. DocTermIds field -> String
show :: DocTermIds field -> String
$cshowList :: forall field. [DocTermIds field] -> ShowS
showList :: [DocTermIds field] -> ShowS
Show)

getField :: (Ix field, Bounded field) => DocTermIds field -> field -> TermBag
getField :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField (DocTermIds Vector TermBag
fieldVec) = Vector TermBag -> field -> TermBag
forall ix a. (Ix ix, Bounded ix) => Vector a -> ix -> a
vecIndexIx Vector TermBag
fieldVec

create :: (Ix field, Bounded field) =>
          (field -> [TermId]) -> DocTermIds field
create :: forall field.
(Ix field, Bounded field) =>
(field -> [TermId]) -> DocTermIds field
create field -> [TermId]
docTermIds =
    Vector TermBag -> DocTermIds field
forall field. Vector TermBag -> DocTermIds field
DocTermIds ((field -> TermBag) -> Vector TermBag
forall ix a. (Ix ix, Bounded ix) => (ix -> a) -> Vector a
vecCreateIx ([TermId] -> TermBag
TermBag.fromList ([TermId] -> TermBag) -> (field -> [TermId]) -> field -> TermBag
forall b c a. (b -> c) -> (a -> b) -> a -> c
. field -> [TermId]
docTermIds))

-- | The number of terms in a field within the document.
fieldLength :: (Ix field, Bounded field) => DocTermIds field -> field -> Int
fieldLength :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> Int
fieldLength DocTermIds field
docterms field
field =
    TermBag -> Int
TermBag.size (DocTermIds field -> field -> TermBag
forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField DocTermIds field
docterms field
field)

-- | /O(log n)/ The frequency of a particular term in a field within the document.
--
fieldTermCount :: (Ix field, Bounded field) =>
                  DocTermIds field -> field -> TermId -> Int
fieldTermCount :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermId -> Int
fieldTermCount DocTermIds field
docterms field
field TermId
termid =
    TermCount -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (TermBag -> TermId -> TermCount
TermBag.termCount (DocTermIds field -> field -> TermBag
forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField DocTermIds field
docterms field
field) TermId
termid)

fieldElems :: (Ix field, Bounded field) => DocTermIds field -> field -> [TermId]
fieldElems :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> [TermId]
fieldElems DocTermIds field
docterms field
field =
    TermBag -> [TermId]
TermBag.elems (DocTermIds field -> field -> TermBag
forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField DocTermIds field
docterms field
field)

-- | The 'DocTermIds' is really a sparse 2d array, and doing lookups with
-- 'fieldTermCount' has a O(log n) cost. This function converts to a dense
-- tabular representation which then enables linear scans.
--
denseTable :: (Ix field, Bounded field) => DocTermIds field ->
              (Int, Int -> TermId, Int -> field -> Int)
denseTable :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> (Int, Int -> TermId, Int -> field -> Int)
denseTable (DocTermIds Vector TermBag
fieldVec) =
    let (!Vector TermId
termids, !Vector TermCount
termcounts) = [TermBag] -> (Vector TermId, Vector TermCount)
TermBag.denseTable (Vector TermBag -> [TermBag]
forall a. Vector a -> [a]
Vec.toList Vector TermBag
fieldVec)
        !numTerms :: Int
numTerms = Vector TermId -> Int
forall a. Unbox a => Vector a -> Int
UVec.length Vector TermId
termids
     in ( Int
numTerms
        , \Int
i    -> Vector TermId
termids Vector TermId -> Int -> TermId
forall a. Unbox a => Vector a -> Int -> a
UVec.! Int
i
        , \Int
i field
ix -> let j :: Int
j = (field, field) -> field -> Int
forall a. Ix a => (a, a) -> a -> Int
Ix.index (field
forall a. Bounded a => a
minBound, field
forall a. Bounded a => a
maxBound) field
ix
                    in TermCount -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Vector TermCount
termcounts Vector TermCount -> Int -> TermCount
forall a. Unbox a => Vector a -> Int -> a
UVec.! (Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
numTerms Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i))
        )

---------------------------------
-- Vector indexed by Ix Bounded
--

vecIndexIx  :: (Ix ix, Bounded ix) => Vector a -> ix -> a
vecIndexIx :: forall ix a. (Ix ix, Bounded ix) => Vector a -> ix -> a
vecIndexIx Vector a
vec ix
ix = Vector a
vec Vector a -> Int -> a
forall a. Vector a -> Int -> a
! (ix, ix) -> ix -> Int
forall a. Ix a => (a, a) -> a -> Int
Ix.index (ix
forall a. Bounded a => a
minBound, ix
forall a. Bounded a => a
maxBound) ix
ix

vecCreateIx :: (Ix ix, Bounded ix) => (ix -> a) -> Vector a
vecCreateIx :: forall ix a. (Ix ix, Bounded ix) => (ix -> a) -> Vector a
vecCreateIx ix -> a
f = Int -> [a] -> Vector a
forall a. Int -> [a] -> Vector a
Vec.fromListN ((ix, ix) -> Int
forall a. Ix a => (a, a) -> Int
Ix.rangeSize (ix, ix)
bounds)
                  [ a
y | ix
ix <- (ix, ix) -> [ix]
forall a. Ix a => (a, a) -> [a]
Ix.range (ix, ix)
bounds, let !y :: a
y = ix -> a
f ix
ix ]
  where
    bounds :: (ix, ix)
bounds = (ix
forall a. Bounded a => a
minBound, ix
forall a. Bounded a => a
maxBound)