Safe Haskell | None |
---|---|
Language | Haskell2010 |
Random projection trees for approximate nearest neighbor search in high-dimensional vector spaces
Synopsis
- forest :: (MonadThrow m, Inner SVector v) => Word64 -> Int -> Int -> Int -> Int -> Double -> Int -> ConduitT () (v Double) m () -> m (RPForest Double (Vector (v Double)))
- knn :: (Ord p, Inner SVector v, Unbox d, Real d) => (v2 -> v d -> p) -> Int -> RPForest d (Vector v2) -> v d -> Vector (p, v2)
- recall :: (Inner u v, Inner SVector v, Unbox a, Ord a, Ord (u a), Floating a) => RPForest a (Vector (u a)) -> Int -> v a -> Double
- levels :: RPTree d a -> Int
- points :: Monoid m => RPTree d m -> m
- leaves :: RPTree d a -> [a]
- candidates :: (Inner SVector v, Unbox d, Ord d, Num d, Semigroup xs) => RPTree d xs -> v d -> xs
- data RPTree d a
- type RPForest d a = IntMap (RPTree d a)
- data SVector a
- fromListSv :: Unbox a => Int -> [(Int, a)] -> SVector a
- data DVector a
- fromListDv :: Unbox a => [a] -> DVector a
- class (Scale u, Scale v) => Inner u v where
- class Scale v where
- dataSource :: Monad m => Int -> GenT m a -> ConduitT i a (GenT m) ()
- sparse :: (Monad m, Unbox a) => Double -> Int -> GenT m a -> GenT m (SVector a)
- dense :: (Monad m, Vector Vector a) => Int -> GenT m a -> GenT m (DVector a)
- draw :: (Show a, Boxed a, PrintfArg v) => RPTree v a -> IO ()
- writeCsv :: (Show a, Show b, Unbox a) => FilePath -> [(DVector a, b)] -> IO ()
Construction
:: (MonadThrow m, Inner SVector v) | |
=> Word64 | random seed |
-> Int | max tree depth |
-> Int | min leaf size |
-> Int | number of trees |
-> Int | data chunk size |
-> Double | nonzero density of projection vectors |
-> Int | dimension of projection vectors |
-> ConduitT () (v Double) m () | data source |
-> m (RPForest Double (Vector (v Double))) |
Populate a forest from a data stream
Assumptions on the data source:
- non-empty : contains at least one value
- stationary : each chunk is representative of the whole dataset
- bounded : we wait until the end of the stream to produce a result
Throws EmptyResult
if the conduit is empty
Query
:: (Ord p, Inner SVector v, Unbox d, Real d) | |
=> (v2 -> v d -> p) | distance function |
-> Int | k neighbors |
-> RPForest d (Vector v2) | random projection forest |
-> v d | query point |
-> Vector (p, v2) | ordered in increasing distance order |
k nearest neighbors
Validation
:: (Inner u v, Inner SVector v, Unbox a, Ord a, Ord (u a), Floating a) | |
=> RPForest a (Vector (u a)) | |
-> Int | k : number of nearest neighbors to consider |
-> v a | query point |
-> Double |
average recall-at-k, computed over a set of trees
Access
Retrieve points nearest to the query
in case of a narrow margin, collect both branches of the tree
Types
RPTree
Random projection trees
The first type parameter corresponds to a floating point scalar value, the second is the type of the data collected at the leaves of the tree (e.g. lists of vectors)
We keep them separate to leverage the Functor instance for postprocessing and visualization
One projection vector per tree level (as suggested in https://www.cs.helsinki.fi/u/ttonteri/pub/bigdata2016.pdf )
Instances
Functor (RPTree d) Source # | |
Foldable (RPTree d) Source # | |
Defined in Data.RPTree.Internal fold :: Monoid m => RPTree d m -> m # foldMap :: Monoid m => (a -> m) -> RPTree d a -> m # foldMap' :: Monoid m => (a -> m) -> RPTree d a -> m # foldr :: (a -> b -> b) -> b -> RPTree d a -> b # foldr' :: (a -> b -> b) -> b -> RPTree d a -> b # foldl :: (b -> a -> b) -> b -> RPTree d a -> b # foldl' :: (b -> a -> b) -> b -> RPTree d a -> b # foldr1 :: (a -> a -> a) -> RPTree d a -> a # foldl1 :: (a -> a -> a) -> RPTree d a -> a # elem :: Eq a => a -> RPTree d a -> Bool # maximum :: Ord a => RPTree d a -> a # minimum :: Ord a => RPTree d a -> a # | |
Traversable (RPTree d) Source # | |
(Unbox d, Eq d, Eq a) => Eq (RPTree d a) Source # | |
(Unbox d, Show d, Show a) => Show (RPTree d a) Source # | |
Generic (RPTree d a) Source # | |
(NFData a, NFData d) => NFData (RPTree d a) Source # | |
Defined in Data.RPTree.Internal | |
(Serialise d, Serialise a, Unbox d) => Serialise (RPTree d a) Source # | |
type Rep (RPTree d a) Source # | |
Defined in Data.RPTree.Internal |
Sparse vectors with unboxed components
Instances
Scale SVector Source # | |
Inner SVector Vector Source # | |
Defined in Data.RPTree.Internal | |
Inner SVector DVector Source # | |
Defined in Data.RPTree.Internal | |
Inner SVector SVector Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Eq a) => Eq (SVector a) Source # | |
(Unbox a, Ord a) => Ord (SVector a) Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Show a) => Show (SVector a) Source # | |
Generic (SVector a) Source # | |
NFData (SVector a) Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Serialise a) => Serialise (SVector a) Source # | |
type Rep (SVector a) Source # | |
Defined in Data.RPTree.Internal type Rep (SVector a) = D1 ('MetaData "SVector" "Data.RPTree.Internal" "rp-tree-0.1.0.0-CFkPWzCADA43VltLgcxOCH" 'False) (C1 ('MetaCons "SV" 'PrefixI 'True) (S1 ('MetaSel ('Just "svDim") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 Int) :*: S1 ('MetaSel ('Just "svVec") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 (Vector (Int, a))))) |
Dense vectors with unboxed components
Instances
Scale DVector Source # | |
Inner DVector DVector Source # | |
Defined in Data.RPTree.Internal | |
Inner SVector DVector Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Eq a) => Eq (DVector a) Source # | |
(Unbox a, Ord a) => Ord (DVector a) Source # | |
Defined in Data.RPTree.Internal | |
(Unbox a, Show a) => Show (DVector a) Source # | |
Generic (DVector a) Source # | |
(Unbox a, Serialise a) => Serialise (DVector a) Source # | |
type Rep (DVector a) Source # | |
Defined in Data.RPTree.Internal |
fromListDv :: Unbox a => [a] -> DVector a Source #
inner product
class (Scale u, Scale v) => Inner u v where Source #
Inner product spaces
This typeclass is provided as a convenience for library users to interface their own vector types.
inner :: (Unbox a, Num a) => u a -> v a -> a Source #
metricL2 :: (Unbox a, Floating a) => u a -> v a -> a Source #
Conduit
:: Monad m | |
=> Int | number of vectors to generate |
-> GenT m a | random generator for the vector components |
-> ConduitT i a (GenT m) () |
Source of random data points
Random generation
vector
:: (Monad m, Unbox a) | |
=> Double | nonzero density |
-> Int | vector dimension |
-> GenT m a | random generator of vector components |
-> GenT m (SVector a) |
Generate a sparse random vector with a given nonzero density and components sampled from the supplied random generator
:: (Monad m, Vector Vector a) | |
=> Int | vector dimension |
-> GenT m a | random generator of vector components |
-> GenT m (DVector a) |
Generate a dense random vector with components sampled from the supplied random generator
Rendering
draw :: (Show a, Boxed a, PrintfArg v) => RPTree v a -> IO () Source #
Render a tree to stdout
Useful for debugging
This should be called only for small trees, otherwise the printed result quickly overflows the screen and becomes hard to read.
NB : prints distance information rounded to two decimal digits