Copyright | (c) 2019 Composewell Technologies |
---|---|
License | BSD3 |
Maintainer | streamly@composewell.com |
Stability | experimental |
Portability | GHC |
Safe Haskell | Safe-Inferred |
Language | Haskell2010 |
Synopsis
- toSet :: (Monad m, Ord a) => Fold m a (Set a)
- toIntSet :: Monad m => Fold m Int IntSet
- countDistinct :: (Monad m, Ord a) => Fold m a Int
- countDistinctInt :: Monad m => Fold m Int Int
- nub :: (Monad m, Ord a) => Fold m a (Maybe a)
- nubInt :: Monad m => Fold m Int (Maybe Int)
- frequency :: (Monad m, Ord a) => Fold m a (Map a Int)
- demuxKvToContainer :: (Monad m, IsMap f, Traversable f) => (Key f -> m (Fold m a b)) -> Fold m (Key f, a) (f b)
- demuxKvToMap :: (Monad m, Ord k) => (k -> m (Fold m a b)) -> Fold m (k, a) (Map k b)
- demuxToContainer :: (Monad m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (f b)
- demuxToContainerIO :: (MonadIO m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (f b)
- demuxToMap :: (Monad m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (Map k b)
- demuxToMapIO :: (MonadIO m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (Map k b)
- demuxGeneric :: (Monad m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (m (f b), Maybe (Key f, b))
- demux :: (Monad m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (m (Map k b), Maybe (k, b))
- demuxGenericIO :: (MonadIO m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (m (f b), Maybe (Key f, b))
- demuxIO :: (MonadIO m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (m (Map k b), Maybe (k, b))
- kvToMap :: (Monad m, Ord k) => Fold m a b -> Fold m (k, a) (Map k b)
- toContainer :: (Monad m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (f b)
- toContainerIO :: (MonadIO m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (f b)
- toMap :: (Monad m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (Map k b)
- toMapIO :: (MonadIO m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (Map k b)
- classifyGeneric :: (Monad m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (m (f b), Maybe (Key f, b))
- classify :: (Monad m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (m (Map k b), Maybe (k, b))
- classifyGenericIO :: (MonadIO m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (m (f b), Maybe (Key f, b))
- classifyIO :: (MonadIO m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (m (Map k b), Maybe (k, b))
Imports
>>>
:m
>>>
:set -XFlexibleContexts
>>>
import qualified Data.Map as Map
>>>
import qualified Data.Set as Set
>>>
import qualified Data.IntSet as IntSet
>>>
import qualified Streamly.Data.Fold as Fold
>>>
import qualified Streamly.Data.Stream as Stream
>>>
import qualified Streamly.Internal.Data.Fold.Container as Fold
Set operations
toSet :: (Monad m, Ord a) => Fold m a (Set a) Source #
Fold the input to a set.
Definition:
>>>
toSet = Fold.foldl' (flip Set.insert) Set.empty
toIntSet :: Monad m => Fold m Int IntSet Source #
Fold the input to an int set. For integer inputs this performs better than
toSet
.
Definition:
>>>
toIntSet = Fold.foldl' (flip IntSet.insert) IntSet.empty
countDistinct :: (Monad m, Ord a) => Fold m a Int Source #
Count non-duplicate elements in the stream.
Definition:
>>>
countDistinct = fmap Set.size Fold.toSet
>>>
countDistinct = Fold.postscan Fold.nub $ Fold.catMaybes $ Fold.length
The memory used is proportional to the number of distinct elements in the stream, to guard against using too much memory use it as a scan and terminate if the count reaches more than a threshold.
Space: \(\mathcal{O}(n)\)
Pre-release
countDistinctInt :: Monad m => Fold m Int Int Source #
Like countDistinct
but specialized to a stream of Int
, for better
performance.
Definition:
>>>
countDistinctInt = fmap IntSet.size Fold.toIntSet
>>>
countDistinctInt = Fold.postscan Fold.nubInt $ Fold.catMaybes $ Fold.length
Pre-release
Map operations
frequency :: (Monad m, Ord a) => Fold m a (Map a Int) Source #
Determine the frequency of each element in the stream.
You can just collect the keys of the resulting map to get the unique elements in the stream.
Definition:
>>>
frequency = Fold.toMap id Fold.length
Demultiplexing
Direct values in the input stream to different folds using an n-ary
fold selector. demux
is a generalization of classify
(and
partition
) where each key of the classifier can use a different fold.
demuxKvToContainer :: (Monad m, IsMap f, Traversable f) => (Key f -> m (Fold m a b)) -> Fold m (Key f, a) (f b) Source #
demuxKvToMap :: (Monad m, Ord k) => (k -> m (Fold m a b)) -> Fold m (k, a) (Map k b) Source #
Fold a stream of key value pairs using a function that maps keys to folds.
Definition:
>>>
demuxKvToMap f = Fold.demuxToContainer fst (Fold.lmap snd . f)
Example:
>>>
import Data.Map (Map)
>>>
:{
let f "SUM" = return Fold.sum f _ = return Fold.product input = Stream.fromList [("SUM",1),("PRODUCT",2),("SUM",3),("PRODUCT",4)] in Stream.fold (Fold.demuxKvToMap f) input :: IO (Map String Int) :} fromList [("PRODUCT",8),("SUM",4)]
Pre-release
demuxToContainer :: (Monad m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (f b) Source #
demuxToContainerIO :: (MonadIO m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (f b) Source #
demuxToMap :: (Monad m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (Map k b) Source #
This collects all the results of demux
in a Map.
demuxToMapIO :: (MonadIO m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (Map k b) Source #
Same as demuxToMap
but uses demuxIO
for better performance.
demuxGeneric :: (Monad m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (m (f b), Maybe (Key f, b)) Source #
demux :: (Monad m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (m (Map k b), Maybe (k, b)) Source #
In a key value stream, fold values corresponding to each key with a key specific fold. The fold returns the fold result as the second component of the output tuple whenever a fold terminates. The first component of the tuple is a Map of in-progress folds. If a fold terminates, another instance of the fold is started upon receiving an input with that key.
This can be used to scan a stream and collect the results from the scan output.
Pre-release
demuxGenericIO :: (MonadIO m, IsMap f, Traversable f) => (a -> Key f) -> (a -> m (Fold m a b)) -> Fold m a (m (f b), Maybe (Key f, b)) Source #
demuxIO :: (MonadIO m, Ord k) => (a -> k) -> (a -> m (Fold m a b)) -> Fold m a (m (Map k b), Maybe (k, b)) Source #
This is specialized version of demux
that uses mutable IO cells as
fold accumulators for better performance.
Classifying
In an input stream of key value pairs fold values for different keys
in individual output buckets using the given fold. classify
is a
special case of demux
where all the branches of the demultiplexer use
the same fold.
Different types of maps can be used with these combinators via the IsMap type class. Hashmap performs better when there are more collisions, trie Map performs better otherwise. Trie has an advantage of sorting the keys at the same time. For example if we want to store a dictionary of words and their meanings then trie Map would be better if we also want to display them in sorted order.
kvToMap :: (Monad m, Ord k) => Fold m a b -> Fold m (k, a) (Map k b) Source #
Given an input stream of key value pairs and a fold for values, fold all the values belonging to each key. Useful for map/reduce, bucketizing the input in different bins or for generating histograms.
Definition:
>>>
kvToMap = Fold.toMap fst . Fold.lmap snd
Example:
>>>
:{
let input = Stream.fromList [("ONE",1),("ONE",1.1),("TWO",2), ("TWO",2.2)] in Stream.fold (Fold.kvToMap Fold.toList) input :} fromList [("ONE",[1.0,1.1]),("TWO",[2.0,2.2])]
Pre-release
toContainer :: (Monad m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (f b) Source #
toContainerIO :: (MonadIO m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (f b) Source #
toMap :: (Monad m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (Map k b) Source #
Split the input stream based on a key field and fold each split using the given fold. Useful for map/reduce, bucketizing the input in different bins or for generating histograms.
Example:
>>>
import Data.Map.Strict (Map)
>>>
:{
let input = Stream.fromList [("ONE",1),("ONE",1.1),("TWO",2), ("TWO",2.2)] classify = Fold.toMap fst (Fold.lmap snd Fold.toList) in Stream.fold classify input :: IO (Map String [Double]) :} fromList [("ONE",[1.0,1.1]),("TWO",[2.0,2.2])]
Once the classifier fold terminates for a particular key any further inputs in that bucket are ignored.
Space used is proportional to the number of keys seen till now and monotonically increases because it stores whether a key has been seen or not.
See demuxToMap
for a more powerful version where you can use a different
fold for each key. A simpler version of toMap
retaining only the last
value for a key can be written as:
>>>
toMap = Fold.foldl' (\kv (k, v) -> Map.insert k v kv) Map.empty
Stops: never
Pre-release
toMapIO :: (MonadIO m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (Map k b) Source #
Same as toMap
but maybe faster because it uses mutable cells as
fold accumulators in the Map.
classifyGeneric :: (Monad m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (m (f b), Maybe (Key f, b)) Source #
classify :: (Monad m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (m (Map k b), Maybe (k, b)) Source #
Folds the values for each key using the supplied fold. When scanning, as soon as the fold is complete, its result is available in the second component of the tuple. The first component of the tuple is a snapshot of the in-progress folds.
Once the fold for a key is done, any future values of the key are ignored.
Definition:
>>>
classify f fld = Fold.demux f (const fld)
classifyGenericIO :: (MonadIO m, IsMap f, Traversable f, Ord (Key f)) => (a -> Key f) -> Fold m a b -> Fold m a (m (f b), Maybe (Key f, b)) Source #
classifyIO :: (MonadIO m, Ord k) => (a -> k) -> Fold m a b -> Fold m a (m (Map k b), Maybe (k, b)) Source #
Same as classify except that it uses mutable IORef cells in the Map providing better performance. Be aware that if this is used as a scan, the values in the intermediate Maps would be mutable.
Definitions:
>>>
classifyIO f fld = Fold.demuxIO f (const fld)