{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
#if !defined(TESTING) && defined(__GLASGOW_HASKELL__)
{-# LANGUAGE Trustworthy #-}
#endif
{-# OPTIONS_HADDOCK not-home #-}
-----------------------------------------------------------------------------
-- |
-- Copyright : (c) Daan Leijen 2002
-- License : BSD-style
--
-- = WARNING
--
-- This module is considered __internal__.
--
-- The Package Versioning Policy __does not apply__.
--
-- This contents of this module may change __in any way whatsoever__
-- and __without any warning__ between minor versions of this package.
--
-- Authors importing this module are expected to track development
-- closely.
--
-- = Description
--
-- An efficient implementation of sets.
--
-- These modules are intended to be imported qualified, to avoid name
-- clashes with Prelude functions, e.g.
--
-- > import Data.Set (Set)
-- > import qualified Data.Set as Set
--
-- The implementation of 'Set' is based on /size balanced/ binary trees (or
-- trees of /bounded balance/) as described by:
--
-- * Stephen Adams, \"/Efficient sets: a balancing act/\",
-- Journal of Functional Programming 3(4):553-562, October 1993,
-- .
-- * J. Nievergelt and E.M. Reingold,
-- \"/Binary search trees of bounded balance/\",
-- SIAM journal of computing 2(1), March 1973.
--
-- Bounds for 'union', 'intersection', and 'difference' are as given
-- by
--
-- * Guy Blelloch, Daniel Ferizovic, and Yihan Sun,
-- \"/Just Join for Parallel Ordered Sets/\",
-- .
--
-- Note that the implementation is /left-biased/ -- the elements of a
-- first argument are always preferred to the second, for example in
-- 'union' or 'insert'. Of course, left-biasing can only be observed
-- when equality is an equivalence relation instead of structural
-- equality.
--
-- /Warning/: The size of the set must not exceed @maxBound::Int@. Violation of
-- this condition is not detected and if the size limit is exceeded, the
-- behavior of the set is completely undefined.
--
-- @since 0.5.9
-----------------------------------------------------------------------------
-- [Note: Using INLINABLE]
-- ~~~~~~~~~~~~~~~~~~~~~~~
-- It is crucial to the performance that the functions specialize on the Ord
-- type when possible. GHC 7.0 and higher does this by itself when it sees th
-- unfolding of a function -- that is why all public functions are marked
-- INLINABLE (that exposes the unfolding).
-- [Note: Using INLINE]
-- ~~~~~~~~~~~~~~~~~~~~
-- For other compilers and GHC pre 7.0, we mark some of the functions INLINE.
-- We mark the functions that just navigate down the tree (lookup, insert,
-- delete and similar). That navigation code gets inlined and thus specialized
-- when possible. There is a price to pay -- code growth. The code INLINED is
-- therefore only the tree navigation, all the real work (rebalancing) is not
-- INLINED by using a NOINLINE.
--
-- All methods marked INLINE have to be nonrecursive -- a 'go' function doing
-- the real work is provided.
-- [Note: Type of local 'go' function]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- If the local 'go' function uses an Ord class, it sometimes heap-allocates
-- the Ord dictionary when the 'go' function does not have explicit type.
-- In that case we give 'go' explicit type. But this slightly decrease
-- performance, as the resulting 'go' function can float out to top level.
-- [Note: Local 'go' functions and capturing]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- As opposed to IntSet, when 'go' function captures an argument, increased
-- heap-allocation can occur: sometimes in a polymorphic function, the 'go'
-- floats out of its enclosing function and then it heap-allocates the
-- dictionary and the argument. Maybe it floats out too late and strictness
-- analyzer cannot see that these could be passed on stack.
-- [Note: Order of constructors]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- The order of constructors of Set matters when considering performance.
-- Currently in GHC 7.0, when type has 2 constructors, a forward conditional
-- jump is made when successfully matching second constructor. Successful match
-- of first constructor results in the forward jump not taken.
-- On GHC 7.0, reordering constructors from Tip | Bin to Bin | Tip
-- improves the benchmark by up to 10% on x86.
module Data.Set.Private (
-- * Set type
Set(..) -- instance Eq,Ord,Show,Read,Data,Typeable
, Size
, insertBy'
, empty
) where
import Prelude hiding (filter,foldl,foldr,null,map,take,drop,splitAt)
import Control.Monad (join)
#if __GLASGOW_HASKELL__
import GHC.Exts ( lazy )
#endif
{--------------------------------------------------------------------
Sets are size balanced trees
--------------------------------------------------------------------}
-- | A set of values @a@.
-- See Note: Order of constructors
data Set a = Bin {-# UNPACK #-} !Size !a !(Set a) !(Set a) | Tip
type Size = Int
{--------------------------------------------------------------------
Query
--------------------------------------------------------------------}
-- | /O(1)/. The number of elements in the set.
size :: Set a -> Int
size Tip = 0
size (Bin sz _ _ _) = sz
{--------------------------------------------------------------------
Construction
--------------------------------------------------------------------}
-- | /O(1)/. The empty set.
empty :: Set a
empty = Tip
-- | /O(1)/. Create a singleton set.
singleton :: a -> Set a
singleton x = Bin 1 x Tip Tip
{--------------------------------------------------------------------
Insertion, Deletion
--------------------------------------------------------------------}
-- | /O(log n)/. Insert an element in a set.
-- If the set already contains an element equal to the given value,
-- it is replaced with the new value.
-- See Note: Type of local 'go' function
-- See Note: Avoiding worker/wrapper (in Data.Map.Internal)
insertBy' :: (a -> a -> Ordering) -> a -> Set a -> Maybe (Set a)
insertBy' compare = join go
where
go orig !_ Tip = Just $! singleton (lazy orig)
go orig !x (Bin _ y l r) = case compare x y of
LT -> (\ !l' -> balanceL y l' r) <$!> go orig x l
GT -> (\ !r' -> balanceR y l r') <$!> go orig x r
EQ -> Nothing
#if __GLASGOW_HASKELL__
{-# INLINABLE insertBy' #-}
#else
{-# INLINE insertBy' #-}
#endif
infixl 4 <$!>
(<$!>) :: (a -> b) -> Maybe a -> Maybe b
(<$!>) f = \ case
Nothing -> Nothing
Just a -> Just $! f a
#ifndef __GLASGOW_HASKELL__
lazy :: a -> a
lazy a = a
#endif
{--------------------------------------------------------------------
[balance x l r] balances two trees with value x.
The sizes of the trees should balance after decreasing the
size of one of them. (a rotation).
[delta] is the maximal relative difference between the sizes of
two trees, it corresponds with the [w] in Adams' paper.
[ratio] is the ratio between an outer and inner sibling of the
heavier subtree in an unbalanced setting. It determines
whether a double or single rotation should be performed
to restore balance. It is correspondes with the inverse
of $\alpha$ in Adam's article.
Note that according to the Adam's paper:
- [delta] should be larger than 4.646 with a [ratio] of 2.
- [delta] should be larger than 3.745 with a [ratio] of 1.534.
But the Adam's paper is errorneous:
- it can be proved that for delta=2 and delta>=5 there does
not exist any ratio that would work
- delta=4.5 and ratio=2 does not work
That leaves two reasonable variants, delta=3 and delta=4,
both with ratio=2.
- A lower [delta] leads to a more 'perfectly' balanced tree.
- A higher [delta] performs less rebalancing.
In the benchmarks, delta=3 is faster on insert operations,
and delta=4 has slightly better deletes. As the insert speedup
is larger, we currently use delta=3.
--------------------------------------------------------------------}
delta,ratio :: Int
delta = 3
ratio = 2
-- The balance function is equivalent to the following:
--
-- balance :: a -> Set a -> Set a -> Set a
-- balance x l r
-- | sizeL + sizeR <= 1 = Bin sizeX x l r
-- | sizeR > delta*sizeL = rotateL x l r
-- | sizeL > delta*sizeR = rotateR x l r
-- | otherwise = Bin sizeX x l r
-- where
-- sizeL = size l
-- sizeR = size r
-- sizeX = sizeL + sizeR + 1
--
-- rotateL :: a -> Set a -> Set a -> Set a
-- rotateL x l r@(Bin _ _ ly ry) | size ly < ratio*size ry = singleL x l r
-- | otherwise = doubleL x l r
-- rotateR :: a -> Set a -> Set a -> Set a
-- rotateR x l@(Bin _ _ ly ry) r | size ry < ratio*size ly = singleR x l r
-- | otherwise = doubleR x l r
--
-- singleL, singleR :: a -> Set a -> Set a -> Set a
-- singleL x1 t1 (Bin _ x2 t2 t3) = bin x2 (bin x1 t1 t2) t3
-- singleR x1 (Bin _ x2 t1 t2) t3 = bin x2 t1 (bin x1 t2 t3)
--
-- doubleL, doubleR :: a -> Set a -> Set a -> Set a
-- doubleL x1 t1 (Bin _ x2 (Bin _ x3 t2 t3) t4) = bin x3 (bin x1 t1 t2) (bin x2 t3 t4)
-- doubleR x1 (Bin _ x2 t1 (Bin _ x3 t2 t3)) t4 = bin x3 (bin x2 t1 t2) (bin x1 t3 t4)
--
-- It is only written in such a way that every node is pattern-matched only once.
--
-- Only balanceL and balanceR are needed at the moment, so balance is not here anymore.
-- In case it is needed, it can be found in Data.Map.
-- Functions balanceL and balanceR are specialised versions of balance.
-- balanceL only checks whether the left subtree is too big,
-- balanceR only checks whether the right subtree is too big.
-- balanceL is called when left subtree might have been inserted to or when
-- right subtree might have been deleted from.
balanceL :: a -> Set a -> Set a -> Set a
balanceL x l r = case r of
Tip -> case l of
Tip -> Bin 1 x Tip Tip
(Bin _ _ Tip Tip) -> Bin 2 x l Tip
(Bin _ lx Tip (Bin _ lrx _ _)) -> Bin 3 lrx (Bin 1 lx Tip Tip) (Bin 1 x Tip Tip)
(Bin _ lx ll@(Bin _ _ _ _) Tip) -> Bin 3 lx ll (Bin 1 x Tip Tip)
(Bin ls lx ll@(Bin lls _ _ _) lr@(Bin lrs lrx lrl lrr))
| lrs < ratio*lls -> Bin (1+ls) lx ll (Bin (1+lrs) x lr Tip)
| otherwise -> Bin (1+ls) lrx (Bin (1+lls+size lrl) lx ll lrl) (Bin (1+size lrr) x lrr Tip)
(Bin rs _ _ _) -> case l of
Tip -> Bin (1+rs) x Tip r
(Bin ls lx ll lr)
| ls > delta*rs -> case (ll, lr) of
(Bin lls _ _ _, Bin lrs lrx lrl lrr)
| lrs < ratio*lls -> Bin (1+ls+rs) lx ll (Bin (1+rs+lrs) x lr r)
| otherwise -> Bin (1+ls+rs) lrx (Bin (1+lls+size lrl) lx ll lrl) (Bin (1+rs+size lrr) x lrr r)
(_, _) -> error "Failure in Data.Map.balanceL"
| otherwise -> Bin (1+ls+rs) x l r
{-# NOINLINE balanceL #-}
-- balanceR is called when right subtree might have been inserted to or when
-- left subtree might have been deleted from.
balanceR :: a -> Set a -> Set a -> Set a
balanceR x l r = case l of
Tip -> case r of
Tip -> Bin 1 x Tip Tip
(Bin _ _ Tip Tip) -> Bin 2 x Tip r
(Bin _ rx Tip rr@(Bin _ _ _ _)) -> Bin 3 rx (Bin 1 x Tip Tip) rr
(Bin _ rx (Bin _ rlx _ _) Tip) -> Bin 3 rlx (Bin 1 x Tip Tip) (Bin 1 rx Tip Tip)
(Bin rs rx rl@(Bin rls rlx rll rlr) rr@(Bin rrs _ _ _))
| rls < ratio*rrs -> Bin (1+rs) rx (Bin (1+rls) x Tip rl) rr
| otherwise -> Bin (1+rs) rlx (Bin (1+size rll) x Tip rll) (Bin (1+rrs+size rlr) rx rlr rr)
(Bin ls _ _ _) -> case r of
Tip -> Bin (1+ls) x l Tip
(Bin rs rx rl rr)
| rs > delta*ls -> case (rl, rr) of
(Bin rls rlx rll rlr, Bin rrs _ _ _)
| rls < ratio*rrs -> Bin (1+ls+rs) rx (Bin (1+ls+rls) x l rl) rr
| otherwise -> Bin (1+ls+rs) rlx (Bin (1+ls+size rll) x l rll) (Bin (1+rrs+size rlr) rx rlr rr)
(_, _) -> error "Failure in Data.Map.balanceR"
| otherwise -> Bin (1+ls+rs) x l r
{-# NOINLINE balanceR #-}