From 07771e0e1cc67858ee944f68d05c8fe8220f3531 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 13:39:50 +1000 Subject: [PATCH 01/20] make benchmarks work --- benchmark/{bench.hs => Main.hs} | 2 ++ statistics.cabal | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) rename benchmark/{bench.hs => Main.hs} (99%) diff --git a/benchmark/bench.hs b/benchmark/Main.hs similarity index 99% rename from benchmark/bench.hs rename to benchmark/Main.hs index 4c5b793..c305617 100644 --- a/benchmark/bench.hs +++ b/benchmark/Main.hs @@ -1,3 +1,5 @@ +module Main where + import Data.Complex import Statistics.Sample import Statistics.Transform diff --git a/statistics.cabal b/statistics.cabal index a466dd1..447f3d5 100644 --- a/statistics.cabal +++ b/statistics.cabal @@ -223,7 +223,7 @@ benchmark statistics-bench import: bench-stanza type: exitcode-stdio-1.0 hs-source-dirs: benchmark bench-time - main-is: bench.hs + main-is: Main.hs Other-modules: Bench build-depends: tasty-bench >= 0.3 From a72dd129a628cc427490fb84018071151fed528d Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 12:44:10 +1000 Subject: [PATCH 02/20] Initial Fold conversions --- Statistics/Sample/Fold.hs | 512 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 512 insertions(+) create mode 100644 Statistics/Sample/Fold.hs diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs new file mode 100644 index 0000000..c8f85a1 --- /dev/null +++ b/Statistics/Sample/Fold.hs @@ -0,0 +1,512 @@ +{-# LANGUAGE FlexibleContexts #-} +-- | +-- Module : Statistics.Sample +-- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan +-- License : BSD3 +-- +-- Maintainer : bos@serpentine.com +-- Stability : experimental +-- Portability : portable +-- +-- Commonly used sample statistics, also known as descriptive +-- statistics. + +module Statistics.Sample.Fold + ( + -- * Types + Sample + , WeightedSample + -- * Descriptive functions + , range + + -- * Statistics of location + , expectation + , mean + , welfordMean + , meanWeighted + , harmonicMean + , geometricMean + + -- * Statistics of dispersion + -- $variance + + -- ** Functions over central moments + , centralMoment + , centralMoments + , skewness + , kurtosis + + -- ** Two-pass functions (numerically robust) + -- $robust + , variance + , varianceUnbiased + , meanVariance + , meanVarianceUnb + , stdDev + , varianceWeighted + , stdErrMean + + -- ** Single-pass functions (faster, less safe) + -- $cancellation + , fastVariance + , fastVarianceUnbiased + , fastStdDev + + -- * Joint distributions + , covariance + , correlation + , covariance2 + , correlation2 + , pair + -- * References + -- $references + ) where + +import Statistics.Function (square) +import Statistics.Sample.Internal (robustSumVar, sum) +import Statistics.Types.Internal (Sample,WeightedSample) +import qualified Data.Vector as V +import qualified Data.Vector.Generic as G +import qualified Data.Vector.Unboxed as U +import Numeric.Sum (kbn, Summation(zero,add), KBNSum) + +import qualified Control.Foldl as F + +-- Operator ^ will be overridden +import Prelude hiding ((^), sum) + +-- | /O(n)/ Range. The difference between the largest and smallest +-- elements of a sample. +range :: F.Fold Double Double +range = maximum' - minimum' +{-# INLINE range #-} + +maximum' :: F.Fold Double Double +maximum' = F.Fold max (-1/0 :: Double) id where +minimum' :: F.Fold Double Double +minimum' = F.Fold min ( 1/0 :: Double) id where + +kbnSum' :: F.Fold Double KBNSum +kbnSum' = F.Fold add zero id + +kbnSum :: F.Fold Double Double +kbnSum = fmap kbn kbnSum' + + +-- | Compute expectation of function over for sample. +expectation :: (a -> Double) -> F.Fold a Double +expectation f = F.premap f kbnSum / F.genericLength +{-# INLINE expectation #-} + +-- | Arithmetic mean. This uses Kahan-Babuška-Neumaier +-- summation, so is more accurate than 'welfordMean' unless the input +-- values are very large. This function is not subject to stream +-- fusion. +mean :: F.Fold Double Double +mean = kbnSum / F.genericLength + +-- | Arithmetic mean. This uses Welford's algorithm to provide +-- numerical stability, using a single pass over the sample data. +-- +-- Compared to 'mean', this loses a surprising amount of precision +-- unless the inputs are very large. +welfordMean :: F.Fold Double Double +welfordMean = F.Fold go (T 0 0) fini + where + fini (T a _) = a + go (T m n) x = T m' n' + where m' = m + (x - m) / fromIntegral n' + n' = n + 1 + + +-- | Arithmetic mean for weighted sample. It uses a single-pass +-- algorithm analogous to the one used by 'welfordMean'. +meanWeighted :: F.Fold (Double,Double) Double +meanWeighted = F.Fold go (V 0 0) fini + where + fini (V a _) = a + go (V m w) (x,xw) = V m' w' + where m' | w' == 0 = 0 + | otherwise = m + xw * (x - m) / w' + w' = w + xw +{-# INLINE meanWeighted #-} + +-- | Harmonic mean. This algorithm performs a single pass over +-- the sample. +harmonicMean :: F.Fold Double Double +harmonicMean = F.Fold go (T 0 0) fini + where + fini (T b a) = fromIntegral a / b + go (T x y) n = T (x + (1/n)) (y+1) +{-# INLINE harmonicMean #-} + +-- | Geometric mean of a sample containing no negative values. +geometricMean :: F.Fold Double Double +geometricMean = fmap exp (expectation log) +{-# INLINE geometricMean #-} + +-- | Compute the /k/th central moment of a sample. The central moment +-- is also known as the moment about the mean. +-- +-- This function performs two passes over the sample, so is not subject +-- to stream fusion. +-- +-- For samples containing many values very close to the mean, this +-- function is subject to inaccuracy due to catastrophic cancellation. +centralMoment :: Int + -> Double -- ^ mean + -> F.Fold Double Double +centralMoment a m + | a < 0 = error "Statistics.Sample.centralMoment: negative input" + | a == 0 = pure 1 + | a == 1 = pure 0 + | otherwise = expectation go + where + go x = (x-m) ^ a + -- m = mean xs +-- + +-- | Compute the /k/th and /j/th central moments of a sample. +-- +-- This function performs two passes over the sample, so is not subject +-- to stream fusion. +-- +-- For samples containing many values very close to the mean, this +-- function is subject to inaccuracy due to catastrophic cancellation. +centralMoments :: (G.Vector v Double) => Int -> Int -> v Double -> (Double, Double) +centralMoments a b xs + | a < 2 || b < 2 = (centralMoment a xs , centralMoment b xs) + | otherwise = fini . G.foldl' go (V 0 0) $ xs + where go (V i j) x = V (i + d^a) (j + d^b) + where d = x - m + fini (V i j) = (i / n , j / n) + m = mean xs + n = fromIntegral (G.length xs) +{-# SPECIALIZE + centralMoments :: Int -> Int -> U.Vector Double -> (Double, Double) #-} +{-# SPECIALIZE + centralMoments :: Int -> Int -> V.Vector Double -> (Double, Double) #-} + +-- | Compute the skewness of a sample. This is a measure of the +-- asymmetry of its distribution. +-- +-- A sample with negative skew is said to be /left-skewed/. Most of +-- its mass is on the right of the distribution, with the tail on the +-- left. +-- +-- > skewness $ U.to [1,100,101,102,103] +-- > ==> -1.497681449918257 +-- +-- A sample with positive skew is said to be /right-skewed/. +-- +-- > skewness $ U.to [1,2,3,4,100] +-- > ==> 1.4975367033335198 +-- +-- A sample's skewness is not defined if its 'variance' is zero. +-- +-- This function performs two passes over the sample, so is not subject +-- to stream fusion. +-- +-- For samples containing many values very close to the mean, this +-- function is subject to inaccuracy due to catastrophic cancellation. +skewness :: (G.Vector v Double) => v Double -> Double +skewness xs = c3 * c2 ** (-1.5) + where (c3 , c2) = centralMoments 3 2 xs +{-# SPECIALIZE skewness :: U.Vector Double -> Double #-} +{-# SPECIALIZE skewness :: V.Vector Double -> Double #-} + +-- | Compute the excess kurtosis of a sample. This is a measure of +-- the \"peakedness\" of its distribution. A high kurtosis indicates +-- that more of the sample's variance is due to infrequent severe +-- deviations, rather than more frequent modest deviations. +-- +-- A sample's excess kurtosis is not defined if its 'variance' is +-- zero. +-- +-- This function performs two passes over the sample, so is not subject +-- to stream fusion. +-- +-- For samples containing many values very close to the mean, this +-- function is subject to inaccuracy due to catastrophic cancellation. +kurtosis :: (G.Vector v Double) => v Double -> Double +kurtosis xs = c4 / (c2 * c2) - 3 + where (c4 , c2) = centralMoments 4 2 xs +{-# SPECIALIZE kurtosis :: U.Vector Double -> Double #-} +{-# SPECIALIZE kurtosis :: V.Vector Double -> Double #-} + +-- $variance +-- +-- The variance — and hence the standard deviation — of a +-- sample of fewer than two elements are both defined to be zero. + +-- $robust +-- +-- These functions use the compensated summation algorithm of Chan et +-- al. for numerical robustness, but require two passes over the +-- sample data as a result. +-- +-- Because of the need for two passes, these functions are /not/ +-- subject to stream fusion. + +data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double + +-- | Maximum likelihood estimate of a sample's variance. Also known +-- as the population variance, where the denominator is /n/. +variance :: (G.Vector v Double) => v Double -> Double +variance samp + | n > 1 = robustSumVar (mean samp) samp / fromIntegral n + | otherwise = 0 + where + n = G.length samp +{-# SPECIALIZE variance :: U.Vector Double -> Double #-} +{-# SPECIALIZE variance :: V.Vector Double -> Double #-} + + +-- | Unbiased estimate of a sample's variance. Also known as the +-- sample variance, where the denominator is /n/-1. +varianceUnbiased :: (G.Vector v Double) => v Double -> Double +varianceUnbiased samp + | n > 1 = robustSumVar (mean samp) samp / fromIntegral (n-1) + | otherwise = 0 + where + n = G.length samp +{-# SPECIALIZE varianceUnbiased :: U.Vector Double -> Double #-} +{-# SPECIALIZE varianceUnbiased :: V.Vector Double -> Double #-} + +-- | Calculate mean and maximum likelihood estimate of variance. This +-- function should be used if both mean and variance are required +-- since it will calculate mean only once. +meanVariance :: (G.Vector v Double) => v Double -> (Double,Double) +meanVariance samp + | n > 1 = (m, robustSumVar m samp / fromIntegral n) + | otherwise = (m, 0) + where + n = G.length samp + m = mean samp +{-# SPECIALIZE meanVariance :: U.Vector Double -> (Double,Double) #-} +{-# SPECIALIZE meanVariance :: V.Vector Double -> (Double,Double) #-} + +-- | Calculate mean and unbiased estimate of variance. This +-- function should be used if both mean and variance are required +-- since it will calculate mean only once. +meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double) +meanVarianceUnb samp + | n > 1 = (m, robustSumVar m samp / fromIntegral (n-1)) + | otherwise = (m, 0) + where + n = G.length samp + m = mean samp +{-# SPECIALIZE meanVarianceUnb :: U.Vector Double -> (Double,Double) #-} +{-# SPECIALIZE meanVarianceUnb :: V.Vector Double -> (Double,Double) #-} + +-- | Standard deviation. This is simply the square root of the +-- unbiased estimate of the variance. +stdDev :: (G.Vector v Double) => v Double -> Double +stdDev = sqrt . varianceUnbiased +{-# SPECIALIZE stdDev :: U.Vector Double -> Double #-} +{-# SPECIALIZE stdDev :: V.Vector Double -> Double #-} + +-- | Standard error of the mean. This is the standard deviation +-- divided by the square root of the sample size. +stdErrMean :: (G.Vector v Double) => v Double -> Double +stdErrMean samp = stdDev samp / (sqrt . fromIntegral . G.length) samp +{-# SPECIALIZE stdErrMean :: U.Vector Double -> Double #-} +{-# SPECIALIZE stdErrMean :: V.Vector Double -> Double #-} + +robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> V +robustSumVarWeighted samp = G.foldl' go (V 0 0) samp + where + go (V s w) (x,xw) = V (s + xw*d*d) (w + xw) + where d = x - m + m = meanWeighted samp +{-# INLINE robustSumVarWeighted #-} + +-- | Weighted variance. This is biased estimation. +varianceWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double +varianceWeighted samp + | G.length samp > 1 = fini $ robustSumVarWeighted samp + | otherwise = 0 + where + fini (V s w) = s / w +{-# SPECIALIZE varianceWeighted :: U.Vector (Double,Double) -> Double #-} +{-# SPECIALIZE varianceWeighted :: V.Vector (Double,Double) -> Double #-} + +-- $cancellation +-- +-- The functions prefixed with the name @fast@ below perform a single +-- pass over the sample data using Knuth's algorithm. They usually +-- work well, but see below for caveats. These functions are subject +-- to array fusion. +-- +-- /Note/: in cases where most sample data is close to the sample's +-- mean, Knuth's algorithm gives inaccurate results due to +-- catastrophic cancellation. + +fastVar :: (G.Vector v Double) => v Double -> T1 +fastVar = G.foldl' go (T1 0 0 0) + where + go (T1 n m s) x = T1 n' m' s' + where n' = n + 1 + m' = m + d / fromIntegral n' + s' = s + d * (x - m') + d = x - m + +-- | Maximum likelihood estimate of a sample's variance. +fastVariance :: (G.Vector v Double) => v Double -> Double +fastVariance = fini . fastVar + where fini (T1 n _m s) + | n > 1 = s / fromIntegral n + | otherwise = 0 +{-# INLINE fastVariance #-} + +-- | Unbiased estimate of a sample's variance. +fastVarianceUnbiased :: (G.Vector v Double) => v Double -> Double +fastVarianceUnbiased = fini . fastVar + where fini (T1 n _m s) + | n > 1 = s / fromIntegral (n - 1) + | otherwise = 0 +{-# INLINE fastVarianceUnbiased #-} + +-- | Standard deviation. This is simply the square root of the +-- maximum likelihood estimate of the variance. +fastStdDev :: (G.Vector v Double) => v Double -> Double +fastStdDev = sqrt . fastVariance +{-# INLINE fastStdDev #-} + +-- | Covariance of sample of pairs. For empty sample it's set to +-- zero +covariance :: (G.Vector v (Double,Double)) + => v (Double,Double) + -> Double +covariance xy + | n == 0 = 0 + | otherwise = expectation (\(x,y) -> (x - muX)*(y - muY)) xy + where + n = G.length xy + muX = expectation fst xy + muY = expectation snd xy +{-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-} +{-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-} + +-- | Correlation coefficient for sample of pairs. Also known as +-- Pearson's correlation. For empty sample it's set to zero. +correlation :: (G.Vector v (Double,Double)) + => v (Double,Double) + -> Double +correlation xy + | n == 0 = 0 + | otherwise = cov / sqrt (varX * varY) + where + n = G.length xy + muX = expectation (\(x,_) -> x) xy + muY = expectation (\(_,y) -> y) xy + varX = expectation (\(x,_) -> square (x - muX)) xy + varY = expectation (\(_,y) -> square (y - muY)) xy + cov = expectation (\(x,y) -> (x - muX)*(y - muY)) xy +{-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-} +{-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-} + + +-- | Covariance of two samples. Both vectors must be of the same +-- length. If both are empty it's set to zero +covariance2 :: (G.Vector v Double) + => v Double + -> v Double + -> Double +covariance2 xs ys + | nx /= ny = error $ "Statistics.Sample.covariance2: both samples must have same length" + | nx == 0 = 0 + | otherwise = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys) + / fromIntegral nx + where + nx = G.length xs + ny = G.length ys + muX = mean xs + muY = mean ys +{-# SPECIALIZE covariance2 :: U.Vector Double -> U.Vector Double -> Double #-} +{-# SPECIALIZE covariance2 :: V.Vector Double -> V.Vector Double -> Double #-} + +-- | Correlation coefficient for two samples. Both vector must have +-- same length Also known as Pearson's correlation. For empty sample +-- it's set to zero. +correlation2 :: (G.Vector v Double) + => v Double + -> v Double + -> Double +correlation2 xs ys + | nx /= ny = error $ "Statistics.Sample.correlation2: both samples must have same length" + | nx == 0 = 0 + | otherwise = cov / sqrt (varX * varY) + where + nx = G.length xs + ny = G.length ys + (muX,varX) = meanVariance xs + (muY,varY) = meanVariance ys + cov = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys) + / fromIntegral nx +{-# SPECIALIZE correlation2 :: U.Vector Double -> U.Vector Double -> Double #-} +{-# SPECIALIZE correlation2 :: V.Vector Double -> V.Vector Double -> Double #-} + + +-- | Pair two samples. It's like 'G.zip' but requires that both +-- samples have equal size. +pair :: (G.Vector v a, G.Vector v b, G.Vector v (a,b)) => v a -> v b -> v (a,b) +pair va vb + | G.length va == G.length vb = G.zip va vb + | otherwise = error "Statistics.Sample.pair: vector must have same length" +{-# INLINE pair #-} + +------------------------------------------------------------------------ +-- Helper code. Monomorphic unpacked accumulators. + +-- (^) operator from Prelude is just slow. +(^) :: Double -> Int -> Double +x ^ 1 = x +x ^ n = x * (x ^ (n-1)) +{-# INLINE (^) #-} + +-- don't support polymorphism, as we can't get unboxed returns if we use it. +data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int + +data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double + +{- + +Consider this core: + +with data T a = T !a !Int + +$wfold :: Double# + -> Int# + -> Int# + -> (# Double, Int# #) + +and without, + +$wfold :: Double# + -> Int# + -> Int# + -> (# Double#, Int# #) + +yielding to boxed returns and heap checks. + +-} + +-- $references +-- +-- * Chan, T. F.; Golub, G.H.; LeVeque, R.J. (1979) Updating formulae +-- and a pairwise algorithm for computing sample +-- variances. Technical Report STAN-CS-79-773, Department of +-- Computer Science, Stanford +-- University. +-- +-- * Knuth, D.E. (1998) The art of computer programming, volume 2: +-- seminumerical algorithms, 3rd ed., p. 232. +-- +-- * Welford, B.P. (1962) Note on a method for calculating corrected +-- sums of squares and products. /Technometrics/ +-- 4(3):419–420. +-- +-- * West, D.H.D. (1979) Updating mean and variance estimates: an +-- improved method. /Communications of the ACM/ +-- 22(9):532–535. From 490d1d8a994a6f61385d145d7e6d3b7540961d4b Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 12:44:26 +1000 Subject: [PATCH 03/20] import foldl package --- statistics.cabal | 2 ++ 1 file changed, 2 insertions(+) diff --git a/statistics.cabal b/statistics.cabal index 447f3d5..2444791 100644 --- a/statistics.cabal +++ b/statistics.cabal @@ -101,6 +101,7 @@ library Statistics.Resampling Statistics.Resampling.Bootstrap Statistics.Sample + Statistics.Sample.Fold Statistics.Sample.Internal Statistics.Sample.Histogram Statistics.Sample.KernelDensity @@ -142,6 +143,7 @@ library , vector-th-unbox , vector-binary-instances >= 0.2.1 , data-default-class >= 0.1.2 + , foldl -- Older GHC if impl(ghc < 7.6) From 450e87722ab204e0c0a3b30ff78445a1dd7dd341 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 15:15:33 +1000 Subject: [PATCH 04/20] WIP --- Statistics/Sample.hs | 90 +++++--------- Statistics/Sample/Fold.hs | 238 +++++++++++++++++++------------------- 2 files changed, 148 insertions(+), 180 deletions(-) diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs index f47b9ea..bd04ef4 100644 --- a/Statistics/Sample.hs +++ b/Statistics/Sample.hs @@ -62,29 +62,37 @@ module Statistics.Sample -- $references ) where -import Statistics.Function (minMax,square) +import Statistics.Function (square) import Statistics.Sample.Internal (robustSumVar, sum) import Statistics.Types.Internal (Sample,WeightedSample) import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U -import Numeric.Sum (kbn, Summation(zero,add)) +-- import Numeric.Sum (kbn, Summation(zero,add)) -- Operator ^ will be overridden import Prelude hiding ((^), sum) +import qualified Control.Foldl as F +import qualified Statistics.Sample.Fold as FF + +ffold :: G.Vector v a => F.Fold a b -> v a -> b +ffold f v = F.purely_ (\s i -> G.foldl s i v) f +{-# INLINE ffold #-} +{-# SPECIALIZE ffold :: F.Fold Double b -> U.Vector Double -> b #-} +{-# SPECIALIZE ffold :: F.Fold Double b -> V.Vector Double -> b #-} + + -- | /O(n)/ Range. The difference between the largest and smallest -- elements of a sample. range :: (G.Vector v Double) => v Double -> Double -range s = hi - lo - where (lo , hi) = minMax s +range = ffold FF.range {-# INLINE range #-} -- | /O(n)/ Compute expectation of function over for sample. This is -- simply @mean . map f@ but won't create intermediate vector. expectation :: (G.Vector v a) => (a -> Double) -> v a -> Double -expectation f xs = kbn (G.foldl' (\s -> add s . f) zero xs) - / fromIntegral (G.length xs) +expectation f xs = ffold (FF.expectation f) xs {-# INLINE expectation #-} -- | /O(n)/ Arithmetic mean. This uses Kahan-Babuška-Neumaier @@ -92,7 +100,7 @@ expectation f xs = kbn (G.foldl' (\s -> add s . f) zero xs) -- values are very large. This function is not subject to stream -- fusion. mean :: (G.Vector v Double) => v Double -> Double -mean xs = sum xs / fromIntegral (G.length xs) +mean = ffold FF.mean {-# SPECIALIZE mean :: U.Vector Double -> Double #-} {-# SPECIALIZE mean :: V.Vector Double -> Double #-} @@ -102,39 +110,25 @@ mean xs = sum xs / fromIntegral (G.length xs) -- Compared to 'mean', this loses a surprising amount of precision -- unless the inputs are very large. welfordMean :: (G.Vector v Double) => v Double -> Double -welfordMean = fini . G.foldl' go (T 0 0) - where - fini (T a _) = a - go (T m n) x = T m' n' - where m' = m + (x - m) / fromIntegral n' - n' = n + 1 +welfordMean = ffold FF.welfordMean {-# SPECIALIZE welfordMean :: U.Vector Double -> Double #-} {-# SPECIALIZE welfordMean :: V.Vector Double -> Double #-} -- | /O(n)/ Arithmetic mean for weighted sample. It uses a single-pass -- algorithm analogous to the one used by 'welfordMean'. meanWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double -meanWeighted = fini . G.foldl' go (V 0 0) - where - fini (V a _) = a - go (V m w) (x,xw) = V m' w' - where m' | w' == 0 = 0 - | otherwise = m + xw * (x - m) / w' - w' = w + xw +meanWeighted = ffold FF.meanWeighted {-# INLINE meanWeighted #-} -- | /O(n)/ Harmonic mean. This algorithm performs a single pass over -- the sample. harmonicMean :: (G.Vector v Double) => v Double -> Double -harmonicMean = fini . G.foldl' go (T 0 0) - where - fini (T b a) = fromIntegral a / b - go (T x y) n = T (x + (1/n)) (y+1) +harmonicMean = ffold FF.harmonicMean {-# INLINE harmonicMean #-} -- | /O(n)/ Geometric mean of a sample containing no negative values. geometricMean :: (G.Vector v Double) => v Double -> Double -geometricMean = exp . expectation log +geometricMean = ffold FF.geometricMean {-# INLINE geometricMean #-} -- | Compute the /k/th central moment of a sample. The central moment @@ -146,14 +140,9 @@ geometricMean = exp . expectation log -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. centralMoment :: (G.Vector v Double) => Int -> v Double -> Double -centralMoment a xs - | a < 0 = error "Statistics.Sample.centralMoment: negative input" - | a == 0 = 1 - | a == 1 = 0 - | otherwise = expectation go xs +centralMoment a xs = ffold (FF.centralMoment a m) xs where - go x = (x-m) ^ a - m = mean xs + m = mean xs {-# SPECIALIZE centralMoment :: Int -> U.Vector Double -> Double #-} {-# SPECIALIZE centralMoment :: Int -> V.Vector Double -> Double #-} @@ -165,18 +154,9 @@ centralMoment a xs -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. centralMoments :: (G.Vector v Double) => Int -> Int -> v Double -> (Double, Double) -centralMoments a b xs - | a < 2 || b < 2 = (centralMoment a xs , centralMoment b xs) - | otherwise = fini . G.foldl' go (V 0 0) $ xs - where go (V i j) x = V (i + d^a) (j + d^b) - where d = x - m - fini (V i j) = (i / n , j / n) - m = mean xs - n = fromIntegral (G.length xs) -{-# SPECIALIZE - centralMoments :: Int -> Int -> U.Vector Double -> (Double, Double) #-} -{-# SPECIALIZE - centralMoments :: Int -> Int -> V.Vector Double -> (Double, Double) #-} +centralMoments a b xs = ffold (FF.centralMoments a b m) xs + where m = mean xs + -- | Compute the skewness of a sample. This is a measure of the -- asymmetry of its distribution. @@ -201,8 +181,8 @@ centralMoments a b xs -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. skewness :: (G.Vector v Double) => v Double -> Double -skewness xs = c3 * c2 ** (-1.5) - where (c3 , c2) = centralMoments 3 2 xs +skewness xs = ffold (FF.skewness m) xs + where m = mean xs {-# SPECIALIZE skewness :: U.Vector Double -> Double #-} {-# SPECIALIZE skewness :: V.Vector Double -> Double #-} @@ -220,8 +200,8 @@ skewness xs = c3 * c2 ** (-1.5) -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. kurtosis :: (G.Vector v Double) => v Double -> Double -kurtosis xs = c4 / (c2 * c2) - 3 - where (c4 , c2) = centralMoments 4 2 xs +kurtosis xs = ffold (FF.kurtosis m) xs + where m = mean xs {-# SPECIALIZE kurtosis :: U.Vector Double -> Double #-} {-# SPECIALIZE kurtosis :: V.Vector Double -> Double #-} @@ -244,11 +224,8 @@ data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double -- | Maximum likelihood estimate of a sample's variance. Also known -- as the population variance, where the denominator is /n/. variance :: (G.Vector v Double) => v Double -> Double -variance samp - | n > 1 = robustSumVar (mean samp) samp / fromIntegral n - | otherwise = 0 - where - n = G.length samp +variance samp = ffold (FF.variance m) samp + where m = mean samp {-# SPECIALIZE variance :: U.Vector Double -> Double #-} {-# SPECIALIZE variance :: V.Vector Double -> Double #-} @@ -256,11 +233,8 @@ variance samp -- | Unbiased estimate of a sample's variance. Also known as the -- sample variance, where the denominator is /n/-1. varianceUnbiased :: (G.Vector v Double) => v Double -> Double -varianceUnbiased samp - | n > 1 = robustSumVar (mean samp) samp / fromIntegral (n-1) - | otherwise = 0 - where - n = G.length samp +varianceUnbiased samp = ffold (FF.varianceUnbiased m) samp + where m = mean samp {-# SPECIALIZE varianceUnbiased :: U.Vector Double -> Double #-} {-# SPECIALIZE varianceUnbiased :: V.Vector Double -> Double #-} diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index c8f85a1..d036289 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -11,63 +11,60 @@ -- Commonly used sample statistics, also known as descriptive -- statistics. -module Statistics.Sample.Fold - ( - -- * Types - Sample - , WeightedSample - -- * Descriptive functions - , range - - -- * Statistics of location - , expectation - , mean - , welfordMean - , meanWeighted - , harmonicMean - , geometricMean - - -- * Statistics of dispersion - -- $variance - - -- ** Functions over central moments - , centralMoment - , centralMoments - , skewness - , kurtosis - - -- ** Two-pass functions (numerically robust) - -- $robust - , variance - , varianceUnbiased - , meanVariance - , meanVarianceUnb - , stdDev - , varianceWeighted - , stdErrMean - - -- ** Single-pass functions (faster, less safe) - -- $cancellation - , fastVariance - , fastVarianceUnbiased - , fastStdDev - - -- * Joint distributions - , covariance - , correlation - , covariance2 - , correlation2 - , pair - -- * References - -- $references - ) where +module Statistics.Sample.Fold where + -- ( + -- -- * Types + -- Sample + -- , WeightedSample + -- -- * Descriptive functions + -- , range + + -- -- * Statistics of location + -- , expectation + -- , mean + -- , welfordMean + -- , meanWeighted + -- , harmonicMean + -- , geometricMean + + -- -- * Statistics of dispersion + -- -- $variance + + -- -- ** Functions over central moments + -- , centralMoment + -- , centralMoments + -- , skewness + -- , kurtosis + + -- -- ** Two-pass functions (numerically robust) + -- -- $robust + -- , variance + -- , varianceUnbiased + -- -- , meanVariance + -- -- , meanVarianceUnb + -- , stdDev + -- , varianceWeighted + -- , stdErrMean + + -- -- ** Single-pass functions (faster, less safe) + -- -- $cancellation + -- , fastVariance + -- , fastVarianceUnbiased + -- , fastStdDev + + -- -- * Joint distributions + -- , covariance + -- , correlation + -- , covariance2 + -- , correlation2 + -- , pair + -- -- * References + -- -- $references + -- ) where import Statistics.Function (square) -import Statistics.Sample.Internal (robustSumVar, sum) -import Statistics.Types.Internal (Sample,WeightedSample) -import qualified Data.Vector as V -import qualified Data.Vector.Generic as G -import qualified Data.Vector.Unboxed as U +-- import Statistics.Sample.Internal (robustSumVar, sum) +-- import Statistics.Types.Internal (Sample,WeightedSample) import Numeric.Sum (kbn, Summation(zero,add), KBNSum) import qualified Control.Foldl as F @@ -75,6 +72,17 @@ import qualified Control.Foldl as F -- Operator ^ will be overridden import Prelude hiding ((^), sum) + +data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double +data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int +data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double + +-- (^) operator from Prelude is just slow. +(^) :: Double -> Int -> Double +x ^ 1 = x +x ^ n = x * (x ^ (n-1)) +{-# INLINE (^) #-} + -- | /O(n)/ Range. The difference between the largest and smallest -- elements of a sample. range :: F.Fold Double Double @@ -83,14 +91,19 @@ range = maximum' - minimum' maximum' :: F.Fold Double Double maximum' = F.Fold max (-1/0 :: Double) id where +{-# INLINE minimum' #-} + minimum' :: F.Fold Double Double minimum' = F.Fold min ( 1/0 :: Double) id where +{-# INLINE maximum' #-} kbnSum' :: F.Fold Double KBNSum kbnSum' = F.Fold add zero id +{-# INLINE kbnSum' #-} kbnSum :: F.Fold Double Double kbnSum = fmap kbn kbnSum' +{-# INLINE kbnSum #-} -- | Compute expectation of function over for sample. @@ -145,6 +158,7 @@ geometricMean :: F.Fold Double Double geometricMean = fmap exp (expectation log) {-# INLINE geometricMean #-} + -- | Compute the /k/th central moment of a sample. The central moment -- is also known as the moment about the mean. -- @@ -173,20 +187,14 @@ centralMoment a m -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. -centralMoments :: (G.Vector v Double) => Int -> Int -> v Double -> (Double, Double) -centralMoments a b xs - | a < 2 || b < 2 = (centralMoment a xs , centralMoment b xs) - | otherwise = fini . G.foldl' go (V 0 0) $ xs - where go (V i j) x = V (i + d^a) (j + d^b) +centralMoments :: Int -> Int -> Double -> F.Fold Double (Double, Double) +centralMoments a b m + | a < 2 || b < 2 = liftA2 (,) (centralMoment a m) (centralMoment b m) + | otherwise = F.Fold go (T1 0 0 0) fini + where go (T1 n i j) x = T1 (n+1) (i + d^a) (j + d^b) where d = x - m - fini (V i j) = (i / n , j / n) - m = mean xs - n = fromIntegral (G.length xs) -{-# SPECIALIZE - centralMoments :: Int -> Int -> U.Vector Double -> (Double, Double) #-} -{-# SPECIALIZE - centralMoments :: Int -> Int -> V.Vector Double -> (Double, Double) #-} - + fini (T1 n i j) = (i / n' , j / n') + where n' = fromIntegral n -- | Compute the skewness of a sample. This is a measure of the -- asymmetry of its distribution. -- @@ -209,11 +217,9 @@ centralMoments a b xs -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. -skewness :: (G.Vector v Double) => v Double -> Double -skewness xs = c3 * c2 ** (-1.5) - where (c3 , c2) = centralMoments 3 2 xs -{-# SPECIALIZE skewness :: U.Vector Double -> Double #-} -{-# SPECIALIZE skewness :: V.Vector Double -> Double #-} +skewness :: Double -> F.Fold Double Double +skewness m = (\(c3, c2) -> c3 * c2 ** (-1.5)) <$> centralMoments 3 2 m + -- | Compute the excess kurtosis of a sample. This is a measure of -- the \"peakedness\" of its distribution. A high kurtosis indicates @@ -228,11 +234,8 @@ skewness xs = c3 * c2 ** (-1.5) -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. -kurtosis :: (G.Vector v Double) => v Double -> Double -kurtosis xs = c4 / (c2 * c2) - 3 - where (c4 , c2) = centralMoments 4 2 xs -{-# SPECIALIZE kurtosis :: U.Vector Double -> Double #-} -{-# SPECIALIZE kurtosis :: V.Vector Double -> Double #-} +kurtosis :: Double -> F.Fold Double Double +kurtosis m = (\(c4, c2) -> c4 / (c2 * c2) - 3) <$> centralMoments 4 2 m -- $variance -- @@ -248,70 +251,60 @@ kurtosis xs = c4 / (c2 * c2) - 3 -- Because of the need for two passes, these functions are /not/ -- subject to stream fusion. -data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double - -- | Maximum likelihood estimate of a sample's variance. Also known -- as the population variance, where the denominator is /n/. -variance :: (G.Vector v Double) => v Double -> Double -variance samp - | n > 1 = robustSumVar (mean samp) samp / fromIntegral n - | otherwise = 0 - where - n = G.length samp -{-# SPECIALIZE variance :: U.Vector Double -> Double #-} -{-# SPECIALIZE variance :: V.Vector Double -> Double #-} +variance :: Double -> F.Fold Double Double +variance m = + liftA2 (\s n -> if n > 1 then s / n else 0) + (robustSumVar m) F.genericLength + +robustSumVar :: Double -> F.Fold Double Double +robustSumVar m = F.premap (square . subtract m) kbnSum + -- | Unbiased estimate of a sample's variance. Also known as the -- sample variance, where the denominator is /n/-1. -varianceUnbiased :: (G.Vector v Double) => v Double -> Double -varianceUnbiased samp - | n > 1 = robustSumVar (mean samp) samp / fromIntegral (n-1) - | otherwise = 0 - where - n = G.length samp -{-# SPECIALIZE varianceUnbiased :: U.Vector Double -> Double #-} -{-# SPECIALIZE varianceUnbiased :: V.Vector Double -> Double #-} +varianceUnbiased :: Double -> F.Fold Double Double +varianceUnbiased m = + liftA2 (\s n -> if n > 1 then s / (n-1) else 0) + (robustSumVar m) F.genericLength +{- -- | Calculate mean and maximum likelihood estimate of variance. This -- function should be used if both mean and variance are required -- since it will calculate mean only once. -meanVariance :: (G.Vector v Double) => v Double -> (Double,Double) -meanVariance samp - | n > 1 = (m, robustSumVar m samp / fromIntegral n) - | otherwise = (m, 0) - where - n = G.length samp - m = mean samp -{-# SPECIALIZE meanVariance :: U.Vector Double -> (Double,Double) #-} -{-# SPECIALIZE meanVariance :: V.Vector Double -> (Double,Double) #-} +-- meanVariance :: Double -> F.Fold Double (Double,Double) +-- meanVariance m +-- | n > 1 = (m, robustSumVar m samp / fromIntegral n) +-- | otherwise = (m, 0) +-- where +-- n = G.length samp +-- {-# SPECIALIZE meanVariance :: U.Vector Double -> (Double,Double) #-} +-- {-# SPECIALIZE meanVariance :: V.Vector Double -> (Double,Double) #-} -- | Calculate mean and unbiased estimate of variance. This -- function should be used if both mean and variance are required -- since it will calculate mean only once. -meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double) -meanVarianceUnb samp - | n > 1 = (m, robustSumVar m samp / fromIntegral (n-1)) - | otherwise = (m, 0) - where - n = G.length samp - m = mean samp -{-# SPECIALIZE meanVarianceUnb :: U.Vector Double -> (Double,Double) #-} -{-# SPECIALIZE meanVarianceUnb :: V.Vector Double -> (Double,Double) #-} +-- meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double) +-- meanVarianceUnb samp +-- | n > 1 = (m, robustSumVar m samp / fromIntegral (n-1)) +-- | otherwise = (m, 0) +-- where +-- n = G.length samp +-- m = mean samp +-- {-# SPECIALIZE meanVarianceUnb :: U.Vector Double -> (Double,Double) #-} +-- {-# SPECIALIZE meanVarianceUnb :: V.Vector Double -> (Double,Double) #-} -- | Standard deviation. This is simply the square root of the -- unbiased estimate of the variance. -stdDev :: (G.Vector v Double) => v Double -> Double -stdDev = sqrt . varianceUnbiased -{-# SPECIALIZE stdDev :: U.Vector Double -> Double #-} -{-# SPECIALIZE stdDev :: V.Vector Double -> Double #-} +stdDev :: Double -> F.Fold Double Double +stdDev m = fmap sqrt $ varianceUnbiased m -- | Standard error of the mean. This is the standard deviation -- divided by the square root of the sample size. -stdErrMean :: (G.Vector v Double) => v Double -> Double -stdErrMean samp = stdDev samp / (sqrt . fromIntegral . G.length) samp -{-# SPECIALIZE stdErrMean :: U.Vector Double -> Double #-} -{-# SPECIALIZE stdErrMean :: V.Vector Double -> Double #-} +stdErrMean :: Double -> F.Fold Double Double +stdErrMean m = stdDev m / fmap sqrt F.genericLength robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> V robustSumVarWeighted samp = G.foldl' go (V 0 0) samp @@ -510,3 +503,4 @@ yielding to boxed returns and heap checks. -- * West, D.H.D. (1979) Updating mean and variance estimates: an -- improved method. /Communications of the ACM/ -- 22(9):532–535. +-} \ No newline at end of file From e08a275905e917fc06ca93a91cab99de59dfda63 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 15:33:01 +1000 Subject: [PATCH 05/20] WIP --- Statistics/Sample/Fold.hs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index d036289..30e32ee 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -117,6 +117,8 @@ expectation f = F.premap f kbnSum / F.genericLength -- fusion. mean :: F.Fold Double Double mean = kbnSum / F.genericLength +{-# INLINE mean #-} + -- | Arithmetic mean. This uses Welford's algorithm to provide -- numerical stability, using a single pass over the sample data. @@ -178,7 +180,8 @@ centralMoment a m where go x = (x-m) ^ a -- m = mean xs --- +{-# INLINE centralMoment #-} + -- | Compute the /k/th and /j/th central moments of a sample. -- @@ -195,6 +198,9 @@ centralMoments a b m where d = x - m fini (T1 n i j) = (i / n' , j / n') where n' = fromIntegral n +{-# INLINE centralMoments #-} + + -- | Compute the skewness of a sample. This is a measure of the -- asymmetry of its distribution. -- @@ -219,6 +225,7 @@ centralMoments a b m -- function is subject to inaccuracy due to catastrophic cancellation. skewness :: Double -> F.Fold Double Double skewness m = (\(c3, c2) -> c3 * c2 ** (-1.5)) <$> centralMoments 3 2 m +{-# INLINE skewness #-} -- | Compute the excess kurtosis of a sample. This is a measure of @@ -236,6 +243,8 @@ skewness m = (\(c3, c2) -> c3 * c2 ** (-1.5)) <$> centralMoments 3 2 m -- function is subject to inaccuracy due to catastrophic cancellation. kurtosis :: Double -> F.Fold Double Double kurtosis m = (\(c4, c2) -> c4 / (c2 * c2) - 3) <$> centralMoments 4 2 m +{-# INLINE kurtosis #-} + -- $variance -- @@ -257,11 +266,15 @@ variance :: Double -> F.Fold Double Double variance m = liftA2 (\s n -> if n > 1 then s / n else 0) (robustSumVar m) F.genericLength +{-# INLINE variance #-} + robustSumVar :: Double -> F.Fold Double Double robustSumVar m = F.premap (square . subtract m) kbnSum +{-# INLINE robustSumVar #-} + -- | Unbiased estimate of a sample's variance. Also known as the -- sample variance, where the denominator is /n/-1. @@ -269,6 +282,8 @@ varianceUnbiased :: Double -> F.Fold Double Double varianceUnbiased m = liftA2 (\s n -> if n > 1 then s / (n-1) else 0) (robustSumVar m) F.genericLength +{-# INLINE varianceUnbiased #-} + {- -- | Calculate mean and maximum likelihood estimate of variance. This From 96658679e6bb805054cb06fdb763d8d1bd502795 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 15:45:18 +1000 Subject: [PATCH 06/20] F.genericLength -> doubleLength --- Statistics/Sample/Fold.hs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index 30e32ee..d8765f5 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -108,7 +108,7 @@ kbnSum = fmap kbn kbnSum' -- | Compute expectation of function over for sample. expectation :: (a -> Double) -> F.Fold a Double -expectation f = F.premap f kbnSum / F.genericLength +expectation f = F.premap f kbnSum / doubleLength {-# INLINE expectation #-} -- | Arithmetic mean. This uses Kahan-Babuška-Neumaier @@ -116,9 +116,12 @@ expectation f = F.premap f kbnSum / F.genericLength -- values are very large. This function is not subject to stream -- fusion. mean :: F.Fold Double Double -mean = kbnSum / F.genericLength +mean = kbnSum / doubleLength {-# INLINE mean #-} +doubleLength :: F.Fold a Double +doubleLength = F.Fold (\n _ -> n+1) (0::Int) fromIntegral + -- | Arithmetic mean. This uses Welford's algorithm to provide -- numerical stability, using a single pass over the sample data. @@ -265,7 +268,7 @@ kurtosis m = (\(c4, c2) -> c4 / (c2 * c2) - 3) <$> centralMoments 4 2 m variance :: Double -> F.Fold Double Double variance m = liftA2 (\s n -> if n > 1 then s / n else 0) - (robustSumVar m) F.genericLength + (robustSumVar m) doubleLength {-# INLINE variance #-} @@ -281,7 +284,7 @@ robustSumVar m = F.premap (square . subtract m) kbnSum varianceUnbiased :: Double -> F.Fold Double Double varianceUnbiased m = liftA2 (\s n -> if n > 1 then s / (n-1) else 0) - (robustSumVar m) F.genericLength + (robustSumVar m) doubleLength {-# INLINE varianceUnbiased #-} @@ -319,7 +322,7 @@ stdDev m = fmap sqrt $ varianceUnbiased m -- | Standard error of the mean. This is the standard deviation -- divided by the square root of the sample size. stdErrMean :: Double -> F.Fold Double Double -stdErrMean m = stdDev m / fmap sqrt F.genericLength +stdErrMean m = stdDev m / fmap sqrt doubleLength robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> V robustSumVarWeighted samp = G.foldl' go (V 0 0) samp From c75af106e367eead3ea6a2230af9a3638a3c6691 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 22:44:38 +1000 Subject: [PATCH 07/20] Convert all Statistic.Sample functions to use folds --- Statistics/Sample.hs | 129 +++++++-------- Statistics/Sample/Fold.hs | 320 ++++++++++++++++---------------------- 2 files changed, 187 insertions(+), 262 deletions(-) diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs index bd04ef4..490e2c5 100644 --- a/Statistics/Sample.hs +++ b/Statistics/Sample.hs @@ -62,13 +62,12 @@ module Statistics.Sample -- $references ) where -import Statistics.Function (square) -import Statistics.Sample.Internal (robustSumVar, sum) + + import Statistics.Types.Internal (Sample,WeightedSample) import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U --- import Numeric.Sum (kbn, Summation(zero,add)) -- Operator ^ will be overridden import Prelude hiding ((^), sum) @@ -82,6 +81,24 @@ ffold f v = F.purely_ (\s i -> G.foldl s i v) f {-# SPECIALIZE ffold :: F.Fold Double b -> U.Vector Double -> b #-} {-# SPECIALIZE ffold :: F.Fold Double b -> V.Vector Double -> b #-} +data P a = P !a !Int + +pfold :: (G.Vector v a, G.Vector v b) => String -> F.Fold (a,b) c -> v a -> v b -> c +pfold err fld va vb + | la /= lb = error err + | otherwise = ffold foldPair va + where la = G.length va + lb = G.length vb + -- Pattern match here so we don't end up forcing arguments + -- to the fold before we've checked the length + foldPair = case fld of + (F.Fold step ini end) -> F.Fold step' (P ini 0) end' where + end' (P a _) = end a + step' (P x n) a = P (step x (a, G.unsafeIndex vb n)) (n+1) +{-# INLINE pfold #-} +{-# SPECIALIZE pfold :: String -> F.Fold (Double, Double) b -> U.Vector Double -> U.Vector Double -> b #-} +{-# SPECIALIZE pfold :: String -> F.Fold (Double, Double) b -> V.Vector Double -> V.Vector Double -> b #-} + -- | /O(n)/ Range. The difference between the largest and smallest -- elements of a sample. @@ -219,8 +236,6 @@ kurtosis xs = ffold (FF.kurtosis m) xs -- Because of the need for two passes, these functions are /not/ -- subject to stream fusion. -data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double - -- | Maximum likelihood estimate of a sample's variance. Also known -- as the population variance, where the denominator is /n/. variance :: (G.Vector v Double) => v Double -> Double @@ -243,7 +258,7 @@ varianceUnbiased samp = ffold (FF.varianceUnbiased m) samp -- since it will calculate mean only once. meanVariance :: (G.Vector v Double) => v Double -> (Double,Double) meanVariance samp - | n > 1 = (m, robustSumVar m samp / fromIntegral n) + | n > 1 = (m, ffold (FF.robustSumVar m) samp / fromIntegral n) | otherwise = (m, 0) where n = G.length samp @@ -256,7 +271,7 @@ meanVariance samp -- since it will calculate mean only once. meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double) meanVarianceUnb samp - | n > 1 = (m, robustSumVar m samp / fromIntegral (n-1)) + | n > 1 = (m, ffold (FF.robustSumVar m) samp / fromIntegral (n-1)) | otherwise = (m, 0) where n = G.length samp @@ -267,23 +282,23 @@ meanVarianceUnb samp -- | Standard deviation. This is simply the square root of the -- unbiased estimate of the variance. stdDev :: (G.Vector v Double) => v Double -> Double -stdDev = sqrt . varianceUnbiased +stdDev samp = ffold (FF.stdDev m) samp + where m = mean samp {-# SPECIALIZE stdDev :: U.Vector Double -> Double #-} {-# SPECIALIZE stdDev :: V.Vector Double -> Double #-} -- | Standard error of the mean. This is the standard deviation -- divided by the square root of the sample size. stdErrMean :: (G.Vector v Double) => v Double -> Double -stdErrMean samp = stdDev samp / (sqrt . fromIntegral . G.length) samp +stdErrMean samp = ffold (FF.stdErrMean m) samp + where m = mean samp {-# SPECIALIZE stdErrMean :: U.Vector Double -> Double #-} {-# SPECIALIZE stdErrMean :: V.Vector Double -> Double #-} -robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> V -robustSumVarWeighted samp = G.foldl' go (V 0 0) samp - where - go (V s w) (x,xw) = V (s + xw*d*d) (w + xw) - where d = x - m - m = meanWeighted samp +robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> FF.V +robustSumVarWeighted samp = ffold (fmap fini $ FF.robustSumVarWeighted m) samp + where m = meanWeighted samp + fini (FF.V a b) = FF.V a b {-# INLINE robustSumVarWeighted #-} -- | Weighted variance. This is biased estimation. @@ -292,7 +307,7 @@ varianceWeighted samp | G.length samp > 1 = fini $ robustSumVarWeighted samp | otherwise = 0 where - fini (V s w) = s / w + fini (FF.V s w) = s / w {-# SPECIALIZE varianceWeighted :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE varianceWeighted :: V.Vector (Double,Double) -> Double #-} @@ -307,35 +322,23 @@ varianceWeighted samp -- mean, Knuth's algorithm gives inaccurate results due to -- catastrophic cancellation. -fastVar :: (G.Vector v Double) => v Double -> T1 -fastVar = G.foldl' go (T1 0 0 0) - where - go (T1 n m s) x = T1 n' m' s' - where n' = n + 1 - m' = m + d / fromIntegral n' - s' = s + d * (x - m') - d = x - m +-- fastVar :: (G.Vector v Double) => v Double -> FF.T1 +-- fastVar = ffold FF.fastVar -- | Maximum likelihood estimate of a sample's variance. fastVariance :: (G.Vector v Double) => v Double -> Double -fastVariance = fini . fastVar - where fini (T1 n _m s) - | n > 1 = s / fromIntegral n - | otherwise = 0 +fastVariance = ffold FF.fastVariance {-# INLINE fastVariance #-} -- | Unbiased estimate of a sample's variance. fastVarianceUnbiased :: (G.Vector v Double) => v Double -> Double -fastVarianceUnbiased = fini . fastVar - where fini (T1 n _m s) - | n > 1 = s / fromIntegral (n - 1) - | otherwise = 0 +fastVarianceUnbiased = ffold FF.fastVarianceUnbiased {-# INLINE fastVarianceUnbiased #-} -- | Standard deviation. This is simply the square root of the -- maximum likelihood estimate of the variance. fastStdDev :: (G.Vector v Double) => v Double -> Double -fastStdDev = sqrt . fastVariance +fastStdDev = ffold FF.fastStdDev {-# INLINE fastStdDev #-} -- | Covariance of sample of pairs. For empty sample it's set to @@ -343,13 +346,9 @@ fastStdDev = sqrt . fastVariance covariance :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double -covariance xy - | n == 0 = 0 - | otherwise = expectation (\(x,y) -> (x - muX)*(y - muY)) xy +covariance xy = ffold (FF.covariance (muX, muY)) xy where - n = G.length xy - muX = expectation fst xy - muY = expectation snd xy + FF.V muX muY = ffold (FF.biExpectation fst snd) xy {-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-} @@ -358,16 +357,10 @@ covariance xy correlation :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double -correlation xy - | n == 0 = 0 - | otherwise = cov / sqrt (varX * varY) +correlation xy = ffold (FF.correlation (muX, muY)) xy where - n = G.length xy - muX = expectation (\(x,_) -> x) xy - muY = expectation (\(_,y) -> y) xy - varX = expectation (\(x,_) -> square (x - muX)) xy - varY = expectation (\(_,y) -> square (y - muY)) xy - cov = expectation (\(x,y) -> (x - muX)*(y - muY)) xy + FF.V muX muY = ffold (FF.biExpectation fst snd) xy + {-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-} @@ -378,14 +371,11 @@ covariance2 :: (G.Vector v Double) => v Double -> v Double -> Double -covariance2 xs ys - | nx /= ny = error $ "Statistics.Sample.covariance2: both samples must have same length" - | nx == 0 = 0 - | otherwise = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys) - / fromIntegral nx +covariance2 xs ys = + pfold "Statistics.Sample.covariance2: both samples must have same length" + (FF.covariance (muX, muY)) + xs ys where - nx = G.length xs - ny = G.length ys muX = mean xs muY = mean ys {-# SPECIALIZE covariance2 :: U.Vector Double -> U.Vector Double -> Double #-} @@ -398,17 +388,17 @@ correlation2 :: (G.Vector v Double) => v Double -> v Double -> Double -correlation2 xs ys - | nx /= ny = error $ "Statistics.Sample.correlation2: both samples must have same length" - | nx == 0 = 0 - | otherwise = cov / sqrt (varX * varY) +correlation2 xs ys = + pfold "Statistics.Sample.correlation2: both samples must have same length" + (FF.correlation (muX, muY)) + xs ys + -- | nx /= ny = error $ "Statistics.Sample.correlation2: both samples must have same length" + -- | nx == 0 = 0 + -- | otherwise = cov / sqrt (varX * varY) where - nx = G.length xs - ny = G.length ys - (muX,varX) = meanVariance xs - (muY,varY) = meanVariance ys - cov = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys) - / fromIntegral nx + muX = mean xs + muY = mean ys + {-# SPECIALIZE correlation2 :: U.Vector Double -> U.Vector Double -> Double #-} {-# SPECIALIZE correlation2 :: V.Vector Double -> V.Vector Double -> Double #-} @@ -424,16 +414,7 @@ pair va vb ------------------------------------------------------------------------ -- Helper code. Monomorphic unpacked accumulators. --- (^) operator from Prelude is just slow. -(^) :: Double -> Int -> Double -x ^ 1 = x -x ^ n = x * (x ^ (n-1)) -{-# INLINE (^) #-} - -- don't support polymorphism, as we can't get unboxed returns if we use it. -data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int - -data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double {- diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index d8765f5..b07064b 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -1,4 +1,5 @@ {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE BangPatterns #-} -- | -- Module : Statistics.Sample -- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan @@ -11,60 +12,63 @@ -- Commonly used sample statistics, also known as descriptive -- statistics. -module Statistics.Sample.Fold where - -- ( - -- -- * Types - -- Sample - -- , WeightedSample - -- -- * Descriptive functions - -- , range - - -- -- * Statistics of location - -- , expectation - -- , mean - -- , welfordMean - -- , meanWeighted - -- , harmonicMean - -- , geometricMean - - -- -- * Statistics of dispersion - -- -- $variance - - -- -- ** Functions over central moments - -- , centralMoment - -- , centralMoments - -- , skewness - -- , kurtosis - - -- -- ** Two-pass functions (numerically robust) - -- -- $robust - -- , variance - -- , varianceUnbiased - -- -- , meanVariance - -- -- , meanVarianceUnb - -- , stdDev - -- , varianceWeighted - -- , stdErrMean - - -- -- ** Single-pass functions (faster, less safe) - -- -- $cancellation - -- , fastVariance - -- , fastVarianceUnbiased - -- , fastStdDev - - -- -- * Joint distributions - -- , covariance - -- , correlation - -- , covariance2 - -- , correlation2 - -- , pair - -- -- * References - -- -- $references - -- ) where +module Statistics.Sample.Fold + ( + -- * Descriptive functions + range + , minimum' + , maximum' + + -- * Statistics of location + , expectation + , mean + , welfordMean + , meanWeighted + , harmonicMean + , geometricMean + + -- * Statistics of dispersion + -- $variance + + -- ** Functions over central moments + , centralMoment + , centralMoments + , skewness + , kurtosis + + -- ** Two-pass functions (numerically robust) + -- $robust + , variance + , varianceUnbiased + -- , meanVariance + -- , meanVarianceUnb + , stdDev + , varianceWeighted + , stdErrMean + , robustSumVar + , robustSumVarWeighted + + -- ** Single-pass functions (faster, less safe) + -- $cancellation + , fastVariance + , fastVarianceUnbiased + , fastStdDev + + -- * Joint distributions + , covariance + , correlation + -- * Strict types and helpers + , V(..) + , biExpectation + , kbnSum + , kbnSum' + -- * References + -- $references + + + ) where import Statistics.Function (square) --- import Statistics.Sample.Internal (robustSumVar, sum) --- import Statistics.Types.Internal (Sample,WeightedSample) import Numeric.Sum (kbn, Summation(zero,add), KBNSum) import qualified Control.Foldl as F @@ -73,16 +77,47 @@ import qualified Control.Foldl as F import Prelude hiding ((^), sum) -data V = V {-# UNPACK #-} !Double {-# UNPACK #-} !Double -data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int -data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double +------------------------------------------------------------------------ +-- Helper code. Monomorphic unpacked accumulators. -- (^) operator from Prelude is just slow. (^) :: Double -> Int -> Double x ^ 1 = x -x ^ n = x * (x ^ (n-1)) +-- x ^ n = x * (x ^ (n-1)) +x0 ^ n0 = go (n0-1) x0 where + go 0 !acc = acc + go n acc = go (n-1) (acc*x0) {-# INLINE (^) #-} + +data V = V {-# UNPACK #-}!Double {-# UNPACK #-}!Double +data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int +data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double + +-- don't support polymorphism, as we can't get unboxed returns if we use it. +{- + +Consider this core: + +with data T a = T !a !Int + +$wfold :: Double# + -> Int# + -> Int# + -> (# Double, Int# #) + +and without, + +$wfold :: Double# + -> Int# + -> Int# + -> (# Double#, Int# #) + +yielding to boxed returns and heap checks. + +-} + + -- | /O(n)/ Range. The difference between the largest and smallest -- elements of a sample. range :: F.Fold Double Double @@ -272,8 +307,6 @@ variance m = {-# INLINE variance #-} - - robustSumVar :: Double -> F.Fold Double Double robustSumVar m = F.premap (square . subtract m) kbnSum {-# INLINE robustSumVar #-} @@ -313,34 +346,32 @@ varianceUnbiased m = -- m = mean samp -- {-# SPECIALIZE meanVarianceUnb :: U.Vector Double -> (Double,Double) #-} -- {-# SPECIALIZE meanVarianceUnb :: V.Vector Double -> (Double,Double) #-} +-} -- | Standard deviation. This is simply the square root of the -- unbiased estimate of the variance. stdDev :: Double -> F.Fold Double Double stdDev m = fmap sqrt $ varianceUnbiased m +{-# INLINE stdDev #-} -- | Standard error of the mean. This is the standard deviation -- divided by the square root of the sample size. stdErrMean :: Double -> F.Fold Double Double stdErrMean m = stdDev m / fmap sqrt doubleLength +{-# INLINE stdErrMean #-} -robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> V -robustSumVarWeighted samp = G.foldl' go (V 0 0) samp +robustSumVarWeighted :: Double -> F.Fold (Double,Double) V +robustSumVarWeighted wm = F.Fold go (V 0 0) id where go (V s w) (x,xw) = V (s + xw*d*d) (w + xw) - where d = x - m - m = meanWeighted samp + where d = x - wm {-# INLINE robustSumVarWeighted #-} -- | Weighted variance. This is biased estimation. -varianceWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double -varianceWeighted samp - | G.length samp > 1 = fini $ robustSumVarWeighted samp - | otherwise = 0 +varianceWeighted :: Double -> F.Fold (Double,Double) Double +varianceWeighted wm = fini <$> robustSumVarWeighted wm where fini (V s w) = s / w -{-# SPECIALIZE varianceWeighted :: U.Vector (Double,Double) -> Double #-} -{-# SPECIALIZE varianceWeighted :: V.Vector (Double,Double) -> Double #-} -- $cancellation -- @@ -353,8 +384,8 @@ varianceWeighted samp -- mean, Knuth's algorithm gives inaccurate results due to -- catastrophic cancellation. -fastVar :: (G.Vector v Double) => v Double -> T1 -fastVar = G.foldl' go (T1 0 0 0) +fastVar :: F.Fold Double T1 +fastVar = F.Fold go (T1 0 0 0) id where go (T1 n m s) x = T1 n' m' s' where n' = n + 1 @@ -363,16 +394,16 @@ fastVar = G.foldl' go (T1 0 0 0) d = x - m -- | Maximum likelihood estimate of a sample's variance. -fastVariance :: (G.Vector v Double) => v Double -> Double -fastVariance = fini . fastVar +fastVariance :: F.Fold Double Double +fastVariance = fmap fini fastVar where fini (T1 n _m s) | n > 1 = s / fromIntegral n | otherwise = 0 {-# INLINE fastVariance #-} -- | Unbiased estimate of a sample's variance. -fastVarianceUnbiased :: (G.Vector v Double) => v Double -> Double -fastVarianceUnbiased = fini . fastVar +fastVarianceUnbiased :: F.Fold Double Double +fastVarianceUnbiased = fmap fini fastVar where fini (T1 n _m s) | n > 1 = s / fromIntegral (n - 1) | otherwise = 0 @@ -380,128 +411,42 @@ fastVarianceUnbiased = fini . fastVar -- | Standard deviation. This is simply the square root of the -- maximum likelihood estimate of the variance. -fastStdDev :: (G.Vector v Double) => v Double -> Double -fastStdDev = sqrt . fastVariance +fastStdDev :: F.Fold Double Double +fastStdDev = fmap sqrt fastVariance {-# INLINE fastStdDev #-} +-- Avoids computing the length twice +biExpectation :: (a -> Double) -> (a -> Double) -> F.Fold a V +biExpectation f s = fini <$> F.length <*> F.premap f kbnSum <*> F.premap s kbnSum + where fini n a b = let n' = fromIntegral n in V (a/n') (b/n') +{-# INLINE biExpectation #-} + -- | Covariance of sample of pairs. For empty sample it's set to -- zero -covariance :: (G.Vector v (Double,Double)) - => v (Double,Double) - -> Double -covariance xy - | n == 0 = 0 - | otherwise = expectation (\(x,y) -> (x - muX)*(y - muY)) xy - where - n = G.length xy - muX = expectation fst xy - muY = expectation snd xy -{-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-} -{-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-} +covariance ::(Double, Double) -> F.Fold (Double,Double) Double +covariance (muX, muY) = + liftA2 (\n x -> if n > 0 then x else 0) + F.length + (expectation (\(x,y) -> (x - muX)*(y - muY))) +{-# INLINE covariance #-} + -- | Correlation coefficient for sample of pairs. Also known as -- Pearson's correlation. For empty sample it's set to zero. -correlation :: (G.Vector v (Double,Double)) - => v (Double,Double) - -> Double -correlation xy - | n == 0 = 0 - | otherwise = cov / sqrt (varX * varY) +-- The means `muX` and `muY` must be known. +correlation :: (Double, Double) -> F.Fold (Double,Double) Double +correlation (muX, muY) = + fini <$> F.length <*> covF <*> varsF where - n = G.length xy - muX = expectation (\(x,_) -> x) xy - muY = expectation (\(_,y) -> y) xy - varX = expectation (\(x,_) -> square (x - muX)) xy - varY = expectation (\(_,y) -> square (y - muY)) xy - cov = expectation (\(x,y) -> (x - muX)*(y - muY)) xy -{-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-} -{-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-} - - --- | Covariance of two samples. Both vectors must be of the same --- length. If both are empty it's set to zero -covariance2 :: (G.Vector v Double) - => v Double - -> v Double - -> Double -covariance2 xs ys - | nx /= ny = error $ "Statistics.Sample.covariance2: both samples must have same length" - | nx == 0 = 0 - | otherwise = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys) - / fromIntegral nx - where - nx = G.length xs - ny = G.length ys - muX = mean xs - muY = mean ys -{-# SPECIALIZE covariance2 :: U.Vector Double -> U.Vector Double -> Double #-} -{-# SPECIALIZE covariance2 :: V.Vector Double -> V.Vector Double -> Double #-} - --- | Correlation coefficient for two samples. Both vector must have --- same length Also known as Pearson's correlation. For empty sample --- it's set to zero. -correlation2 :: (G.Vector v Double) - => v Double - -> v Double - -> Double -correlation2 xs ys - | nx /= ny = error $ "Statistics.Sample.correlation2: both samples must have same length" - | nx == 0 = 0 - | otherwise = cov / sqrt (varX * varY) - where - nx = G.length xs - ny = G.length ys - (muX,varX) = meanVariance xs - (muY,varY) = meanVariance ys - cov = sum (G.zipWith (\x y -> (x - muX)*(y - muY)) xs ys) - / fromIntegral nx -{-# SPECIALIZE correlation2 :: U.Vector Double -> U.Vector Double -> Double #-} -{-# SPECIALIZE correlation2 :: V.Vector Double -> V.Vector Double -> Double #-} - - --- | Pair two samples. It's like 'G.zip' but requires that both --- samples have equal size. -pair :: (G.Vector v a, G.Vector v b, G.Vector v (a,b)) => v a -> v b -> v (a,b) -pair va vb - | G.length va == G.length vb = G.zip va vb - | otherwise = error "Statistics.Sample.pair: vector must have same length" -{-# INLINE pair #-} - ------------------------------------------------------------------------- --- Helper code. Monomorphic unpacked accumulators. - --- (^) operator from Prelude is just slow. -(^) :: Double -> Int -> Double -x ^ 1 = x -x ^ n = x * (x ^ (n-1)) -{-# INLINE (^) #-} - --- don't support polymorphism, as we can't get unboxed returns if we use it. -data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int - -data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double - -{- - -Consider this core: + fini n cov (V varX varY) + | n == 0 = 0 + | otherwise = cov / sqrt (varX * varY) + covF = expectation (\(x,y) -> (x - muX)*(y - muY)) + varsF = biExpectation (\(x,_) -> square (x - muX)) + (\(_,y) -> square (y - muY)) +{-# INLINE correlation #-} -with data T a = T !a !Int - -$wfold :: Double# - -> Int# - -> Int# - -> (# Double, Int# #) - -and without, - -$wfold :: Double# - -> Int# - -> Int# - -> (# Double#, Int# #) - -yielding to boxed returns and heap checks. - --} +-------- -- $references -- @@ -521,4 +466,3 @@ yielding to boxed returns and heap checks. -- * West, D.H.D. (1979) Updating mean and variance estimates: an -- improved method. /Communications of the ACM/ -- 22(9):532–535. --} \ No newline at end of file From d37111850a6d3335947c5096a4e176ae9d32bd40 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Fri, 8 Aug 2025 23:31:18 +1000 Subject: [PATCH 08/20] Missed INLINE --- Statistics/Sample/Fold.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index b07064b..11f2ecf 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -392,6 +392,7 @@ fastVar = F.Fold go (T1 0 0 0) id m' = m + d / fromIntegral n' s' = s + d * (x - m') d = x - m +{-# INLINE fastVar #-} -- | Maximum likelihood estimate of a sample's variance. fastVariance :: F.Fold Double Double From f646698cc3199fa96b896e84b87afd248f6ec5f9 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 16:10:11 +1000 Subject: [PATCH 09/20] Update haddock for Fold --- Statistics/Sample/Fold.hs | 235 +++++++++++++++++++++++++------------- 1 file changed, 153 insertions(+), 82 deletions(-) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index 11f2ecf..5c5f53e 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -1,8 +1,8 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE BangPatterns #-} -- | --- Module : Statistics.Sample --- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan +-- Module : Statistics.Sample.Fold +-- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan, 2025 Alex Mason -- License : BSD3 -- -- Maintainer : bos@serpentine.com @@ -10,14 +10,70 @@ -- Portability : portable -- -- Commonly used sample statistics, also known as descriptive --- statistics. +-- statistics. They are provided as 'Control.Foldl.Fold''s +-- allowing them to be applied to any 'Foldable' type containing +-- 'Double's. The functions in "Statistics.Sample" are all +-- implemented in terms of these folds. +-- +-- The benefit of poviding these as 'Fold's is that they can be combined +-- compute multiple statistics with a single pass: +-- +-- >>> -- Compute the mean divided by the variance +-- >>> F.fold (liftA2 (/) FF.mean FF.fastVariance) [1,2,3,4,5,6,7,8] +-- 0.8571428571428571 +-- +-- 'Fold's can be modified in several ways; use 'F.premap' to extract +-- values from the input before being passed into the fold: +-- +-- >>> :t F.premap fst mean +-- F.Fold (Double, a) Double +-- +-- +-- They can be have their result mapped using `fmap`: +-- +-- >>> -- single pass standard deviation +-- >>> :t sqrt <$> fastVariance +-- F.Fold Double Double +-- +-- (They also have a 'Profunctor' instance, so also have 'lmap', 'rmap', 'dimap' +-- to contravariantly map the inputs and covariantly map output). +-- +-- And they can be combined 'Applicative'ly to produce new single pass +-- combined statistics (or even non-statistical 'Fold's). +-- +-- >>> differenceOfMeans = liftA2 (-) (F.premap fst mean) (F.premap snd mean) +-- >>> :t differenceOfMeans +-- F.Fold (Double, Double) Double +-- >>> F.fold differenceOfMeans [(10,1), (20,2), (30,3), (40,4)] +-- 22.5 +-- +-- 'Fold's also have 'Num', 'Floating', 'Fractional', `Semigroup` and 'Monoid' +-- instances, which allow them to be combined succinctly: +-- +-- >>> myMean = sum / F.genericLength +-- >>> F.fold myMean [1..10.0] +-- 5.5 +-- +-- >>> myStdDev = sqrt fastVariance +-- >>> F.fold myStdDev [1..10] +-- 2.8722813232690143 +-- >>> differenceOfMeans = F.premap fst mean - F.premap snd mean +-- +-- +-- === Performance +-- Inlining can make a very large difference in performance when defining 'Fold's, +-- which is why all definitions in this module are marked @INLINE@. If you define +-- 'Fold's which you export in your own modules and libraries, you should mark +-- them as @INLINE@ unless you have a good reason not to. Doing so allows their +-- definitions to be inlined into the definitions of folding functions, allowing +-- for significantly greater unboxing to occur. module Statistics.Sample.Fold ( -- * Descriptive functions - range - , minimum' + minimum' , maximum' + , range -- * Statistics of location , expectation @@ -62,10 +118,11 @@ module Statistics.Sample.Fold , biExpectation , kbnSum , kbnSum' + , kbn + , summation + , summationVia -- * References -- $references - - ) where import Statistics.Function (square) @@ -118,28 +175,68 @@ yielding to boxed returns and heap checks. -} --- | /O(n)/ Range. The difference between the largest and smallest +-- | Range. The difference between the largest and smallest -- elements of a sample. range :: F.Fold Double Double range = maximum' - minimum' {-# INLINE range #-} +-- | Maximum specialised to Double, returns -∞ without input. maximum' :: F.Fold Double Double maximum' = F.Fold max (-1/0 :: Double) id where {-# INLINE minimum' #-} +-- | Minimum specialised to Double, returns ∞ without input. minimum' :: F.Fold Double Double minimum' = F.Fold min ( 1/0 :: Double) id where {-# INLINE maximum' #-} +-- | Sum Doubles using Kahan-Babuška-Neumaier summation for increased accuracy. If you need to compute partial sums for combination later, use @kbnSum'@. +kbnSum :: F.Fold Double Double +kbnSum = summationVia kbn +{-# INLINE kbnSum #-} + +-- | Sum Doubles using Kahan-Babuška-Neumaier summation +-- for increased accuracy. The @KBNSum@ type contains +-- the internal state of the sum, and as it is a monoid, +-- can be use to combine different sums accurately. +-- +-- === __Examples:__ +-- >>> xs = [1,2,pi,1e-9] +-- >>> ys = [4,5,2*pi,1e-11] +-- >>> F.fold kbnSum' xs <> F.fold kbnSum' ys +-- KBNSum 21.424777961779377 2.580967484452971e-15 +-- +-- >>> kbn it +-- 21.42477796177938 +-- >>> F.fold kbnSum' (xs ++ ys) +-- KBNSum 21.42477796177938 (-9.7174619434753e-16) +-- +-- >>> kbn it +-- 21.42477796177938 kbnSum' :: F.Fold Double KBNSum -kbnSum' = F.Fold add zero id +kbnSum' = summation {-# INLINE kbnSum' #-} -kbnSum :: F.Fold Double Double -kbnSum = fmap kbn kbnSum' -{-# INLINE kbnSum #-} +-- | Sum doubles using any instance of the 'Summation' class. +summation :: Summation a => F.Fold Double a +summation = F.Fold add zero id +{-# INLINE summation #-} +-- | Sum doubles by using the finalisation function for types in the +-- 'Summation' class. 'kbnSum' is equivalent to @summationVia kbn@. +-- +-- >>> F.fold (summationVia kbn) [1,2,pi,1e-9,1e-17] +-- 6.141592654589793 +-- >>> F.fold (summationVia Numeric.Sum.kb2) [1,2,pi,1e-9,9e-15] +-- 6.141592654589802 +summationVia :: Summation s => (s -> Double) -> F.Fold Double Double +summationVia f = f <$> summation +{-# INLINE summationVia #-} + +-- Counts the number of elementes using Word for the count. +doubleLength :: F.Fold a Double +doubleLength = F.Fold (\n _ -> n+1) (0::Word) fromIntegral -- | Compute expectation of function over for sample. expectation :: (a -> Double) -> F.Fold a Double @@ -148,15 +245,11 @@ expectation f = F.premap f kbnSum / doubleLength -- | Arithmetic mean. This uses Kahan-Babuška-Neumaier -- summation, so is more accurate than 'welfordMean' unless the input --- values are very large. This function is not subject to stream --- fusion. +-- values are very large. mean :: F.Fold Double Double mean = kbnSum / doubleLength {-# INLINE mean #-} -doubleLength :: F.Fold a Double -doubleLength = F.Fold (\n _ -> n+1) (0::Int) fromIntegral - -- | Arithmetic mean. This uses Welford's algorithm to provide -- numerical stability, using a single pass over the sample data. @@ -184,8 +277,7 @@ meanWeighted = F.Fold go (V 0 0) fini w' = w + xw {-# INLINE meanWeighted #-} --- | Harmonic mean. This algorithm performs a single pass over --- the sample. +-- | Harmonic mean. harmonicMean :: F.Fold Double Double harmonicMean = F.Fold go (T 0 0) fini where @@ -195,20 +287,20 @@ harmonicMean = F.Fold go (T 0 0) fini -- | Geometric mean of a sample containing no negative values. geometricMean :: F.Fold Double Double -geometricMean = fmap exp (expectation log) +geometricMean = exp (expectation log) {-# INLINE geometricMean #-} -- | Compute the /k/th central moment of a sample. The central moment -- is also known as the moment about the mean. -- --- This function performs two passes over the sample, so is not subject --- to stream fusion. +-- The mean must be provided which may require two passes over +-- the input data. -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. -centralMoment :: Int - -> Double -- ^ mean +centralMoment :: Int -- ^ /k/ + -> Double -- ^ /mean/ -> F.Fold Double Double centralMoment a m | a < 0 = error "Statistics.Sample.centralMoment: negative input" @@ -217,18 +309,19 @@ centralMoment a m | otherwise = expectation go where go x = (x-m) ^ a - -- m = mean xs {-# INLINE centralMoment #-} -- | Compute the /k/th and /j/th central moments of a sample. -- --- This function performs two passes over the sample, so is not subject --- to stream fusion. +-- The mean must already be known. -- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. -centralMoments :: Int -> Int -> Double -> F.Fold Double (Double, Double) +centralMoments :: Int -- ^ /i/ + -> Int -- ^ /j/ + -> Double -- ^ /mean/ + -> F.Fold Double (Double, Double) -- ^ /i/th & /j/th central moments centralMoments a b m | a < 2 || b < 2 = liftA2 (,) (centralMoment a m) (centralMoment b m) | otherwise = F.Fold go (T1 0 0 0) fini @@ -240,25 +333,28 @@ centralMoments a b m -- | Compute the skewness of a sample. This is a measure of the --- asymmetry of its distribution. +-- asymmetry of its distribution. The mean must already be known. -- -- A sample with negative skew is said to be /left-skewed/. Most of -- its mass is on the right of the distribution, with the tail on the -- left. -- --- > skewness $ U.to [1,100,101,102,103] +-- >>> xs = [1,100,101,102,103] +-- >>> m = F.fold mean xs +-- >>> F.fold (skewness m) xs +-- -1.4976814499182607 +-- -- > ==> -1.497681449918257 -- -- A sample with positive skew is said to be /right-skewed/. -- --- > skewness $ U.to [1,2,3,4,100] --- > ==> 1.4975367033335198 +-- >>> xs = [1,2,3,4,100] +-- >>> m = F.fold mean xs +-- >>> F.fold (skewness m) xs +-- 1.4975367033335198 -- -- A sample's skewness is not defined if its 'variance' is zero. -- --- This function performs two passes over the sample, so is not subject --- to stream fusion. --- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. skewness :: Double -> F.Fold Double Double @@ -269,14 +365,12 @@ skewness m = (\(c3, c2) -> c3 * c2 ** (-1.5)) <$> centralMoments 3 2 m -- | Compute the excess kurtosis of a sample. This is a measure of -- the \"peakedness\" of its distribution. A high kurtosis indicates -- that more of the sample's variance is due to infrequent severe --- deviations, rather than more frequent modest deviations. +-- deviations, rather than more frequent modest deviations. The mean +-- must already be known. -- -- A sample's excess kurtosis is not defined if its 'variance' is -- zero. -- --- This function performs two passes over the sample, so is not subject --- to stream fusion. --- -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. kurtosis :: Double -> F.Fold Double Double @@ -295,8 +389,8 @@ kurtosis m = (\(c4, c2) -> c4 / (c2 * c2) - 3) <$> centralMoments 4 2 m -- al. for numerical robustness, but require two passes over the -- sample data as a result. -- --- Because of the need for two passes, these functions are /not/ --- subject to stream fusion. +-- The mean must already be known, and may require two passes over +-- the data. -- | Maximum likelihood estimate of a sample's variance. Also known -- as the population variance, where the denominator is /n/. @@ -307,11 +401,6 @@ variance m = {-# INLINE variance #-} -robustSumVar :: Double -> F.Fold Double Double -robustSumVar m = F.premap (square . subtract m) kbnSum -{-# INLINE robustSumVar #-} - - -- | Unbiased estimate of a sample's variance. Also known as the -- sample variance, where the denominator is /n/-1. varianceUnbiased :: Double -> F.Fold Double Double @@ -321,43 +410,25 @@ varianceUnbiased m = {-# INLINE varianceUnbiased #-} -{- --- | Calculate mean and maximum likelihood estimate of variance. This --- function should be used if both mean and variance are required --- since it will calculate mean only once. --- meanVariance :: Double -> F.Fold Double (Double,Double) --- meanVariance m --- | n > 1 = (m, robustSumVar m samp / fromIntegral n) --- | otherwise = (m, 0) --- where --- n = G.length samp --- {-# SPECIALIZE meanVariance :: U.Vector Double -> (Double,Double) #-} --- {-# SPECIALIZE meanVariance :: V.Vector Double -> (Double,Double) #-} - --- | Calculate mean and unbiased estimate of variance. This --- function should be used if both mean and variance are required --- since it will calculate mean only once. --- meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double) --- meanVarianceUnb samp --- | n > 1 = (m, robustSumVar m samp / fromIntegral (n-1)) --- | otherwise = (m, 0) --- where --- n = G.length samp --- m = mean samp --- {-# SPECIALIZE meanVarianceUnb :: U.Vector Double -> (Double,Double) #-} --- {-# SPECIALIZE meanVarianceUnb :: V.Vector Double -> (Double,Double) #-} --} +-- | The square of differences from the mean: +-- +-- \(robustSumVar(xs) = \sum_{i=0}^n{(x_i-m)^2}\) +-- +--'variance' is the result of this divided by the number of inputs. +robustSumVar :: Double -> F.Fold Double Double +robustSumVar m = F.premap (square . subtract m) kbnSum +{-# INLINE robustSumVar #-} -- | Standard deviation. This is simply the square root of the -- unbiased estimate of the variance. stdDev :: Double -> F.Fold Double Double -stdDev m = fmap sqrt $ varianceUnbiased m +stdDev m = sqrt (varianceUnbiased m) {-# INLINE stdDev #-} -- | Standard error of the mean. This is the standard deviation -- divided by the square root of the sample size. stdErrMean :: Double -> F.Fold Double Double -stdErrMean m = stdDev m / fmap sqrt doubleLength +stdErrMean m = stdDev m / sqrt doubleLength {-# INLINE stdErrMean #-} robustSumVarWeighted :: Double -> F.Fold (Double,Double) V @@ -372,13 +443,13 @@ varianceWeighted :: Double -> F.Fold (Double,Double) Double varianceWeighted wm = fini <$> robustSumVarWeighted wm where fini (V s w) = s / w +{-# INLINE varianceWeighted #-} -- $cancellation -- -- The functions prefixed with the name @fast@ below perform a single -- pass over the sample data using Knuth's algorithm. They usually --- work well, but see below for caveats. These functions are subject --- to array fusion. +-- work well, but see below for caveats. -- -- /Note/: in cases where most sample data is close to the sample's -- mean, Knuth's algorithm gives inaccurate results due to @@ -413,17 +484,17 @@ fastVarianceUnbiased = fmap fini fastVar -- | Standard deviation. This is simply the square root of the -- maximum likelihood estimate of the variance. fastStdDev :: F.Fold Double Double -fastStdDev = fmap sqrt fastVariance +fastStdDev = sqrt fastVariance {-# INLINE fastStdDev #-} --- Avoids computing the length twice +-- | Compute two expectations at once, to avoid computing the length twice. biExpectation :: (a -> Double) -> (a -> Double) -> F.Fold a V -biExpectation f s = fini <$> F.length <*> F.premap f kbnSum <*> F.premap s kbnSum - where fini n a b = let n' = fromIntegral n in V (a/n') (b/n') +biExpectation f s = fini <$> doubleLength <*> F.premap f kbnSum <*> F.premap s kbnSum + where fini n a b = V (a/n) (b/n) {-# INLINE biExpectation #-} -- | Covariance of sample of pairs. For empty sample it's set to --- zero +-- zero. The means `muX` and `muY` must already be known. covariance ::(Double, Double) -> F.Fold (Double,Double) Double covariance (muX, muY) = liftA2 (\n x -> if n > 0 then x else 0) @@ -434,7 +505,7 @@ covariance (muX, muY) = -- | Correlation coefficient for sample of pairs. Also known as -- Pearson's correlation. For empty sample it's set to zero. --- The means `muX` and `muY` must be known. +-- The means `muX` and `muY` must already be known. correlation :: (Double, Double) -> F.Fold (Double,Double) Double correlation (muX, muY) = fini <$> F.length <*> covF <*> varsF From 139fc65903cd478e4ed5478eb190c44cad44fbe1 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 16:11:04 +1000 Subject: [PATCH 10/20] Unroll exponentation uo to 5 for inlining. --- Statistics/Sample/Fold.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index 5c5f53e..de789f5 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -139,7 +139,11 @@ import Prelude hiding ((^), sum) -- (^) operator from Prelude is just slow. (^) :: Double -> Int -> Double -x ^ 1 = x +!x ^ 1 = x +x ^ 2 = x * x +x ^ 3 = x * x * x +x ^ 4 = (x * x) * (x * x) +x ^ 5 = (x * x) * x * (x * x) -- x ^ n = x * (x ^ (n-1)) x0 ^ n0 = go (n0-1) x0 where go 0 !acc = acc From 6aee3afcb768e2b601ddf511783c3502933433e3 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 16:31:55 +1000 Subject: [PATCH 11/20] Update Sample docs, export Fold folds --- Statistics/Sample.hs | 129 +++++++++++++++++++------------------------ 1 file changed, 58 insertions(+), 71 deletions(-) diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs index 490e2c5..1dd0829 100644 --- a/Statistics/Sample.hs +++ b/Statistics/Sample.hs @@ -1,7 +1,7 @@ {-# LANGUAGE FlexibleContexts #-} -- | -- Module : Statistics.Sample --- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan +-- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan, 2025 Alex Mason -- License : BSD3 -- -- Maintainer : bos@serpentine.com @@ -10,6 +10,9 @@ -- -- Commonly used sample statistics, also known as descriptive -- statistics. +-- +-- To apply these to structures other than `Vector`s, see "Statistics.Sample.Fold" +-- which contains the definitions of all algorithms in this module. module Statistics.Sample ( @@ -57,7 +60,11 @@ module Statistics.Sample , correlation , covariance2 , correlation2 + + -- * Helpers , pair + , foldf + , pairFold -- * References -- $references ) where @@ -75,18 +82,29 @@ import Prelude hiding ((^), sum) import qualified Control.Foldl as F import qualified Statistics.Sample.Fold as FF -ffold :: G.Vector v a => F.Fold a b -> v a -> b -ffold f v = F.purely_ (\s i -> G.foldl s i v) f -{-# INLINE ffold #-} -{-# SPECIALIZE ffold :: F.Fold Double b -> U.Vector Double -> b #-} -{-# SPECIALIZE ffold :: F.Fold Double b -> V.Vector Double -> b #-} +-- | Apply a 'Fold' to a 'Vector', with specialisations +-- for [Unboxed]("Data.Vector.Unboxed") +-- and [Boxed]("Data.Vector") vectors. +foldf :: G.Vector v a => F.Fold a b -> v a -> b +foldf f v = F.purely_ (\s i -> G.foldl s i v) f +{-# INLINE foldf #-} +{-# SPECIALIZE foldf :: F.Fold Double b -> U.Vector Double -> b #-} +{-# SPECIALIZE foldf :: F.Fold Double b -> V.Vector Double -> b #-} data P a = P !a !Int -pfold :: (G.Vector v a, G.Vector v b) => String -> F.Fold (a,b) c -> v a -> v b -> c -pfold err fld va vb +-- | Apply a 'Fold' to a pair of 'Vector's, the vectors must be of the same +-- length and will 'error' if they are not. Because the lengths must be known, +-- this function is not subject to fusion. + +-- TODO: Can this be made fuse by using Stream? Instead of checking +-- length at the end, error if the end of one vector is reached before +-- the other. This is the optimistic alternative to the pessimistic +-- initial length check. +pairFold :: (G.Vector v a, G.Vector v b) => String -> F.Fold (a,b) c -> v a -> v b -> c +pairFold err fld va vb | la /= lb = error err - | otherwise = ffold foldPair va + | otherwise = foldf foldPair va where la = G.length va lb = G.length vb -- Pattern match here so we don't end up forcing arguments @@ -95,21 +113,21 @@ pfold err fld va vb (F.Fold step ini end) -> F.Fold step' (P ini 0) end' where end' (P a _) = end a step' (P x n) a = P (step x (a, G.unsafeIndex vb n)) (n+1) -{-# INLINE pfold #-} -{-# SPECIALIZE pfold :: String -> F.Fold (Double, Double) b -> U.Vector Double -> U.Vector Double -> b #-} -{-# SPECIALIZE pfold :: String -> F.Fold (Double, Double) b -> V.Vector Double -> V.Vector Double -> b #-} +{-# INLINE pairFold #-} +{-# SPECIALIZE pairFold :: String -> F.Fold (Double, Double) b -> U.Vector Double -> U.Vector Double -> b #-} +{-# SPECIALIZE pairFold :: String -> F.Fold (Double, Double) b -> V.Vector Double -> V.Vector Double -> b #-} -- | /O(n)/ Range. The difference between the largest and smallest -- elements of a sample. range :: (G.Vector v Double) => v Double -> Double -range = ffold FF.range +range = foldf FF.range {-# INLINE range #-} -- | /O(n)/ Compute expectation of function over for sample. This is -- simply @mean . map f@ but won't create intermediate vector. expectation :: (G.Vector v a) => (a -> Double) -> v a -> Double -expectation f xs = ffold (FF.expectation f) xs +expectation f xs = foldf (FF.expectation f) xs {-# INLINE expectation #-} -- | /O(n)/ Arithmetic mean. This uses Kahan-Babuška-Neumaier @@ -117,7 +135,7 @@ expectation f xs = ffold (FF.expectation f) xs -- values are very large. This function is not subject to stream -- fusion. mean :: (G.Vector v Double) => v Double -> Double -mean = ffold FF.mean +mean = foldf FF.mean {-# SPECIALIZE mean :: U.Vector Double -> Double #-} {-# SPECIALIZE mean :: V.Vector Double -> Double #-} @@ -127,25 +145,25 @@ mean = ffold FF.mean -- Compared to 'mean', this loses a surprising amount of precision -- unless the inputs are very large. welfordMean :: (G.Vector v Double) => v Double -> Double -welfordMean = ffold FF.welfordMean +welfordMean = foldf FF.welfordMean {-# SPECIALIZE welfordMean :: U.Vector Double -> Double #-} {-# SPECIALIZE welfordMean :: V.Vector Double -> Double #-} -- | /O(n)/ Arithmetic mean for weighted sample. It uses a single-pass -- algorithm analogous to the one used by 'welfordMean'. meanWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double -meanWeighted = ffold FF.meanWeighted +meanWeighted = foldf FF.meanWeighted {-# INLINE meanWeighted #-} -- | /O(n)/ Harmonic mean. This algorithm performs a single pass over -- the sample. harmonicMean :: (G.Vector v Double) => v Double -> Double -harmonicMean = ffold FF.harmonicMean +harmonicMean = foldf FF.harmonicMean {-# INLINE harmonicMean #-} -- | /O(n)/ Geometric mean of a sample containing no negative values. geometricMean :: (G.Vector v Double) => v Double -> Double -geometricMean = ffold FF.geometricMean +geometricMean = foldf FF.geometricMean {-# INLINE geometricMean #-} -- | Compute the /k/th central moment of a sample. The central moment @@ -157,7 +175,7 @@ geometricMean = ffold FF.geometricMean -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. centralMoment :: (G.Vector v Double) => Int -> v Double -> Double -centralMoment a xs = ffold (FF.centralMoment a m) xs +centralMoment a xs = foldf (FF.centralMoment a m) xs where m = mean xs {-# SPECIALIZE centralMoment :: Int -> U.Vector Double -> Double #-} @@ -171,7 +189,7 @@ centralMoment a xs = ffold (FF.centralMoment a m) xs -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. centralMoments :: (G.Vector v Double) => Int -> Int -> v Double -> (Double, Double) -centralMoments a b xs = ffold (FF.centralMoments a b m) xs +centralMoments a b xs = foldf (FF.centralMoments a b m) xs where m = mean xs @@ -198,7 +216,7 @@ centralMoments a b xs = ffold (FF.centralMoments a b m) xs -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. skewness :: (G.Vector v Double) => v Double -> Double -skewness xs = ffold (FF.skewness m) xs +skewness xs = foldf (FF.skewness m) xs where m = mean xs {-# SPECIALIZE skewness :: U.Vector Double -> Double #-} {-# SPECIALIZE skewness :: V.Vector Double -> Double #-} @@ -217,7 +235,7 @@ skewness xs = ffold (FF.skewness m) xs -- For samples containing many values very close to the mean, this -- function is subject to inaccuracy due to catastrophic cancellation. kurtosis :: (G.Vector v Double) => v Double -> Double -kurtosis xs = ffold (FF.kurtosis m) xs +kurtosis xs = foldf (FF.kurtosis m) xs where m = mean xs {-# SPECIALIZE kurtosis :: U.Vector Double -> Double #-} {-# SPECIALIZE kurtosis :: V.Vector Double -> Double #-} @@ -239,7 +257,7 @@ kurtosis xs = ffold (FF.kurtosis m) xs -- | Maximum likelihood estimate of a sample's variance. Also known -- as the population variance, where the denominator is /n/. variance :: (G.Vector v Double) => v Double -> Double -variance samp = ffold (FF.variance m) samp +variance samp = foldf (FF.variance m) samp where m = mean samp {-# SPECIALIZE variance :: U.Vector Double -> Double #-} {-# SPECIALIZE variance :: V.Vector Double -> Double #-} @@ -248,7 +266,7 @@ variance samp = ffold (FF.variance m) samp -- | Unbiased estimate of a sample's variance. Also known as the -- sample variance, where the denominator is /n/-1. varianceUnbiased :: (G.Vector v Double) => v Double -> Double -varianceUnbiased samp = ffold (FF.varianceUnbiased m) samp +varianceUnbiased samp = foldf (FF.varianceUnbiased m) samp where m = mean samp {-# SPECIALIZE varianceUnbiased :: U.Vector Double -> Double #-} {-# SPECIALIZE varianceUnbiased :: V.Vector Double -> Double #-} @@ -258,7 +276,7 @@ varianceUnbiased samp = ffold (FF.varianceUnbiased m) samp -- since it will calculate mean only once. meanVariance :: (G.Vector v Double) => v Double -> (Double,Double) meanVariance samp - | n > 1 = (m, ffold (FF.robustSumVar m) samp / fromIntegral n) + | n > 1 = (m, foldf (FF.robustSumVar m) samp / fromIntegral n) | otherwise = (m, 0) where n = G.length samp @@ -271,7 +289,7 @@ meanVariance samp -- since it will calculate mean only once. meanVarianceUnb :: (G.Vector v Double) => v Double -> (Double,Double) meanVarianceUnb samp - | n > 1 = (m, ffold (FF.robustSumVar m) samp / fromIntegral (n-1)) + | n > 1 = (m, foldf (FF.robustSumVar m) samp / fromIntegral (n-1)) | otherwise = (m, 0) where n = G.length samp @@ -282,7 +300,7 @@ meanVarianceUnb samp -- | Standard deviation. This is simply the square root of the -- unbiased estimate of the variance. stdDev :: (G.Vector v Double) => v Double -> Double -stdDev samp = ffold (FF.stdDev m) samp +stdDev samp = foldf (FF.stdDev m) samp where m = mean samp {-# SPECIALIZE stdDev :: U.Vector Double -> Double #-} {-# SPECIALIZE stdDev :: V.Vector Double -> Double #-} @@ -290,13 +308,13 @@ stdDev samp = ffold (FF.stdDev m) samp -- | Standard error of the mean. This is the standard deviation -- divided by the square root of the sample size. stdErrMean :: (G.Vector v Double) => v Double -> Double -stdErrMean samp = ffold (FF.stdErrMean m) samp +stdErrMean samp = foldf (FF.stdErrMean m) samp where m = mean samp {-# SPECIALIZE stdErrMean :: U.Vector Double -> Double #-} {-# SPECIALIZE stdErrMean :: V.Vector Double -> Double #-} robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> FF.V -robustSumVarWeighted samp = ffold (fmap fini $ FF.robustSumVarWeighted m) samp +robustSumVarWeighted samp = foldf (fmap fini $ FF.robustSumVarWeighted m) samp where m = meanWeighted samp fini (FF.V a b) = FF.V a b {-# INLINE robustSumVarWeighted #-} @@ -322,23 +340,21 @@ varianceWeighted samp -- mean, Knuth's algorithm gives inaccurate results due to -- catastrophic cancellation. --- fastVar :: (G.Vector v Double) => v Double -> FF.T1 --- fastVar = ffold FF.fastVar -- | Maximum likelihood estimate of a sample's variance. fastVariance :: (G.Vector v Double) => v Double -> Double -fastVariance = ffold FF.fastVariance +fastVariance = foldf FF.fastVariance {-# INLINE fastVariance #-} -- | Unbiased estimate of a sample's variance. fastVarianceUnbiased :: (G.Vector v Double) => v Double -> Double -fastVarianceUnbiased = ffold FF.fastVarianceUnbiased +fastVarianceUnbiased = foldf FF.fastVarianceUnbiased {-# INLINE fastVarianceUnbiased #-} --- | Standard deviation. This is simply the square root of the +-- | Standard deviation. This is simply the square root of the -- maximum likelihood estimate of the variance. fastStdDev :: (G.Vector v Double) => v Double -> Double -fastStdDev = ffold FF.fastStdDev +fastStdDev = foldf FF.fastStdDev {-# INLINE fastStdDev #-} -- | Covariance of sample of pairs. For empty sample it's set to @@ -346,9 +362,9 @@ fastStdDev = ffold FF.fastStdDev covariance :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double -covariance xy = ffold (FF.covariance (muX, muY)) xy +covariance xy = foldf (FF.covariance (muX, muY)) xy where - FF.V muX muY = ffold (FF.biExpectation fst snd) xy + FF.V muX muY = foldf (FF.biExpectation fst snd) xy {-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-} @@ -357,9 +373,9 @@ covariance xy = ffold (FF.covariance (muX, muY)) xy correlation :: (G.Vector v (Double,Double)) => v (Double,Double) -> Double -correlation xy = ffold (FF.correlation (muX, muY)) xy +correlation xy = foldf (FF.correlation (muX, muY)) xy where - FF.V muX muY = ffold (FF.biExpectation fst snd) xy + FF.V muX muY = foldf (FF.biExpectation fst snd) xy {-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-} @@ -372,7 +388,7 @@ covariance2 :: (G.Vector v Double) -> v Double -> Double covariance2 xs ys = - pfold "Statistics.Sample.covariance2: both samples must have same length" + pairFold "Statistics.Sample.covariance2: both samples must have same length" (FF.covariance (muX, muY)) xs ys where @@ -389,12 +405,9 @@ correlation2 :: (G.Vector v Double) -> v Double -> Double correlation2 xs ys = - pfold "Statistics.Sample.correlation2: both samples must have same length" + pairFold "Statistics.Sample.correlation2: both samples must have same length" (FF.correlation (muX, muY)) xs ys - -- | nx /= ny = error $ "Statistics.Sample.correlation2: both samples must have same length" - -- | nx == 0 = 0 - -- | otherwise = cov / sqrt (varX * varY) where muX = mean xs muY = mean ys @@ -411,32 +424,6 @@ pair va vb | otherwise = error "Statistics.Sample.pair: vector must have same length" {-# INLINE pair #-} ------------------------------------------------------------------------- --- Helper code. Monomorphic unpacked accumulators. - --- don't support polymorphism, as we can't get unboxed returns if we use it. - -{- - -Consider this core: - -with data T a = T !a !Int - -$wfold :: Double# - -> Int# - -> Int# - -> (# Double, Int# #) - -and without, - -$wfold :: Double# - -> Int# - -> Int# - -> (# Double#, Int# #) - -yielding to boxed returns and heap checks. - --} -- $references -- From 8a2e350f489d4f46435a7ec2aeff59a1d5c2047e Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 16:32:14 +1000 Subject: [PATCH 12/20] Export F.fold --- Statistics/Sample/Fold.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index de789f5..3fa744b 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -114,6 +114,7 @@ module Statistics.Sample.Fold , covariance , correlation -- * Strict types and helpers + , F.fold , V(..) , biExpectation , kbnSum From dc8abb0b603e15327d20c7268cf63726ad78acea Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 17:25:21 +1000 Subject: [PATCH 13/20] Don't export V --- Statistics/Sample.hs | 11 +++++------ Statistics/Sample/Fold.hs | 13 ++++++------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/Statistics/Sample.hs b/Statistics/Sample.hs index 1dd0829..e117256 100644 --- a/Statistics/Sample.hs +++ b/Statistics/Sample.hs @@ -313,10 +313,9 @@ stdErrMean samp = foldf (FF.stdErrMean m) samp {-# SPECIALIZE stdErrMean :: U.Vector Double -> Double #-} {-# SPECIALIZE stdErrMean :: V.Vector Double -> Double #-} -robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> FF.V -robustSumVarWeighted samp = foldf (fmap fini $ FF.robustSumVarWeighted m) samp +robustSumVarWeighted :: (G.Vector v (Double,Double)) => v (Double,Double) -> (Double, Double) +robustSumVarWeighted samp = foldf (FF.robustSumVarWeighted m) samp where m = meanWeighted samp - fini (FF.V a b) = FF.V a b {-# INLINE robustSumVarWeighted #-} -- | Weighted variance. This is biased estimation. @@ -325,7 +324,7 @@ varianceWeighted samp | G.length samp > 1 = fini $ robustSumVarWeighted samp | otherwise = 0 where - fini (FF.V s w) = s / w + fini (s, w) = s / w {-# SPECIALIZE varianceWeighted :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE varianceWeighted :: V.Vector (Double,Double) -> Double #-} @@ -364,7 +363,7 @@ covariance :: (G.Vector v (Double,Double)) -> Double covariance xy = foldf (FF.covariance (muX, muY)) xy where - FF.V muX muY = foldf (FF.biExpectation fst snd) xy + (muX, muY) = foldf (FF.biExpectation fst snd) xy {-# SPECIALIZE covariance :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE covariance :: V.Vector (Double,Double) -> Double #-} @@ -375,7 +374,7 @@ correlation :: (G.Vector v (Double,Double)) -> Double correlation xy = foldf (FF.correlation (muX, muY)) xy where - FF.V muX muY = foldf (FF.biExpectation fst snd) xy + (muX, muY) = foldf (FF.biExpectation fst snd) xy {-# SPECIALIZE correlation :: U.Vector (Double,Double) -> Double #-} {-# SPECIALIZE correlation :: V.Vector (Double,Double) -> Double #-} diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index 3fa744b..a35f858 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -115,7 +115,6 @@ module Statistics.Sample.Fold , correlation -- * Strict types and helpers , F.fold - , V(..) , biExpectation , kbnSum , kbnSum' @@ -436,8 +435,8 @@ stdErrMean :: Double -> F.Fold Double Double stdErrMean m = stdDev m / sqrt doubleLength {-# INLINE stdErrMean #-} -robustSumVarWeighted :: Double -> F.Fold (Double,Double) V -robustSumVarWeighted wm = F.Fold go (V 0 0) id +robustSumVarWeighted :: Double -> F.Fold (Double,Double) (Double, Double) +robustSumVarWeighted wm = F.Fold go (V 0 0) (\(V a b) -> (a,b)) where go (V s w) (x,xw) = V (s + xw*d*d) (w + xw) where d = x - wm @@ -447,7 +446,7 @@ robustSumVarWeighted wm = F.Fold go (V 0 0) id varianceWeighted :: Double -> F.Fold (Double,Double) Double varianceWeighted wm = fini <$> robustSumVarWeighted wm where - fini (V s w) = s / w + fini (s, w) = s / w {-# INLINE varianceWeighted #-} -- $cancellation @@ -493,9 +492,9 @@ fastStdDev = sqrt fastVariance {-# INLINE fastStdDev #-} -- | Compute two expectations at once, to avoid computing the length twice. -biExpectation :: (a -> Double) -> (a -> Double) -> F.Fold a V +biExpectation :: (a -> Double) -> (a -> Double) -> F.Fold a (Double, Double) biExpectation f s = fini <$> doubleLength <*> F.premap f kbnSum <*> F.premap s kbnSum - where fini n a b = V (a/n) (b/n) + where fini n a b = (a/n, b/n) {-# INLINE biExpectation #-} -- | Covariance of sample of pairs. For empty sample it's set to @@ -515,7 +514,7 @@ correlation :: (Double, Double) -> F.Fold (Double,Double) Double correlation (muX, muY) = fini <$> F.length <*> covF <*> varsF where - fini n cov (V varX varY) + fini n cov (varX, varY) | n == 0 = 0 | otherwise = cov / sqrt (varX * varY) covF = expectation (\(x,y) -> (x - muX)*(y - muY)) From ca2a7c340bfe873f39962106e2a4ce7de36ce316 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 17:25:49 +1000 Subject: [PATCH 14/20] Add LMVSK types and folds --- Statistics/Sample/Fold.hs | 152 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index a35f858..d50c72b 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -113,6 +113,17 @@ module Statistics.Sample.Fold -- * Joint distributions , covariance , correlation + + -- ** Single pass Length, Mean, Variance, Skewness and Kurtosis + -- $lmvsk + , LMVSK(..) + , LMVSKState + , fastLMVSK + , fastLMVSKu + , foldLMVSKState + , getLMVSK + , getLMVSKu + -- * Strict types and helpers , F.fold , biExpectation @@ -522,6 +533,144 @@ correlation (muX, muY) = (\(_,y) -> square (y - muY)) {-# INLINE correlation #-} +-- $lmvsk +-- +-- 'LMVSK' represents the Length, Mean, Variance, Skewness +-- and Kurtosis of a sample. If you need to compute several +-- of these statistics at once, using these folds may be more +-- efficient than combining the 'Fold's above Applicatively. +-- +-- The algorithm used is similar to the found in 'fastVariance' etc. +-- but common subexpressions are shared. + +data LMVSK = LMVSK + { lmvskLength :: {-# UNPACK #-}!Int + , lmvskMean :: {-# UNPACK #-}!Double + , lmvskVariance :: {-# UNPACK #-}!Double + , lmvskSkewness :: {-# UNPACK #-}!Double + , lmvskKurtosis :: {-# UNPACK #-}!Double + } deriving (Show, Eq) + +-- | Intermediate state for computing 'LMVSK'. This type's 'Semigroup' +-- instance allows it to be computed in parallel and combined +newtype LMVSKState = LMVSKState LMVSK + +instance Monoid LMVSKState where + {-# INLINE mempty #-} + mempty = LMVSKState lmvskZero + {-# INLINE mappend #-} + mappend = (<>) + +instance Semigroup LMVSKState where + {-# INLINE (<>) #-} + (LMVSKState (LMVSK an am1 am2 am3 am4)) <> (LMVSKState (LMVSK bn bm1 bm2 bm3 bm4)) + = LMVSKState (LMVSK n m1 m2 m3 m4) where + fi :: Int -> Double + fi = fromIntegral + -- combined.n = a.n + b.n; + n = an+bn + n2 = n*n + nd = fi n + nda = fi an + ndb = fi bn + -- delta = b.M1 - a.M1; + delta = bm1 - am1 + -- delta2 = delta*delta; + delta2 = delta*delta + -- delta3 = delta*delta2; + delta3 = delta*delta2 + -- delta4 = delta2*delta2; + delta4 = delta2*delta2 + -- combined.M1 = (a.n*a.M1 + b.n*b.M1) / combined.n; + m1 = (nda*am1 + ndb*bm1 ) / nd + -- combined.M2 = a.M2 + b.M2 + delta2*a.n*b.n / combined.n; + m2 = am2 + bm2 + delta2*nda*ndb / nd + -- combined.M3 = a.M3 + b.M3 + delta3*a.n*b.n* (a.n - b.n)/(combined.n*combined.n); + m3 = am3 + bm3 + delta3*nda*ndb* fi( an - bn )/ fi n2 + -- combined.M3 += 3.0*delta * (a.n*b.M2 - b.n*a.M2) / combined.n; + + 3.0*delta * (nda*bm2 - ndb*am2 ) / nd + -- + -- combined.M4 = a.M4 + b.M4 + delta4*a.n*b.n * (a.n*a.n - a.n*b.n + b.n*b.n) /(combined.n*combined.n*combined.n); + m4 = am4 + bm4 + delta4*nda*ndb *fi(an*an - an*bn + bn*bn ) / fi (n*n*n) + -- combined.M4 += 6.0*delta2 * (a.n*a.n*b.M2 + b.n*b.n*a.M2)/(combined.n*combined.n) + + + 6.0*delta2 * (nda*nda*bm2 + ndb*ndb*am2) / fi n2 + -- 4.0*delta*(a.n*b.M3 - b.n*a.M3) / combined.n; + + 4.0*delta*(nda*bm3 - ndb*am3) / nd + +-- | Efficiently compute the +-- __length, mean, variance, skewness and kurtosis__ with a single pass. +{-# INLINE fastLMVSK #-} +fastLMVSK :: F.Fold Double LMVSK +fastLMVSK = getLMVSK <$> foldLMVSKState + +-- | Efficiently compute the +-- __length, mean, unbiased variance, skewness and kurtosis__ with a single pass. +{-# INLINE fastLMVSKu #-} +fastLMVSKu :: F.Fold Double LMVSK +fastLMVSKu = getLMVSKu <$> foldLMVSKState + +-- | Initial 'LMVSKState' +{-# INLINE lmvskZero #-} +lmvskZero :: LMVSK +lmvskZero = LMVSK 0 0 0 0 0 + +-- | Performs the heavy lifting of fastLMVSK. This is exposed +-- because the internal `LMVSKState` is monoidal, allowing you +-- to run these statistics in parallel over datasets which are +-- split and then combine the results. +{-# INLINE foldLMVSKState #-} +foldLMVSKState :: F.Fold Double LMVSKState +foldLMVSKState = F.Fold stepLMVSKState (LMVSKState lmvskZero) id + +{-# INLINE stepLMVSKState #-} +stepLMVSKState :: LMVSKState -> Double -> LMVSKState +stepLMVSKState (LMVSKState (LMVSK n1 m1 m2 m3 m4)) x = LMVSKState $ LMVSK n m1' m2' m3' m4' where + fi :: Int -> Double + fi = fromIntegral + -- long long n1 = n; + -- n++; + n = n1+1 + -- delta = x - M1; + delta = x - m1 + -- delta_n = delta / n; + delta_n = delta / fi n + -- delta_n2 = delta_n * delta_n; + delta_n2 = delta_n * delta_n + -- term1 = delta * delta_n * n1; + term1 = delta * delta_n * fi n1 + -- M1 += delta_n; + m1' = m1 + delta_n + -- M4 += term1 * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3; + m4' = m4 + term1 * delta_n2 * fi (n*n - 3*n + 3) + 6 * delta_n2 * m2 - 4 * delta_n * m3 + -- M3 += term1 * delta_n * (n - 2) - 3 * delta_n * M2; + m3' = m3 + term1 * delta_n * fi (n - 2) - 3 * delta_n * m2 + -- M2 += term1; + m2' = m2 + term1 + +-- | Returns the stats which have been computed in a LMVSKState with /population/ variance. +getLMVSK :: LMVSKState -> LMVSK +getLMVSK (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where + nd = fromIntegral n + -- M2/(n-1.0) + m2' = m2 / nd + -- sqrt(double(n)) * M3/ pow(M2, 1.5) + m3' = sqrt nd * m3 / (m2 ** 1.5) + -- double(n)*M4 / (M2*M2) - 3.0 + m4' = nd*m4 / (m2*m2) - 3.0 + +-- | Returns the stats which have been computed in a LMVSKState, +-- with the /sample/ variance. +getLMVSKu :: LMVSKState -> LMVSK +getLMVSKu (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where + nd = fromIntegral n + -- M2/(n-1.0) + m2' = m2 / (nd-1) + -- sqrt(double(n)) * M3/ pow(M2, 1.5) + m3' = sqrt nd * m3 / (m2 ** 1.5) + -- double(n)*M4 / (M2*M2) - 3.0 + m4' = nd*m4 / (m2*m2) - 3.0 + + -------- -- $references @@ -542,3 +691,6 @@ correlation (muX, muY) = -- * West, D.H.D. (1979) Updating mean and variance estimates: an -- improved method. /Communications of the ACM/ -- 22(9):532–535. +-- +-- * John D. Cook. Computing skewness and kurtosis in one pass +-- \ No newline at end of file From fe938167eff18ae5d6dbac5cb44634f1b4e9719d Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 17:39:23 +1000 Subject: [PATCH 15/20] Add benchmark for LMVSK comparing to separate calculation --- Statistics/Sample/Fold.hs | 6 ++++++ benchmark/Main.hs | 13 +++++++++++++ statistics.cabal | 1 + 3 files changed, 20 insertions(+) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index d50c72b..aa40e5a 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -1,5 +1,6 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE BangPatterns #-} +{-# LANGUAGE DerivingVia #-} -- | -- Module : Statistics.Sample.Fold -- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan, 2025 Alex Mason @@ -143,6 +144,7 @@ import qualified Control.Foldl as F -- Operator ^ will be overridden import Prelude hiding ((^), sum) +import Control.DeepSeq (NFData(..)) ------------------------------------------------------------------------ @@ -551,9 +553,13 @@ data LMVSK = LMVSK , lmvskKurtosis :: {-# UNPACK #-}!Double } deriving (Show, Eq) +instance NFData LMVSK where + rnf !_ = () + -- | Intermediate state for computing 'LMVSK'. This type's 'Semigroup' -- instance allows it to be computed in parallel and combined newtype LMVSKState = LMVSKState LMVSK + deriving NFData via LMVSK instance Monoid LMVSKState where {-# INLINE mempty #-} diff --git a/benchmark/Main.hs b/benchmark/Main.hs index c305617..f472016 100644 --- a/benchmark/Main.hs +++ b/benchmark/Main.hs @@ -2,6 +2,8 @@ module Main where import Data.Complex import Statistics.Sample +import qualified Statistics.Sample.Fold as F +import qualified Control.Foldl as CF import Statistics.Transform import Statistics.Correlation import System.Random.MWC @@ -24,6 +26,15 @@ sampleW = VU.zip sample (VU.reverse sample) sampleC :: VU.Vector (Complex Double) sampleC = VU.zipWith (:+) sample (VU.reverse sample) +separateLMVSK :: CF.Fold Double F.LMVSK +separateLMVSK = + F.LMVSK + <$> CF.length + <*> F.mean + <*> F.fastVariance + <*> F.centralMoment 2 0 + <*> F.centralMoment 3 0 -- fudge the mean because we don't have a fast skewness or kurtosis + -- Simple benchmark for functions from Statistics.Sample main :: IO () @@ -55,6 +66,8 @@ main = , bench "C.M. 3" $ nf (\x -> centralMoment 3 x) sample , bench "C.M. 4" $ nf (\x -> centralMoment 4 x) sample , bench "C.M. 5" $ nf (\x -> centralMoment 5 x) sample + , bench "Applicative LMVSK" $ nf (\x -> foldf separateLMVSK x) sample + , bench "fastLMVSK" $ nf (\x -> foldf F.fastLMVSK x) sample ] , bgroup "FFT" [ bgroup "fft" diff --git a/statistics.cabal b/statistics.cabal index 2444791..ac61876 100644 --- a/statistics.cabal +++ b/statistics.cabal @@ -228,6 +228,7 @@ benchmark statistics-bench main-is: Main.hs Other-modules: Bench build-depends: tasty-bench >= 0.3 + , foldl benchmark statistics-bench-papi import: bench-stanza From 3d2d2454d835ca830cdd577b7f68c90b86c9d54f Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 22:05:35 +1000 Subject: [PATCH 16/20] Use Main.hs for papi bench --- statistics.cabal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statistics.cabal b/statistics.cabal index ac61876..c02cc32 100644 --- a/statistics.cabal +++ b/statistics.cabal @@ -236,6 +236,6 @@ benchmark statistics-bench-papi if impl(ghcjs) || !flag(BenchPAPI) buildable: False hs-source-dirs: benchmark bench-papi - main-is: bench.hs + main-is: Main.hs Other-modules: Bench build-depends: tasty-papi >= 0.1.2 From 75f8f6bc34cb6ecec8d7e03a9e788d7081429bde Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 23:10:00 +1000 Subject: [PATCH 17/20] Remove unused extensions --- Statistics/Sample/Fold.hs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index aa40e5a..c34894c 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -1,6 +1,4 @@ -{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE BangPatterns #-} -{-# LANGUAGE DerivingVia #-} -- | -- Module : Statistics.Sample.Fold -- Copyright : (c) 2008 Don Stewart, 2009 Bryan O'Sullivan, 2025 Alex Mason @@ -553,13 +551,13 @@ data LMVSK = LMVSK , lmvskKurtosis :: {-# UNPACK #-}!Double } deriving (Show, Eq) -instance NFData LMVSK where - rnf !_ = () -- | Intermediate state for computing 'LMVSK'. This type's 'Semigroup' -- instance allows it to be computed in parallel and combined newtype LMVSKState = LMVSKState LMVSK - deriving NFData via LMVSK + +instance NFData LMVSK where rnf !_ = () +instance NFData LMVSKState where rnf !_ = () instance Monoid LMVSKState where {-# INLINE mempty #-} From 04c6a4386455d40f67301dc213cd75c7de8eb7f5 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 23:22:56 +1000 Subject: [PATCH 18/20] Remove bang pattern in (^) --- Statistics/Sample/Fold.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index c34894c..735c6ab 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -150,7 +150,7 @@ import Control.DeepSeq (NFData(..)) -- (^) operator from Prelude is just slow. (^) :: Double -> Int -> Double -!x ^ 1 = x +x ^ 1 = x x ^ 2 = x * x x ^ 3 = x * x * x x ^ 4 = (x * x) * (x * x) From 5b7ca9ef085c7c4191aa0ac39f1b62b8aed6ae76 Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 23:25:07 +1000 Subject: [PATCH 19/20] Fix doctests --- Statistics/Sample/Fold.hs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index 735c6ab..622c428 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -219,15 +219,17 @@ kbnSum = summationVia kbn -- === __Examples:__ -- >>> xs = [1,2,pi,1e-9] -- >>> ys = [4,5,2*pi,1e-11] --- >>> F.fold kbnSum' xs <> F.fold kbnSum' ys +-- >>> r1 = F.fold kbnSum' xs <> F.fold kbnSum' ys +-- >>> r1 -- KBNSum 21.424777961779377 2.580967484452971e-15 -- --- >>> kbn it +-- >>> kbn r1 -- 21.42477796177938 --- >>> F.fold kbnSum' (xs ++ ys) +-- >>> r2 = F.fold kbnSum' (xs ++ ys) +-- >>> r2 -- KBNSum 21.42477796177938 (-9.7174619434753e-16) -- --- >>> kbn it +-- >>> kbn r2 -- 21.42477796177938 kbnSum' :: F.Fold Double KBNSum kbnSum' = summation @@ -697,4 +699,4 @@ getLMVSKu (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where -- 22(9):532–535. -- -- * John D. Cook. Computing skewness and kurtosis in one pass --- \ No newline at end of file +-- From 8e3b0af6b733852835bf72705b7772a90bff94ca Mon Sep 17 00:00:00 2001 From: Alex Mason Date: Sat, 9 Aug 2025 23:30:41 +1000 Subject: [PATCH 20/20] Add Applicative imports for old GHCs --- Statistics/Sample/Fold.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/Statistics/Sample/Fold.hs b/Statistics/Sample/Fold.hs index 622c428..b85424b 100644 --- a/Statistics/Sample/Fold.hs +++ b/Statistics/Sample/Fold.hs @@ -142,6 +142,7 @@ import qualified Control.Foldl as F -- Operator ^ will be overridden import Prelude hiding ((^), sum) +import Control.Applicative (liftA2, (<$>), (<*>)) import Control.DeepSeq (NFData(..))