{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}

module Statistics.LinearRegression (
    -- * Simple linear regression functions
    linearRegression,
    linearRegressionRSqr,
    linearRegressionTLS,
    -- * Related functions
    correl,
    covar,
    -- * Estimated errors and distribution parameters
    linearRegressionMSE,
    linearRegressionDistributions,
    -- * Robust linear regression
    robustFit,
    nonRandomRobustFit,
    robustFitRSqr,
    -- ** Related types
    EstimationParameters(..),
    ErrorFunction,
    Estimator,
    EstimatedRelation,
    -- ** Provided values
    defaultEstimationParameters,
    linearRegressionError,
    linearRegressionTLSError,
    -- ** Helper functions
    converge,
    -- * References
    -- $references
    ) where

import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U
import Data.Vector.Generic (Vector, (!))
import Safe (at)
import System.Random
import System.Random.Shuffle (shuffleM)
import Control.Monad.Random.Class
import Control.Monad.Random (evalRand)
import Control.Monad (liftM)
import Data.Function (on)
import Data.List (minimumBy, sortBy)
import Data.Maybe (fromMaybe)
import qualified Statistics.Sample as S
import qualified Statistics.Distribution as D
import qualified Statistics.Distribution.Transform as T
import qualified Statistics.Distribution.StudentT as ST

--- * Simple linear regression

-- | Covariance of two samples
covar :: Vector v Double => v Double -> v Double -> Double
covar :: v Double -> v Double -> Double
covar v Double
xs v Double
ys = Double -> Double -> Double -> v Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
Double -> Double -> Double -> v Double -> v Double -> Double
covar' Double
m1 Double
m2 Double
n v Double
xs v Double
ys
    where
          !n :: Double
n = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs
          !m1 :: Double
m1 = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
S.mean v Double
xs
          !m2 :: Double
m2 = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
S.mean v Double
ys
{-# INLINE covar #-}

-- internal function that avoids duplicate calculation of means and lengths where possible
-- Note: trying to make the calculation even more efficient by subtracting m1*m1*n instead of individual subtractions increased errors, probably due to rounding issues.
covar' :: Vector v Double => Double -> Double -> Double -> v Double -> v Double -> Double
covar' :: Double -> Double -> Double -> v Double -> v Double -> Double
covar' Double
m1 Double
m2 Double
n v Double
xs v Double
ys = v Double -> Double
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
G.sum ((Double -> Double -> Double) -> v Double -> v Double -> v Double
forall (v :: * -> *) a b c.
(Vector v a, Vector v b, Vector v c) =>
(a -> b -> c) -> v a -> v b -> v c
G.zipWith Double -> Double -> Double
forall a. Num a => a -> a -> a
(*) ((Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (Double -> Double -> Double
forall a. Num a => a -> a -> a
subtract Double
m1) v Double
xs) ((Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (Double -> Double -> Double
forall a. Num a => a -> a -> a
subtract Double
m2) v Double
ys)) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
nDouble -> Double -> Double
forall a. Num a => a -> a -> a
-Double
1)
{-# INLINE covar' #-}

-- | Pearson's product-moment correlation coefficient
correl :: Vector v Double => v Double -> v Double -> Double
correl :: v Double -> v Double -> Double
correl v Double
xs v Double
ys = let !c :: Double
c = v Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
v Double -> v Double -> Double
covar v Double
xs v Double
ys
                   !sx :: Double
sx = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
S.stdDev v Double
xs
                   !sy :: Double
sy = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
S.stdDev v Double
ys
               in Double
c Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
sx Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
sy)
{-# INLINE correl #-}

-- | Simple linear regression between 2 samples.
--   Takes two vectors Y={yi} and X={xi} and returns
--   (alpha, beta, r*r) such that Y = alpha + beta*X
--   and where r is the Pearson product-moment correlation
--   coefficient
linearRegressionRSqr :: Vector v Double => v Double -> v Double -> (Double, Double, Double)
linearRegressionRSqr :: v Double -> v Double -> (Double, Double, Double)
linearRegressionRSqr v Double
xs v Double
ys = (Double
alpha, Double
beta, Double
r2)
    where 
          !c :: Double
c                   = Double -> Double -> Double -> v Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
Double -> Double -> Double -> v Double -> v Double -> Double
covar' Double
m1 Double
m2 Double
n v Double
xs v Double
ys
          !r2 :: Double
r2                  = Double
cDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
c Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
v1Double -> Double -> Double
forall a. Num a => a -> a -> a
*Double
v2)
          !(Double
m1,Double
v1)             = v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
S.meanVarianceUnb v Double
xs 
          !(Double
m2,Double
v2)             = v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
S.meanVarianceUnb v Double
ys
          !n :: Double
n                   = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs
          !beta :: Double
beta                = Double
c Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
v1
          !alpha :: Double
alpha               = Double
m2 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
beta Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
m1
{-# INLINE linearRegressionRSqr #-}
          
-- | Simple linear regression between 2 samples.
--   Takes two vectors Y={yi} and X={xi} and returns
--   (alpha, beta) such that Y = alpha + beta*X          
linearRegression :: Vector v Double => v Double -> v Double -> (Double, Double)
linearRegression :: v Double -> v Double -> (Double, Double)
linearRegression v Double
xs v Double
ys = (Double
alpha, Double
beta)
    where 
        (Double
alpha, Double
beta, Double
_) = v Double -> v Double -> (Double, Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> v Double -> (Double, Double, Double)
linearRegressionRSqr v Double
xs v Double
ys
{-# INLINE linearRegression #-}

-- | The error (or residual) mean square of a sample w.r.t. an estimated regression line.
--   This serves as an estimate for the variance of the sampled data.
--   Accepts the regression parameters (alpha,beta) and the sample vectors X and Y.
linearRegressionMSE :: (Vector v Double, Vector v (Double, Double)) => (Double,Double) -> v Double -> v Double -> Double
linearRegressionMSE :: (Double, Double) -> v Double -> v Double -> Double
linearRegressionMSE (Double, Double)
ab v Double
xs v Double
ys = (v Double -> Double
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
G.sum (v Double -> Double)
-> (v Double -> v Double) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Double, Double) -> Double) -> v (Double, Double) -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (ErrorFunction
linearRegressionError (Double, Double)
ab) (v (Double, Double) -> v Double)
-> (v Double -> v (Double, Double)) -> v Double -> v Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v Double -> v Double -> v (Double, Double)
forall (v :: * -> *) a b.
(Vector v a, Vector v b, Vector v (a, b)) =>
v a -> v b -> v (a, b)
G.zip v Double
xs (v Double -> Double) -> v Double -> Double
forall a b. (a -> b) -> a -> b
$ v Double
ys)Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/(Double
nDouble -> Double -> Double
forall a. Num a => a -> a -> a
-Double
2)
    where
        !n :: Double
n = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs

-- | The estimated distributions of the regression parameters (alpha and beta) assuming normal, identical distributions of Y, the sampled data.
-- These can serve to get confidence intervals for the regression parameters.
-- Accepts the regression parameters (alpha,beta) and the sample vectors X and Y.
-- The distributions are StudnetT distributions centered at the estimated (alpha,beta) respectively, with parameter numbers n-2 (where n is the initial sample size) and with standard deviations that are extracted from the sampled data based on its MSE. See chapter 2 of reference [3] for details.
linearRegressionDistributions :: (Vector v Double, Vector v (Double, Double)) => (Double,Double) -> v Double -> v Double -> (T.LinearTransform ST.StudentT,T.LinearTransform ST.StudentT)
linearRegressionDistributions :: (Double, Double)
-> v Double
-> v Double
-> (LinearTransform StudentT, LinearTransform StudentT)
linearRegressionDistributions (Double
alpha,Double
beta) v Double
xs v Double
ys = (Double -> Double -> Double -> LinearTransform StudentT
ST.studentTUnstandardized (Double
nDouble -> Double -> Double
forall a. Num a => a -> a -> a
-Double
2) Double
alpha Double
va,Double -> Double -> Double -> LinearTransform StudentT
ST.studentTUnstandardized (Double
nDouble -> Double -> Double
forall a. Num a => a -> a -> a
-Double
2) Double
beta Double
vb)
    where
        !n :: Double
n = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs
        !mse :: Double
mse = (Double, Double) -> v Double -> v Double -> Double
forall (v :: * -> *).
(Vector v Double, Vector v (Double, Double)) =>
(Double, Double) -> v Double -> v Double -> Double
linearRegressionMSE (Double
alpha,Double
beta) v Double
xs v Double
ys
        !vb :: Double
vb = Double
mseDouble -> Double -> Double
forall a. Fractional a => a -> a -> a
/(Double
xv)
        !mx :: Double
mx = v Double -> Double
forall (v :: * -> *). Vector v Double => v Double -> Double
S.mean v Double
xs
        !va :: Double
va = Double
mseDouble -> Double -> Double
forall a. Num a => a -> a -> a
*(Double
1Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/Double
nDouble -> Double -> Double
forall a. Num a => a -> a -> a
+Double
mxDouble -> Integer -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^Integer
2Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/Double
xv)
        !xv :: Double
xv = v Double -> Double
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
G.sum (v Double -> Double)
-> (v Double -> v Double) -> v Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Double -> Double) -> v Double -> v Double
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
G.map (\Double
x -> (Double
xDouble -> Double -> Double
forall a. Num a => a -> a -> a
-Double
mx)Double -> Integer -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^Integer
2) (v Double -> Double) -> v Double -> Double
forall a b. (a -> b) -> a -> b
$ v Double
xs

-- | Total Least Squares (TLS) linear regression.
-- Assumes x-axis values (and not just y-axis values) are random variables and that both variables have similar distributions.
-- interface is the same as 'linearRegression'.
linearRegressionTLS :: Vector v Double => v Double -> v Double -> (Double,Double)
linearRegressionTLS :: v Double -> v Double -> (Double, Double)
linearRegressionTLS v Double
xs v Double
ys = (Double
alpha, Double
beta)
    where
          !c :: Double
c                   = Double -> Double -> Double -> v Double -> v Double -> Double
forall (v :: * -> *).
Vector v Double =>
Double -> Double -> Double -> v Double -> v Double -> Double
covar' Double
m1 Double
m2 Double
n v Double
xs v Double
ys
          !b :: Double
b                   = (Double
v1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
v2) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
c
          !(Double
m1,Double
v1)             = v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
S.meanVarianceUnb v Double
xs 
          !(Double
m2,Double
v2)             = v Double -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
v Double -> (Double, Double)
S.meanVarianceUnb v Double
ys
          !n :: Double
n                   = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs
          !betas :: [Double]
betas               = [(-Double
b Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double -> Double
forall a. Floating a => a -> a
sqrt(Double
bDouble -> Integer -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^Integer
2Double -> Double -> Double
forall a. Num a => a -> a -> a
+Double
4))Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/Double
2,(-Double
b Double -> Double -> Double
forall a. Num a => a -> a -> a
+ Double -> Double
forall a. Floating a => a -> a
sqrt(Double
bDouble -> Integer -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^Integer
2Double -> Double -> Double
forall a. Num a => a -> a -> a
+Double
4)) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/Double
2]
          !beta :: Double
beta                = if Double
c Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
> Double
0 then [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
maximum [Double]
betas else [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
minimum [Double]
betas
          !alpha :: Double
alpha               = Double
m2 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
beta Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
m1
{-# INLINE linearRegressionTLS #-}

-- | An estimated linear relation between 2 samples is (alpha,beta) such that Y = alpha + beta*X.
type EstimatedRelation = (Double,Double)

-- | An 'Estimator' is a function that generates an estimated linear regression based on 2 samples. This module provides two estimator functions:
-- 'linearRegression' and 'linearRegressionTLS'
type Estimator = (S.Sample -> S.Sample -> EstimatedRelation)

-- | An 'ErrorFunction' is a function that computes the error of a given point from an estimate. This module provides two error functions correspoinding to the two 'Estimator' functions it defines:
-- 
-- * Vertical distance squared via 'linearRegressionError' that should be used with 'linearRegression'
-- 
-- * Total distance squared vie 'linearRegressionTLSError' that should be used with 'linearRegressionTLS'
type ErrorFunction = (EstimatedRelation -> (Double,Double) -> Double)

-- | The robust fit algorithm used has various parameters that can be specified using the 'EstimationParameters' record.
data EstimationParameters = EstimationParameters {
    -- | Maximal fraction of outliers expected in the sample (default 0.25)
    EstimationParameters -> Double
outlierFraction     :: !Double,
    -- | Number of concentration steps to take for initial evaluation of a solution (default 3)
    EstimationParameters -> Int
shortIterationSteps :: !Int,
    -- | Maximal number of sampled subsets (pairs of points) to use as starting points (default 500)
    EstimationParameters -> Int
maxSubsetsNum       :: !Int,
    -- | If the initial sample is large, and thus gets subdivided, this is the number of candidate-estimations to take from each subgroup, on which complete convergence will be executed (default 10)
    EstimationParameters -> Int
groupSubsets        :: !Int,
    -- | Maximal size of sample that can be analyzed without any sub-division (default 600)
    EstimationParameters -> Int
mediumSetSize       :: !Int,
    -- | Maximal size of sample that does not require two-step sub-division (see reference article) (default 1500)
    EstimationParameters -> Int
largeSetSize        :: !Int,
    -- | Estimator function to use (default linearRegression)
    EstimationParameters -> Estimator
estimator           :: Estimator,
    -- | ErrorFunction to use (default linearRegressionError)
    EstimationParameters -> ErrorFunction
errorFunction       :: ErrorFunction
    }

-- | Default set of parameters to use (see reference for details).
defaultEstimationParameters :: EstimationParameters
defaultEstimationParameters = EstimationParameters :: Double
-> Int
-> Int
-> Int
-> Int
-> Int
-> Estimator
-> ErrorFunction
-> EstimationParameters
EstimationParameters {
    outlierFraction :: Double
outlierFraction = Double
0.25,
    shortIterationSteps :: Int
shortIterationSteps = Int
3,
    maxSubsetsNum :: Int
maxSubsetsNum = Int
500,
    groupSubsets :: Int
groupSubsets = Int
10,
    mediumSetSize :: Int
mediumSetSize = Int
600,
    largeSetSize :: Int
largeSetSize = Int
1500,
    estimator :: Estimator
estimator = Estimator
forall (v :: * -> *).
Vector v Double =>
v Double -> v Double -> (Double, Double)
linearRegression,
    errorFunction :: ErrorFunction
errorFunction = ErrorFunction
linearRegressionError
}

-- | linearRegression error function is the square of the /vertical/ distance of a point from the line.
linearRegressionError :: ErrorFunction
linearRegressionError :: ErrorFunction
linearRegressionError (Double
alpha,Double
beta) (Double
x,Double
y) = (Double
yDouble -> Double -> Double
forall a. Num a => a -> a -> a
-(Double
betaDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
xDouble -> Double -> Double
forall a. Num a => a -> a -> a
+Double
alpha))Double -> Integer -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^Integer
2

-- | linearRegressionTLS error function is the square of the /total/ distance of a point from the line.
linearRegressionTLSError :: ErrorFunction
linearRegressionTLSError :: ErrorFunction
linearRegressionTLSError (Double
alpha,Double
beta) (Double
x,Double
y) = Double
eyDouble -> Double -> Double
forall a. Fractional a => a -> a -> a
/(Double
1Double -> Double -> Double
forall a. Num a => a -> a -> a
+Double
betaDouble -> Integer -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^Integer
2)
    where
        ey :: Double
ey = ErrorFunction
linearRegressionError (Double
alpha,Double
beta) (Double
x,Double
y)

-- | Helper function to calculate the minimal expected size of uncontaminated data based on the maximal fraction of outliers.
setSize :: Vector v Double => EstimationParameters -> v Double -> Int
setSize :: EstimationParameters -> v Double -> Int
setSize EstimationParameters
ep v Double
xs = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max (Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
2 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> Int) -> (Double -> Int) -> Double -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Int
forall a b. (RealFrac a, Integral b) => a -> b
round (Double -> Int) -> Double -> Int
forall a b. (a -> b) -> a -> b
$ (Double
1Double -> Double -> Double
forall a. Num a => a -> a -> a
-EstimationParameters -> Double
outlierFraction EstimationParameters
ep) Double -> Double -> Double
forall a. Num a => a -> a -> a
* (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n)
    where
        n :: Int
n = v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs

-- | Helper function that, given an initial estimated relation and the error of the perivous estimation, performs a "concentration" step, generating a new estimate based on a fraction of points laying closest to the previous estimate and estimates the error of the previous estimate based on the same fraction.
-- The result is an estimate that is at least as good as the previous one.
-- The reason the error is calculated for the previous parameters is calculation optimization.
concentrationStep :: Vector v Double => EstimationParameters -> v Double -> v Double -> (EstimatedRelation, Double) -> (EstimatedRelation, Double)
concentrationStep :: EstimationParameters
-> v Double
-> v Double
-> ((Double, Double), Double)
-> ((Double, Double), Double)
concentrationStep EstimationParameters
ep v Double
xs v Double
ys ((Double, Double)
prev, Double
prev_err) = ((Double, Double)
new_estimate, Double
new_err)
    where
        set_size :: Int
set_size = EstimationParameters -> v Double -> Int
forall (v :: * -> *).
Vector v Double =>
EstimationParameters -> v Double -> Int
setSize EstimationParameters
ep v Double
xs
        xyerrors :: [((Double, Double), Double)]
xyerrors = ((Double, Double) -> ((Double, Double), Double))
-> [(Double, Double)] -> [((Double, Double), Double)]
forall a b. (a -> b) -> [a] -> [b]
map (\(Double, Double)
p -> ((Double, Double)
p,EstimationParameters -> ErrorFunction
errorFunction EstimationParameters
ep (Double, Double)
prev (Double, Double)
p)) ([(Double, Double)] -> [((Double, Double), Double)])
-> [(Double, Double)] -> [((Double, Double), Double)]
forall a b. (a -> b) -> a -> b
$ [Double] -> [Double] -> [(Double, Double)]
forall a b. [a] -> [b] -> [(a, b)]
zip (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
xs) (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
ys)
        ([(Double, Double)]
xys,[Double]
errors) = [((Double, Double), Double)] -> ([(Double, Double)], [Double])
forall a b. [(a, b)] -> ([a], [b])
unzip ([((Double, Double), Double)] -> ([(Double, Double)], [Double]))
-> ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> [((Double, Double), Double)]
-> ([(Double, Double)], [Double])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a. Int -> [a] -> [a]
take Int
set_size ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> [((Double, Double), Double)]
-> [((Double, Double), Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Double, Double), Double)
 -> ((Double, Double), Double) -> Ordering)
-> [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (Double -> Double -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Double -> Double -> Ordering)
-> (((Double, Double), Double) -> Double)
-> ((Double, Double), Double)
-> ((Double, Double), Double)
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` ((Double, Double), Double) -> Double
forall a b. (a, b) -> b
snd) ([((Double, Double), Double)] -> ([(Double, Double)], [Double]))
-> [((Double, Double), Double)] -> ([(Double, Double)], [Double])
forall a b. (a -> b) -> a -> b
$ [((Double, Double), Double)]
xyerrors
        ([Double]
good_xs,[Double]
good_ys) = [(Double, Double)] -> ([Double], [Double])
forall a b. [(a, b)] -> ([a], [b])
unzip [(Double, Double)]
xys
        new_estimate :: (Double, Double)
new_estimate = EstimationParameters -> Estimator
estimator EstimationParameters
ep ([Double] -> Vector Double
forall (v :: * -> *) a. Vector v a => [a] -> v a
G.fromList [Double]
good_xs) ([Double] -> Vector Double
forall (v :: * -> *) a. Vector v a => [a] -> v a
G.fromList [Double]
good_ys)
        new_err :: Double
new_err = [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Double]
errors

-- | Infinite set of consecutive concentration steps.
concentration :: Vector v Double => EstimationParameters -> v Double -> v Double -> EstimatedRelation -> [(EstimatedRelation, Double)]
concentration :: EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
concentration EstimationParameters
ep v Double
xs v Double
ys (Double, Double)
params = [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a. [a] -> [a]
tail ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a b. (a -> b) -> a -> b
$ (((Double, Double), Double) -> ((Double, Double), Double))
-> ((Double, Double), Double) -> [((Double, Double), Double)]
forall a. (a -> a) -> a -> [a]
iterate (EstimationParameters
-> v Double
-> v Double
-> ((Double, Double), Double)
-> ((Double, Double), Double)
forall (v :: * -> *).
Vector v Double =>
EstimationParameters
-> v Double
-> v Double
-> ((Double, Double), Double)
-> ((Double, Double), Double)
concentrationStep EstimationParameters
ep v Double
xs v Double
ys) ((Double, Double)
params,-Double
1)

-- | Calculate the optimal (local minimum) estimate based on an initial estimate.
-- The local minimum may not be the global (a.k.a. best) estimate but starting from enough different initial estimates should yield the global optimum eventually.
converge :: Vector v Double => EstimationParameters -> v Double -> v Double -> EstimatedRelation -> EstimatedRelation
converge :: EstimationParameters
-> v Double -> v Double -> (Double, Double) -> (Double, Double)
converge EstimationParameters
ep v Double
xs v Double
ys = ((Double, Double), Double) -> (Double, Double)
forall a b. (a, b) -> a
fst (((Double, Double), Double) -> (Double, Double))
-> ((Double, Double) -> ((Double, Double), Double))
-> (Double, Double)
-> (Double, Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [((Double, Double), Double)] -> ((Double, Double), Double)
forall a b. Ord a => [(b, a)] -> (b, a)
findConvergencePoint ([((Double, Double), Double)] -> ((Double, Double), Double))
-> ((Double, Double) -> [((Double, Double), Double)])
-> (Double, Double)
-> ((Double, Double), Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
forall (v :: * -> *).
Vector v Double =>
EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
concentration EstimationParameters
ep v Double
xs v Double
ys

-- | The convergence point is defined as the point the error estimate of which is equal to the next estimate's error.
findConvergencePoint :: Ord a => [(b,a)] -> (b,a)
findConvergencePoint :: [(b, a)] -> (b, a)
findConvergencePoint ((b, a)
x:(b, a)
y:[(b, a)]
ys)
    | (b, a) -> a
forall a b. (a, b) -> b
snd (b, a)
x a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= (b, a) -> a
forall a b. (a, b) -> b
snd (b, a)
y = (b, a)
x -- rounding issues my cause an actual increase in error resulting in an infinite loop so the actual stop condition is when the errors stop decreasing
    | Bool
otherwise = [(b, a)] -> (b, a)
forall a b. Ord a => [(b, a)] -> (b, a)
findConvergencePoint ((b, a)
y(b, a) -> [(b, a)] -> [(b, a)]
forall a. a -> [a] -> [a]
:[(b, a)]
ys)
findConvergencePoint [(b, a)]
xs = [Char] -> (b, a)
forall a. HasCallStack => [Char] -> a
error [Char]
"Too short a list for conversion (size < 2)"

-- | Many times there is no need for full concentration as bad initial estimates can be discovered after only a few concentration steps.
concentrateNSteps :: Vector v Double => EstimationParameters -> v Double -> v Double -> EstimatedRelation -> (EstimatedRelation,Double)
concentrateNSteps :: EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> ((Double, Double), Double)
concentrateNSteps EstimationParameters
ep v Double
xs v Double
ys (Double, Double)
params = EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
forall (v :: * -> *).
Vector v Double =>
EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
concentration EstimationParameters
ep v Double
xs v Double
ys (Double, Double)
params [((Double, Double), Double)] -> Int -> ((Double, Double), Double)
forall a. [a] -> Int -> a
!! EstimationParameters -> Int
shortIterationSteps EstimationParameters
ep

-- | Finding a robust fit linear estimate between two samples. The procedure requires randomization and is based on the procedure described in the reference.
robustFit :: (MonadRandom m, Vector v Double) => EstimationParameters -> v Double -> v Double -> m EstimatedRelation
robustFit :: EstimationParameters -> v Double -> v Double -> m (Double, Double)
robustFit EstimationParameters
ep v Double
xs v Double
ys = do
    let n :: Int
n = v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs
-- For optimal performance the exact procedure executed depends on the set size.
    if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
2
        then
            [Char] -> m (Double, Double)
forall a. HasCallStack => [Char] -> a
error [Char]
"cannot fit an input of size < 2"
        else if Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
2
            then (Double, Double) -> m (Double, Double)
forall (m :: * -> *) a. Monad m => a -> m a
return ((Double, Double) -> m (Double, Double))
-> (Double, Double) -> m (Double, Double)
forall a b. (a -> b) -> a -> b
$ ((Double, Double), (Double, Double)) -> (Double, Double)
lineParams ((v Double -> Double
forall (v :: * -> *) a. Vector v a => v a -> a
G.head v Double
xs,v Double -> Double
forall (v :: * -> *) a. Vector v a => v a -> a
G.head v Double
ys),(v Double -> Double
forall (v :: * -> *) a. Vector v a => v a -> a
G.last v Double
xs,v Double -> Double
forall (v :: * -> *) a. Vector v a => v a -> a
G.last v Double
ys))
            else 
                ([(Double, Double)] -> (Double, Double))
-> m [(Double, Double)] -> m (Double, Double)
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM (EstimationParameters
-> v Double -> v Double -> [(Double, Double)] -> (Double, Double)
forall (v :: * -> *).
Vector v Double =>
EstimationParameters
-> v Double -> v Double -> [(Double, Double)] -> (Double, Double)
candidatesToWinner EstimationParameters
ep v Double
xs v Double
ys) (m [(Double, Double)] -> m (Double, Double))
-> m [(Double, Double)] -> m (Double, Double)
forall a b. (a -> b) -> a -> b
$ if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< EstimationParameters -> Int
mediumSetSize EstimationParameters
ep
                    then
                         EstimationParameters
-> Maybe Int -> v Double -> v Double -> m [(Double, Double)]
forall (m :: * -> *) (v :: * -> *).
(MonadRandom m, Vector v Double) =>
EstimationParameters
-> Maybe Int -> v Double -> v Double -> m [(Double, Double)]
singleGroupFitCandidates EstimationParameters
ep Maybe Int
forall a. Maybe a
Nothing v Double
xs v Double
ys
                    else if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< EstimationParameters -> Int
largeSetSize EstimationParameters
ep
                        then EstimationParameters
-> v Double -> v Double -> m [(Double, Double)]
forall (m :: * -> *) (v :: * -> *).
(MonadRandom m, Vector v Double) =>
EstimationParameters
-> v Double -> v Double -> m [(Double, Double)]
largeGroupFitCandidates EstimationParameters
ep v Double
xs v Double
ys
                        else do
                            ([Double]
nxs,[Double]
nys) <- ([(Double, Double)] -> ([Double], [Double]))
-> m [(Double, Double)] -> m ([Double], [Double])
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM [(Double, Double)] -> ([Double], [Double])
forall a b. [(a, b)] -> ([a], [b])
unzip (m [(Double, Double)] -> m ([Double], [Double]))
-> m [(Double, Double)] -> m ([Double], [Double])
forall a b. (a -> b) -> a -> b
$ [(Double, Double)] -> Int -> m [(Double, Double)]
forall (m :: * -> *) a. MonadRandom m => [a] -> Int -> m [a]
randomSubset ([Double] -> [Double] -> [(Double, Double)]
forall a b. [a] -> [b] -> [(a, b)]
zip (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
xs) (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
ys)) (EstimationParameters -> Int
largeSetSize EstimationParameters
ep)
                            EstimationParameters
-> Vector Double -> Vector Double -> m [(Double, Double)]
forall (m :: * -> *) (v :: * -> *).
(MonadRandom m, Vector v Double) =>
EstimationParameters
-> v Double -> v Double -> m [(Double, Double)]
largeGroupFitCandidates EstimationParameters
ep ([Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
U.fromList [Double]
nxs) ([Double] -> Vector Double
forall (v :: * -> *) a. Vector v a => [a] -> v a
G.fromList [Double]
nys)

-- | Robust fit yielding also the R-square value of the \"clean\" dataset.
robustFitRSqr :: (MonadRandom m, Vector v Double, Vector v (Double, Double)) => EstimationParameters -> v Double -> v Double -> m (EstimatedRelation,Double)
robustFitRSqr :: EstimationParameters
-> v Double -> v Double -> m ((Double, Double), Double)
robustFitRSqr EstimationParameters
ep v Double
xs v Double
ys = do
    (Double, Double)
er <- EstimationParameters -> v Double -> v Double -> m (Double, Double)
forall (m :: * -> *) (v :: * -> *).
(MonadRandom m, Vector v Double) =>
EstimationParameters -> v Double -> v Double -> m (Double, Double)
robustFit EstimationParameters
ep v Double
xs v Double
ys
    let (Vector Double
good_xs,Vector Double
good_ys) = Vector (Double, Double) -> (Vector Double, Vector Double)
forall a b.
(Unbox a, Unbox b) =>
Vector (a, b) -> (Vector a, Vector b)
U.unzip (Vector (Double, Double) -> (Vector Double, Vector Double))
-> (v (Double, Double) -> Vector (Double, Double))
-> v (Double, Double)
-> (Vector Double, Vector Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Double, Double)] -> Vector (Double, Double)
forall (v :: * -> *) a. Vector v a => [a] -> v a
G.fromList ([(Double, Double)] -> Vector (Double, Double))
-> (v (Double, Double) -> [(Double, Double)])
-> v (Double, Double)
-> Vector (Double, Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [(Double, Double)] -> [(Double, Double)]
forall a. Int -> [a] -> [a]
take (EstimationParameters -> v Double -> Int
forall (v :: * -> *).
Vector v Double =>
EstimationParameters -> v Double -> Int
setSize EstimationParameters
ep v Double
xs) ([(Double, Double)] -> [(Double, Double)])
-> (v (Double, Double) -> [(Double, Double)])
-> v (Double, Double)
-> [(Double, Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Double, Double) -> (Double, Double) -> Ordering)
-> [(Double, Double)] -> [(Double, Double)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (Double -> Double -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Double -> Double -> Ordering)
-> ((Double, Double) -> Double)
-> (Double, Double)
-> (Double, Double)
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` EstimationParameters -> ErrorFunction
errorFunction EstimationParameters
ep (Double, Double)
er) ([(Double, Double)] -> [(Double, Double)])
-> (v (Double, Double) -> [(Double, Double)])
-> v (Double, Double)
-> [(Double, Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. v (Double, Double) -> [(Double, Double)]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList (v (Double, Double) -> (Vector Double, Vector Double))
-> v (Double, Double) -> (Vector Double, Vector Double)
forall a b. (a -> b) -> a -> b
$ v Double -> v Double -> v (Double, Double)
forall (v :: * -> *) a b.
(Vector v a, Vector v b, Vector v (a, b)) =>
v a -> v b -> v (a, b)
G.zip v Double
xs v Double
ys
    ((Double, Double), Double) -> m ((Double, Double), Double)
forall (m :: * -> *) a. Monad m => a -> m a
return ((Double, Double)
er,Vector Double -> Vector Double -> Double
forall (v :: * -> *).
Vector v Double =>
v Double -> v Double -> Double
correl Vector Double
good_xs Vector Double
good_ys Double -> Integer -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^ Integer
2)

-- | A wrapper that executes 'robustFit' using a default random generator (meaning it is only pseudo-random)
nonRandomRobustFit :: Vector v Double => EstimationParameters -> v Double -> v Double -> EstimatedRelation
nonRandomRobustFit :: EstimationParameters -> v Double -> v Double -> (Double, Double)
nonRandomRobustFit EstimationParameters
ep v Double
xs v Double
ys = Rand StdGen (Double, Double) -> StdGen -> (Double, Double)
forall g a. Rand g a -> g -> a
evalRand (EstimationParameters
-> v Double -> v Double -> Rand StdGen (Double, Double)
forall (m :: * -> *) (v :: * -> *).
(MonadRandom m, Vector v Double) =>
EstimationParameters -> v Double -> v Double -> m (Double, Double)
robustFit EstimationParameters
ep v Double
xs v Double
ys) (Int -> StdGen
mkStdGen Int
1)

-- | Given a set of initial estimates converge them all and find the optimal one.
candidatesToWinner :: Vector v Double => EstimationParameters -> v Double -> v Double -> [EstimatedRelation] -> EstimatedRelation
candidatesToWinner :: EstimationParameters
-> v Double -> v Double -> [(Double, Double)] -> (Double, Double)
candidatesToWinner EstimationParameters
ep v Double
xs v Double
ys = ((Double, Double), Double) -> (Double, Double)
forall a b. (a, b) -> a
fst (((Double, Double), Double) -> (Double, Double))
-> ([(Double, Double)] -> ((Double, Double), Double))
-> [(Double, Double)]
-> (Double, Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Double, Double), Double)
 -> ((Double, Double), Double) -> Ordering)
-> [((Double, Double), Double)] -> ((Double, Double), Double)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
minimumBy (Double -> Double -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Double -> Double -> Ordering)
-> (((Double, Double), Double) -> Double)
-> ((Double, Double), Double)
-> ((Double, Double), Double)
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` ((Double, Double), Double) -> Double
forall a b. (a, b) -> b
snd) ([((Double, Double), Double)] -> ((Double, Double), Double))
-> ([(Double, Double)] -> [((Double, Double), Double)])
-> [(Double, Double)]
-> ((Double, Double), Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Double, Double) -> ((Double, Double), Double))
-> [(Double, Double)] -> [((Double, Double), Double)]
forall a b. (a -> b) -> [a] -> [b]
map ([((Double, Double), Double)] -> ((Double, Double), Double)
forall a b. Ord a => [(b, a)] -> (b, a)
findConvergencePoint ([((Double, Double), Double)] -> ((Double, Double), Double))
-> ((Double, Double) -> [((Double, Double), Double)])
-> (Double, Double)
-> ((Double, Double), Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
forall (v :: * -> *).
Vector v Double =>
EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
concentration EstimationParameters
ep v Double
xs v Double
ys)

-- | for a large initial sample - subdivide it, then get candidates from each subgroup. Perform full convergence on all the candidates and return the best ones.
largeGroupFitCandidates :: (MonadRandom m, Vector v Double) => EstimationParameters -> v Double -> v Double -> m [EstimatedRelation]
largeGroupFitCandidates :: EstimationParameters
-> v Double -> v Double -> m [(Double, Double)]
largeGroupFitCandidates EstimationParameters
ep v Double
xs v Double
ys = do
    let n :: Int
n = v Double -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
G.length v Double
xs
    let sub_groups_num :: Int
sub_groups_num = Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` (EstimationParameters -> Int
mediumSetSize EstimationParameters
ep Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
2)
    let sub_groups_size :: Int
sub_groups_size = Int
n Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
sub_groups_num
    [(Double, Double)]
shuffled <- [(Double, Double)] -> m [(Double, Double)]
forall (m :: * -> *) a. MonadRandom m => [a] -> m [a]
shuffleM ([(Double, Double)] -> m [(Double, Double)])
-> [(Double, Double)] -> m [(Double, Double)]
forall a b. (a -> b) -> a -> b
$ [Double] -> [Double] -> [(Double, Double)]
forall a b. [a] -> [b] -> [(a, b)]
zip (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
xs) (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
ys)
    let sub_groups :: [(Vector Double, Vector Double)]
sub_groups = ([(Double, Double)] -> (Vector Double, Vector Double))
-> [[(Double, Double)]] -> [(Vector Double, Vector Double)]
forall a b. (a -> b) -> [a] -> [b]
map (Vector (Double, Double) -> (Vector Double, Vector Double)
forall (v :: * -> *) a b.
(Vector v a, Vector v b, Vector v (a, b)) =>
v (a, b) -> (v a, v b)
G.unzip (Vector (Double, Double) -> (Vector Double, Vector Double))
-> ([(Double, Double)] -> Vector (Double, Double))
-> [(Double, Double)]
-> (Vector Double, Vector Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Double, Double)] -> Vector (Double, Double)
forall a. Unbox a => [a] -> Vector a
U.fromList) ([[(Double, Double)]] -> [(Vector Double, Vector Double)])
-> [[(Double, Double)]] -> [(Vector Double, Vector Double)]
forall a b. (a -> b) -> a -> b
$ Int -> [(Double, Double)] -> [[(Double, Double)]]
forall a. Int -> [a] -> [[a]]
splitTo Int
sub_groups_size [(Double, Double)]
shuffled
    let sub_groups_candidates :: Int
sub_groups_candidates = EstimationParameters -> Int
maxSubsetsNum EstimationParameters
ep Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
sub_groups_num
    [[(Double, Double)]]
candidates_list <- ((Vector Double, Vector Double) -> m [(Double, Double)])
-> [(Vector Double, Vector Double)] -> m [[(Double, Double)]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM ((Vector Double -> Vector Double -> m [(Double, Double)])
-> (Vector Double, Vector Double) -> m [(Double, Double)]
forall a b c. (a -> b -> c) -> (a, b) -> c
applyTo ((Vector Double -> Vector Double -> m [(Double, Double)])
 -> (Vector Double, Vector Double) -> m [(Double, Double)])
-> (Vector Double -> Vector Double -> m [(Double, Double)])
-> (Vector Double, Vector Double)
-> m [(Double, Double)]
forall a b. (a -> b) -> a -> b
$ EstimationParameters
-> Maybe Int
-> Vector Double
-> Vector Double
-> m [(Double, Double)]
forall (m :: * -> *) (v :: * -> *).
(MonadRandom m, Vector v Double) =>
EstimationParameters
-> Maybe Int -> v Double -> v Double -> m [(Double, Double)]
singleGroupFitCandidates EstimationParameters
ep (Int -> Maybe Int
forall a. a -> Maybe a
Just Int
sub_groups_candidates)) [(Vector Double, Vector Double)]
sub_groups
    let candidates :: [(Double, Double)]
candidates = [[(Double, Double)]] -> [(Double, Double)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[(Double, Double)]]
candidates_list
    [(Double, Double)] -> m [(Double, Double)]
forall (m :: * -> *) a. Monad m => a -> m a
return ([(Double, Double)] -> m [(Double, Double)])
-> ([(Double, Double)] -> [(Double, Double)])
-> [(Double, Double)]
-> m [(Double, Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Double, Double), Double) -> (Double, Double))
-> [((Double, Double), Double)] -> [(Double, Double)]
forall a b. (a -> b) -> [a] -> [b]
map ((Double, Double), Double) -> (Double, Double)
forall a b. (a, b) -> a
fst ([((Double, Double), Double)] -> [(Double, Double)])
-> ([(Double, Double)] -> [((Double, Double), Double)])
-> [(Double, Double)]
-> [(Double, Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a. Int -> [a] -> [a]
take (EstimationParameters -> Int
groupSubsets EstimationParameters
ep) ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> ([(Double, Double)] -> [((Double, Double), Double)])
-> [(Double, Double)]
-> [((Double, Double), Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Double, Double), Double)
 -> ((Double, Double), Double) -> Ordering)
-> [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (Double -> Double -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Double -> Double -> Ordering)
-> (((Double, Double), Double) -> Double)
-> ((Double, Double), Double)
-> ((Double, Double), Double)
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` ((Double, Double), Double) -> Double
forall a b. (a, b) -> b
snd) ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> ([(Double, Double)] -> [((Double, Double), Double)])
-> [(Double, Double)]
-> [((Double, Double), Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Double, Double) -> ((Double, Double), Double))
-> [(Double, Double)] -> [((Double, Double), Double)]
forall a b. (a -> b) -> [a] -> [b]
map ([((Double, Double), Double)] -> ((Double, Double), Double)
forall a b. Ord a => [(b, a)] -> (b, a)
findConvergencePoint ([((Double, Double), Double)] -> ((Double, Double), Double))
-> ((Double, Double) -> [((Double, Double), Double)])
-> (Double, Double)
-> ((Double, Double), Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
forall (v :: * -> *).
Vector v Double =>
EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> [((Double, Double), Double)]
concentration EstimationParameters
ep v Double
xs v Double
ys) ([(Double, Double)] -> m [(Double, Double)])
-> [(Double, Double)] -> m [(Double, Double)]
forall a b. (a -> b) -> a -> b
$ [(Double, Double)]
candidates

-- | For a single group (a group that will not be subdivided) pick an initial set of pairs of points, run a few steps on each, then return the most promising candidates.
singleGroupFitCandidates :: (MonadRandom m, Vector v Double) => EstimationParameters -> Maybe Int -> v Double -> v Double -> m [EstimatedRelation]
singleGroupFitCandidates :: EstimationParameters
-> Maybe Int -> v Double -> v Double -> m [(Double, Double)]
singleGroupFitCandidates EstimationParameters
ep Maybe Int
m_subsets v Double
xs v Double
ys = do
    let all_pairs :: [((Double, Double), (Double, Double))]
all_pairs = [(Double, Double)] -> [((Double, Double), (Double, Double))]
forall a. [a] -> [(a, a)]
allPairs ([(Double, Double)] -> [((Double, Double), (Double, Double))])
-> [(Double, Double)] -> [((Double, Double), (Double, Double))]
forall a b. (a -> b) -> a -> b
$ [Double] -> [Double] -> [(Double, Double)]
forall a b. [a] -> [b] -> [(a, b)]
zip (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
xs) (v Double -> [Double]
forall (v :: * -> *) a. Vector v a => v a -> [a]
G.toList v Double
ys)
    let return_size :: Int
return_size = Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe (EstimationParameters -> Int
maxSubsetsNum EstimationParameters
ep) Maybe Int
m_subsets
    [((Double, Double), (Double, Double))]
initial_sets <- if Int
return_size Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> [((Double, Double), (Double, Double))] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [((Double, Double), (Double, Double))]
all_pairs
        then [((Double, Double), (Double, Double))]
-> m [((Double, Double), (Double, Double))]
forall (m :: * -> *) a. Monad m => a -> m a
return [((Double, Double), (Double, Double))]
all_pairs
        else [((Double, Double), (Double, Double))]
-> Int -> m [((Double, Double), (Double, Double))]
forall (m :: * -> *) a. MonadRandom m => [a] -> Int -> m [a]
randomSubset [((Double, Double), (Double, Double))]
all_pairs Int
return_size 
    [(Double, Double)] -> m [(Double, Double)]
forall (m :: * -> *) a. Monad m => a -> m a
return ([(Double, Double)] -> m [(Double, Double)])
-> ([((Double, Double), (Double, Double))] -> [(Double, Double)])
-> [((Double, Double), (Double, Double))]
-> m [(Double, Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Double, Double), Double) -> (Double, Double))
-> [((Double, Double), Double)] -> [(Double, Double)]
forall a b. (a -> b) -> [a] -> [b]
map ((Double, Double), Double) -> (Double, Double)
forall a b. (a, b) -> a
fst ([((Double, Double), Double)] -> [(Double, Double)])
-> ([((Double, Double), (Double, Double))]
    -> [((Double, Double), Double)])
-> [((Double, Double), (Double, Double))]
-> [(Double, Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a. Int -> [a] -> [a]
take (EstimationParameters -> Int
groupSubsets EstimationParameters
ep) ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> ([((Double, Double), (Double, Double))]
    -> [((Double, Double), Double)])
-> [((Double, Double), (Double, Double))]
-> [((Double, Double), Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Double, Double), Double)
 -> ((Double, Double), Double) -> Ordering)
-> [((Double, Double), Double)] -> [((Double, Double), Double)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (Double -> Double -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Double -> Double -> Ordering)
-> (((Double, Double), Double) -> Double)
-> ((Double, Double), Double)
-> ((Double, Double), Double)
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` ((Double, Double), Double) -> Double
forall a b. (a, b) -> b
snd) ([((Double, Double), Double)] -> [((Double, Double), Double)])
-> ([((Double, Double), (Double, Double))]
    -> [((Double, Double), Double)])
-> [((Double, Double), (Double, Double))]
-> [((Double, Double), Double)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Double, Double), (Double, Double))
 -> ((Double, Double), Double))
-> [((Double, Double), (Double, Double))]
-> [((Double, Double), Double)]
forall a b. (a -> b) -> [a] -> [b]
map (EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> ((Double, Double), Double)
forall (v :: * -> *).
Vector v Double =>
EstimationParameters
-> v Double
-> v Double
-> (Double, Double)
-> ((Double, Double), Double)
concentrateNSteps EstimationParameters
ep v Double
xs v Double
ys ((Double, Double) -> ((Double, Double), Double))
-> (((Double, Double), (Double, Double)) -> (Double, Double))
-> ((Double, Double), (Double, Double))
-> ((Double, Double), Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Double, Double), (Double, Double)) -> (Double, Double)
lineParams) ([((Double, Double), (Double, Double))] -> m [(Double, Double)])
-> [((Double, Double), (Double, Double))] -> m [(Double, Double)]
forall a b. (a -> b) -> a -> b
$ [((Double, Double), (Double, Double))]
initial_sets

-- | Find the line passing between two points. This is the initial estimate to use given two random points.
lineParams :: ((Double,Double),(Double,Double)) -> EstimatedRelation
lineParams :: ((Double, Double), (Double, Double)) -> (Double, Double)
lineParams ((Double
x1,Double
y1),(Double
x2,Double
y2)) = (Double
alpha,Double
beta)
    where
        beta :: Double
beta = (Double
y2Double -> Double -> Double
forall a. Num a => a -> a -> a
-Double
y1)Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/(Double
x2Double -> Double -> Double
forall a. Num a => a -> a -> a
-Double
x1)
        alpha :: Double
alpha = Double
y1 Double -> Double -> Double
forall a. Num a => a -> a -> a
- Double
betaDouble -> Double -> Double
forall a. Num a => a -> a -> a
*Double
x1

-- | A list of all possible two-element pairs from a list.
allPairs :: [a] -> [(a,a)]
allPairs :: [a] -> [(a, a)]
allPairs [] = []
allPairs [a
x] = []
allPairs [a
x,a
y] = [(a
x,a
y)]
allPairs (a
x:[a]
xs) = ([a] -> [a] -> [(a, a)]
forall a b. [a] -> [b] -> [(a, b)]
zip [a]
xs ([a] -> [(a, a)]) -> (a -> [a]) -> a -> [(a, a)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [a]
forall a. a -> [a]
repeat (a -> [(a, a)]) -> a -> [(a, a)]
forall a b. (a -> b) -> a -> b
$ a
x) [(a, a)] -> [(a, a)] -> [(a, a)]
forall a. [a] -> [a] -> [a]
++ [a] -> [(a, a)]
forall a. [a] -> [(a, a)]
allPairs [a]
xs

-- | Get a random subset of a given size.
randomSubset :: MonadRandom m => [a] -> Int -> m [a]
randomSubset :: [a] -> Int -> m [a]
randomSubset [a]
xs Int
size = ([a] -> [a]) -> m [a] -> m [a]
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM (Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take Int
size) (m [a] -> m [a]) -> m [a] -> m [a]
forall a b. (a -> b) -> a -> b
$ [a] -> m [a]
forall (m :: * -> *) a. MonadRandom m => [a] -> m [a]
shuffleM [a]
xs

-- | Split a list into sublists of length n.
splitTo :: Int -> [a] -> [[a]]
splitTo :: Int -> [a] -> [[a]]
splitTo Int
n = ([a] -> [a]) -> [[a]] -> [[a]]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take Int
n) ([[a]] -> [[a]]) -> ([a] -> [[a]]) -> [a] -> [[a]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([a] -> Bool) -> [[a]] -> [[a]]
forall a. (a -> Bool) -> [a] -> [a]
takeWhile (Bool -> Bool
not (Bool -> Bool) -> ([a] -> Bool) -> [a] -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null) ([[a]] -> [[a]]) -> ([a] -> [[a]]) -> [a] -> [[a]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([a] -> [a]) -> [a] -> [[a]]
forall a. (a -> a) -> a -> [a]
iterate (Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
drop Int
n)

-- | Helper function to adjust parameter handling
applyTo :: (a->b->c) -> (a,b) -> c
applyTo :: (a -> b -> c) -> (a, b) -> c
applyTo a -> b -> c
f (a
x,b
y) = a -> b -> c
f a
x b
y

-- $references
--
-- * Two Dimensional Euclidean Regression (Stein) <http://www.dspcsp.com/pubs/euclreg.pdf>
--
-- * Computing LTS Regression For Large Data Sets (Rousseeuw and Driessen) <http://agoras.ua.ac.be/abstract/Comlts99.htm>
--
-- * Applied linear statistical models (Kutner et al.)