276 lines
11 KiB
C
276 lines
11 KiB
C
|
|
/*=========================================================================
|
||
|
|
|
||
|
|
Program: Visualization Toolkit
|
||
|
|
Module: vtkDescriptiveStatistics.h
|
||
|
|
|
||
|
|
Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
|
||
|
|
All rights reserved.
|
||
|
|
See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
|
||
|
|
|
||
|
|
This software is distributed WITHOUT ANY WARRANTY; without even
|
||
|
|
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||
|
|
PURPOSE. See the above copyright notice for more information.
|
||
|
|
|
||
|
|
=========================================================================*/
|
||
|
|
/*-------------------------------------------------------------------------
|
||
|
|
Copyright 2010 Sandia Corporation.
|
||
|
|
Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||
|
|
the U.S. Government retains certain rights in this software.
|
||
|
|
-------------------------------------------------------------------------*/
|
||
|
|
/**
|
||
|
|
* @class vtkDescriptiveStatistics
|
||
|
|
* @brief A class for univariate descriptive statistics
|
||
|
|
*
|
||
|
|
*
|
||
|
|
* Given a selection of columns of interest in an input data table, this
|
||
|
|
* class provides the following functionalities, depending on the chosen
|
||
|
|
* execution options:
|
||
|
|
* * Learn: calculate extremal values, sample mean, and M2, M3, and M4 aggregates
|
||
|
|
* (cf. P. Pebay, Formulas for robust, one-pass parallel computation of covariances
|
||
|
|
* and Arbitrary-Order Statistical Moments, Sandia Report SAND2008-6212, Sep 2008,
|
||
|
|
* http://infoserve.sandia.gov/sand_doc/2008/086212.pdf for details)
|
||
|
|
* * Derive: calculate unbiased variance estimator, standard deviation estimator,
|
||
|
|
* two skewness estimators, and two kurtosis excess estimators.
|
||
|
|
* * Assess: given an input data set, a reference value and a non-negative deviation,
|
||
|
|
* mark each datum with corresponding relative deviation (1-dimensional Mahlanobis
|
||
|
|
* distance). If the deviation is zero, then mark each datum which are equal to the
|
||
|
|
* reference value with 0, and all others with 1. By default, the reference value
|
||
|
|
* and the deviation are, respectively, the mean and the standard deviation of the
|
||
|
|
* input model.
|
||
|
|
* * Test: calculate Jarque-Bera statistic and, if VTK to R interface is available,
|
||
|
|
* retrieve corresponding p-value for normality testing.
|
||
|
|
*
|
||
|
|
* Among the derived statistics, the variance, the standard deviation, the skewness
|
||
|
|
* and the kurtosis can be estimated in two ways: using the sample version of those
|
||
|
|
* statistics, or the population version. Specify whether a sample estimate or population
|
||
|
|
* estimate is done by setting `SampleEstimate`. By default, `SampleEstimate == true`, hence
|
||
|
|
* the sample version of the statistics is estimated,
|
||
|
|
* which produces unbiased estimators (except for the sample standard deviation).
|
||
|
|
* The sample estimate should be used for input that represent a subset of the whole
|
||
|
|
* population of study. On the other hand, when `SampleEstimate == false`, the population
|
||
|
|
* version of the statistics is estimated. If the input doesn't contain all the samples
|
||
|
|
* from the population of study, then a bias is induced (the variance is slightly bigger than it
|
||
|
|
* should be). One can read about Bessel's correction to understand better where this comes from.
|
||
|
|
* That being said, on very large data, the difference between the 2 estimation formulas
|
||
|
|
* becomes very low, so in those instances,
|
||
|
|
* either state of `SampleEstimate` should yield very similar results
|
||
|
|
* (see explicit formulas below).
|
||
|
|
*
|
||
|
|
* \verbatim
|
||
|
|
*
|
||
|
|
* The formulas used are as follows, writing \f( \bar{X} \f) the mean of \f( X \f) and \f( N \f)
|
||
|
|
* the number of samples:
|
||
|
|
* - Sample estimate:
|
||
|
|
* \f[
|
||
|
|
* Var{X} = s^2 = \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^2 }{N - 1}
|
||
|
|
* \f]
|
||
|
|
* \f[
|
||
|
|
* Skew{X} = \frac{n}{(n - 1)(n - 2)}
|
||
|
|
* \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{s^3}
|
||
|
|
* \f]
|
||
|
|
* \f[
|
||
|
|
* Kurt{X} = \frac{n(n + 1)}{(n - 1)(n - 2)(n - 3)}
|
||
|
|
* \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{s^4}
|
||
|
|
* - 3 \frac{(n - 1)^2}{(n - 2)(n - 3)}
|
||
|
|
* \f]
|
||
|
|
* - Population estimate:
|
||
|
|
* \f[
|
||
|
|
* Var{X} = \sigma^2 = \frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^2 }{N}
|
||
|
|
* \f]
|
||
|
|
* \f[
|
||
|
|
* Skew{X} = \frac{1}{N}\frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{\sigma^3}
|
||
|
|
* \f]
|
||
|
|
* \f[
|
||
|
|
* Kurt{X} = \frac{1}{N}\frac{\sum_{k=1}^N \left(x_k - \bar{x}\right)^3 }{\sigma^4} - 3
|
||
|
|
* \f]
|
||
|
|
*
|
||
|
|
* \f(\sigma\f) is the population standard deviation, and \f(s\f) is the sample standard deviation.
|
||
|
|
* Note that the kurtosis is corrected so the kurtosis of a gaussian distribution should yield 0.
|
||
|
|
*
|
||
|
|
* In the instance where \f(\sigma = 0\f) or \f(s = 0\f), the skewness and kurtosis are undefined.
|
||
|
|
* Thus they output a `NaN`. Similarly, if there are no samples, then all derived statistics
|
||
|
|
* yield a `NaN`.
|
||
|
|
*
|
||
|
|
* \endverbatim
|
||
|
|
*
|
||
|
|
* @par Thanks:
|
||
|
|
* Thanks to Philippe Pebay and David Thompson from Sandia National Laboratories
|
||
|
|
* for implementing this class.
|
||
|
|
* Updated by Philippe Pebay, Kitware SAS 2012
|
||
|
|
*/
|
||
|
|
|
||
|
|
#ifndef vtkDescriptiveStatistics_h
|
||
|
|
#define vtkDescriptiveStatistics_h
|
||
|
|
|
||
|
|
#include "vtkDeprecation.h" // For VTK_DEPRECATED_IN_9_2_0
|
||
|
|
#include "vtkFiltersStatisticsModule.h" // For export macro
|
||
|
|
#include "vtkStatisticsAlgorithm.h"
|
||
|
|
|
||
|
|
class vtkMultiBlockDataSet;
|
||
|
|
class vtkStringArray;
|
||
|
|
class vtkTable;
|
||
|
|
class vtkVariant;
|
||
|
|
class vtkDoubleArray;
|
||
|
|
|
||
|
|
class VTKFILTERSSTATISTICS_EXPORT vtkDescriptiveStatistics : public vtkStatisticsAlgorithm
|
||
|
|
{
|
||
|
|
public:
|
||
|
|
vtkTypeMacro(vtkDescriptiveStatistics, vtkStatisticsAlgorithm);
|
||
|
|
void PrintSelf(ostream& os, vtkIndent indent) override;
|
||
|
|
static vtkDescriptiveStatistics* New();
|
||
|
|
|
||
|
|
///@{
|
||
|
|
/**
|
||
|
|
* @warning THIS METHOD DOES NOTHING AND IS DEPRECATED.
|
||
|
|
*
|
||
|
|
* To compute an unbiased variance, please set `SampleEstimate` instead. When set to true,
|
||
|
|
* the sample variance is computed, which is unbiased.
|
||
|
|
*/
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void SetUnbiasedVariance(vtkTypeBool);
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use GetSampleEstimate instead")
|
||
|
|
virtual vtkTypeBool GetUnbiasedVariance();
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void UnbiasedVarianceOn();
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void UnbiasedVarianceOff();
|
||
|
|
///@}
|
||
|
|
|
||
|
|
///@{
|
||
|
|
/**
|
||
|
|
* @warning THIS METHOD DOES NOTHING AND IS DEPRECATED.
|
||
|
|
*
|
||
|
|
* Skewness estimator is picked depending on the state of `SampleEstimate`.
|
||
|
|
*/
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void SetG1Skewness(vtkTypeBool);
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use GetSampleEstimate instead")
|
||
|
|
virtual vtkTypeBool GetG1Skewness();
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void G1SkewnessOn();
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void G1SkewnessOff();
|
||
|
|
///@}
|
||
|
|
|
||
|
|
///@{
|
||
|
|
/**
|
||
|
|
* @warning THIS METHOD DOES NOTHING AND IS DEPRECATED.
|
||
|
|
*
|
||
|
|
* Kurtosis estimator is picked depending on the state of `SampleEstimate`.
|
||
|
|
*/
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void SetG2Kurtosis(vtkTypeBool);
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use GetSampleEstimate instead")
|
||
|
|
virtual vtkTypeBool GetG2Kurtosis();
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void G2KurtosisOn();
|
||
|
|
VTK_DEPRECATED_IN_9_2_0("Please use SetSampleEstimate instead")
|
||
|
|
virtual void G2KurtosisOff();
|
||
|
|
///@}
|
||
|
|
|
||
|
|
///@{
|
||
|
|
/**
|
||
|
|
* Getter / Setter on `SampleEstimate`. When turned on, descriptive statistics
|
||
|
|
* computed by this filter assume that the input data only holds a sample of the whole
|
||
|
|
* population of study. In effect, the sample variance, the sample standard deviation,
|
||
|
|
* the sample skewness and the sample kurtosis are estimated. When turned off, the population
|
||
|
|
* variance, the population standard deviation, the population skewness and the population
|
||
|
|
* kurtosis are estimated instead.
|
||
|
|
*
|
||
|
|
* In short, if the input data is a full description of the population being studied,
|
||
|
|
* `SampleEstimate` should be turned off. If the input data is a sample of the population being
|
||
|
|
* studied, then `SampleEstimate` should be turned on. By default, `SampleEstimate` is turned
|
||
|
|
* on, as it is the most likely case.
|
||
|
|
*
|
||
|
|
* Please see class description for a full description of the formulas.
|
||
|
|
*
|
||
|
|
* @note For large data, the difference between the population estimate and the sample
|
||
|
|
* estimate becomes thin, so this parameter becomes of less worry.
|
||
|
|
*/
|
||
|
|
vtkSetMacro(SampleEstimate, bool);
|
||
|
|
vtkGetMacro(SampleEstimate, bool);
|
||
|
|
vtkBooleanMacro(SampleEstimate, bool);
|
||
|
|
///@}
|
||
|
|
|
||
|
|
///@{
|
||
|
|
/**
|
||
|
|
* Set/get whether the deviations returned should be signed, or should
|
||
|
|
* only have their magnitude reported.
|
||
|
|
* The default is that signed deviations will be computed.
|
||
|
|
*/
|
||
|
|
vtkSetMacro(SignedDeviations, vtkTypeBool);
|
||
|
|
vtkGetMacro(SignedDeviations, vtkTypeBool);
|
||
|
|
vtkBooleanMacro(SignedDeviations, vtkTypeBool);
|
||
|
|
///@}
|
||
|
|
|
||
|
|
///@{
|
||
|
|
/**
|
||
|
|
* If there is a ghost array in the input, then ghosts matching `GhostsToSkip` mask
|
||
|
|
* will be skipped. It is set to 0xff by default (every ghosts types are skipped).
|
||
|
|
*
|
||
|
|
* @sa
|
||
|
|
* vtkDataSetAttributes
|
||
|
|
* vtkFieldData
|
||
|
|
* vtkPointData
|
||
|
|
* vtkCellData
|
||
|
|
*/
|
||
|
|
vtkSetMacro(GhostsToSkip, unsigned char);
|
||
|
|
vtkGetMacro(GhostsToSkip, unsigned char);
|
||
|
|
///@}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Given a collection of models, calculate aggregate model
|
||
|
|
*/
|
||
|
|
void Aggregate(vtkDataObjectCollection*, vtkMultiBlockDataSet*) override;
|
||
|
|
|
||
|
|
protected:
|
||
|
|
vtkDescriptiveStatistics();
|
||
|
|
~vtkDescriptiveStatistics() override;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Execute the calculations required by the Learn option, given some input Data
|
||
|
|
* NB: input parameters are unused.
|
||
|
|
*/
|
||
|
|
void Learn(vtkTable*, vtkTable*, vtkMultiBlockDataSet*) override;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Execute the calculations required by the Derive option.
|
||
|
|
*/
|
||
|
|
void Derive(vtkMultiBlockDataSet*) override;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Execute the calculations required by the Test option.
|
||
|
|
*/
|
||
|
|
void Test(vtkTable*, vtkMultiBlockDataSet*, vtkTable*) override;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Execute the calculations required by the Assess option.
|
||
|
|
*/
|
||
|
|
void Assess(vtkTable* inData, vtkMultiBlockDataSet* inMeta, vtkTable* outData) override
|
||
|
|
{
|
||
|
|
this->Superclass::Assess(inData, inMeta, outData, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Calculate p-value. This will be overridden using the object factory with an
|
||
|
|
* R implementation if R is present.
|
||
|
|
*/
|
||
|
|
virtual vtkDoubleArray* CalculatePValues(vtkDoubleArray*);
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Provide the appropriate assessment functor.
|
||
|
|
*/
|
||
|
|
void SelectAssessFunctor(vtkTable* outData, vtkDataObject* inMeta, vtkStringArray* rowNames,
|
||
|
|
AssessFunctor*& dfunc) override;
|
||
|
|
|
||
|
|
bool SampleEstimate;
|
||
|
|
vtkTypeBool SignedDeviations;
|
||
|
|
unsigned char GhostsToSkip;
|
||
|
|
|
||
|
|
private:
|
||
|
|
vtkDescriptiveStatistics(const vtkDescriptiveStatistics&) = delete;
|
||
|
|
void operator=(const vtkDescriptiveStatistics&) = delete;
|
||
|
|
};
|
||
|
|
|
||
|
|
#endif
|