|
// +----------------------------------------------------------------------+
//
// $Id: Stats.php,v 1.10 2003/05/16 22:03:03 jmcastagnetto Exp $
//
include_once "PEAR.php";
/**
* @package Math_Stats
*/
// Constants for defining the statistics to calculate /*{{{*/
/**
* STATS_BASIC to generate the basic descriptive statistics
*/
define("STATS_BASIC", 1);
/**
* STATS_FULL to generate also higher moments, mode, median, etc.
*/
define("STATS_FULL", 2);
/*}}}*/
// Constants describing the data set format /*{{{*/
/**
* STATS_DATA_SIMPLE for an array of numeric values
* e.g. $data = array(2,3,4,5,1,1,6);
*/
define("STATS_DATA_SIMPLE", 0);
/**
* STATS_DATA_CUMMULATIVE for an associative array of frequency values,
* where in each array entry, the index is the data point and the
* value the count (frequency):
* e.g. $data = array(3=>4, 2.3=>5, 1.25=>6, 0.5=>3)
*/
define("STATS_DATA_CUMMULATIVE", 1);
/*}}}*/
// Constants defining how to handle nulls /*{{{*/
/**
* STATS_REJECT_NULL, reject data sets with null values.
* Any non-numeric value is considered a null in this context.
*/
define("STATS_REJECT_NULL", -1);
/**
* STATS_IGNORE_NULL, ignore null values and prune them from the data.
* Any non-numeric value is considered a null in this context.
*/
define("STATS_IGNORE_NULL", -2);
/**
* STATS_USE_NULL_AS_ZERO, assign the value of 0 (zero) to null values.
* Any non-numeric value is considered a null in this context.
*/
define("STATS_USE_NULL_AS_ZERO", -3);
/*}}}*/
/**
* A class to calculate descriptive statistics from a data set.
* Data sets can be simple arrays of data, or a cummulative hash.
* The second form is useful when passing large data set,
* for example the data set:
*
*
* $data1 = array (1,2,1,1,1,1,3,3,4.1,3,2,2,4.1,1,1,2,3,3,2,2,1,1,2,2);
*
*
* can be epxressed more compactly as:
*
*
* $data2 = array("1"=>9, "2"=>8, "3"=>5, "4.1"=>2);
*
*
* Example of use:
*
*
* include_once "Math/Stats.php";
* $s = new Math_Stats();
* $s->setData($data1);
* // or
* // $s->setData($data2, STATS_DATA_CUMMULATIVE);
* $stats = $s->calcBasic();
* echo "Mean: ".$stats["mean"]." StDev: ".$stats["stdev"]."
\n";
*
* // using data with nulls
* // first ignoring them:
* $data3 = array(1.2, "foo", 2.4, 3.1, 4.2, 3.2, null, 5.1, 6.2);
* $s->setNullOption(STATS_IGNORE_NULL);
* $s->setData($data3);
* $stats3 = $s->calcFull();
*
* // and then assuming nulls == 0
* $s->setNullOption(STATS_USE_NULL_AS_ZERO);
* $s->setData($data3);
* $stats3 = $s->calcFull();
*
*
* Originally this class was part of NumPHP (Numeric PHP package)
*
* @author Jesus M. Castagnetto
* @version 0.8
* @access public
* @package Math_Stats
*/
class Math_Stats {/*{{{*/
// properties /*{{{*/
/**
* The simple or cummulative data set.
* Null by default.
*
* @access private
* @var array
*/
var $_data = null;
/**
* Flag for data type, one of STATS_DATA_SIMPLE or
* STATS_DATA_CUMMULATIVE. Null by default.
*
* @access private
* @var int
*/
var $_dataOption = null;
/**
* Flag for null handling options. One of STATS_REJECT_NULL,
* STATS_IGNORE_NULL or STATS_USE_NULL_AS_ZERO
*
* @access private
* @var int
*/
var $_nullOption;
/*}}}*/
/**
* Constructor for the class
*
* @access public
* @param optional int $nullOption how to handle null values
* @return object Math_Stats
*/
function Math_Stats($nullOption=STATS_REJECT_NULL) {/*{{{*/
$this->_nullOption = $nullOption;
}/*}}}*/
/**
* Sets and verifies the data, checking for nulls and using
* the current null handling option
*
* @access public
* @param array $arr the data set
* @param optional int $opt data format: STATS_DATA_CUMMULATIVE or STATS_DATA_SIMPLE (default)
* @return mixed true on success, a PEAR_Error object otherwise
*/
function setData($arr, $opt=STATS_DATA_SIMPLE) {/*{{{*/
$this->_data = null;
$this->_dataOption = null;
if (!is_array($arr))
return PEAR::raiseError("invalid data, an array of numeric data was expected");
if ($opt == STATS_DATA_SIMPLE) {
$this->_dataOption = $opt;
$this->_data = array_values($arr);
} else if ($opt == STATS_DATA_CUMMULATIVE) {
$this->_dataOption = $opt;
$this->_data = $arr;
}
return $this->_validate();
}/*}}}*/
/**
* Returns the data which might have been modified
* according to the current null handling options.
*
* @access public
* @return mixed array of data on success, a PEAR_Error object otherwise
* @see _validate()
*/
function getData() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return $this->_data;
}/*}}}*/
/**
* Sets the null handling option.
* Must be called before assigning a new data set containing null values
*
* @access public
* @return mixed true on success, a PEAR_Error object otherwise
* @see _validate()
*/
function setNullOption($nullOption) {/*{{{*/
if ($nullOption == STATS_REJECT_NULL
|| $nullOption == STATS_IGNORE_NULL
|| $nullOption == STATS_USE_NULL_AS_ZERO) {
$this->_nullOption = $nullOption;
return true;
} else {
return PEAR::raiseError("invalid null handling option expecting: ".
"STATS_REJECT_NULL, STATS_IGNORE_NULL or STATS_USE_NULL_AS_ZERO");
}
}/*}}}*/
/**
* Calculates the basic or full statistics for the data set
*
* @access public
* @param int $mode one of STATS_BASIC or STATS_FULL
* @return mixed an associative array of statistics on success, a PEAR_Error object otherwise
* @see calcBasic()
* @see calcFull()
*/
function calc($mode) {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if ($mode == STATS_BASIC) {
return array (
"min" => $this->min(),
"max" => $this->max(),
"sum" => $this->sum(),
"sum2" => $this->sum2(),
"count" => $this->count(),
"mean" => $this->mean(),
"stdev" => $this->stDev(),
"variance" => $this->variance()
);
} else if ($mode == STATS_FULL) {
return array (
"min" => $this->min(),
"max" => $this->max(),
"sum" => $this->sum(),
"sum2" => $this->sum2(),
"count" => $this->count(),
"mean" => $this->mean(),
"median" => $this->median(),
"mode" => $this->mode(),
"midrange" => $this->midrange(),
"stdev" => $this->stDev(),
"absdev" => $this->absDev(),
"variance" => $this->variance(),
"std_error_of_mean" => $this->stdErrorOfMean(),
"skewness" => $this->skewness(),
"kurtosis" => $this->kurtosis(),
"coeff_of_variation" => $this->coeffOfVariation(),
"sample_central_moments" => array (
1 => $this->sampleCentralMoment(1),
2 => $this->sampleCentralMoment(2),
3 => $this->sampleCentralMoment(3),
4 => $this->sampleCentralMoment(4),
5 => $this->sampleCentralMoment(5)
),
"sample_raw_moments" => array (
1 => $this->sampleRawMoment(1),
2 => $this->sampleRawMoment(2),
3 => $this->sampleRawMoment(3),
4 => $this->sampleRawMoment(4),
5 => $this->sampleRawMoment(5)
),
"frequency" => $this->frequency()
);
} else {
return PEAR::raiseError("incorrect mode, expected STATS_BASIC or STATS_FULL");
}
}/*}}}*/
/**
* Calculates a basic set of statistics
*
* @access public
* @return mixed an associative array of statistics on success, a PEAR_Error object otherwise
* @see calc()
* @see calcFull()
*/
function calcBasic() {/*{{{*/
return $this->calc(STATS_BASIC);
}/*}}}*/
/**
* Calculates a full set of statistics
*
* @access public
* @return mixed an associative array of statistics on success, a PEAR_Error object otherwise
* @see calc()
* @see calcBasic()
*/
function calcFull() {/*{{{*/
return $this->calc(STATS_FULL);
}/*}}}*/
/**
* Calculates the minimum of a data set.
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the minimum value on success, a PEAR_Error object otherwise
* @see calc()
* @see max()
*/
function min() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if ($this->_dataOption == STATS_DATA_CUMMULATIVE)
return min(array_keys($this->_data));
else
return min($this->_data);
}/*}}}*/
/**
* Calculates the maximum of a data set.
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the maximum value on success, a PEAR_Error object otherwise
* @see calc()
* @see min()
*/
function max() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if ($this->_dataOption == STATS_DATA_CUMMULATIVE)
return max(array_keys($this->_data));
else
return max($this->_data);
}/*}}}*/
/**
* Calculates SUM { xi }
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the sum on success, a PEAR_Error object otherwise
* @see calc()
* @see sum2()
* @see sumN()
*/
function sum() {/*{{{*/
return $this->sumN(1);
}/*}}}*/
/**
* Calculates SUM { (xi)^2 }
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the sum on success, a PEAR_Error object otherwise
* @see calc()
* @see sum()
* @see sumN()
*/
function sum2() {/*{{{*/
return $this->sumN(2);
}/*}}}*/
/**
* Calculates SUM { (xi)^n }
* Handles cummulative data sets correctly
*
* @access public
* @param numeric $n the exponent
* @return mixed the sum on success, a PEAR_Error object otherwise
* @see calc()
* @see sum()
* @see sum2()
*/
function sumN($n) {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
$sumN = 0;
if ($this->_dataOption == STATS_DATA_CUMMULATIVE) {
foreach($this->_data as $val=>$freq)
$sumN += $freq * pow((double)$val, (double)$n);
} else {
foreach($this->_data as $val)
$sumN += pow((double)$val, (double)$n);
}
return $sumN;
}/*}}}*/
/**
* Calculates the number of data points in the set
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the count on success, a PEAR_Error object otherwise
* @see calc()
*/
function count() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if ($this->_dataOption == STATS_DATA_CUMMULATIVE) {
foreach($this->_data as $freq)
$count += $freq;
} else {
$count = count($this->_data);
}
return $count;
}/*}}}*/
/**
* Calculates the mean (average) of the data points in the set
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the mean value on success, a PEAR_Error object otherwise
* @see calc()
* @see sum()
* @see count()
*/
function mean() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return ($this->sum() / $this->count());
}/*}}}*/
/**
* Calculates the variance (unbiased) of the data points in the set
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the variance value on success, a PEAR_Error object otherwise
* @see calc()
* @see __sumdiff()
* @see count()
*/
function variance() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return $this->__sumdiff(2) / ($this->count() - 1);
}/*}}}*/
/**
* Calculates the standard deviation (unbiased) of the data points in the set
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the standard deviation on success, a PEAR_Error object otherwise
* @see calc()
* @see variance()
*/
function stDev() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return sqrt($this->variance());
}/*}}}*/
/**
* Calculates the variance (unbiased) of the data points in the set
* given a fixed mean (average) value. Not used in calcBasic(), calcFull()
* or calc().
* Handles cummulative data sets correctly
*
* @access public
* @param numeric $mean the fixed mean value
* @return mixed the variance on success, a PEAR_Error object otherwise
* @see __sumdiff()
* @see count()
* @see variance()
*/
function varianceWithMean($mean) {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return $this->__sumdiff(2, $mean) / ($this->count() - 1);
}/*}}}*/
/**
* Calculates the standard deviation (unbiased) of the data points in the set
* given a fixed mean (average) value. Not used in calcBasic(), calcFull()
* or calc().
* Handles cummulative data sets correctly
*
* @access public
* @param numeric $mean the fixed mean value
* @return mixed the standard deviation on success, a PEAR_Error object otherwise
* @see varianceWithMean()
* @see stDev()
*/
function stDevWithMean($mean) {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return sqrt($this->varianceWithMean($mean));
}/*}}}*/
/**
* Calculates the absolute deviation of the data points in the set
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the absolute deviation on success, a PEAR_Error object otherwise
* @see calc()
* @see __sumabsdev()
* @see count()
* @see absDevWithMean()
*/
function absDev() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return $this->__sumabsdev() / $this->count();
}/*}}}*/
/**
* Calculates the absolute deviation of the data points in the set
* given a fixed mean (average) value. Not used in calcBasic(), calcFull()
* or calc().
* Handles cummulative data sets correctly
*
* @access public
* @param numeric $mean the fixed mean value
* @return mixed the absolute deviation on success, a PEAR_Error object otherwise
* @see __sumabsdev()
* @see absDev()
*/
function absDevWithMean($mean) {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return $this->__sumabsdev($mean) / $this->count();
}/*}}}*/
/**
* Calculates the skewness of the data distribution in the set
* The skewness measures the degree of asymmetry of a distribution,
* and is related to the third central moment of a distribution.
* A normal distribution has a skewness = 0
* A distribution with a tail off towards the high end of the scale
* (positive skew) has a skewness > 0
* A distribution with a tail off towards the low end of the scale
* (negative skew) has a skewness < 0
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the skewness value on success, a PEAR_Error object otherwise
* @see __sumdiff()
* @see count()
* @see stDev()
* @see calc()
*/
function skewness() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
$skew = ($this->__sumdiff(3) / ($this->count() * pow($this->stDev(), 3)));
return $skew;
}/*}}}*/
/**
* Calculates the kurtosis of the data distribution in the set
* The kurtosis measures the degrees of peakedness of a distribution.
* It is also callesd the "excess" or "excess coefficient", and is
* a normalized form of the fourth central moment of a distribution.
* A normal distributions has kurtosis = 0
* A narrow and peaked (leptokurtic) distribution has a
* kurtosis > 0
* A flat and wide (platykurtic) distribution has a kurtosis < 0
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the kurtosis value on success, a PEAR_Error object otherwise
* @see __sumdiff()
* @see count()
* @see stDev()
* @see calc()
*/
function kurtosis() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
$kurt = ($this->__sumdiff(4) / ($this->count() * pow($this->stDev(), 4))) - 3;
return $kurt;
}/*}}}*/
/**
* Calculates the median of a data set.
* The median is the value such that half of the points are below it
* in a sorted data set.
* If the number of values is odd, it is the middle item.
* If the number of values is even, is the average of the two middle items.
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the median value on success, a PEAR_Error object otherwise
* @see count()
* @see calc()
*/
function median() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
$arr = array();
if ($this->_dataOption == STATS_DATA_CUMMULATIVE)
foreach ($this->_data as $val=>$freq)
$arr = array_pad($arr, count($arr) + $freq, $val);
else
$arr = $this->_data;
sort($arr);
$n = count($arr);
$h = intval($n / 2);
if ($n % 2 == 0) {
$median = ($arr[$h] + $arr[$h - 1]) / 2;
} else {
$median = $arr[$h + 1];
}
return $median;
}/*}}}*/
/**
* Calculates the mode of a data set.
* The mode is the value with the highest frequency in the data set.
* There can be more than one mode.
* Handles cummulative data sets correctly
*
* @access public
* @return mixed an array of mode value on success, a PEAR_Error object otherwise
* @see frequency()
* @see calc()
*/
function mode() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if ($this->_dataOption == STATS_DATA_CUMMULATIVE)
$arr = $this->_data;
else
$arr = $this->frequency();
arsort($arr);
$mcount = 1;
foreach ($arr as $val=>$freq) {
if ($mcount == 1) {
$mode = array($val);
$mfreq = $freq;
$mcount++;
continue;
}
if ($mfreq == $freq)
$mode[] = $val;
if ($mfreq > $freq)
break;
}
return $mode;
}/*}}}*/
/**
* Calculates the nth central moment (m{n}) of a data set.
*
* The definition of a sample central moment is:
*
* m{n} = 1/N * SUM { (xi - avg)^n }
*
* where: N = sample size, avg = sample mean.
*/
function sampleCentralMoment($n) {/*{{{*/
if ($n == 1) {
return 0;
}
$count = $this->count();
if (PEAR::isError($count)) {
return $count;
}
if ($count == 0) {
return PEAR::raiseError("Cannot calculate {$n}th sample moment, there are zero data entries.");
}
$sum = $this->__sumdiff($n);
if (PEAR::isError($sum)) {
return $sum;
}
return ($sum / $count);
}/*}}}*/
/**
* Calculates the nth raw moment (m{n}) of a data set.
*
* The definition of a sample central moment is:
*
* m{n} = 1/N * SUM { xi^n }
*
* where: N = sample size, avg = sample mean.
*/
function sampleRawMoment($n) {/*{{{*/
$count = $this->count();
if (PEAR::isError($count)) {
return $count;
}
if ($count == 0) {
return PEAR::raiseError("Cannot calculate {$n}th raw moment, there are zero data entries.");
}
$sum = $this->sumN($n);
if (PEAR::isError($sum)) {
return $sum;
}
return ($sum / $count);
}/*}}}*/
/**
* Calculates the midrange of a data set.
* The midrange is the average of the minimum and maximum of the data set.
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the midrange value on success, a PEAR_Error object otherwise
* @see min()
* @see max()
* @see calc()
*/
function midrange() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return (($this->max() + $this->min()) / 2);
}/*}}}*/
/**
* Calculates the value frequency table of a data set.
* Handles cummulative data sets correctly
*
* @access public
* @return mixed an associative array of value=>frequency items on success, a PEAR_Error object otherwise
* @see min()
* @see max()
* @see calc()
*/
function frequency() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if ($this->_dataOption == STATS_DATA_CUMMULATIVE) {
return $this->_data;
} else {
$freq = array();
foreach ($this->_data as $val)
$freq["$val"]++;
return $freq;
}
}/*}}}*/
/**
* Calculates the coefficient of variation of a data set.
* The coefficient of variation measures the spread of a set of data
* as a proportion of its mean. It is often expressed as a percentage.
* Handles cummulative data sets correctly
*
* @access public
* @return mixed the coefficient of variation on success, a PEAR_Error object otherwise
* @see stDev()
* @see mean()
* @see calc()
*/
function coeffOfVariation() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return $this->stDev() / $this->mean();
}/*}}}*/
/**
* Calculates the standard error of the mean.
* It is the standard deviation of the sampling distribution of
* the mean. The formula is:
*
* S.E. Mean = SD / (N)^(1/2)
*
* This formula does not assume a normal distribution, and shows
* that the size of the standard error of the mean is inversely
* proportional to the square root of the sample size.
*
* @access public
* @return mixed the standard error of the mean on success, a PEAR_Error object otherwise
* @see stDev()
* @see count()
* @see calc()
*/
function stdErrorOfMean() {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
return $this->stDev() / sqrt($this->count());
}/*}}}*/
/**
* Utility function to calculate: SUM { (xi - mean)^n }
*
* @access private
* @param numeric $power the exponent
* @param optional double $mean the data set mean value
* @return mixed the sum on success, a PEAR_Error object otherwise
*
* @see stDev()
* @see variaceWithMean();
* @see skewness();
* @see kurtosis();
*/
function __sumdiff($power, $mean=null) {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if (is_null($mean))
$mean = $this->mean();
$sdiff = 0;
if ($this->_dataOption == STATS_DATA_CUMMULATIVE) {
foreach ($this->_data as $val=>$freq)
$sdiff += $freq * pow((double)($val - $mean), (double)$power);
} else {
foreach ($this->_data as $val)
$sdiff += pow((double)($val - $mean), (double)$power);
}
return $sdiff;
}/*}}}*/
/**
* Utility function to calculate: SUM { | xi - mean | }
*
* @access private
* @param optional double $mean the mean value for the set or population
* @return mixed the sum on success, a PEAR_Error object otherwise
*
* @see absDev()
* @see absDevWithMean()
*/
function __sumabsdev($mean=null) {/*{{{*/
if ($this->_data == null)
return PEAR::raiseError("data has not been set");
if (is_null($mean))
$mean = $this->mean();
$sdev = 0;
if ($this->_dataOption == STATS_DATA_CUMMULATIVE) {
foreach ($this->_data as $val=>$freq)
$sdev += $freq * abs($val - $mean);
} else {
foreach ($this->_data as $val)
$sdev += abs($val - $mean);
}
return $sdev;
}/*}}}*/
/**
* Utility function to validate the data and modify it
* according to the current null handling option
*
* @access private
* @return mixed true on success, a PEAR_Error object otherwise
*
* @see setData()
*/
function _validate() {/*{{{*/
$flag = ($this->_dataOption == STATS_DATA_CUMMULATIVE);
foreach ($this->_data as $key=>$value) {
$d = ($flag) ? $key : $value;
$v = ($flag) ? $value : $key;
if (!is_numeric($d)) {
switch ($this->_nullOption) {
case STATS_IGNORE_NULL :
unset($this->_data["$key"]);
break;
case STATS_USE_NULL_AS_ZERO:
if ($flag) {
unset($this->_data["$key"]);
$this->_data[0] += $v;
} else {
$this->_data[$key] = 0;
}
break;
case STATS_REJECT_NULL :
default:
return PEAR::raiseError("data rejected, contains NULL values");
break;
}
}
}
return true;
}/*}}}*/
}/*}}}*/
// vim: ts=4:sw=4:et:
// vim6: fdl=0:
?>