1# statistics.tcl -- 2# 3# Package for basic statistical analysis 4# 5# version 0.1: initial implementation, january 2003 6# version 0.1.1: added linear regression, june 2004 7# version 0.1.2: border case in stdev taken care of 8# version 0.1.3: moved initialisation of CDF to first call, november 2004 9# version 0.3: added test for normality (as implemented by Torsten Reincke), march 2006 10# (also fixed an error in the export list) 11# version 0.4: added the multivariate linear regression procedures by 12# Eric Kemp-Benedict, february 2007 13# version 0.5: added the population standard deviation and variance, 14# as suggested by Dimitrios Zachariadis 15# version 0.6: added pdf and cdf procedures for various distributions 16# (provided by Eric Kemp-Benedict) 17 18package provide math::statistics 0.6.3 19package require math 20 21# ::math::statistics -- 22# Namespace holding the procedures and variables 23# 24 25namespace eval ::math::statistics { 26 # 27 # Safer: change to short procedures 28 # 29 namespace export mean min max number var stdev pvar pstdev basic-stats corr \ 30 histogram interval-mean-stdev t-test-mean quantiles \ 31 test-normal lillieforsFit \ 32 autocorr crosscorr filter map samplescount median \ 33 test-2x2 print-2x2 control-xbar test_xbar \ 34 control-Rchart test-Rchart 35 # 36 # Error messages 37 # 38 variable NEGSTDEV {Zero or negative standard deviation} 39 variable TOOFEWDATA {Too few or invalid data} 40 variable OUTOFRANGE {Argument out of range} 41 42 # 43 # Coefficients involved 44 # 45 variable factorNormalPdf 46 set factorNormalPdf [expr {sqrt(8.0*atan(1.0))}] 47 48 # xbar/R-charts: 49 # Data from: 50 # Peter W.M. John: 51 # Statistical methods in engineering and quality assurance 52 # Wiley and Sons, 1990 53 # 54 variable control_factors { 55 A2 {1.880 1.093 0.729 0.577 0.483 0.419 0.419} 56 D3 {0.0 0.0 0.0 0.0 0.0 0.076 0.076} 57 D4 {3.267 2.574 2.282 2.114 2.004 1.924 1.924} 58 } 59} 60 61# mean, min, max, number, var, stdev, pvar, pstdev -- 62# Return the mean (minimum, maximum) value of a list of numbers 63# or number of non-missing values 64# 65# Arguments: 66# type Type of value to be returned 67# values List of values to be examined 68# 69# Results: 70# Value that was required 71# 72# 73namespace eval ::math::statistics { 74 foreach type {mean min max number stdev var pstdev pvar} { 75 proc $type { values } "BasicStats $type \$values" 76 } 77 proc basic-stats { values } "BasicStats all \$values" 78} 79 80# BasicStats -- 81# Return the one or all of the basic statistical properties 82# 83# Arguments: 84# type Type of value to be returned 85# values List of values to be examined 86# 87# Results: 88# Value that was required 89# 90proc ::math::statistics::BasicStats { type values } { 91 variable TOOFEWDATA 92 93 if { [lsearch {all mean min max number stdev var pstdev pvar} $type] < 0 } { 94 return -code error \ 95 -errorcode ARG -errorinfo [list unknown type of statistic -- $type] \ 96 [list unknown type of statistic -- $type] 97 } 98 99 set min {} 100 set max {} 101 set mean {} 102 set stdev {} 103 set var {} 104 105 set sum 0.0 106 set sumsq 0.0 107 set number 0 108 set first {} 109 110 foreach value $values { 111 if { $value == {} } { 112 continue 113 } 114 set value [expr {double($value)}] 115 116 if { $first == {} } { 117 set first $value 118 } 119 120 incr number 121 set sum [expr {$sum+$value}] 122 set sumsq [expr {$sumsq+($value-$first)*($value-$first)}] 123 124 if { $min == {} || $value < $min } { 125 set min $value 126 } 127 if { $max == {} || $value > $max } { 128 set max $value 129 } 130 } 131 132 if { $number > 0 } { 133 set mean [expr {$sum/$number}] 134 } else { 135 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 136 } 137 138 if { $number > 1 } { 139 set var [expr {($sumsq-($mean-$first)*($sum-$number*$first))/double($number-1)}] 140 # 141 # Take care of a rare situation: uniform data might 142 # cause a tiny negative difference 143 # 144 if { $var < 0.0 } { 145 set var 0.0 146 } 147 set stdev [expr {sqrt($var)}] 148 } 149 set pvar [expr {($sumsq-($mean-$first)*($sum-$number*$first))/double($number)}] 150 # 151 # Take care of a rare situation: uniform data might 152 # cause a tiny negative difference 153 # 154 if { $pvar < 0.0 } { 155 set pvar 0.0 156 } 157 set pstdev [expr {sqrt($pvar)}] 158 159 set all [list $mean $min $max $number $stdev $var $pstdev $pvar] 160 161 # 162 # Return the appropriate value 163 # 164 set $type 165} 166 167# histogram -- 168# Return histogram information from a list of numbers 169# 170# Arguments: 171# limits Upper limits for the buckets (in increasing order) 172# values List of values to be examined 173# 174# Results: 175# List of number of values in each bucket (length is one more than 176# the number of limits) 177# 178# 179proc ::math::statistics::histogram { limits values } { 180 181 if { [llength $limits] < 1 } { 182 return -code error -errorcode ARG -errorinfo {No limits given} {No limits given} 183 } 184 185 set limits [lsort -real -increasing $limits] 186 187 for { set index 0 } { $index <= [llength $limits] } { incr index } { 188 set buckets($index) 0 189 } 190 set last [llength $limits] 191 192 foreach value $values { 193 if { $value == {} } { 194 continue 195 } 196 197 set index 0 198 set found 0 199 foreach limit $limits { 200 if { $value <= $limit } { 201 set found 1 202 incr buckets($index) 203 break 204 } 205 incr index 206 } 207 208 if { $found == 0 } { 209 incr buckets($last) 210 } 211 } 212 213 set result {} 214 for { set index 0 } { $index <= $last } { incr index } { 215 lappend result $buckets($index) 216 } 217 218 return $result 219} 220 221# corr -- 222# Return the correlation coefficient of two sets of data 223# 224# Arguments: 225# data1 List with the first set of data 226# data2 List with the second set of data 227# 228# Result: 229# Correlation coefficient of the two 230# 231proc ::math::statistics::corr { data1 data2 } { 232 variable TOOFEWDATA 233 234 set number 0 235 set sum1 0.0 236 set sum2 0.0 237 set sumsq1 0.0 238 set sumsq2 0.0 239 set sumprod 0.0 240 241 foreach value1 $data1 value2 $data2 { 242 if { $value1 == {} || $value2 == {} } { 243 continue 244 } 245 set value1 [expr {double($value1)}] 246 set value2 [expr {double($value2)}] 247 248 set sum1 [expr {$sum1+$value1}] 249 set sum2 [expr {$sum2+$value2}] 250 set sumsq1 [expr {$sumsq1+$value1*$value1}] 251 set sumsq2 [expr {$sumsq2+$value2*$value2}] 252 set sumprod [expr {$sumprod+$value1*$value2}] 253 incr number 254 } 255 if { $number > 0 } { 256 set numerator [expr {$number*$sumprod-$sum1*$sum2}] 257 set denom1 [expr {sqrt($number*$sumsq1-$sum1*$sum1)}] 258 set denom2 [expr {sqrt($number*$sumsq2-$sum2*$sum2)}] 259 if { $denom1 != 0.0 && $denom2 != 0.0 } { 260 set corr_coeff [expr {$numerator/$denom1/$denom2}] 261 } elseif { $denom1 != 0.0 || $denom2 != 0.0 } { 262 set corr_coeff 0.0 ;# Uniform against non-uniform 263 } else { 264 set corr_coeff 1.0 ;# Both uniform 265 } 266 267 } else { 268 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 269 } 270 return $corr_coeff 271} 272 273# lillieforsFit -- 274# Calculate the goodness of fit according to Lilliefors 275# (goodness of fit to a normal distribution) 276# 277# Arguments: 278# values List of values to be tested for normality 279# 280# Result: 281# Value of the statistic D 282# 283proc ::math::statistics::lillieforsFit {values} { 284 # 285 # calculate the goodness of fit according to Lilliefors 286 # (goodness of fit to a normal distribution) 287 # 288 # values -> list of values to be tested for normality 289 # (these values are sampled counts) 290 # 291 292 # calculate standard deviation and mean of the sample: 293 set n [llength $values] 294 if { $n < 5 } { 295 return -code error "Insufficient number of data (at least five required)" 296 } 297 set sd [stdev $values] 298 set mean [mean $values] 299 300 # sort the sample for further processing: 301 set values [lsort -real $values] 302 303 # standardize the sample data (Z-scores): 304 foreach x $values { 305 lappend stdData [expr {($x - $mean)/double($sd)}] 306 } 307 308 # compute the value of the distribution function at every sampled point: 309 foreach x $stdData { 310 lappend expData [pnorm $x] 311 } 312 313 # compute D+: 314 set i 0 315 foreach x $expData { 316 incr i 317 lappend dplus [expr {$i/double($n)-$x}] 318 } 319 set dplus [lindex [lsort -real $dplus] end] 320 321 # compute D-: 322 set i 0 323 foreach x $expData { 324 incr i 325 lappend dminus [expr {$x-($i-1)/double($n)}] 326 } 327 set dminus [lindex [lsort -real $dminus] end] 328 329 # Calculate the test statistic D 330 # by finding the maximal vertical difference 331 # between the sample and the expectation: 332 # 333 set D [expr {$dplus > $dminus ? $dplus : $dminus}] 334 335 # We now use the modified statistic Z, 336 # because D is only reliable 337 # if the p-value is smaller than 0.1 338 return [expr {$D * (sqrt($n) - 0.01 + 0.831/sqrt($n))}] 339} 340 341# pnorm -- 342# Calculate the cumulative distribution function (cdf) 343# for the standard normal distribution like in the statistical 344# software 'R' (mean=0 and sd=1) 345# 346# Arguments: 347# x Value fro which the cdf should be calculated 348# 349# Result: 350# Value of the statistic D 351# 352proc ::math::statistics::pnorm {x} { 353 # 354 # cumulative distribution function (cdf) 355 # for the standard normal distribution like in the statistical software 'R' 356 # (mean=0 and sd=1) 357 # 358 # x -> value for which the cdf should be calculated 359 # 360 set sum [expr {double($x)}] 361 set oldSum 0.0 362 set i 1 363 set denom 1.0 364 while {$sum != $oldSum} { 365 set oldSum $sum 366 incr i 2 367 set denom [expr {$denom*$i}] 368 #puts "$i - $denom" 369 set sum [expr {$oldSum + pow($x,$i)/$denom}] 370 } 371 return [expr {0.5 + $sum * exp(-0.5 * $x*$x - 0.91893853320467274178)}] 372} 373 374# pnorm_quicker -- 375# Calculate the cumulative distribution function (cdf) 376# for the standard normal distribution - quicker alternative 377# (less accurate) 378# 379# Arguments: 380# x Value for which the cdf should be calculated 381# 382# Result: 383# Value of the statistic D 384# 385proc ::math::statistics::pnorm_quicker {x} { 386 387 set n [expr {abs($x)}] 388 set n [expr {1.0 + $n*(0.04986735 + $n*(0.02114101 + $n*(0.00327763 \ 389 + $n*(0.0000380036 + $n*(0.0000488906 + $n*0.000005383)))))}] 390 set n [expr {1.0/pow($n,16)}] 391 # 392 if {$x >= 0} { 393 return [expr {1 - $n/2.0}] 394 } else { 395 return [expr {$n/2.0}] 396 } 397} 398 399# test-normal -- 400# Test for normality (using method Lilliefors) 401# 402# Arguments: 403# data Values that need to be tested 404# confidence ... 405# 406# Result: 407# 1 if of the statistic D 408# 409proc ::math::statistics::test-normal {data confidence} { 410 set D [lillieforsFit $data] 411 412 set Dcrit -- 413 if { abs($confidence-0.80) < 0.0001 } { 414 set Dcrit 0.741 415 } 416 if { abs($confidence-0.85) < 0.0001 } { 417 set Dcrit 0.775 418 } 419 if { abs($confidence-0.90) < 0.0001 } { 420 set Dcrit 0.819 421 } 422 if { abs($confidence-0.95) < 0.0001 } { 423 set Dcrit 0.895 424 } 425 if { abs($confidence-0.99) < 0.0001 } { 426 set Dcrit 1.035 427 } 428 if { $Dcrit != "--" } { 429 return [expr {$D > $Dcrit ? 1 : 0 }] 430 } else { 431 return -code error "Confidence level must be one of: 0.80, 0.85, 0.90, 0.95 or 0.99" 432 } 433} 434 435# t-test-mean -- 436# Test whether the mean value of a sample is in accordance with the 437# estimated normal distribution with a certain level of confidence 438# (Student's t test) 439# 440# Arguments: 441# data List of raw data values (small sample) 442# est_mean Estimated mean of the distribution 443# est_stdev Estimated stdev of the distribution 444# confidence Confidence level (0.95 or 0.99 for instance) 445# 446# Result: 447# 1 if the test is positive, 0 otherwise. If there are too few data, 448# returns an empty string 449# 450proc ::math::statistics::t-test-mean { data est_mean est_stdev confidence } { 451 variable NEGSTDEV 452 variable TOOFEWDATA 453 454 if { $est_stdev <= 0.0 } { 455 return -code error -errorcode ARG -errorinfo $NEGSTDEV $NEGSTDEV 456 } 457 458 set allstats [BasicStats all $data] 459 460 set conf2 [expr {(1.0+$confidence)/2.0}] 461 462 set sample_mean [lindex $allstats 0] 463 set sample_number [lindex $allstats 3] 464 465 if { $sample_number > 1 } { 466 set tzero [expr {abs($sample_mean-$est_mean)/$est_stdev * \ 467 sqrt($sample_number-1)}] 468 set degrees [expr {$sample_number-1}] 469 set prob [cdf-students-t $degrees $tzero] 470 471 return [expr {$prob<$conf2}] 472 473 } else { 474 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 475 } 476} 477 478# interval-mean-stdev -- 479# Return the interval containing the mean value and one 480# containing the standard deviation with a certain 481# level of confidence (assuming a normal distribution) 482# 483# Arguments: 484# data List of raw data values 485# confidence Confidence level (0.95 or 0.99 for instance) 486# 487# Result: 488# List having the following elements: lower and upper bounds of 489# mean, lower and upper bounds of stdev 490# 491# 492proc ::math::statistics::interval-mean-stdev { data confidence } { 493 variable TOOFEWDATA 494 495 set allstats [BasicStats all $data] 496 497 set conf2 [expr {(1.0+$confidence)/2.0}] 498 set mean [lindex $allstats 0] 499 set number [lindex $allstats 3] 500 set stdev [lindex $allstats 4] 501 502 if { $number > 1 } { 503 set degrees [expr {$number-1}] 504 set student_t [expr {sqrt([Inverse-cdf-toms322 1 $degrees $conf2])}] 505 set mean_lower [expr {$mean-$student_t*$stdev/sqrt($number)}] 506 set mean_upper [expr {$mean+$student_t*$stdev/sqrt($number)}] 507 set stdev_lower {} 508 set stdev_upper {} 509 return [list $mean_lower $mean_upper $stdev_lower $stdev_upper] 510 } else { 511 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 512 } 513} 514 515# quantiles -- 516# Return the quantiles for a given set of data or histogram 517# 518# Arguments: 519# (two arguments) 520# data List of raw data values 521# confidence Confidence level (0.95 or 0.99 for instance) 522# (three arguments) 523# limits List of upper limits from histogram 524# counts List of counts for for each interval in histogram 525# confidence Confidence level (0.95 or 0.99 for instance) 526# 527# Result: 528# List of quantiles 529# 530proc ::math::statistics::quantiles { arg1 arg2 {arg3 {}} } { 531 variable TOOFEWDATA 532 533 if { [catch { 534 if { $arg3 == {} } { 535 set result \ 536 [::math::statistics::QuantilesRawData $arg1 $arg2] 537 } else { 538 set result \ 539 [::math::statistics::QuantilesHistogram $arg1 $arg2 $arg3] 540 } 541 } msg] } { 542 return -code error -errorcode $msg $msg 543 } 544 return $result 545} 546 547# QuantilesRawData -- 548# Return the quantiles based on raw data 549# 550# Arguments: 551# data List of raw data values 552# confidence Confidence level (0.95 or 0.99 for instance) 553# 554# Result: 555# List of quantiles 556# 557proc ::math::statistics::QuantilesRawData { data confidence } { 558 variable TOOFEWDATA 559 variable OUTOFRANGE 560 561 if { [llength $confidence] <= 0 } { 562 return -code error -errorcode ARG "$TOOFEWDATA - quantiles" 563 } 564 565 if { [llength $data] <= 0 } { 566 return -code error -errorcode ARG "$TOOFEWDATA - raw data" 567 } 568 569 foreach cond $confidence { 570 if { $cond <= 0.0 || $cond >= 1.0 } { 571 return -code error -errorcode ARG "$OUTOFRANGE - quantiles" 572 } 573 } 574 575 # 576 # Sort the data first 577 # 578 set sorted_data [lsort -real -increasing $data] 579 580 # 581 # Determine the list element lower or equal to the quantile 582 # and return the corresponding value 583 # 584 set result {} 585 set number_data [llength $sorted_data] 586 foreach cond $confidence { 587 set elem [expr {round($number_data*$cond)-1}] 588 if { $elem < 0 } { 589 set elem 0 590 } 591 lappend result [lindex $sorted_data $elem] 592 } 593 594 return $result 595} 596 597# QuantilesHistogram -- 598# Return the quantiles based on histogram information only 599# 600# Arguments: 601# limits Upper limits for histogram intervals 602# counts Counts for each interval 603# confidence Confidence level (0.95 or 0.99 for instance) 604# 605# Result: 606# List of quantiles 607# 608proc ::math::statistics::QuantilesHistogram { limits counts confidence } { 609 variable TOOFEWDATA 610 variable OUTOFRANGE 611 612 if { [llength $confidence] <= 0 } { 613 return -code error -errorcode ARG "$TOOFEWDATA - quantiles" 614 } 615 616 if { [llength $confidence] <= 0 } { 617 return -code error -errorcode ARG "$TOOFEWDATA - histogram limits" 618 } 619 620 if { [llength $counts] <= [llength $limits] } { 621 return -code error -errorcode ARG "$TOOFEWDATA - histogram counts" 622 } 623 624 foreach cond $confidence { 625 if { $cond <= 0.0 || $cond >= 1.0 } { 626 return -code error -errorcode ARG "$OUTOFRANGE - quantiles" 627 } 628 } 629 630 # 631 # Accumulate the histogram counts first 632 # 633 set sum 0 634 set accumulated_counts {} 635 foreach count $counts { 636 set sum [expr {$sum+$count}] 637 lappend accumulated_counts $sum 638 } 639 set total_counts $sum 640 641 # 642 # Determine the list element lower or equal to the quantile 643 # and return the corresponding value (use interpolation if 644 # possible) 645 # 646 set result {} 647 foreach cond $confidence { 648 set found 0 649 set bound [expr {round($total_counts*$cond)}] 650 set lower_limit {} 651 set lower_count 0 652 foreach acc_count $accumulated_counts limit $limits { 653 if { $acc_count >= $bound } { 654 set found 1 655 break 656 } 657 set lower_limit $limit 658 set lower_count $acc_count 659 } 660 661 if { $lower_limit == {} || $limit == {} || $found == 0 } { 662 set quant $limit 663 if { $limit == {} } { 664 set quant $lower_limit 665 } 666 } else { 667 set quant [expr {$limit+($lower_limit-$limit) * 668 ($acc_count-$bound)/($acc_count-$lower_count)}] 669 } 670 lappend result $quant 671 } 672 673 return $result 674} 675 676# autocorr -- 677# Return the autocorrelation function (assuming equidistance between 678# samples) 679# 680# Arguments: 681# data Raw data for which the autocorrelation must be determined 682# 683# Result: 684# List of autocorrelation values (about 1/2 the number of raw data) 685# 686proc ::math::statistics::autocorr { data } { 687 variable TOOFEWDATA 688 689 if { [llength $data] <= 1 } { 690 return -code error -errorcode ARG "$TOOFEWDATA" 691 } 692 693 return [crosscorr $data $data] 694} 695 696# crosscorr -- 697# Return the cross-correlation function (assuming equidistance 698# between samples) 699# 700# Arguments: 701# data1 First set of raw data 702# data2 Second set of raw data 703# 704# Result: 705# List of cross-correlation values (about 1/2 the number of raw data) 706# 707# Note: 708# The number of data pairs is not kept constant - because tests 709# showed rather awkward results when it was kept constant. 710# 711proc ::math::statistics::crosscorr { data1 data2 } { 712 variable TOOFEWDATA 713 714 if { [llength $data1] <= 1 || [llength $data2] <= 1 } { 715 return -code error -errorcode ARG "$TOOFEWDATA" 716 } 717 718 # 719 # First determine the number of data pairs 720 # 721 set number1 [llength $data1] 722 set number2 [llength $data2] 723 724 set basic_stat1 [basic-stats $data1] 725 set basic_stat2 [basic-stats $data2] 726 set vmean1 [lindex $basic_stat1 0] 727 set vmean2 [lindex $basic_stat2 0] 728 set vvar1 [lindex $basic_stat1 end] 729 set vvar2 [lindex $basic_stat2 end] 730 731 set number_pairs $number1 732 if { $number1 > $number2 } { 733 set number_pairs $number2 734 } 735 set number_values $number_pairs 736 set number_delays [expr {$number_values/2.0}] 737 738 set scale [expr {sqrt($vvar1*$vvar2)}] 739 740 set result {} 741 for { set delay 0 } { $delay < $number_delays } { incr delay } { 742 set sumcross 0.0 743 set no_cross 0 744 for { set idx 0 } { $idx < $number_values } { incr idx } { 745 set value1 [lindex $data1 $idx] 746 set value2 [lindex $data2 [expr {$idx+$delay}]] 747 if { $value1 != {} && $value2 != {} } { 748 set sumcross \ 749 [expr {$sumcross+($value1-$vmean1)*($value2-$vmean2)}] 750 incr no_cross 751 } 752 } 753 lappend result [expr {$sumcross/($no_cross*$scale)}] 754 755 incr number_values -1 756 } 757 758 return $result 759} 760 761# mean-histogram-limits 762# Determine reasonable limits based on mean and standard deviation 763# for a histogram 764# 765# Arguments: 766# mean Mean of the data 767# stdev Standard deviation 768# number Number of limits to generate (defaults to 8) 769# 770# Result: 771# List of limits 772# 773proc ::math::statistics::mean-histogram-limits { mean stdev {number 8} } { 774 variable NEGSTDEV 775 776 if { $stdev <= 0.0 } { 777 return -code error -errorcode ARG "$NEGSTDEV" 778 } 779 if { $number < 1 } { 780 return -code error -errorcode ARG "Number of limits must be positive" 781 } 782 783 # 784 # Always: between mean-3.0*stdev and mean+3.0*stdev 785 # number = 2: -0.25, 0.25 786 # number = 3: -0.25, 0, 0.25 787 # number = 4: -1, -0.25, 0.25, 1 788 # number = 5: -1, -0.25, 0, 0.25, 1 789 # number = 6: -2, -1, -0.25, 0.25, 1, 2 790 # number = 7: -2, -1, -0.25, 0, 0.25, 1, 2 791 # number = 8: -3, -2, -1, -0.25, 0.25, 1, 2, 3 792 # 793 switch -- $number { 794 "1" { set limits {0.0} } 795 "2" { set limits {-0.25 0.25} } 796 "3" { set limits {-0.25 0.0 0.25} } 797 "4" { set limits {-1.0 -0.25 0.25 1.0} } 798 "5" { set limits {-1.0 -0.25 0.0 0.25 1.0} } 799 "6" { set limits {-2.0 -1.0 -0.25 0.25 1.0 2.0} } 800 "7" { set limits {-2.0 -1.0 -0.25 0.0 0.25 1.0 2.0} } 801 "8" { set limits {-3.0 -2.0 -1.0 -0.25 0.25 1.0 2.0 3.0} } 802 "9" { set limits {-3.0 -2.0 -1.0 -0.25 0.0 0.25 1.0 2.0 3.0} } 803 default { 804 set dlim [expr {6.0/double($number-1)}] 805 for {set i 0} {$i <$number} {incr i} { 806 lappend limits [expr {$dlim*($i-($number-1)/2.0)}] 807 } 808 } 809 } 810 811 set result {} 812 foreach limit $limits { 813 lappend result [expr {$mean+$limit*$stdev}] 814 } 815 816 return $result 817} 818 819# minmax-histogram-limits 820# Determine reasonable limits based on minimum and maximum bounds 821# for a histogram 822# 823# Arguments: 824# min Estimated minimum 825# max Estimated maximum 826# number Number of limits to generate (defaults to 8) 827# 828# Result: 829# List of limits 830# 831proc ::math::statistics::minmax-histogram-limits { min max {number 8} } { 832 variable NEGSTDEV 833 834 if { $number < 1 } { 835 return -code error -errorcode ARG "Number of limits must be positive" 836 } 837 if { $min >= $max } { 838 return -code error -errorcode ARG "Minimum must be lower than maximum" 839 } 840 841 set result {} 842 set dlim [expr {($max-$min)/double($number-1)}] 843 for {set i 0} {$i <$number} {incr i} { 844 lappend result [expr {$min+$dlim*$i}] 845 } 846 847 return $result 848} 849 850# linear-model 851# Determine the coefficients for a linear regression between 852# two series of data (the model: Y = A + B*X) 853# 854# Arguments: 855# xdata Series of independent (X) data 856# ydata Series of dependent (Y) data 857# intercept Whether to use an intercept or not (optional) 858# 859# Result: 860# List of the following items: 861# - (Estimate of) Intercept A 862# - (Estimate of) Slope B 863# - Standard deviation of Y relative to fit 864# - Correlation coefficient R2 865# - Number of degrees of freedom df 866# - Standard error of the intercept A 867# - Significance level of A 868# - Standard error of the slope B 869# - Significance level of B 870# 871# 872proc ::math::statistics::linear-model { xdata ydata {intercept 1} } { 873 variable TOOFEWDATA 874 875 if { [llength $xdata] < 3 } { 876 return -code error -errorcode ARG "$TOOFEWDATA: not enough independent data" 877 } 878 if { [llength $ydata] < 3 } { 879 return -code error -errorcode ARG "$TOOFEWDATA: not enough dependent data" 880 } 881 if { [llength $xdata] != [llength $ydata] } { 882 return -code error -errorcode ARG "$TOOFEWDATA: number of dependent data differs from number of independent data" 883 } 884 885 set sumx 0.0 886 set sumy 0.0 887 set sumx2 0.0 888 set sumy2 0.0 889 set sumxy 0.0 890 set df 0 891 foreach x $xdata y $ydata { 892 if { $x != "" && $y != "" } { 893 set sumx [expr {$sumx+$x}] 894 set sumy [expr {$sumy+$y}] 895 set sumx2 [expr {$sumx2+$x*$x}] 896 set sumy2 [expr {$sumy2+$y*$y}] 897 set sumxy [expr {$sumxy+$x*$y}] 898 incr df 899 } 900 } 901 902 if { $df <= 2 } { 903 return -code error -errorcode ARG "$TOOFEWDATA: too few valid data" 904 } 905 if { $sumx2 == 0.0 } { 906 return -code error -errorcode ARG "$TOOFEWDATA: independent values are all the same" 907 } 908 909 # 910 # Calculate the intermediate quantities 911 # 912 set sx [expr {$sumx2-$sumx*$sumx/$df}] 913 set sy [expr {$sumy2-$sumy*$sumy/$df}] 914 set sxy [expr {$sumxy-$sumx*$sumy/$df}] 915 916 # 917 # Calculate the coefficients 918 # 919 if { $intercept } { 920 set B [expr {$sxy/$sx}] 921 set A [expr {($sumy-$B*$sumx)/$df}] 922 } else { 923 set B [expr {$sumxy/$sumx2}] 924 set A 0.0 925 } 926 927 # 928 # Calculate the error estimates 929 # 930 set stdevY 0.0 931 set varY 0.0 932 933 if { $intercept } { 934 set ve [expr {$sy-$B*$sxy}] 935 if { $ve >= 0.0 } { 936 set varY [expr {$ve/($df-2)}] 937 } 938 } else { 939 set ve [expr {$sumy2-$B*$sumxy}] 940 if { $ve >= 0.0 } { 941 set varY [expr {$ve/($df-1)}] 942 } 943 } 944 set seY [expr {sqrt($varY)}] 945 946 if { $intercept } { 947 set R2 [expr {$sxy*$sxy/($sx*$sy)}] 948 set seA [expr {$seY*sqrt(1.0/$df+$sumx*$sumx/($sx*$df*$df))}] 949 set seB [expr {sqrt($varY/$sx)}] 950 set tA {} 951 set tB {} 952 if { $seA != 0.0 } { 953 set tA [expr {$A/$seA*sqrt($df-2)}] 954 } 955 if { $seB != 0.0 } { 956 set tB [expr {$B/$seB*sqrt($df-2)}] 957 } 958 } else { 959 set R2 [expr {$sumxy*$sumxy/($sumx2*$sumy2)}] 960 set seA {} 961 set tA {} 962 set tB {} 963 set seB [expr {sqrt($varY/$sumx2)}] 964 if { $seB != 0.0 } { 965 set tB [expr {$B/$seB*sqrt($df-1)}] 966 } 967 } 968 969 # 970 # Return the list of parameters 971 # 972 return [list $A $B $seY $R2 $df $seA $tA $seB $tB] 973} 974 975# linear-residuals 976# Determine the difference between actual data and predicted from 977# the linear model 978# 979# Arguments: 980# xdata Series of independent (X) data 981# ydata Series of dependent (Y) data 982# intercept Whether to use an intercept or not (optional) 983# 984# Result: 985# List of differences 986# 987proc ::math::statistics::linear-residuals { xdata ydata {intercept 1} } { 988 variable TOOFEWDATA 989 990 if { [llength $xdata] < 3 } { 991 return -code error -errorcode ARG "$TOOFEWDATA: no independent data" 992 } 993 if { [llength $ydata] < 3 } { 994 return -code error -errorcode ARG "$TOOFEWDATA: no dependent data" 995 } 996 if { [llength $xdata] != [llength $ydata] } { 997 return -code error -errorcode ARG "$TOOFEWDATA: number of dependent data differs from number of independent data" 998 } 999 1000 foreach {A B} [linear-model $xdata $ydata $intercept] {break} 1001 1002 set result {} 1003 foreach x $xdata y $ydata { 1004 set residue [expr {$y-$A-$B*$x}] 1005 lappend result $residue 1006 } 1007 return $result 1008} 1009 1010# median 1011# Determine the median from a list of data 1012# 1013# Arguments: 1014# data (Unsorted) list of data 1015# 1016# Result: 1017# Median (either the middle value or the mean of two values in the 1018# middle) 1019# 1020# Note: 1021# Adapted from the Wiki page "Stats", code provided by JPS 1022# 1023proc ::math::statistics::median { data } { 1024 set org_data $data 1025 set data {} 1026 foreach value $org_data { 1027 if { $value != {} } { 1028 lappend data $value 1029 } 1030 } 1031 set len [llength $data] 1032 1033 set data [lsort -real $data] 1034 if { $len % 2 } { 1035 lindex $data [expr {($len-1)/2}] 1036 } else { 1037 expr {([lindex $data [expr {($len / 2) - 1}]] \ 1038 + [lindex $data [expr {$len / 2}]]) / 2.0} 1039 } 1040} 1041 1042# test-2x2 -- 1043# Compute the chi-square statistic for a 2x2 table 1044# 1045# Arguments: 1046# a Element upper-left 1047# b Element upper-right 1048# c Element lower-left 1049# d Element lower-right 1050# Return value: 1051# Chi-square 1052# Note: 1053# There is only one degree of freedom - this is important 1054# when comparing the value to the tabulated values 1055# of chi-square 1056# 1057proc ::math::statistics::test-2x2 { a b c d } { 1058 set ab [expr {$a+$b}] 1059 set ac [expr {$a+$c}] 1060 set bd [expr {$b+$d}] 1061 set cd [expr {$c+$d}] 1062 set N [expr {$a+$b+$c+$d}] 1063 set det [expr {$a*$d-$b*$c}] 1064 set result [expr {double($N*$det*$det)/double($ab*$cd*$ac*$bd)}] 1065} 1066 1067# print-2x2 -- 1068# Print a 2x2 table 1069# 1070# Arguments: 1071# a Element upper-left 1072# b Element upper-right 1073# c Element lower-left 1074# d Element lower-right 1075# Return value: 1076# Printed version with marginals 1077# 1078proc ::math::statistics::print-2x2 { a b c d } { 1079 set ab [expr {$a+$b}] 1080 set ac [expr {$a+$c}] 1081 set bd [expr {$b+$d}] 1082 set cd [expr {$c+$d}] 1083 set N [expr {$a+$b+$c+$d}] 1084 set chisq [test-2x2 $a $b $c $d] 1085 1086 set line [string repeat - 10] 1087 set result [format "%10d%10d | %10d\n" $a $b $ab] 1088 append result [format "%10d%10d | %10d\n" $c $d $cd] 1089 append result [format "%10s%10s + %10s\n" $line $line $line] 1090 append result [format "%10d%10d | %10d\n" $ac $bd $N] 1091 append result "Chisquare = $chisq\n" 1092 append result "Difference is significant?\n" 1093 append result " at 95%: [expr {$chisq<3.84146? "no":"yes"}]\n" 1094 append result " at 99%: [expr {$chisq<6.63490? "no":"yes"}]" 1095} 1096 1097# control-xbar -- 1098# Determine the control lines for an x-bar chart 1099# 1100# Arguments: 1101# data List of observed values (at least 20*nsamples) 1102# nsamples Number of data per subsamples (default: 4) 1103# Return value: 1104# List of: mean, lower limit, upper limit, number of data per 1105# subsample. Can be used in the test-xbar procedure 1106# 1107proc ::math::statistics::control-xbar { data {nsamples 4} } { 1108 variable TOOFEWDATA 1109 variable control_factors 1110 1111 # 1112 # Check the number of data 1113 # 1114 if { $nsamples <= 1 } { 1115 return -code error -errorcode DATA -errorinfo $OUTOFRANGE \ 1116 "Number of data per subsample must be at least 2" 1117 } 1118 if { [llength $data] < 20*$nsamples } { 1119 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 1120 } 1121 1122 set nogroups [expr {[llength $data]/$nsamples}] 1123 set mrange 0.0 1124 set xmeans 0.0 1125 for { set i 0 } { $i < $nogroups } { incr i } { 1126 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]] 1127 1128 set xmean 0.0 1129 set xmin [lindex $subsample 0] 1130 set xmax $xmin 1131 foreach d $subsample { 1132 set xmean [expr {$xmean+$d}] 1133 set xmin [expr {$xmin<$d? $xmin : $d}] 1134 set xmax [expr {$xmax>$d? $xmax : $d}] 1135 } 1136 set xmean [expr {$xmean/double($nsamples)}] 1137 1138 set xmeans [expr {$xmeans+$xmean}] 1139 set mrange [expr {$mrange+($xmax-$xmin)}] 1140 } 1141 1142 # 1143 # Determine the control lines 1144 # 1145 set xmeans [expr {$xmeans/double($nogroups)}] 1146 set mrange [expr {$mrange/double($nogroups)}] 1147 set A2 [lindex [lindex $control_factors 1] $nsamples] 1148 if { $A2 == "" } { set A2 [lindex [lindex $control_factors 1] end] } 1149 1150 return [list $xmeans [expr {$xmeans-$A2*$mrange}] \ 1151 [expr {$xmeans+$A2*$mrange}] $nsamples] 1152} 1153 1154# test-xbar -- 1155# Determine if any data points lie outside the x-bar control limits 1156# 1157# Arguments: 1158# control List returned by control-xbar with control data 1159# data List of observed values 1160# Return value: 1161# Indices of any subsamples that violate the control limits 1162# 1163proc ::math::statistics::test-xbar { control data } { 1164 foreach {xmean xlower xupper nsamples} $control {break} 1165 1166 if { [llength $data] < 1 } { 1167 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 1168 } 1169 1170 set nogroups [expr {[llength $data]/$nsamples}] 1171 if { $nogroups <= 0 } { 1172 set nogroup 1 1173 set nsamples [llength $data] 1174 } 1175 1176 set result {} 1177 1178 for { set i 0 } { $i < $nogroups } { incr i } { 1179 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]] 1180 1181 set xmean 0.0 1182 foreach d $subsample { 1183 set xmean [expr {$xmean+$d}] 1184 } 1185 set xmean [expr {$xmean/double($nsamples)}] 1186 1187 if { $xmean < $xlower } { lappend result $i } 1188 if { $xmean > $xupper } { lappend result $i } 1189 } 1190 1191 return $result 1192} 1193 1194# control-Rchart -- 1195# Determine the control lines for an R chart 1196# 1197# Arguments: 1198# data List of observed values (at least 20*nsamples) 1199# nsamples Number of data per subsamples (default: 4) 1200# Return value: 1201# List of: mean range, lower limit, upper limit, number of data per 1202# subsample. Can be used in the test-Rchart procedure 1203# 1204proc ::math::statistics::control-Rchart { data {nsamples 4} } { 1205 variable TOOFEWDATA 1206 variable control_factors 1207 1208 # 1209 # Check the number of data 1210 # 1211 if { $nsamples <= 1 } { 1212 return -code error -errorcode DATA -errorinfo $OUTOFRANGE \ 1213 "Number of data per subsample must be at least 2" 1214 } 1215 if { [llength $data] < 20*$nsamples } { 1216 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 1217 } 1218 1219 set nogroups [expr {[llength $data]/$nsamples}] 1220 set mrange 0.0 1221 for { set i 0 } { $i < $nogroups } { incr i } { 1222 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]] 1223 1224 set xmin [lindex $subsample 0] 1225 set xmax $xmin 1226 foreach d $subsample { 1227 set xmin [expr {$xmin<$d? $xmin : $d}] 1228 set xmax [expr {$xmax>$d? $xmax : $d}] 1229 } 1230 set mrange [expr {$mrange+($xmax-$xmin)}] 1231 } 1232 1233 # 1234 # Determine the control lines 1235 # 1236 set mrange [expr {$mrange/double($nogroups)}] 1237 set D3 [lindex [lindex $control_factors 3] $nsamples] 1238 set D4 [lindex [lindex $control_factors 5] $nsamples] 1239 if { $D3 == "" } { set D3 [lindex [lindex $control_factors 3] end] } 1240 if { $D4 == "" } { set D4 [lindex [lindex $control_factors 5] end] } 1241 1242 return [list $mrange [expr {$D3*$mrange}] \ 1243 [expr {$D4*$mrange}] $nsamples] 1244} 1245 1246# test-Rchart -- 1247# Determine if any data points lie outside the R-chart control limits 1248# 1249# Arguments: 1250# control List returned by control-xbar with control data 1251# data List of observed values 1252# Return value: 1253# Indices of any subsamples that violate the control limits 1254# 1255proc ::math::statistics::test-Rchart { control data } { 1256 foreach {rmean rlower rupper nsamples} $control {break} 1257 1258 # 1259 # Check the number of data 1260 # 1261 if { [llength $data] < 1 } { 1262 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA 1263 } 1264 1265 set nogroups [expr {[llength $data]/$nsamples}] 1266 1267 set result {} 1268 for { set i 0 } { $i < $nogroups } { incr i } { 1269 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]] 1270 1271 set xmin [lindex $subsample 0] 1272 set xmax $xmin 1273 foreach d $subsample { 1274 set xmin [expr {$xmin<$d? $xmin : $d}] 1275 set xmax [expr {$xmax>$d? $xmax : $d}] 1276 } 1277 set range [expr {$xmax-$xmin}] 1278 1279 if { $range < $rlower } { lappend result $i } 1280 if { $range > $rupper } { lappend result $i } 1281 } 1282 1283 return $result 1284} 1285 1286 1287 1288# 1289# Load the auxiliary scripts 1290# 1291source [file join [file dirname [info script]] pdf_stat.tcl] 1292source [file join [file dirname [info script]] plotstat.tcl] 1293source [file join [file dirname [info script]] liststat.tcl] 1294source [file join [file dirname [info script]] mvlinreg.tcl] 1295 1296# 1297# Define the tables 1298# 1299namespace eval ::math::statistics { 1300 variable student_t_table 1301 1302 # set student_t_table [::math::interpolation::defineTable student_t 1303 # {X 80% 90% 95% 98% 99%} 1304 # {X 0.80 0.90 0.95 0.98 0.99 1305 # 1 3.078 6.314 12.706 31.821 63.657 1306 # 2 1.886 2.920 4.303 6.965 9.925 1307 # 3 1.638 2.353 3.182 4.541 5.841 1308 # 5 1.476 2.015 2.571 3.365 4.032 1309 # 10 1.372 1.812 2.228 2.764 3.169 1310 # 15 1.341 1.753 2.131 2.602 2.947 1311 # 20 1.325 1.725 2.086 2.528 2.845 1312 # 30 1.310 1.697 2.042 2.457 2.750 1313 # 60 1.296 1.671 2.000 2.390 2.660 1314 # 1.0e9 1.282 1.645 1.960 2.326 2.576 }] 1315 1316 # PM 1317 #set chi_squared_table [::math::interpolation::defineTable chi_square 1318 # ... 1319} 1320 1321# 1322# Simple test code 1323# 1324if { [info exists ::argv0] && ([file tail [info script]] == [file tail $::argv0]) } { 1325 1326 console show 1327 puts [interp aliases] 1328 1329 set values {1 1 1 1 {}} 1330 puts [::math::statistics::basic-stats $values] 1331 set values {1 2 3 4} 1332 puts [::math::statistics::basic-stats $values] 1333 set values {1 -1 1 -2} 1334 puts [::math::statistics::basic-stats $values] 1335 puts [::math::statistics::mean $values] 1336 puts [::math::statistics::min $values] 1337 puts [::math::statistics::max $values] 1338 puts [::math::statistics::number $values] 1339 puts [::math::statistics::stdev $values] 1340 puts [::math::statistics::var $values] 1341 1342 set novals 100 1343 #set maxvals 100001 1344 set maxvals 1001 1345 while { $novals < $maxvals } { 1346 set values {} 1347 for { set i 0 } { $i < $novals } { incr i } { 1348 lappend values [expr {rand()}] 1349 } 1350 puts [::math::statistics::basic-stats $values] 1351 puts [::math::statistics::histogram {0.0 0.2 0.4 0.6 0.8 1.0} $values] 1352 set novals [expr {$novals*10}] 1353 } 1354 1355 puts "Normal distribution:" 1356 puts "X=0: [::math::statistics::pdf-normal 0.0 1.0 0.0]" 1357 puts "X=1: [::math::statistics::pdf-normal 0.0 1.0 1.0]" 1358 puts "X=-1: [::math::statistics::pdf-normal 0.0 1.0 -1.0]" 1359 1360 set data1 {0.0 1.0 3.0 4.0 100.0 -23.0} 1361 set data2 {1.0 2.0 4.0 5.0 101.0 -22.0} 1362 set data3 {0.0 2.0 6.0 8.0 200.0 -46.0} 1363 set data4 {2.0 6.0 8.0 200.0 -46.0 1.0} 1364 set data5 {100.0 99.0 90.0 93.0 5.0 123.0} 1365 puts "Correlation data1 and data1: [::math::statistics::corr $data1 $data1]" 1366 puts "Correlation data1 and data2: [::math::statistics::corr $data1 $data2]" 1367 puts "Correlation data1 and data3: [::math::statistics::corr $data1 $data3]" 1368 puts "Correlation data1 and data4: [::math::statistics::corr $data1 $data4]" 1369 puts "Correlation data1 and data5: [::math::statistics::corr $data1 $data5]" 1370 1371 # set data {1.0 2.0 2.3 4.0 3.4 1.2 0.6 5.6} 1372 # puts [::math::statistics::basicStats $data] 1373 # puts [::math::statistics::interval-mean-stdev $data 0.90] 1374 # puts [::math::statistics::interval-mean-stdev $data 0.95] 1375 # puts [::math::statistics::interval-mean-stdev $data 0.99] 1376 1377 # puts "\nTest mean values:" 1378 # puts [::math::statistics::test-mean $data 2.0 0.1 0.90] 1379 # puts [::math::statistics::test-mean $data 2.0 0.5 0.90] 1380 # puts [::math::statistics::test-mean $data 2.0 1.0 0.90] 1381 # puts [::math::statistics::test-mean $data 2.0 2.0 0.90] 1382 1383 set rc [catch { 1384 set m [::math::statistics::mean {}] 1385 } msg ] ; # {} 1386 puts "Result: $rc $msg" 1387 1388 puts "\nTest quantiles:" 1389 set data {1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0} 1390 set quantiles {0.11 0.21 0.51 0.91 0.99} 1391 set limits {2.1 4.1 6.1 8.1} 1392 puts [::math::statistics::quantiles $data $quantiles] 1393 1394 set histogram [::math::statistics::histogram $limits $data] 1395 puts [::math::statistics::quantiles $limits $histogram $quantiles] 1396 1397 puts "\nTest autocorrelation:" 1398 set data {1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0} 1399 puts [::math::statistics::autocorr $data] 1400 set data {1.0 -1.1 2.0 -0.6 3.0 -4.0 0.5 0.9 -1.0} 1401 puts [::math::statistics::autocorr $data] 1402 1403 puts "\nTest histogram limits:" 1404 puts [::math::statistics::mean-histogram-limits 1.0 1.0] 1405 puts [::math::statistics::mean-histogram-limits 1.0 1.0 4] 1406 puts [::math::statistics::minmax-histogram-limits 1.0 10.0 10] 1407 1408} 1409 1410# 1411# Test xbar/R-chart procedures 1412# 1413if { 0 } { 1414 set data {} 1415 for { set i 0 } { $i < 500 } { incr i } { 1416 lappend data [expr {rand()}] 1417 } 1418 set limits [::math::statistics::control-xbar $data] 1419 puts $limits 1420 1421 puts "Outliers? [::math::statistics::test-xbar $limits $data]" 1422 1423 set newdata {1.0 1.0 1.0 1.0 0.5 0.5 0.5 0.5 10.0 10.0 10.0 10.0} 1424 puts "Outliers? [::math::statistics::test-xbar $limits $newdata] -- 0 2" 1425 1426 set limits [::math::statistics::control-Rchart $data] 1427 puts $limits 1428 1429 puts "Outliers? [::math::statistics::test-Rchart $limits $data]" 1430 1431 set newdata {0.0 1.0 2.0 1.0 0.4 0.5 0.6 0.5 10.0 0.0 10.0 10.0} 1432 puts "Outliers? [::math::statistics::test-Rchart $limits $newdata] -- 0 2" 1433} 1434 1435