1# statistics.tcl --
2#
3#    Package for basic statistical analysis
4#
5# version 0.1:   initial implementation, january 2003
6# version 0.1.1: added linear regression, june 2004
7# version 0.1.2: border case in stdev taken care of
8# version 0.1.3: moved initialisation of CDF to first call, november 2004
9# version 0.3:   added test for normality (as implemented by Torsten Reincke), march 2006
10#                (also fixed an error in the export list)
11# version 0.4:   added the multivariate linear regression procedures by
12#                Eric Kemp-Benedict, february 2007
13# version 0.5:   added the population standard deviation and variance,
14#                as suggested by Dimitrios Zachariadis
15# version 0.6:   added pdf and cdf procedures for various distributions
16#                (provided by Eric Kemp-Benedict)
17
18package provide math::statistics 0.6.3
19package require math
20
21# ::math::statistics --
22#   Namespace holding the procedures and variables
23#
24
25namespace eval ::math::statistics {
26    #
27    # Safer: change to short procedures
28    #
29    namespace export mean min max number var stdev pvar pstdev basic-stats corr \
30	    histogram interval-mean-stdev t-test-mean quantiles \
31	    test-normal lillieforsFit \
32	    autocorr crosscorr filter map samplescount median \
33	    test-2x2 print-2x2 control-xbar test_xbar \
34	    control-Rchart test-Rchart
35    #
36    # Error messages
37    #
38    variable NEGSTDEV   {Zero or negative standard deviation}
39    variable TOOFEWDATA {Too few or invalid data}
40    variable OUTOFRANGE {Argument out of range}
41
42    #
43    # Coefficients involved
44    #
45    variable factorNormalPdf
46    set factorNormalPdf [expr {sqrt(8.0*atan(1.0))}]
47
48    # xbar/R-charts:
49    # Data from:
50    #    Peter W.M. John:
51    #    Statistical methods in engineering and quality assurance
52    #    Wiley and Sons, 1990
53    #
54    variable control_factors {
55        A2 {1.880 1.093 0.729 0.577 0.483 0.419 0.419}
56        D3 {0.0   0.0   0.0   0.0   0.0   0.076 0.076}
57        D4 {3.267 2.574 2.282 2.114 2.004 1.924 1.924}
58    }
59}
60
61# mean, min, max, number, var, stdev, pvar, pstdev --
62#    Return the mean (minimum, maximum) value of a list of numbers
63#    or number of non-missing values
64#
65# Arguments:
66#    type     Type of value to be returned
67#    values   List of values to be examined
68#
69# Results:
70#    Value that was required
71#
72#
73namespace eval ::math::statistics {
74    foreach type {mean min max number stdev var pstdev pvar} {
75	proc $type { values } "BasicStats $type \$values"
76    }
77    proc basic-stats { values } "BasicStats all \$values"
78}
79
80# BasicStats --
81#    Return the one or all of the basic statistical properties
82#
83# Arguments:
84#    type     Type of value to be returned
85#    values   List of values to be examined
86#
87# Results:
88#    Value that was required
89#
90proc ::math::statistics::BasicStats { type values } {
91    variable TOOFEWDATA
92
93    if { [lsearch {all mean min max number stdev var pstdev pvar} $type] < 0 } {
94	return -code error \
95		-errorcode ARG -errorinfo [list unknown type of statistic -- $type] \
96		[list unknown type of statistic -- $type]
97    }
98
99    set min    {}
100    set max    {}
101    set mean   {}
102    set stdev  {}
103    set var    {}
104
105    set sum    0.0
106    set sumsq  0.0
107    set number 0
108    set first  {}
109
110    foreach value $values {
111	if { $value == {} } {
112	    continue
113	}
114	set value [expr {double($value)}]
115
116	if { $first == {} } {
117	    set first $value
118	}
119
120	incr number
121	set  sum    [expr {$sum+$value}]
122	set  sumsq  [expr {$sumsq+($value-$first)*($value-$first)}]
123
124	if { $min == {} || $value < $min } {
125	    set min $value
126	}
127	if { $max == {} || $value > $max } {
128	    set max $value
129	}
130    }
131
132    if { $number > 0 } {
133	set mean [expr {$sum/$number}]
134    } else {
135	return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
136    }
137
138    if { $number > 1 } {
139	set var    [expr {($sumsq-($mean-$first)*($sum-$number*$first))/double($number-1)}]
140        #
141        # Take care of a rare situation: uniform data might
142        # cause a tiny negative difference
143        #
144        if { $var < 0.0 } {
145           set var 0.0
146        }
147	set stdev  [expr {sqrt($var)}]
148    }
149	set pvar [expr {($sumsq-($mean-$first)*($sum-$number*$first))/double($number)}]
150        #
151        # Take care of a rare situation: uniform data might
152        # cause a tiny negative difference
153        #
154        if { $pvar < 0.0 } {
155           set pvar 0.0
156        }
157	set pstdev  [expr {sqrt($pvar)}]
158
159    set all [list $mean $min $max $number $stdev $var $pstdev $pvar]
160
161    #
162    # Return the appropriate value
163    #
164    set $type
165}
166
167# histogram --
168#    Return histogram information from a list of numbers
169#
170# Arguments:
171#    limits   Upper limits for the buckets (in increasing order)
172#    values   List of values to be examined
173#
174# Results:
175#    List of number of values in each bucket (length is one more than
176#    the number of limits)
177#
178#
179proc ::math::statistics::histogram { limits values } {
180
181    if { [llength $limits] < 1 } {
182	return -code error -errorcode ARG -errorinfo {No limits given} {No limits given}
183    }
184
185    set limits [lsort -real -increasing $limits]
186
187    for { set index 0 } { $index <= [llength $limits] } { incr index } {
188	set buckets($index) 0
189    }
190    set last [llength $limits]
191
192    foreach value $values {
193	if { $value == {} } {
194	    continue
195	}
196
197	set index 0
198	set found 0
199	foreach limit $limits {
200	    if { $value <= $limit } {
201		set found 1
202		incr buckets($index)
203		break
204	    }
205	    incr index
206	}
207
208	if { $found == 0 } {
209	    incr buckets($last)
210	}
211    }
212
213    set result {}
214    for { set index 0 } { $index <= $last } { incr index } {
215	lappend result $buckets($index)
216    }
217
218    return $result
219}
220
221# corr --
222#    Return the correlation coefficient of two sets of data
223#
224# Arguments:
225#    data1    List with the first set of data
226#    data2    List with the second set of data
227#
228# Result:
229#    Correlation coefficient of the two
230#
231proc ::math::statistics::corr { data1 data2 } {
232    variable TOOFEWDATA
233
234    set number  0
235    set sum1    0.0
236    set sum2    0.0
237    set sumsq1  0.0
238    set sumsq2  0.0
239    set sumprod 0.0
240
241    foreach value1 $data1 value2 $data2 {
242	if { $value1 == {} || $value2 == {} } {
243	    continue
244	}
245	set  value1  [expr {double($value1)}]
246	set  value2  [expr {double($value2)}]
247
248	set  sum1    [expr {$sum1+$value1}]
249	set  sum2    [expr {$sum2+$value2}]
250	set  sumsq1  [expr {$sumsq1+$value1*$value1}]
251	set  sumsq2  [expr {$sumsq2+$value2*$value2}]
252	set  sumprod [expr {$sumprod+$value1*$value2}]
253	incr number
254    }
255    if { $number > 0 } {
256	set numerator   [expr {$number*$sumprod-$sum1*$sum2}]
257	set denom1      [expr {sqrt($number*$sumsq1-$sum1*$sum1)}]
258	set denom2      [expr {sqrt($number*$sumsq2-$sum2*$sum2)}]
259	if { $denom1 != 0.0 && $denom2 != 0.0 } {
260	    set corr_coeff  [expr {$numerator/$denom1/$denom2}]
261	} elseif { $denom1 != 0.0 || $denom2 != 0.0 } {
262	    set corr_coeff  0.0 ;# Uniform against non-uniform
263	} else {
264	    set corr_coeff  1.0 ;# Both uniform
265	}
266
267    } else {
268	return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
269    }
270    return $corr_coeff
271}
272
273# lillieforsFit --
274#     Calculate the goodness of fit according to Lilliefors
275#     (goodness of fit to a normal distribution)
276#
277# Arguments:
278#     values          List of values to be tested for normality
279#
280# Result:
281#     Value of the statistic D
282#
283proc ::math::statistics::lillieforsFit {values} {
284    #
285    # calculate the goodness of fit according to Lilliefors
286    # (goodness of fit to a normal distribution)
287    #
288    # values -> list of values to be tested for normality
289    # (these values are sampled counts)
290    #
291
292    # calculate standard deviation and mean of the sample:
293    set n [llength $values]
294    if { $n < 5 } {
295        return -code error "Insufficient number of data (at least five required)"
296    }
297    set sd   [stdev $values]
298    set mean [mean $values]
299
300    # sort the sample for further processing:
301    set values [lsort -real $values]
302
303    # standardize the sample data (Z-scores):
304    foreach x $values {
305        lappend stdData [expr {($x - $mean)/double($sd)}]
306    }
307
308    # compute the value of the distribution function at every sampled point:
309    foreach x $stdData {
310        lappend expData [pnorm $x]
311    }
312
313    # compute D+:
314    set i 0
315    foreach x $expData {
316        incr i
317        lappend dplus [expr {$i/double($n)-$x}]
318    }
319    set dplus [lindex [lsort -real $dplus] end]
320
321    # compute D-:
322    set i 0
323    foreach x $expData {
324        incr i
325        lappend dminus [expr {$x-($i-1)/double($n)}]
326    }
327    set dminus [lindex [lsort -real $dminus] end]
328
329    # Calculate the test statistic D
330    # by finding the maximal vertical difference
331    # between the sample and the expectation:
332    #
333    set D [expr {$dplus > $dminus ? $dplus : $dminus}]
334
335    # We now use the modified statistic Z,
336    # because D is only reliable
337    # if the p-value is smaller than 0.1
338    return [expr {$D * (sqrt($n) - 0.01 + 0.831/sqrt($n))}]
339}
340
341# pnorm --
342#     Calculate the cumulative distribution function (cdf)
343#     for the standard normal distribution like in the statistical
344#     software 'R' (mean=0 and sd=1)
345#
346# Arguments:
347#     x               Value fro which the cdf should be calculated
348#
349# Result:
350#     Value of the statistic D
351#
352proc ::math::statistics::pnorm {x} {
353    #
354    # cumulative distribution function (cdf)
355    # for the standard normal distribution like in the statistical software 'R'
356    # (mean=0 and sd=1)
357    #
358    # x -> value for which the cdf should be calculated
359    #
360    set sum [expr {double($x)}]
361    set oldSum 0.0
362    set i 1
363    set denom 1.0
364    while {$sum != $oldSum} {
365            set oldSum $sum
366            incr i 2
367            set denom [expr {$denom*$i}]
368            #puts "$i - $denom"
369            set sum [expr {$oldSum + pow($x,$i)/$denom}]
370    }
371    return [expr {0.5 + $sum * exp(-0.5 * $x*$x - 0.91893853320467274178)}]
372}
373
374# pnorm_quicker --
375#     Calculate the cumulative distribution function (cdf)
376#     for the standard normal distribution - quicker alternative
377#     (less accurate)
378#
379# Arguments:
380#     x               Value for which the cdf should be calculated
381#
382# Result:
383#     Value of the statistic D
384#
385proc ::math::statistics::pnorm_quicker {x} {
386
387    set n [expr {abs($x)}]
388    set n [expr {1.0 + $n*(0.04986735 + $n*(0.02114101 + $n*(0.00327763 \
389            + $n*(0.0000380036 + $n*(0.0000488906 + $n*0.000005383)))))}]
390    set n [expr {1.0/pow($n,16)}]
391    #
392    if {$x >= 0} {
393        return [expr {1 - $n/2.0}]
394    } else {
395        return [expr {$n/2.0}]
396    }
397}
398
399# test-normal --
400#     Test for normality (using method Lilliefors)
401#
402# Arguments:
403#     data            Values that need to be tested
404#     confidence      ...
405#
406# Result:
407#     1 if of the statistic D
408#
409proc ::math::statistics::test-normal {data confidence} {
410    set D [lillieforsFit $data]
411
412    set Dcrit --
413    if { abs($confidence-0.80) < 0.0001 } {
414        set Dcrit 0.741
415    }
416    if { abs($confidence-0.85) < 0.0001 } {
417        set Dcrit 0.775
418    }
419    if { abs($confidence-0.90) < 0.0001 } {
420        set Dcrit 0.819
421    }
422    if { abs($confidence-0.95) < 0.0001 } {
423        set Dcrit 0.895
424    }
425    if { abs($confidence-0.99) < 0.0001 } {
426        set Dcrit 1.035
427    }
428    if { $Dcrit != "--" } {
429        return [expr {$D > $Dcrit ? 1 : 0 }]
430    } else {
431        return -code error "Confidence level must be one of: 0.80, 0.85, 0.90, 0.95 or 0.99"
432    }
433}
434
435# t-test-mean --
436#    Test whether the mean value of a sample is in accordance with the
437#    estimated normal distribution with a certain level of confidence
438#    (Student's t test)
439#
440# Arguments:
441#    data         List of raw data values (small sample)
442#    est_mean     Estimated mean of the distribution
443#    est_stdev    Estimated stdev of the distribution
444#    confidence   Confidence level (0.95 or 0.99 for instance)
445#
446# Result:
447#    1 if the test is positive, 0 otherwise. If there are too few data,
448#    returns an empty string
449#
450proc ::math::statistics::t-test-mean { data est_mean est_stdev confidence } {
451    variable NEGSTDEV
452    variable TOOFEWDATA
453
454    if { $est_stdev <= 0.0 } {
455	return -code error -errorcode ARG -errorinfo $NEGSTDEV $NEGSTDEV
456    }
457
458    set allstats        [BasicStats all $data]
459
460    set conf2           [expr {(1.0+$confidence)/2.0}]
461
462    set sample_mean     [lindex $allstats 0]
463    set sample_number   [lindex $allstats 3]
464
465    if { $sample_number > 1 } {
466	set tzero   [expr {abs($sample_mean-$est_mean)/$est_stdev * \
467		sqrt($sample_number-1)}]
468	set degrees [expr {$sample_number-1}]
469	set prob    [cdf-students-t $degrees $tzero]
470
471	return [expr {$prob<$conf2}]
472
473    } else {
474	return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
475    }
476}
477
478# interval-mean-stdev --
479#    Return the interval containing the mean value and one
480#    containing the standard deviation with a certain
481#    level of confidence (assuming a normal distribution)
482#
483# Arguments:
484#    data         List of raw data values
485#    confidence   Confidence level (0.95 or 0.99 for instance)
486#
487# Result:
488#    List having the following elements: lower and upper bounds of
489#    mean, lower and upper bounds of stdev
490#
491#
492proc ::math::statistics::interval-mean-stdev { data confidence } {
493    variable TOOFEWDATA
494
495    set allstats [BasicStats all $data]
496
497    set conf2    [expr {(1.0+$confidence)/2.0}]
498    set mean     [lindex $allstats 0]
499    set number   [lindex $allstats 3]
500    set stdev    [lindex $allstats 4]
501
502    if { $number > 1 } {
503	set degrees    [expr {$number-1}]
504	set student_t  [expr {sqrt([Inverse-cdf-toms322 1 $degrees $conf2])}]
505	set mean_lower [expr {$mean-$student_t*$stdev/sqrt($number)}]
506	set mean_upper [expr {$mean+$student_t*$stdev/sqrt($number)}]
507	set stdev_lower {}
508	set stdev_upper {}
509	return [list $mean_lower $mean_upper $stdev_lower $stdev_upper]
510    } else {
511	return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
512    }
513}
514
515# quantiles --
516#    Return the quantiles for a given set of data or histogram
517#
518# Arguments:
519#    (two arguments)
520#    data         List of raw data values
521#    confidence   Confidence level (0.95 or 0.99 for instance)
522#    (three arguments)
523#    limits       List of upper limits from histogram
524#    counts       List of counts for for each interval in histogram
525#    confidence   Confidence level (0.95 or 0.99 for instance)
526#
527# Result:
528#    List of quantiles
529#
530proc ::math::statistics::quantiles { arg1 arg2 {arg3 {}} } {
531    variable TOOFEWDATA
532
533    if { [catch {
534	if { $arg3 == {} } {
535	    set result \
536		    [::math::statistics::QuantilesRawData $arg1 $arg2]
537	} else {
538	    set result \
539		    [::math::statistics::QuantilesHistogram $arg1 $arg2 $arg3]
540	}
541    } msg] } {
542	return -code error -errorcode $msg $msg
543    }
544    return $result
545}
546
547# QuantilesRawData --
548#    Return the quantiles based on raw data
549#
550# Arguments:
551#    data         List of raw data values
552#    confidence   Confidence level (0.95 or 0.99 for instance)
553#
554# Result:
555#    List of quantiles
556#
557proc ::math::statistics::QuantilesRawData { data confidence } {
558    variable TOOFEWDATA
559    variable OUTOFRANGE
560
561    if { [llength $confidence] <= 0 } {
562	return -code error -errorcode ARG "$TOOFEWDATA - quantiles"
563    }
564
565    if { [llength $data] <= 0 } {
566	return -code error -errorcode ARG "$TOOFEWDATA - raw data"
567    }
568
569    foreach cond $confidence {
570	if { $cond <= 0.0 || $cond >= 1.0 } {
571	    return -code error -errorcode ARG "$OUTOFRANGE - quantiles"
572	}
573    }
574
575    #
576    # Sort the data first
577    #
578    set sorted_data [lsort -real -increasing $data]
579
580    #
581    # Determine the list element lower or equal to the quantile
582    # and return the corresponding value
583    #
584    set result      {}
585    set number_data [llength $sorted_data]
586    foreach cond $confidence {
587	set elem [expr {round($number_data*$cond)-1}]
588	if { $elem < 0 } {
589	    set elem 0
590	}
591	lappend result [lindex $sorted_data $elem]
592    }
593
594    return $result
595}
596
597# QuantilesHistogram --
598#    Return the quantiles based on histogram information only
599#
600# Arguments:
601#    limits       Upper limits for histogram intervals
602#    counts       Counts for each interval
603#    confidence   Confidence level (0.95 or 0.99 for instance)
604#
605# Result:
606#    List of quantiles
607#
608proc ::math::statistics::QuantilesHistogram { limits counts confidence } {
609    variable TOOFEWDATA
610    variable OUTOFRANGE
611
612    if { [llength $confidence] <= 0 } {
613	return -code error -errorcode ARG "$TOOFEWDATA - quantiles"
614    }
615
616    if { [llength $confidence] <= 0 } {
617	return -code error -errorcode ARG "$TOOFEWDATA - histogram limits"
618    }
619
620    if { [llength $counts] <= [llength $limits] } {
621	return -code error -errorcode ARG "$TOOFEWDATA - histogram counts"
622    }
623
624    foreach cond $confidence {
625	if { $cond <= 0.0 || $cond >= 1.0 } {
626	    return -code error -errorcode ARG "$OUTOFRANGE - quantiles"
627	}
628    }
629
630    #
631    # Accumulate the histogram counts first
632    #
633    set sum 0
634    set accumulated_counts {}
635    foreach count $counts {
636	set sum [expr {$sum+$count}]
637	lappend accumulated_counts $sum
638    }
639    set total_counts $sum
640
641    #
642    # Determine the list element lower or equal to the quantile
643    # and return the corresponding value (use interpolation if
644    # possible)
645    #
646    set result      {}
647    foreach cond $confidence {
648	set found       0
649	set bound       [expr {round($total_counts*$cond)}]
650	set lower_limit {}
651	set lower_count 0
652	foreach acc_count $accumulated_counts limit $limits {
653	    if { $acc_count >= $bound } {
654		set found 1
655		break
656	    }
657	    set lower_limit $limit
658	    set lower_count $acc_count
659	}
660
661	if { $lower_limit == {} || $limit == {} || $found == 0 } {
662	    set quant $limit
663	    if { $limit == {} } {
664		set quant $lower_limit
665	    }
666	} else {
667	    set quant [expr {$limit+($lower_limit-$limit) *
668	    ($acc_count-$bound)/($acc_count-$lower_count)}]
669	}
670	lappend result $quant
671    }
672
673    return $result
674}
675
676# autocorr --
677#    Return the autocorrelation function (assuming equidistance between
678#    samples)
679#
680# Arguments:
681#    data         Raw data for which the autocorrelation must be determined
682#
683# Result:
684#    List of autocorrelation values (about 1/2 the number of raw data)
685#
686proc ::math::statistics::autocorr { data } {
687    variable TOOFEWDATA
688
689    if { [llength $data] <= 1 } {
690	return -code error -errorcode ARG "$TOOFEWDATA"
691    }
692
693    return [crosscorr $data $data]
694}
695
696# crosscorr --
697#    Return the cross-correlation function (assuming equidistance
698#    between samples)
699#
700# Arguments:
701#    data1        First set of raw data
702#    data2        Second set of raw data
703#
704# Result:
705#    List of cross-correlation values (about 1/2 the number of raw data)
706#
707# Note:
708#    The number of data pairs is not kept constant - because tests
709#    showed rather awkward results when it was kept constant.
710#
711proc ::math::statistics::crosscorr { data1 data2 } {
712    variable TOOFEWDATA
713
714    if { [llength $data1] <= 1 || [llength $data2] <= 1 } {
715	return -code error -errorcode ARG "$TOOFEWDATA"
716    }
717
718    #
719    # First determine the number of data pairs
720    #
721    set number1 [llength $data1]
722    set number2 [llength $data2]
723
724    set basic_stat1 [basic-stats $data1]
725    set basic_stat2 [basic-stats $data2]
726    set vmean1      [lindex $basic_stat1 0]
727    set vmean2      [lindex $basic_stat2 0]
728    set vvar1       [lindex $basic_stat1 end]
729    set vvar2       [lindex $basic_stat2 end]
730
731    set number_pairs $number1
732    if { $number1 > $number2 } {
733	set number_pairs $number2
734    }
735    set number_values $number_pairs
736    set number_delays [expr {$number_values/2.0}]
737
738    set scale [expr {sqrt($vvar1*$vvar2)}]
739
740    set result {}
741    for { set delay 0 } { $delay < $number_delays } { incr delay } {
742	set sumcross 0.0
743	set no_cross 0
744	for { set idx 0 } { $idx < $number_values } { incr idx } {
745	    set value1 [lindex $data1 $idx]
746	    set value2 [lindex $data2 [expr {$idx+$delay}]]
747	    if { $value1 != {} && $value2 != {} } {
748		set  sumcross \
749			[expr {$sumcross+($value1-$vmean1)*($value2-$vmean2)}]
750		incr no_cross
751	    }
752	}
753	lappend result [expr {$sumcross/($no_cross*$scale)}]
754
755	incr number_values -1
756    }
757
758    return $result
759}
760
761# mean-histogram-limits
762#    Determine reasonable limits based on mean and standard deviation
763#    for a histogram
764#
765# Arguments:
766#    mean         Mean of the data
767#    stdev        Standard deviation
768#    number       Number of limits to generate (defaults to 8)
769#
770# Result:
771#    List of limits
772#
773proc ::math::statistics::mean-histogram-limits { mean stdev {number 8} } {
774    variable NEGSTDEV
775
776    if { $stdev <= 0.0 } {
777	return -code error -errorcode ARG "$NEGSTDEV"
778    }
779    if { $number < 1 } {
780	return -code error -errorcode ARG "Number of limits must be positive"
781    }
782
783    #
784    # Always: between mean-3.0*stdev and mean+3.0*stdev
785    # number = 2: -0.25, 0.25
786    # number = 3: -0.25, 0, 0.25
787    # number = 4: -1, -0.25, 0.25, 1
788    # number = 5: -1, -0.25, 0, 0.25, 1
789    # number = 6: -2, -1, -0.25, 0.25, 1, 2
790    # number = 7: -2, -1, -0.25, 0, 0.25, 1, 2
791    # number = 8: -3, -2, -1, -0.25, 0.25, 1, 2, 3
792    #
793    switch -- $number {
794	"1" { set limits {0.0} }
795	"2" { set limits {-0.25 0.25} }
796	"3" { set limits {-0.25 0.0 0.25} }
797	"4" { set limits {-1.0 -0.25 0.25 1.0} }
798	"5" { set limits {-1.0 -0.25 0.0 0.25 1.0} }
799	"6" { set limits {-2.0 -1.0 -0.25 0.25 1.0 2.0} }
800	"7" { set limits {-2.0 -1.0 -0.25 0.0 0.25 1.0 2.0} }
801	"8" { set limits {-3.0 -2.0 -1.0 -0.25 0.25 1.0 2.0 3.0} }
802	"9" { set limits {-3.0 -2.0 -1.0 -0.25 0.0 0.25 1.0 2.0 3.0} }
803	default {
804	    set dlim [expr {6.0/double($number-1)}]
805	    for {set i 0} {$i <$number} {incr i} {
806		lappend limits [expr {$dlim*($i-($number-1)/2.0)}]
807	    }
808	}
809    }
810
811    set result {}
812    foreach limit $limits {
813	lappend result [expr {$mean+$limit*$stdev}]
814    }
815
816    return $result
817}
818
819# minmax-histogram-limits
820#    Determine reasonable limits based on minimum and maximum bounds
821#    for a histogram
822#
823# Arguments:
824#    min          Estimated minimum
825#    max          Estimated maximum
826#    number       Number of limits to generate (defaults to 8)
827#
828# Result:
829#    List of limits
830#
831proc ::math::statistics::minmax-histogram-limits { min max {number 8} } {
832    variable NEGSTDEV
833
834    if { $number < 1 } {
835	return -code error -errorcode ARG "Number of limits must be positive"
836    }
837    if { $min >= $max } {
838	return -code error -errorcode ARG "Minimum must be lower than maximum"
839    }
840
841    set result {}
842    set dlim [expr {($max-$min)/double($number-1)}]
843    for {set i 0} {$i <$number} {incr i} {
844	lappend result [expr {$min+$dlim*$i}]
845    }
846
847    return $result
848}
849
850# linear-model
851#    Determine the coefficients for a linear regression between
852#    two series of data (the model: Y = A + B*X)
853#
854# Arguments:
855#    xdata        Series of independent (X) data
856#    ydata        Series of dependent (Y) data
857#    intercept    Whether to use an intercept or not (optional)
858#
859# Result:
860#    List of the following items:
861#    - (Estimate of) Intercept A
862#    - (Estimate of) Slope B
863#    - Standard deviation of Y relative to fit
864#    - Correlation coefficient R2
865#    - Number of degrees of freedom df
866#    - Standard error of the intercept A
867#    - Significance level of A
868#    - Standard error of the slope B
869#    - Significance level of B
870#
871#
872proc ::math::statistics::linear-model { xdata ydata {intercept 1} } {
873   variable TOOFEWDATA
874
875   if { [llength $xdata] < 3 } {
876      return -code error -errorcode ARG "$TOOFEWDATA: not enough independent data"
877   }
878   if { [llength $ydata] < 3 } {
879      return -code error -errorcode ARG "$TOOFEWDATA: not enough dependent data"
880   }
881   if { [llength $xdata] != [llength $ydata] } {
882      return -code error -errorcode ARG "$TOOFEWDATA: number of dependent data differs from number of independent data"
883   }
884
885   set sumx  0.0
886   set sumy  0.0
887   set sumx2 0.0
888   set sumy2 0.0
889   set sumxy 0.0
890   set df    0
891   foreach x $xdata y $ydata {
892      if { $x != "" && $y != "" } {
893         set sumx  [expr {$sumx+$x}]
894         set sumy  [expr {$sumy+$y}]
895         set sumx2 [expr {$sumx2+$x*$x}]
896         set sumy2 [expr {$sumy2+$y*$y}]
897         set sumxy [expr {$sumxy+$x*$y}]
898         incr df
899      }
900   }
901
902   if { $df <= 2 } {
903      return -code error -errorcode ARG "$TOOFEWDATA: too few valid data"
904   }
905   if { $sumx2 == 0.0 } {
906      return -code error -errorcode ARG "$TOOFEWDATA: independent values are all the same"
907   }
908
909   #
910   # Calculate the intermediate quantities
911   #
912   set sx  [expr {$sumx2-$sumx*$sumx/$df}]
913   set sy  [expr {$sumy2-$sumy*$sumy/$df}]
914   set sxy [expr {$sumxy-$sumx*$sumy/$df}]
915
916   #
917   # Calculate the coefficients
918   #
919   if { $intercept } {
920      set B [expr {$sxy/$sx}]
921      set A [expr {($sumy-$B*$sumx)/$df}]
922   } else {
923      set B [expr {$sumxy/$sumx2}]
924      set A 0.0
925   }
926
927   #
928   # Calculate the error estimates
929   #
930   set stdevY 0.0
931   set varY   0.0
932
933   if { $intercept } {
934      set ve [expr {$sy-$B*$sxy}]
935      if { $ve >= 0.0 } {
936         set varY [expr {$ve/($df-2)}]
937      }
938   } else {
939      set ve [expr {$sumy2-$B*$sumxy}]
940      if { $ve >= 0.0 } {
941         set varY [expr {$ve/($df-1)}]
942      }
943   }
944   set seY [expr {sqrt($varY)}]
945
946   if { $intercept } {
947      set R2    [expr {$sxy*$sxy/($sx*$sy)}]
948      set seA   [expr {$seY*sqrt(1.0/$df+$sumx*$sumx/($sx*$df*$df))}]
949      set seB   [expr {sqrt($varY/$sx)}]
950      set tA    {}
951      set tB    {}
952      if { $seA != 0.0 } {
953         set tA    [expr {$A/$seA*sqrt($df-2)}]
954      }
955      if { $seB != 0.0 } {
956         set tB    [expr {$B/$seB*sqrt($df-2)}]
957      }
958   } else {
959      set R2    [expr {$sumxy*$sumxy/($sumx2*$sumy2)}]
960      set seA   {}
961      set tA    {}
962      set tB    {}
963      set seB   [expr {sqrt($varY/$sumx2)}]
964      if { $seB != 0.0 } {
965         set tB    [expr {$B/$seB*sqrt($df-1)}]
966      }
967   }
968
969   #
970   # Return the list of parameters
971   #
972   return [list $A $B $seY $R2 $df $seA $tA $seB $tB]
973}
974
975# linear-residuals
976#    Determine the difference between actual data and predicted from
977#    the linear model
978#
979# Arguments:
980#    xdata        Series of independent (X) data
981#    ydata        Series of dependent (Y) data
982#    intercept    Whether to use an intercept or not (optional)
983#
984# Result:
985#    List of differences
986#
987proc ::math::statistics::linear-residuals { xdata ydata {intercept 1} } {
988   variable TOOFEWDATA
989
990   if { [llength $xdata] < 3 } {
991      return -code error -errorcode ARG "$TOOFEWDATA: no independent data"
992   }
993   if { [llength $ydata] < 3 } {
994      return -code error -errorcode ARG "$TOOFEWDATA: no dependent data"
995   }
996   if { [llength $xdata] != [llength $ydata] } {
997      return -code error -errorcode ARG "$TOOFEWDATA: number of dependent data differs from number of independent data"
998   }
999
1000   foreach {A B} [linear-model $xdata $ydata $intercept] {break}
1001
1002   set result {}
1003   foreach x $xdata y $ydata {
1004      set residue [expr {$y-$A-$B*$x}]
1005      lappend result $residue
1006   }
1007   return $result
1008}
1009
1010# median
1011#    Determine the median from a list of data
1012#
1013# Arguments:
1014#    data         (Unsorted) list of data
1015#
1016# Result:
1017#    Median (either the middle value or the mean of two values in the
1018#    middle)
1019#
1020# Note:
1021#    Adapted from the Wiki page "Stats", code provided by JPS
1022#
1023proc ::math::statistics::median { data } {
1024    set org_data $data
1025    set data     {}
1026    foreach value $org_data {
1027        if { $value != {} } {
1028            lappend data $value
1029        }
1030    }
1031    set len [llength $data]
1032
1033    set data [lsort -real $data]
1034    if { $len % 2 } {
1035        lindex $data [expr {($len-1)/2}]
1036    } else {
1037        expr {([lindex $data [expr {($len / 2) - 1}]] \
1038		+ [lindex $data [expr {$len / 2}]]) / 2.0}
1039    }
1040}
1041
1042# test-2x2 --
1043#     Compute the chi-square statistic for a 2x2 table
1044#
1045# Arguments:
1046#     a           Element upper-left
1047#     b           Element upper-right
1048#     c           Element lower-left
1049#     d           Element lower-right
1050# Return value:
1051#     Chi-square
1052# Note:
1053#     There is only one degree of freedom - this is important
1054#     when comparing the value to the tabulated values
1055#     of chi-square
1056#
1057proc ::math::statistics::test-2x2 { a b c d } {
1058    set ab     [expr {$a+$b}]
1059    set ac     [expr {$a+$c}]
1060    set bd     [expr {$b+$d}]
1061    set cd     [expr {$c+$d}]
1062    set N      [expr {$a+$b+$c+$d}]
1063    set det    [expr {$a*$d-$b*$c}]
1064    set result [expr {double($N*$det*$det)/double($ab*$cd*$ac*$bd)}]
1065}
1066
1067# print-2x2 --
1068#     Print a 2x2 table
1069#
1070# Arguments:
1071#     a           Element upper-left
1072#     b           Element upper-right
1073#     c           Element lower-left
1074#     d           Element lower-right
1075# Return value:
1076#     Printed version with marginals
1077#
1078proc ::math::statistics::print-2x2 { a b c d } {
1079    set ab     [expr {$a+$b}]
1080    set ac     [expr {$a+$c}]
1081    set bd     [expr {$b+$d}]
1082    set cd     [expr {$c+$d}]
1083    set N      [expr {$a+$b+$c+$d}]
1084    set chisq  [test-2x2 $a $b $c $d]
1085
1086    set    line   [string repeat - 10]
1087    set    result [format "%10d%10d | %10d\n" $a $b $ab]
1088    append result [format "%10d%10d | %10d\n" $c $d $cd]
1089    append result [format "%10s%10s + %10s\n" $line $line $line]
1090    append result [format "%10d%10d | %10d\n" $ac $bd $N]
1091    append result "Chisquare = $chisq\n"
1092    append result "Difference is significant?\n"
1093    append result "   at 95%: [expr {$chisq<3.84146? "no":"yes"}]\n"
1094    append result "   at 99%: [expr {$chisq<6.63490? "no":"yes"}]"
1095}
1096
1097# control-xbar --
1098#     Determine the control lines for an x-bar chart
1099#
1100# Arguments:
1101#     data        List of observed values (at least 20*nsamples)
1102#     nsamples    Number of data per subsamples (default: 4)
1103# Return value:
1104#     List of: mean, lower limit, upper limit, number of data per
1105#     subsample. Can be used in the test-xbar procedure
1106#
1107proc ::math::statistics::control-xbar { data {nsamples 4} } {
1108    variable TOOFEWDATA
1109    variable control_factors
1110
1111    #
1112    # Check the number of data
1113    #
1114    if { $nsamples <= 1 } {
1115        return -code error -errorcode DATA -errorinfo $OUTOFRANGE \
1116            "Number of data per subsample must be at least 2"
1117    }
1118    if { [llength $data] < 20*$nsamples } {
1119        return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
1120    }
1121
1122    set nogroups [expr {[llength $data]/$nsamples}]
1123    set mrange   0.0
1124    set xmeans   0.0
1125    for { set i 0 } { $i < $nogroups } { incr i } {
1126        set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
1127
1128        set xmean 0.0
1129        set xmin  [lindex $subsample 0]
1130        set xmax  $xmin
1131        foreach d $subsample {
1132            set xmean [expr {$xmean+$d}]
1133            set xmin  [expr {$xmin<$d? $xmin : $d}]
1134            set xmax  [expr {$xmax>$d? $xmax : $d}]
1135        }
1136        set xmean [expr {$xmean/double($nsamples)}]
1137
1138        set xmeans [expr {$xmeans+$xmean}]
1139        set mrange [expr {$mrange+($xmax-$xmin)}]
1140    }
1141
1142    #
1143    # Determine the control lines
1144    #
1145    set xmeans [expr {$xmeans/double($nogroups)}]
1146    set mrange [expr {$mrange/double($nogroups)}]
1147    set A2     [lindex [lindex $control_factors 1] $nsamples]
1148    if { $A2 == "" } { set A2 [lindex [lindex $control_factors 1] end] }
1149
1150    return [list $xmeans [expr {$xmeans-$A2*$mrange}] \
1151                         [expr {$xmeans+$A2*$mrange}] $nsamples]
1152}
1153
1154# test-xbar --
1155#     Determine if any data points lie outside the x-bar control limits
1156#
1157# Arguments:
1158#     control     List returned by control-xbar with control data
1159#     data        List of observed values
1160# Return value:
1161#     Indices of any subsamples that violate the control limits
1162#
1163proc ::math::statistics::test-xbar { control data } {
1164    foreach {xmean xlower xupper nsamples} $control {break}
1165
1166    if { [llength $data] < 1 } {
1167        return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
1168    }
1169
1170    set nogroups [expr {[llength $data]/$nsamples}]
1171    if { $nogroups <= 0 } {
1172        set nogroup  1
1173        set nsamples [llength $data]
1174    }
1175
1176    set result {}
1177
1178    for { set i 0 } { $i < $nogroups } { incr i } {
1179        set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
1180
1181        set xmean 0.0
1182        foreach d $subsample {
1183            set xmean [expr {$xmean+$d}]
1184        }
1185        set xmean [expr {$xmean/double($nsamples)}]
1186
1187        if { $xmean < $xlower } { lappend result $i }
1188        if { $xmean > $xupper } { lappend result $i }
1189    }
1190
1191    return $result
1192}
1193
1194# control-Rchart --
1195#     Determine the control lines for an R chart
1196#
1197# Arguments:
1198#     data        List of observed values (at least 20*nsamples)
1199#     nsamples    Number of data per subsamples (default: 4)
1200# Return value:
1201#     List of: mean range, lower limit, upper limit, number of data per
1202#     subsample. Can be used in the test-Rchart procedure
1203#
1204proc ::math::statistics::control-Rchart { data {nsamples 4} } {
1205    variable TOOFEWDATA
1206    variable control_factors
1207
1208    #
1209    # Check the number of data
1210    #
1211    if { $nsamples <= 1 } {
1212        return -code error -errorcode DATA -errorinfo $OUTOFRANGE \
1213            "Number of data per subsample must be at least 2"
1214    }
1215    if { [llength $data] < 20*$nsamples } {
1216        return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
1217    }
1218
1219    set nogroups [expr {[llength $data]/$nsamples}]
1220    set mrange   0.0
1221    for { set i 0 } { $i < $nogroups } { incr i } {
1222        set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
1223
1224        set xmin  [lindex $subsample 0]
1225        set xmax  $xmin
1226        foreach d $subsample {
1227            set xmin  [expr {$xmin<$d? $xmin : $d}]
1228            set xmax  [expr {$xmax>$d? $xmax : $d}]
1229        }
1230        set mrange [expr {$mrange+($xmax-$xmin)}]
1231    }
1232
1233    #
1234    # Determine the control lines
1235    #
1236    set mrange [expr {$mrange/double($nogroups)}]
1237    set D3     [lindex [lindex $control_factors 3] $nsamples]
1238    set D4     [lindex [lindex $control_factors 5] $nsamples]
1239    if { $D3 == "" } { set D3 [lindex [lindex $control_factors 3] end] }
1240    if { $D4 == "" } { set D4 [lindex [lindex $control_factors 5] end] }
1241
1242    return [list $mrange [expr {$D3*$mrange}] \
1243                         [expr {$D4*$mrange}] $nsamples]
1244}
1245
1246# test-Rchart --
1247#     Determine if any data points lie outside the R-chart control limits
1248#
1249# Arguments:
1250#     control     List returned by control-xbar with control data
1251#     data        List of observed values
1252# Return value:
1253#     Indices of any subsamples that violate the control limits
1254#
1255proc ::math::statistics::test-Rchart { control data } {
1256    foreach {rmean rlower rupper nsamples} $control {break}
1257
1258    #
1259    # Check the number of data
1260    #
1261    if { [llength $data] < 1 } {
1262        return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
1263    }
1264
1265    set nogroups [expr {[llength $data]/$nsamples}]
1266
1267    set result {}
1268    for { set i 0 } { $i < $nogroups } { incr i } {
1269        set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
1270
1271        set xmin  [lindex $subsample 0]
1272        set xmax  $xmin
1273        foreach d $subsample {
1274            set xmin  [expr {$xmin<$d? $xmin : $d}]
1275            set xmax  [expr {$xmax>$d? $xmax : $d}]
1276        }
1277        set range [expr {$xmax-$xmin}]
1278
1279        if { $range < $rlower } { lappend result $i }
1280        if { $range > $rupper } { lappend result $i }
1281    }
1282
1283    return $result
1284}
1285
1286
1287
1288#
1289# Load the auxiliary scripts
1290#
1291source [file join [file dirname [info script]] pdf_stat.tcl]
1292source [file join [file dirname [info script]] plotstat.tcl]
1293source [file join [file dirname [info script]] liststat.tcl]
1294source [file join [file dirname [info script]] mvlinreg.tcl]
1295
1296#
1297# Define the tables
1298#
1299namespace eval ::math::statistics {
1300    variable student_t_table
1301
1302    #   set student_t_table [::math::interpolation::defineTable student_t
1303    #          {X        80%    90%    95%    98%    99%}
1304    #          {X      0.80   0.90   0.95   0.98   0.99
1305    #           1      3.078  6.314 12.706 31.821 63.657
1306    #           2      1.886  2.920  4.303  6.965  9.925
1307    #           3      1.638  2.353  3.182  4.541  5.841
1308    #           5      1.476  2.015  2.571  3.365  4.032
1309    #          10      1.372  1.812  2.228  2.764  3.169
1310    #          15      1.341  1.753  2.131  2.602  2.947
1311    #          20      1.325  1.725  2.086  2.528  2.845
1312    #          30      1.310  1.697  2.042  2.457  2.750
1313    #          60      1.296  1.671  2.000  2.390  2.660
1314    #         1.0e9    1.282  1.645  1.960  2.326  2.576 }]
1315
1316    # PM
1317    #set chi_squared_table [::math::interpolation::defineTable chi_square
1318    #   ...
1319}
1320
1321#
1322# Simple test code
1323#
1324if { [info exists ::argv0] && ([file tail [info script]] == [file tail $::argv0]) } {
1325
1326    console show
1327    puts [interp aliases]
1328
1329    set values {1 1 1 1 {}}
1330    puts [::math::statistics::basic-stats $values]
1331    set values {1 2 3 4}
1332    puts [::math::statistics::basic-stats $values]
1333    set values {1 -1 1 -2}
1334    puts [::math::statistics::basic-stats $values]
1335    puts [::math::statistics::mean   $values]
1336    puts [::math::statistics::min    $values]
1337    puts [::math::statistics::max    $values]
1338    puts [::math::statistics::number $values]
1339    puts [::math::statistics::stdev  $values]
1340    puts [::math::statistics::var    $values]
1341
1342    set novals 100
1343    #set maxvals 100001
1344    set maxvals 1001
1345    while { $novals < $maxvals } {
1346	set values {}
1347	for { set i 0 } { $i < $novals } { incr i } {
1348	    lappend values [expr {rand()}]
1349	}
1350	puts [::math::statistics::basic-stats $values]
1351	puts [::math::statistics::histogram {0.0 0.2 0.4 0.6 0.8 1.0} $values]
1352	set novals [expr {$novals*10}]
1353    }
1354
1355    puts "Normal distribution:"
1356    puts "X=0:  [::math::statistics::pdf-normal 0.0 1.0 0.0]"
1357    puts "X=1:  [::math::statistics::pdf-normal 0.0 1.0 1.0]"
1358    puts "X=-1: [::math::statistics::pdf-normal 0.0 1.0 -1.0]"
1359
1360    set data1 {0.0 1.0 3.0 4.0 100.0 -23.0}
1361    set data2 {1.0 2.0 4.0 5.0 101.0 -22.0}
1362    set data3 {0.0 2.0 6.0 8.0 200.0 -46.0}
1363    set data4 {2.0 6.0 8.0 200.0 -46.0 1.0}
1364    set data5 {100.0 99.0 90.0 93.0 5.0 123.0}
1365    puts "Correlation data1 and data1: [::math::statistics::corr $data1 $data1]"
1366    puts "Correlation data1 and data2: [::math::statistics::corr $data1 $data2]"
1367    puts "Correlation data1 and data3: [::math::statistics::corr $data1 $data3]"
1368    puts "Correlation data1 and data4: [::math::statistics::corr $data1 $data4]"
1369    puts "Correlation data1 and data5: [::math::statistics::corr $data1 $data5]"
1370
1371    #   set data {1.0 2.0 2.3 4.0 3.4 1.2 0.6 5.6}
1372    #   puts [::math::statistics::basicStats $data]
1373    #   puts [::math::statistics::interval-mean-stdev $data 0.90]
1374    #   puts [::math::statistics::interval-mean-stdev $data 0.95]
1375    #   puts [::math::statistics::interval-mean-stdev $data 0.99]
1376
1377    #   puts "\nTest mean values:"
1378    #   puts [::math::statistics::test-mean $data 2.0 0.1 0.90]
1379    #   puts [::math::statistics::test-mean $data 2.0 0.5 0.90]
1380    #   puts [::math::statistics::test-mean $data 2.0 1.0 0.90]
1381    #   puts [::math::statistics::test-mean $data 2.0 2.0 0.90]
1382
1383    set rc [catch {
1384	set m [::math::statistics::mean {}]
1385    } msg ] ; # {}
1386    puts "Result: $rc $msg"
1387
1388    puts "\nTest quantiles:"
1389    set data      {1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0}
1390    set quantiles {0.11 0.21 0.51 0.91 0.99}
1391    set limits    {2.1 4.1 6.1 8.1}
1392    puts [::math::statistics::quantiles $data $quantiles]
1393
1394    set histogram [::math::statistics::histogram $limits $data]
1395    puts [::math::statistics::quantiles $limits $histogram $quantiles]
1396
1397    puts "\nTest autocorrelation:"
1398    set data      {1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0}
1399    puts [::math::statistics::autocorr $data]
1400    set data      {1.0 -1.1 2.0 -0.6 3.0 -4.0 0.5 0.9 -1.0}
1401    puts [::math::statistics::autocorr $data]
1402
1403    puts "\nTest histogram limits:"
1404    puts [::math::statistics::mean-histogram-limits   1.0 1.0]
1405    puts [::math::statistics::mean-histogram-limits   1.0 1.0 4]
1406    puts [::math::statistics::minmax-histogram-limits 1.0 10.0 10]
1407
1408}
1409
1410#
1411# Test xbar/R-chart procedures
1412#
1413if { 0 } {
1414    set data {}
1415    for { set i 0 } { $i < 500 } { incr i } {
1416        lappend data [expr {rand()}]
1417    }
1418    set limits [::math::statistics::control-xbar $data]
1419    puts $limits
1420
1421    puts "Outliers? [::math::statistics::test-xbar $limits $data]"
1422
1423    set newdata {1.0 1.0 1.0 1.0 0.5 0.5 0.5 0.5 10.0 10.0 10.0 10.0}
1424    puts "Outliers? [::math::statistics::test-xbar $limits $newdata] -- 0 2"
1425
1426    set limits [::math::statistics::control-Rchart $data]
1427    puts $limits
1428
1429    puts "Outliers? [::math::statistics::test-Rchart $limits $data]"
1430
1431    set newdata {0.0 1.0 2.0 1.0 0.4 0.5 0.6 0.5 10.0  0.0 10.0 10.0}
1432    puts "Outliers? [::math::statistics::test-Rchart $limits $newdata] -- 0 2"
1433}
1434
1435