1# mvreglin.tcl --
2#     Addition to the statistics package
3#     Copyright 2007 Eric Kemp-Benedict
4#     Released under the BSD license under any terms
5#     that allow it to be compatible with tcllib
6
7package require math::linearalgebra 1.0
8package require math::statistics 0.4
9
10# ::math::statistics --
11#     This file adds:
12#     mvlinreg = Multivariate Linear Regression
13#
14namespace eval ::math::statistics {
15    variable epsilon 1.0e-7
16
17    namespace export tstat mv-wls mv-ols
18
19    namespace import -force \
20        ::math::linearalgebra::mkMatrix \
21        ::math::linearalgebra::mkVector \
22        ::math::linearalgebra::mkIdentity \
23        ::math::linearalgebra::mkDiagonal \
24        ::math::linearalgebra::getrow \
25        ::math::linearalgebra::setrow \
26        ::math::linearalgebra::getcol \
27        ::math::linearalgebra::setcol \
28        ::math::linearalgebra::getelem \
29        ::math::linearalgebra::setelem \
30        ::math::linearalgebra::dotproduct \
31        ::math::linearalgebra::matmul \
32        ::math::linearalgebra::add \
33        ::math::linearalgebra::sub \
34        ::math::linearalgebra::solveGauss \
35        ::math::linearalgebra::transpose
36}
37
38# tstats --
39#     Returns inverse of the single-tailed t distribution
40#     given number of degrees of freedom & confidence
41#
42# Arguments:
43#     n        Number of degrees of freedom
44#     alpha    Confidence level (defaults to 0.05)
45#
46# Result:
47#     Inverse of the t-distribution
48#
49# Note:
50#     Iterates until result is within epsilon of the target.
51#     It is possible that the iteration does not converge
52#
53proc ::math::statistics::tstat {n {alpha 0.05}} {
54    variable epsilon
55    variable tvals
56
57    if [info exists tvals($n:$alpha)] {
58        return $tvals($n:$alpha)
59    }
60
61    set deltat [expr {100 * $epsilon}]
62    # For one-tailed distribution,
63    set ptarg [expr {1.000 - $alpha/2.0}]
64    set maxiter 100
65
66    # Initial value for t
67    set t 2.0
68
69    set niter 0
70    while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
71        set pstar [::math::statistics::cdf-students-t $n $t]
72        set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
73        set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]
74
75        set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]
76
77        incr niter
78        if {$niter == $maxiter} {
79            return -code error "::math::statistics::tstat: Did not converge after $niter iterations"
80        }
81    }
82
83    # Cache the result to shorten the call in future
84    set tvals($n:$alpha) $t
85
86    return $t
87}
88
89# mv-wls --
90#     Weighted Least Squares
91#
92# Arguments:
93#     data       Alternating list of weights and observations
94#
95# Result:
96#     List containing:
97#     * R-squared
98#     * Adjusted R-squared
99#     * Coefficients of x's in fit
100#     * Standard errors of the coefficients
101#     * 95% confidence bounds for coefficients
102#
103# Note:
104#     The observations are lists starting with the dependent variable y
105#     and then the values of the independent variables (x1, x2, ...):
106#
107#     mv-wls [list w [list y x's] w [list y x's] ...]
108#
109proc ::math::statistics::mv-wls {data} {
110
111    # Fill the matrices of x & y values, and weights
112    # For n points, k coefficients
113
114    # The number of points is equal to half the arguments (n weights, n points)
115    set n [expr {[llength $data]/2}]
116
117    set firstloop true
118    # Sum up all y values to take an average
119    set ysum 0
120    # Add up the weights
121    set wtsum 0
122    # Count over rows (points) as you go
123    set point 0
124    foreach {wt pt} $data {
125
126        # Check inputs
127        if {[string is double $wt] == 0} {
128            return -code error "::math::statistics::mv-wls: Weight \"$wt\" is not a number"
129            return {}
130        }
131
132        ## -- Check dimensions, initialize
133        if $firstloop {
134            # k = num of vals in pt = 1 + number of x's (because of constant)
135            set k [llength $pt]
136            if {$n <= [expr {$k + 1}]} {
137                return -code error "::math::statistics::mv-wls: Too few degrees of freedom: $k variables but only $n points"
138                return {}
139            }
140            set X [mkMatrix $n $k]
141            set y [mkVector $n]
142            set I_x [mkIdentity $k]
143            set I_y [mkIdentity $n]
144
145            set firstloop false
146
147        } else {
148            # Have to have same number of x's for all points
149            if {$k != [llength $pt]} {
150                return -code error "::math::statistics::mv-wls: Point \"$pt\" has wrong number of values (expected $k)"
151                # Clean up
152                return {}
153            }
154        }
155
156        ## -- Extract values from set of points
157        # Make a list of y values
158        set yval [expr {double([lindex $pt 0])}]
159        setelem y $point $yval
160        set ysum [expr {$ysum + $wt * $yval}]
161        set wtsum [expr {$wtsum + $wt}]
162        # Add x-values to the x-matrix
163        set xrow [lrange $pt 1 end]
164        # Add the constant (value = 1.0)
165        lappend xrow 1.0
166        setrow X $point $xrow
167        # Create list of weights & square root of weights
168        lappend w [expr {double($wt)}]
169        lappend sqrtw [expr {sqrt(double($wt))}]
170
171        incr point
172
173    }
174
175    set ymean [expr {double($ysum)/$wtsum}]
176    set W [mkDiagonal $w]
177    set sqrtW [mkDiagonal $sqrtw]
178
179    # Calculate sum os square differences for x's
180    for {set r 0} {$r < $k} {incr r} {
181        set xsqrsum 0.0
182        set xmeansum 0.0
183        # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
184        for {set t 0} {$t < $n} {incr t} {
185            set xval [getelem $X $t $r]
186            set xmeansum [expr {$xmeansum + double($xval)}]
187            set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
188        }
189        lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
190    }
191
192    ## -- Set up the X'WX matrix
193    set XtW [matmul [::math::linearalgebra::transpose $X] $W]
194    set XtWX [matmul $XtW $X]
195
196    # Invert
197    set M [solveGauss $XtWX $I_x]
198
199    set beta [matmul $M [matmul $XtW $y]]
200
201    ### -- Residuals & R-squared
202    # 1) Generate list of diagonals of the hat matrix
203    set H [matmul $X [matmul $M $XtW]]
204    for {set i 0} {$i < $n} {incr i} {
205        lappend h_ii [getelem $H $i $i]
206    }
207
208    set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
209    set yhat [matmul $H $y]
210
211    # 2) Generate list of residuals, sum of squared residuals, r-squared
212    set sstot 0.0
213    set ssreg 0.0
214    # Note: Relying on representation of Vector as a list for y, yhat
215    foreach yval $y wt $w yhatval $yhat {
216        set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
217        set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
218    }
219    set r2 [expr {double($ssreg)/$sstot}]
220    set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
221    set sumsqresid [dotproduct $R $R]
222    set s2 [expr {double($sumsqresid) / double($n - $k)}]
223
224    ### -- Confidence intervals for coefficients
225    set tvalue [tstat [expr {$n - $k}]]
226    for {set i 0} {$i < $k} {incr i} {
227        set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
228        set mid [lindex $beta $i]
229        lappend stderrs $stderr
230        lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
231    }
232
233    return [list $r2 $adjr2 $beta $stderrs $confinterval]
234}
235
236# mv-ols --
237#     Ordinary Least Squares
238#
239# Arguments:
240#     data       List of observations, list of lists
241#
242# Result:
243#     List containing:
244#     * R-squared
245#     * Adjusted R-squared
246#     * Coefficients of x's in fit
247#     * Standard errors of the coefficients
248#     * 95% confidence bounds for coefficients
249#
250# Note:
251#     The observations are lists starting with the dependent variable y
252#     and then the values of the independent variables (x1, x2, ...):
253#
254#     mv-ols [list y x's] [list y x's] ...
255#
256proc ::math::statistics::mv-ols {data} {
257    set newdata {}
258    foreach pt $data {
259        lappend newdata 1 $pt
260    }
261    return [mv-wls $newdata]
262}
263