1# mvreglin.tcl -- 2# Addition to the statistics package 3# Copyright 2007 Eric Kemp-Benedict 4# Released under the BSD license under any terms 5# that allow it to be compatible with tcllib 6 7package require math::linearalgebra 1.0 8package require math::statistics 0.4 9 10# ::math::statistics -- 11# This file adds: 12# mvlinreg = Multivariate Linear Regression 13# 14namespace eval ::math::statistics { 15 variable epsilon 1.0e-7 16 17 namespace export tstat mv-wls mv-ols 18 19 namespace import -force \ 20 ::math::linearalgebra::mkMatrix \ 21 ::math::linearalgebra::mkVector \ 22 ::math::linearalgebra::mkIdentity \ 23 ::math::linearalgebra::mkDiagonal \ 24 ::math::linearalgebra::getrow \ 25 ::math::linearalgebra::setrow \ 26 ::math::linearalgebra::getcol \ 27 ::math::linearalgebra::setcol \ 28 ::math::linearalgebra::getelem \ 29 ::math::linearalgebra::setelem \ 30 ::math::linearalgebra::dotproduct \ 31 ::math::linearalgebra::matmul \ 32 ::math::linearalgebra::add \ 33 ::math::linearalgebra::sub \ 34 ::math::linearalgebra::solveGauss \ 35 ::math::linearalgebra::transpose 36} 37 38# tstats -- 39# Returns inverse of the single-tailed t distribution 40# given number of degrees of freedom & confidence 41# 42# Arguments: 43# n Number of degrees of freedom 44# alpha Confidence level (defaults to 0.05) 45# 46# Result: 47# Inverse of the t-distribution 48# 49# Note: 50# Iterates until result is within epsilon of the target. 51# It is possible that the iteration does not converge 52# 53proc ::math::statistics::tstat {n {alpha 0.05}} { 54 variable epsilon 55 variable tvals 56 57 if [info exists tvals($n:$alpha)] { 58 return $tvals($n:$alpha) 59 } 60 61 set deltat [expr {100 * $epsilon}] 62 # For one-tailed distribution, 63 set ptarg [expr {1.000 - $alpha/2.0}] 64 set maxiter 100 65 66 # Initial value for t 67 set t 2.0 68 69 set niter 0 70 while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} { 71 set pstar [::math::statistics::cdf-students-t $n $t] 72 set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]] 73 set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]] 74 75 set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}] 76 77 incr niter 78 if {$niter == $maxiter} { 79 return -code error "::math::statistics::tstat: Did not converge after $niter iterations" 80 } 81 } 82 83 # Cache the result to shorten the call in future 84 set tvals($n:$alpha) $t 85 86 return $t 87} 88 89# mv-wls -- 90# Weighted Least Squares 91# 92# Arguments: 93# data Alternating list of weights and observations 94# 95# Result: 96# List containing: 97# * R-squared 98# * Adjusted R-squared 99# * Coefficients of x's in fit 100# * Standard errors of the coefficients 101# * 95% confidence bounds for coefficients 102# 103# Note: 104# The observations are lists starting with the dependent variable y 105# and then the values of the independent variables (x1, x2, ...): 106# 107# mv-wls [list w [list y x's] w [list y x's] ...] 108# 109proc ::math::statistics::mv-wls {data} { 110 111 # Fill the matrices of x & y values, and weights 112 # For n points, k coefficients 113 114 # The number of points is equal to half the arguments (n weights, n points) 115 set n [expr {[llength $data]/2}] 116 117 set firstloop true 118 # Sum up all y values to take an average 119 set ysum 0 120 # Add up the weights 121 set wtsum 0 122 # Count over rows (points) as you go 123 set point 0 124 foreach {wt pt} $data { 125 126 # Check inputs 127 if {[string is double $wt] == 0} { 128 return -code error "::math::statistics::mv-wls: Weight \"$wt\" is not a number" 129 return {} 130 } 131 132 ## -- Check dimensions, initialize 133 if $firstloop { 134 # k = num of vals in pt = 1 + number of x's (because of constant) 135 set k [llength $pt] 136 if {$n <= [expr {$k + 1}]} { 137 return -code error "::math::statistics::mv-wls: Too few degrees of freedom: $k variables but only $n points" 138 return {} 139 } 140 set X [mkMatrix $n $k] 141 set y [mkVector $n] 142 set I_x [mkIdentity $k] 143 set I_y [mkIdentity $n] 144 145 set firstloop false 146 147 } else { 148 # Have to have same number of x's for all points 149 if {$k != [llength $pt]} { 150 return -code error "::math::statistics::mv-wls: Point \"$pt\" has wrong number of values (expected $k)" 151 # Clean up 152 return {} 153 } 154 } 155 156 ## -- Extract values from set of points 157 # Make a list of y values 158 set yval [expr {double([lindex $pt 0])}] 159 setelem y $point $yval 160 set ysum [expr {$ysum + $wt * $yval}] 161 set wtsum [expr {$wtsum + $wt}] 162 # Add x-values to the x-matrix 163 set xrow [lrange $pt 1 end] 164 # Add the constant (value = 1.0) 165 lappend xrow 1.0 166 setrow X $point $xrow 167 # Create list of weights & square root of weights 168 lappend w [expr {double($wt)}] 169 lappend sqrtw [expr {sqrt(double($wt))}] 170 171 incr point 172 173 } 174 175 set ymean [expr {double($ysum)/$wtsum}] 176 set W [mkDiagonal $w] 177 set sqrtW [mkDiagonal $sqrtw] 178 179 # Calculate sum os square differences for x's 180 for {set r 0} {$r < $k} {incr r} { 181 set xsqrsum 0.0 182 set xmeansum 0.0 183 # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n 184 for {set t 0} {$t < $n} {incr t} { 185 set xval [getelem $X $t $r] 186 set xmeansum [expr {$xmeansum + double($xval)}] 187 set xsqrsum [expr {$xsqrsum + double($xval * $xval)}] 188 } 189 lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}] 190 } 191 192 ## -- Set up the X'WX matrix 193 set XtW [matmul [::math::linearalgebra::transpose $X] $W] 194 set XtWX [matmul $XtW $X] 195 196 # Invert 197 set M [solveGauss $XtWX $I_x] 198 199 set beta [matmul $M [matmul $XtW $y]] 200 201 ### -- Residuals & R-squared 202 # 1) Generate list of diagonals of the hat matrix 203 set H [matmul $X [matmul $M $XtW]] 204 for {set i 0} {$i < $n} {incr i} { 205 lappend h_ii [getelem $H $i $i] 206 } 207 208 set R [matmul $sqrtW [matmul [sub $I_y $H] $y]] 209 set yhat [matmul $H $y] 210 211 # 2) Generate list of residuals, sum of squared residuals, r-squared 212 set sstot 0.0 213 set ssreg 0.0 214 # Note: Relying on representation of Vector as a list for y, yhat 215 foreach yval $y wt $w yhatval $yhat { 216 set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}] 217 set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}] 218 } 219 set r2 [expr {double($ssreg)/$sstot}] 220 set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}] 221 set sumsqresid [dotproduct $R $R] 222 set s2 [expr {double($sumsqresid) / double($n - $k)}] 223 224 ### -- Confidence intervals for coefficients 225 set tvalue [tstat [expr {$n - $k}]] 226 for {set i 0} {$i < $k} {incr i} { 227 set stderr [expr {sqrt($s2 * [getelem $M $i $i])}] 228 set mid [lindex $beta $i] 229 lappend stderrs $stderr 230 lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]] 231 } 232 233 return [list $r2 $adjr2 $beta $stderrs $confinterval] 234} 235 236# mv-ols -- 237# Ordinary Least Squares 238# 239# Arguments: 240# data List of observations, list of lists 241# 242# Result: 243# List containing: 244# * R-squared 245# * Adjusted R-squared 246# * Coefficients of x's in fit 247# * Standard errors of the coefficients 248# * 95% confidence bounds for coefficients 249# 250# Note: 251# The observations are lists starting with the dependent variable y 252# and then the values of the independent variables (x1, x2, ...): 253# 254# mv-ols [list y x's] [list y x's] ... 255# 256proc ::math::statistics::mv-ols {data} { 257 set newdata {} 258 foreach pt $data { 259 lappend newdata 1 $pt 260 } 261 return [mv-wls $newdata] 262} 263