191174Stmm/* 291174Stmm * Copyright (c) 1992, 1993 391174Stmm * The Regents of the University of California. All rights reserved. 491174Stmm * 591174Stmm * This software was developed by the Computer Systems Engineering group 691174Stmm * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 791174Stmm * contributed to Berkeley. 891174Stmm * 991174Stmm * All advertising materials mentioning features or use of this software 1091174Stmm * must display the following acknowledgement: 1191174Stmm * This product includes software developed by the University of 1291174Stmm * California, Lawrence Berkeley Laboratory. 1391174Stmm * 1491174Stmm * Redistribution and use in source and binary forms, with or without 1591174Stmm * modification, are permitted provided that the following conditions 1691174Stmm * are met: 1791174Stmm * 1. Redistributions of source code must retain the above copyright 1891174Stmm * notice, this list of conditions and the following disclaimer. 1991174Stmm * 2. Redistributions in binary form must reproduce the above copyright 2091174Stmm * notice, this list of conditions and the following disclaimer in the 2191174Stmm * documentation and/or other materials provided with the distribution. 2291174Stmm * 4. Neither the name of the University nor the names of its contributors 2391174Stmm * may be used to endorse or promote products derived from this software 2491174Stmm * without specific prior written permission. 2591174Stmm * 2691174Stmm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2791174Stmm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2891174Stmm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2991174Stmm * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3091174Stmm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3191174Stmm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3291174Stmm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3391174Stmm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3491174Stmm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3591174Stmm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3691174Stmm * SUCH DAMAGE. 3791174Stmm * 3891174Stmm * @(#)fpu_sqrt.c 8.1 (Berkeley) 6/11/93 3992986Sobrien * $NetBSD: fpu_sqrt.c,v 1.2 1994/11/20 20:52:46 deraadt Exp $ 4091174Stmm */ 4191174Stmm 4292986Sobrien#include <sys/cdefs.h> 4392986Sobrien__FBSDID("$FreeBSD$"); 4492986Sobrien 4591174Stmm/* 4691174Stmm * Perform an FPU square root (return sqrt(x)). 4791174Stmm */ 4891174Stmm 4991174Stmm#include <sys/types.h> 5091174Stmm 5191174Stmm#include <machine/frame.h> 5291174Stmm#include <machine/fp.h> 5391174Stmm 5491174Stmm#include "fpu_arith.h" 5591174Stmm#include "fpu_emu.h" 5691174Stmm#include "fpu_extern.h" 5791174Stmm 5891174Stmm/* 5991174Stmm * Our task is to calculate the square root of a floating point number x0. 6091174Stmm * This number x normally has the form: 6191174Stmm * 6291174Stmm * exp 6391174Stmm * x = mant * 2 (where 1 <= mant < 2 and exp is an integer) 6491174Stmm * 6591174Stmm * This can be left as it stands, or the mantissa can be doubled and the 6691174Stmm * exponent decremented: 6791174Stmm * 6891174Stmm * exp-1 6991174Stmm * x = (2 * mant) * 2 (where 2 <= 2 * mant < 4) 7091174Stmm * 7191174Stmm * If the exponent `exp' is even, the square root of the number is best 7291174Stmm * handled using the first form, and is by definition equal to: 7391174Stmm * 7491174Stmm * exp/2 7591174Stmm * sqrt(x) = sqrt(mant) * 2 7691174Stmm * 7791174Stmm * If exp is odd, on the other hand, it is convenient to use the second 7891174Stmm * form, giving: 7991174Stmm * 8091174Stmm * (exp-1)/2 8191174Stmm * sqrt(x) = sqrt(2 * mant) * 2 8291174Stmm * 8391174Stmm * In the first case, we have 8491174Stmm * 8591174Stmm * 1 <= mant < 2 8691174Stmm * 8791174Stmm * and therefore 8891174Stmm * 8991174Stmm * sqrt(1) <= sqrt(mant) < sqrt(2) 9091174Stmm * 9191174Stmm * while in the second case we have 9291174Stmm * 9391174Stmm * 2 <= 2*mant < 4 9491174Stmm * 9591174Stmm * and therefore 9691174Stmm * 9791174Stmm * sqrt(2) <= sqrt(2*mant) < sqrt(4) 9891174Stmm * 9991174Stmm * so that in any case, we are sure that 10091174Stmm * 10191174Stmm * sqrt(1) <= sqrt(n * mant) < sqrt(4), n = 1 or 2 10291174Stmm * 10391174Stmm * or 10491174Stmm * 10591174Stmm * 1 <= sqrt(n * mant) < 2, n = 1 or 2. 10691174Stmm * 10791174Stmm * This root is therefore a properly formed mantissa for a floating 10891174Stmm * point number. The exponent of sqrt(x) is either exp/2 or (exp-1)/2 10991174Stmm * as above. This leaves us with the problem of finding the square root 11091174Stmm * of a fixed-point number in the range [1..4). 11191174Stmm * 11291174Stmm * Though it may not be instantly obvious, the following square root 11391174Stmm * algorithm works for any integer x of an even number of bits, provided 11491174Stmm * that no overflows occur: 11591174Stmm * 11691174Stmm * let q = 0 11791174Stmm * for k = NBITS-1 to 0 step -1 do -- for each digit in the answer... 11891174Stmm * x *= 2 -- multiply by radix, for next digit 11991174Stmm * if x >= 2q + 2^k then -- if adding 2^k does not 12091174Stmm * x -= 2q + 2^k -- exceed the correct root, 12191174Stmm * q += 2^k -- add 2^k and adjust x 12291174Stmm * fi 12391174Stmm * done 12491174Stmm * sqrt = q / 2^(NBITS/2) -- (and any remainder is in x) 12591174Stmm * 12691174Stmm * If NBITS is odd (so that k is initially even), we can just add another 12791174Stmm * zero bit at the top of x. Doing so means that q is not going to acquire 12891174Stmm * a 1 bit in the first trip around the loop (since x0 < 2^NBITS). If the 12991174Stmm * final value in x is not needed, or can be off by a factor of 2, this is 13091174Stmm * equivalant to moving the `x *= 2' step to the bottom of the loop: 13191174Stmm * 13291174Stmm * for k = NBITS-1 to 0 step -1 do if ... fi; x *= 2; done 13391174Stmm * 13491174Stmm * and the result q will then be sqrt(x0) * 2^floor(NBITS / 2). 13591174Stmm * (Since the algorithm is destructive on x, we will call x's initial 13691174Stmm * value, for which q is some power of two times its square root, x0.) 13791174Stmm * 13891174Stmm * If we insert a loop invariant y = 2q, we can then rewrite this using 13991174Stmm * C notation as: 14091174Stmm * 14191174Stmm * q = y = 0; x = x0; 14291174Stmm * for (k = NBITS; --k >= 0;) { 14391174Stmm * #if (NBITS is even) 14491174Stmm * x *= 2; 14591174Stmm * #endif 14691174Stmm * t = y + (1 << k); 14791174Stmm * if (x >= t) { 14891174Stmm * x -= t; 14991174Stmm * q += 1 << k; 15091174Stmm * y += 1 << (k + 1); 15191174Stmm * } 15291174Stmm * #if (NBITS is odd) 15391174Stmm * x *= 2; 15491174Stmm * #endif 15591174Stmm * } 15691174Stmm * 15791174Stmm * If x0 is fixed point, rather than an integer, we can simply alter the 15891174Stmm * scale factor between q and sqrt(x0). As it happens, we can easily arrange 15991174Stmm * for the scale factor to be 2**0 or 1, so that sqrt(x0) == q. 16091174Stmm * 16191174Stmm * In our case, however, x0 (and therefore x, y, q, and t) are multiword 16291174Stmm * integers, which adds some complication. But note that q is built one 16391174Stmm * bit at a time, from the top down, and is not used itself in the loop 16491174Stmm * (we use 2q as held in y instead). This means we can build our answer 16591174Stmm * in an integer, one word at a time, which saves a bit of work. Also, 16691174Stmm * since 1 << k is always a `new' bit in q, 1 << k and 1 << (k+1) are 16791174Stmm * `new' bits in y and we can set them with an `or' operation rather than 16891174Stmm * a full-blown multiword add. 16991174Stmm * 17091174Stmm * We are almost done, except for one snag. We must prove that none of our 17191174Stmm * intermediate calculations can overflow. We know that x0 is in [1..4) 17291174Stmm * and therefore the square root in q will be in [1..2), but what about x, 17391174Stmm * y, and t? 17491174Stmm * 17591174Stmm * We know that y = 2q at the beginning of each loop. (The relation only 17691174Stmm * fails temporarily while y and q are being updated.) Since q < 2, y < 4. 17791174Stmm * The sum in t can, in our case, be as much as y+(1<<1) = y+2 < 6, and. 17891174Stmm * Furthermore, we can prove with a bit of work that x never exceeds y by 17991174Stmm * more than 2, so that even after doubling, 0 <= x < 8. (This is left as 18091174Stmm * an exercise to the reader, mostly because I have become tired of working 18191174Stmm * on this comment.) 18291174Stmm * 18391174Stmm * If our floating point mantissas (which are of the form 1.frac) occupy 18491174Stmm * B+1 bits, our largest intermediary needs at most B+3 bits, or two extra. 18591174Stmm * In fact, we want even one more bit (for a carry, to avoid compares), or 18691174Stmm * three extra. There is a comment in fpu_emu.h reminding maintainers of 18791174Stmm * this, so we have some justification in assuming it. 18891174Stmm */ 18991174Stmmstruct fpn * 19091174Stmm__fpu_sqrt(fe) 19191174Stmm struct fpemu *fe; 19291174Stmm{ 19392889Sobrien struct fpn *x = &fe->fe_f1; 19492889Sobrien u_int bit, q, tt; 19592889Sobrien u_int x0, x1, x2, x3; 19692889Sobrien u_int y0, y1, y2, y3; 19792889Sobrien u_int d0, d1, d2, d3; 19892889Sobrien int e; 19991174Stmm 20091174Stmm /* 20191174Stmm * Take care of special cases first. In order: 20291174Stmm * 20391174Stmm * sqrt(NaN) = NaN 20491174Stmm * sqrt(+0) = +0 20591174Stmm * sqrt(-0) = -0 20691174Stmm * sqrt(x < 0) = NaN (including sqrt(-Inf)) 20791174Stmm * sqrt(+Inf) = +Inf 20891174Stmm * 20991174Stmm * Then all that remains are numbers with mantissas in [1..2). 21091174Stmm */ 21191174Stmm if (ISNAN(x) || ISZERO(x)) 21291174Stmm return (x); 21391174Stmm if (x->fp_sign) 21491174Stmm return (__fpu_newnan(fe)); 21591174Stmm if (ISINF(x)) 21691174Stmm return (x); 21791174Stmm 21891174Stmm /* 21991174Stmm * Calculate result exponent. As noted above, this may involve 22091174Stmm * doubling the mantissa. We will also need to double x each 22191174Stmm * time around the loop, so we define a macro for this here, and 22291174Stmm * we break out the multiword mantissa. 22391174Stmm */ 22491174Stmm#ifdef FPU_SHL1_BY_ADD 22591174Stmm#define DOUBLE_X { \ 22691174Stmm FPU_ADDS(x3, x3, x3); FPU_ADDCS(x2, x2, x2); \ 22791174Stmm FPU_ADDCS(x1, x1, x1); FPU_ADDC(x0, x0, x0); \ 22891174Stmm} 22991174Stmm#else 23091174Stmm#define DOUBLE_X { \ 23191174Stmm x0 = (x0 << 1) | (x1 >> 31); x1 = (x1 << 1) | (x2 >> 31); \ 23291174Stmm x2 = (x2 << 1) | (x3 >> 31); x3 <<= 1; \ 23391174Stmm} 23491174Stmm#endif 23591174Stmm#if (FP_NMANT & 1) != 0 23691174Stmm# define ODD_DOUBLE DOUBLE_X 23791174Stmm# define EVEN_DOUBLE /* nothing */ 23891174Stmm#else 23991174Stmm# define ODD_DOUBLE /* nothing */ 24091174Stmm# define EVEN_DOUBLE DOUBLE_X 24191174Stmm#endif 24291174Stmm x0 = x->fp_mant[0]; 24391174Stmm x1 = x->fp_mant[1]; 24491174Stmm x2 = x->fp_mant[2]; 24591174Stmm x3 = x->fp_mant[3]; 24691174Stmm e = x->fp_exp; 24791174Stmm if (e & 1) /* exponent is odd; use sqrt(2mant) */ 24891174Stmm DOUBLE_X; 24991174Stmm /* THE FOLLOWING ASSUMES THAT RIGHT SHIFT DOES SIGN EXTENSION */ 25091174Stmm x->fp_exp = e >> 1; /* calculates (e&1 ? (e-1)/2 : e/2 */ 25191174Stmm 25291174Stmm /* 25391174Stmm * Now calculate the mantissa root. Since x is now in [1..4), 25491174Stmm * we know that the first trip around the loop will definitely 25591174Stmm * set the top bit in q, so we can do that manually and start 25691174Stmm * the loop at the next bit down instead. We must be sure to 25791174Stmm * double x correctly while doing the `known q=1.0'. 25891174Stmm * 25991174Stmm * We do this one mantissa-word at a time, as noted above, to 260261455Seadler * save work. To avoid `(1U << 31) << 1', we also do the top bit 26191174Stmm * outside of each per-word loop. 26291174Stmm * 26391174Stmm * The calculation `t = y + bit' breaks down into `t0 = y0, ..., 26491174Stmm * t3 = y3, t? |= bit' for the appropriate word. Since the bit 26591174Stmm * is always a `new' one, this means that three of the `t?'s are 26691174Stmm * just the corresponding `y?'; we use `#define's here for this. 26791174Stmm * The variable `tt' holds the actual `t?' variable. 26891174Stmm */ 26991174Stmm 27091174Stmm /* calculate q0 */ 27191174Stmm#define t0 tt 27291174Stmm bit = FP_1; 27391174Stmm EVEN_DOUBLE; 27491174Stmm /* if (x >= (t0 = y0 | bit)) { */ /* always true */ 27591174Stmm q = bit; 27691174Stmm x0 -= bit; 27791174Stmm y0 = bit << 1; 27891174Stmm /* } */ 27991174Stmm ODD_DOUBLE; 28091174Stmm while ((bit >>= 1) != 0) { /* for remaining bits in q0 */ 28191174Stmm EVEN_DOUBLE; 28291174Stmm t0 = y0 | bit; /* t = y + bit */ 28391174Stmm if (x0 >= t0) { /* if x >= t then */ 28491174Stmm x0 -= t0; /* x -= t */ 28591174Stmm q |= bit; /* q += bit */ 28691174Stmm y0 |= bit << 1; /* y += bit << 1 */ 28791174Stmm } 28891174Stmm ODD_DOUBLE; 28991174Stmm } 29091174Stmm x->fp_mant[0] = q; 29191174Stmm#undef t0 29291174Stmm 29391174Stmm /* calculate q1. note (y0&1)==0. */ 29491174Stmm#define t0 y0 29591174Stmm#define t1 tt 29691174Stmm q = 0; 29791174Stmm y1 = 0; 29891174Stmm bit = 1 << 31; 29991174Stmm EVEN_DOUBLE; 30091174Stmm t1 = bit; 30191174Stmm FPU_SUBS(d1, x1, t1); 30291174Stmm FPU_SUBC(d0, x0, t0); /* d = x - t */ 30391174Stmm if ((int)d0 >= 0) { /* if d >= 0 (i.e., x >= t) then */ 30491174Stmm x0 = d0, x1 = d1; /* x -= t */ 30591174Stmm q = bit; /* q += bit */ 30691174Stmm y0 |= 1; /* y += bit << 1 */ 30791174Stmm } 30891174Stmm ODD_DOUBLE; 30991174Stmm while ((bit >>= 1) != 0) { /* for remaining bits in q1 */ 31091174Stmm EVEN_DOUBLE; /* as before */ 31191174Stmm t1 = y1 | bit; 31291174Stmm FPU_SUBS(d1, x1, t1); 31391174Stmm FPU_SUBC(d0, x0, t0); 31491174Stmm if ((int)d0 >= 0) { 31591174Stmm x0 = d0, x1 = d1; 31691174Stmm q |= bit; 31791174Stmm y1 |= bit << 1; 31891174Stmm } 31991174Stmm ODD_DOUBLE; 32091174Stmm } 32191174Stmm x->fp_mant[1] = q; 32291174Stmm#undef t1 32391174Stmm 32491174Stmm /* calculate q2. note (y1&1)==0; y0 (aka t0) is fixed. */ 32591174Stmm#define t1 y1 32691174Stmm#define t2 tt 32791174Stmm q = 0; 32891174Stmm y2 = 0; 32991174Stmm bit = 1 << 31; 33091174Stmm EVEN_DOUBLE; 33191174Stmm t2 = bit; 33291174Stmm FPU_SUBS(d2, x2, t2); 33391174Stmm FPU_SUBCS(d1, x1, t1); 33491174Stmm FPU_SUBC(d0, x0, t0); 33591174Stmm if ((int)d0 >= 0) { 33691174Stmm x0 = d0, x1 = d1, x2 = d2; 337178139Sdas q = bit; 33891174Stmm y1 |= 1; /* now t1, y1 are set in concrete */ 33991174Stmm } 34091174Stmm ODD_DOUBLE; 34191174Stmm while ((bit >>= 1) != 0) { 34291174Stmm EVEN_DOUBLE; 34391174Stmm t2 = y2 | bit; 34491174Stmm FPU_SUBS(d2, x2, t2); 34591174Stmm FPU_SUBCS(d1, x1, t1); 34691174Stmm FPU_SUBC(d0, x0, t0); 34791174Stmm if ((int)d0 >= 0) { 34891174Stmm x0 = d0, x1 = d1, x2 = d2; 34991174Stmm q |= bit; 35091174Stmm y2 |= bit << 1; 35191174Stmm } 35291174Stmm ODD_DOUBLE; 35391174Stmm } 35491174Stmm x->fp_mant[2] = q; 35591174Stmm#undef t2 35691174Stmm 35791174Stmm /* calculate q3. y0, t0, y1, t1 all fixed; y2, t2, almost done. */ 35891174Stmm#define t2 y2 35991174Stmm#define t3 tt 36091174Stmm q = 0; 36191174Stmm y3 = 0; 36291174Stmm bit = 1 << 31; 36391174Stmm EVEN_DOUBLE; 36491174Stmm t3 = bit; 36591174Stmm FPU_SUBS(d3, x3, t3); 36691174Stmm FPU_SUBCS(d2, x2, t2); 36791174Stmm FPU_SUBCS(d1, x1, t1); 36891174Stmm FPU_SUBC(d0, x0, t0); 36991174Stmm if ((int)d0 >= 0) { 370178139Sdas x0 = d0, x1 = d1, x2 = d2; x3 = d3; 371178139Sdas q = bit; 37291174Stmm y2 |= 1; 37391174Stmm } 374178139Sdas ODD_DOUBLE; 37591174Stmm while ((bit >>= 1) != 0) { 37691174Stmm EVEN_DOUBLE; 37791174Stmm t3 = y3 | bit; 37891174Stmm FPU_SUBS(d3, x3, t3); 37991174Stmm FPU_SUBCS(d2, x2, t2); 38091174Stmm FPU_SUBCS(d1, x1, t1); 38191174Stmm FPU_SUBC(d0, x0, t0); 38291174Stmm if ((int)d0 >= 0) { 383178139Sdas x0 = d0, x1 = d1, x2 = d2; x3 = d3; 38491174Stmm q |= bit; 38591174Stmm y3 |= bit << 1; 38691174Stmm } 38791174Stmm ODD_DOUBLE; 38891174Stmm } 38991174Stmm x->fp_mant[3] = q; 39091174Stmm 39191174Stmm /* 39291174Stmm * The result, which includes guard and round bits, is exact iff 39391174Stmm * x is now zero; any nonzero bits in x represent sticky bits. 39491174Stmm */ 39591174Stmm x->fp_sticky = x0 | x1 | x2 | x3; 39691174Stmm return (x); 39791174Stmm} 398