191174Stmm/* 291174Stmm * Copyright (c) 1992, 1993 391174Stmm * The Regents of the University of California. All rights reserved. 491174Stmm * 591174Stmm * This software was developed by the Computer Systems Engineering group 691174Stmm * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and 791174Stmm * contributed to Berkeley. 891174Stmm * 991174Stmm * All advertising materials mentioning features or use of this software 1091174Stmm * must display the following acknowledgement: 1191174Stmm * This product includes software developed by the University of 1291174Stmm * California, Lawrence Berkeley Laboratory. 1391174Stmm * 1491174Stmm * Redistribution and use in source and binary forms, with or without 1591174Stmm * modification, are permitted provided that the following conditions 1691174Stmm * are met: 1791174Stmm * 1. Redistributions of source code must retain the above copyright 1891174Stmm * notice, this list of conditions and the following disclaimer. 1991174Stmm * 2. Redistributions in binary form must reproduce the above copyright 2091174Stmm * notice, this list of conditions and the following disclaimer in the 2191174Stmm * documentation and/or other materials provided with the distribution. 2291174Stmm * 4. Neither the name of the University nor the names of its contributors 2391174Stmm * may be used to endorse or promote products derived from this software 2491174Stmm * without specific prior written permission. 2591174Stmm * 2691174Stmm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2791174Stmm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2891174Stmm * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2991174Stmm * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3091174Stmm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3191174Stmm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3291174Stmm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3391174Stmm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3491174Stmm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3591174Stmm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3691174Stmm * SUCH DAMAGE. 3791174Stmm * 3891174Stmm * @(#)fpu_mul.c 8.1 (Berkeley) 6/11/93 3992986Sobrien * $NetBSD: fpu_mul.c,v 1.2 1994/11/20 20:52:44 deraadt Exp $ 4091174Stmm */ 4191174Stmm 4292986Sobrien#include <sys/cdefs.h> 4392986Sobrien__FBSDID("$FreeBSD$"); 4492986Sobrien 4591174Stmm/* 4691174Stmm * Perform an FPU multiply (return x * y). 4791174Stmm */ 4891174Stmm 4991174Stmm#include <sys/types.h> 5091174Stmm 5191174Stmm#include <machine/frame.h> 5291174Stmm#include <machine/fp.h> 5391174Stmm 5491174Stmm#include "fpu_arith.h" 5591174Stmm#include "fpu_emu.h" 5691174Stmm#include "fpu_extern.h" 5791174Stmm 5891174Stmm/* 5991174Stmm * The multiplication algorithm for normal numbers is as follows: 6091174Stmm * 6191174Stmm * The fraction of the product is built in the usual stepwise fashion. 6291174Stmm * Each step consists of shifting the accumulator right one bit 6391174Stmm * (maintaining any guard bits) and, if the next bit in y is set, 6491174Stmm * adding the multiplicand (x) to the accumulator. Then, in any case, 6591174Stmm * we advance one bit leftward in y. Algorithmically: 6691174Stmm * 6791174Stmm * A = 0; 6891174Stmm * for (bit = 0; bit < FP_NMANT; bit++) { 6991174Stmm * sticky |= A & 1, A >>= 1; 7091174Stmm * if (Y & (1 << bit)) 7191174Stmm * A += X; 7291174Stmm * } 7391174Stmm * 7491174Stmm * (X and Y here represent the mantissas of x and y respectively.) 7591174Stmm * The resultant accumulator (A) is the product's mantissa. It may 7691174Stmm * be as large as 11.11111... in binary and hence may need to be 7791174Stmm * shifted right, but at most one bit. 7891174Stmm * 7991174Stmm * Since we do not have efficient multiword arithmetic, we code the 8091174Stmm * accumulator as four separate words, just like any other mantissa. 8191174Stmm * We use local `register' variables in the hope that this is faster 8291174Stmm * than memory. We keep x->fp_mant in locals for the same reason. 8391174Stmm * 8491174Stmm * In the algorithm above, the bits in y are inspected one at a time. 8591174Stmm * We will pick them up 32 at a time and then deal with those 32, one 8691174Stmm * at a time. Note, however, that we know several things about y: 8791174Stmm * 8891174Stmm * - the guard and round bits at the bottom are sure to be zero; 8991174Stmm * 9091174Stmm * - often many low bits are zero (y is often from a single or double 9191174Stmm * precision source); 9291174Stmm * 9391174Stmm * - bit FP_NMANT-1 is set, and FP_1*2 fits in a word. 9491174Stmm * 9591174Stmm * We can also test for 32-zero-bits swiftly. In this case, the center 9691174Stmm * part of the loop---setting sticky, shifting A, and not adding---will 9791174Stmm * run 32 times without adding X to A. We can do a 32-bit shift faster 9891174Stmm * by simply moving words. Since zeros are common, we optimize this case. 9991174Stmm * Furthermore, since A is initially zero, we can omit the shift as well 10091174Stmm * until we reach a nonzero word. 10191174Stmm */ 10291174Stmmstruct fpn * 10391174Stmm__fpu_mul(fe) 10492889Sobrien struct fpemu *fe; 10591174Stmm{ 10692889Sobrien struct fpn *x = &fe->fe_f1, *y = &fe->fe_f2; 10792889Sobrien u_int a3, a2, a1, a0, x3, x2, x1, x0, bit, m; 10892889Sobrien int sticky; 10991174Stmm FPU_DECL_CARRY 11091174Stmm 11191174Stmm /* 11291174Stmm * Put the `heavier' operand on the right (see fpu_emu.h). 11391174Stmm * Then we will have one of the following cases, taken in the 11491174Stmm * following order: 11591174Stmm * 11691174Stmm * - y = NaN. Implied: if only one is a signalling NaN, y is. 11791174Stmm * The result is y. 11891174Stmm * - y = Inf. Implied: x != NaN (is 0, number, or Inf: the NaN 11991174Stmm * case was taken care of earlier). 12091174Stmm * If x = 0, the result is NaN. Otherwise the result 12191174Stmm * is y, with its sign reversed if x is negative. 12291174Stmm * - x = 0. Implied: y is 0 or number. 12391174Stmm * The result is 0 (with XORed sign as usual). 12491174Stmm * - other. Implied: both x and y are numbers. 12591174Stmm * The result is x * y (XOR sign, multiply bits, add exponents). 12691174Stmm */ 12791174Stmm ORDER(x, y); 128230193Sdas if (ISNAN(y)) 12991174Stmm return (y); 13091174Stmm if (ISINF(y)) { 13191174Stmm if (ISZERO(x)) 13291174Stmm return (__fpu_newnan(fe)); 13391174Stmm y->fp_sign ^= x->fp_sign; 13491174Stmm return (y); 13591174Stmm } 13691174Stmm if (ISZERO(x)) { 13791174Stmm x->fp_sign ^= y->fp_sign; 13891174Stmm return (x); 13991174Stmm } 14091174Stmm 14191174Stmm /* 14291174Stmm * Setup. In the code below, the mask `m' will hold the current 14391174Stmm * mantissa byte from y. The variable `bit' denotes the bit 14491174Stmm * within m. We also define some macros to deal with everything. 14591174Stmm */ 14691174Stmm x3 = x->fp_mant[3]; 14791174Stmm x2 = x->fp_mant[2]; 14891174Stmm x1 = x->fp_mant[1]; 14991174Stmm x0 = x->fp_mant[0]; 15091174Stmm sticky = a3 = a2 = a1 = a0 = 0; 15191174Stmm 15291174Stmm#define ADD /* A += X */ \ 15391174Stmm FPU_ADDS(a3, a3, x3); \ 15491174Stmm FPU_ADDCS(a2, a2, x2); \ 15591174Stmm FPU_ADDCS(a1, a1, x1); \ 15691174Stmm FPU_ADDC(a0, a0, x0) 15791174Stmm 15891174Stmm#define SHR1 /* A >>= 1, with sticky */ \ 15991174Stmm sticky |= a3 & 1, a3 = (a3 >> 1) | (a2 << 31), \ 16091174Stmm a2 = (a2 >> 1) | (a1 << 31), a1 = (a1 >> 1) | (a0 << 31), a0 >>= 1 16191174Stmm 16291174Stmm#define SHR32 /* A >>= 32, with sticky */ \ 16391174Stmm sticky |= a3, a3 = a2, a2 = a1, a1 = a0, a0 = 0 16491174Stmm 16591174Stmm#define STEP /* each 1-bit step of the multiplication */ \ 16691174Stmm SHR1; if (bit & m) { ADD; }; bit <<= 1 16791174Stmm 16891174Stmm /* 16991174Stmm * We are ready to begin. The multiply loop runs once for each 17091174Stmm * of the four 32-bit words. Some words, however, are special. 17191174Stmm * As noted above, the low order bits of Y are often zero. Even 17291174Stmm * if not, the first loop can certainly skip the guard bits. 17391174Stmm * The last word of y has its highest 1-bit in position FP_NMANT-1, 17491174Stmm * so we stop the loop when we move past that bit. 17591174Stmm */ 17691174Stmm if ((m = y->fp_mant[3]) == 0) { 17791174Stmm /* SHR32; */ /* unneeded since A==0 */ 17891174Stmm } else { 17991174Stmm bit = 1 << FP_NG; 18091174Stmm do { 18191174Stmm STEP; 18291174Stmm } while (bit != 0); 18391174Stmm } 18491174Stmm if ((m = y->fp_mant[2]) == 0) { 18591174Stmm SHR32; 18691174Stmm } else { 18791174Stmm bit = 1; 18891174Stmm do { 18991174Stmm STEP; 19091174Stmm } while (bit != 0); 19191174Stmm } 19291174Stmm if ((m = y->fp_mant[1]) == 0) { 19391174Stmm SHR32; 19491174Stmm } else { 19591174Stmm bit = 1; 19691174Stmm do { 19791174Stmm STEP; 19891174Stmm } while (bit != 0); 19991174Stmm } 20091174Stmm m = y->fp_mant[0]; /* definitely != 0 */ 20191174Stmm bit = 1; 20291174Stmm do { 20391174Stmm STEP; 20491174Stmm } while (bit <= m); 20591174Stmm 20691174Stmm /* 20791174Stmm * Done with mantissa calculation. Get exponent and handle 20891174Stmm * 11.111...1 case, then put result in place. We reuse x since 20991174Stmm * it already has the right class (FP_NUM). 21091174Stmm */ 21191174Stmm m = x->fp_exp + y->fp_exp; 21291174Stmm if (a0 >= FP_2) { 21391174Stmm SHR1; 21491174Stmm m++; 21591174Stmm } 21691174Stmm x->fp_sign ^= y->fp_sign; 21791174Stmm x->fp_exp = m; 21891174Stmm x->fp_sticky = sticky; 21991174Stmm x->fp_mant[3] = a3; 22091174Stmm x->fp_mant[2] = a2; 22191174Stmm x->fp_mant[1] = a1; 22291174Stmm x->fp_mant[0] = a0; 22391174Stmm return (x); 22491174Stmm} 225