mpn/sparc64/dive_1.c

148456Spjd/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
213072Spjd
148456Spjd   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
148456Spjd   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
148456Spjd   FUTURE GNU MP RELEASES.
148456Spjd
148456SpjdCopyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc.
148456Spjd
148456SpjdThis file is part of the GNU MP Library.
148456Spjd
148456SpjdThe GNU MP Library is free software; you can redistribute it and/or modify
148456Spjdit under the terms of either:
155174Spjd
148456Spjd  * the GNU Lesser General Public License as published by the Free
148456Spjd    Software Foundation; either version 3 of the License, or (at your
148456Spjd    option) any later version.
148456Spjd
148456Spjdor
148456Spjd
148456Spjd  * the GNU General Public License as published by the Free Software
148456Spjd    Foundation; either version 2 of the License, or (at your option) any
148456Spjd    later version.
148456Spjd
148456Spjdor both in parallel, as here.
148456Spjd
148456SpjdThe GNU MP Library is distributed in the hope that it will be useful, but
148456SpjdWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
148456Spjdor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
148456Spjdfor more details.
148456Spjd
148456SpjdYou should have received copies of the GNU General Public License and the
148456SpjdGNU Lesser General Public License along with the GNU MP Library.  If not,
148456Spjdsee https://www.gnu.org/licenses/.  */
148456Spjd
148456Spjd#include "gmp-impl.h"
148456Spjd#include "longlong.h"
148456Spjd
148456Spjd#include "mpn/sparc64/sparc64.h"
148456Spjd
148456Spjd
148456Spjd/*                 64-bit divisor   32-bit divisor
148456Spjd                    cycles/limb      cycles/limb
148456Spjd                     (approx)         (approx)
148867Spjd   Ultrasparc 2i:      110               70
148456Spjd*/
148456Spjd
148456Spjd
148456Spjd/* There are two key ideas here to reduce mulx's.  Firstly when the divisor
148456Spjd   is 32-bits the high of q*d can be calculated without the two 32x32->64
148456Spjd   cross-products involving the high 32-bits of the divisor, that being zero
148456Spjd   of course.  Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
148456Spjd   one mulx (each) knowing the low of q*d is equal to the input limb l.
148456Spjd
148456Spjd   For size==1, a simple udivx is used.  This is faster than calculating an
148456Spjd   inverse.
148456Spjd
159307Spjd   For a 32-bit divisor and small sizes, an attempt was made at a simple
159307Spjd   udivx loop (two per 64-bit limb), but it turned out to be slower than
161217Spjd   mul-by-inverse.  At size==2 the inverse is about 260 cycles total
211927Spjd   compared to a udivx at 291.  Perhaps the latter would suit when size==2
211927Spjd   but the high 32-bits of the second limb is zero (saving one udivx), but
161220Spjd   it doesn't seem worth a special case just for that.  */
213070Spjd
148456Spjdvoid
213067Spjdmpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
148456Spjd{
161127Spjd  mp_limb_t  inverse, s, s_next, c, l, ls, q;
148456Spjd  unsigned   rshift, lshift;
161220Spjd  mp_limb_t  lshift_mask;
148456Spjd  mp_limb_t  divisor_h;
161220Spjd
148456Spjd  ASSERT (size >= 1);
161220Spjd  ASSERT (divisor != 0);
148456Spjd  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
161220Spjd  ASSERT_MPN (src, size);
159307Spjd  ASSERT_LIMB (divisor);
161220Spjd
161127Spjd  s = *src++;                 /* src low limb */
161220Spjd  size--;
161127Spjd  if (size == 0)
148456Spjd    {
161220Spjd      *dst = s / divisor;
148456Spjd      return;
161220Spjd    }
161220Spjd
161220Spjd  if ((divisor & 1) == 0)
213067Spjd    {
213067Spjd      count_trailing_zeros (rshift, divisor);
214118Spjd      divisor >>= rshift;
214118Spjd      lshift = 64 - rshift;
148456Spjd
214118Spjd      lshift_mask = MP_LIMB_T_MAX;
214118Spjd    }
148456Spjd  else
159307Spjd    {
148456Spjd      rshift = 0;
148456Spjd
148456Spjd      /* rshift==0 means no shift, so must mask out other part in this case */
148456Spjd      lshift = 0;
148456Spjd      lshift_mask = 0;
159307Spjd    }
148456Spjd
148456Spjd  binvert_limb (inverse, divisor);
148456Spjd
148456Spjd  c = 0;
148456Spjd  divisor_h = HIGH32 (divisor);
213062Spjd
213067Spjd  if (divisor_h == 0)
213067Spjd    {
148456Spjd      /* 32-bit divisor */
148456Spjd      do
213165Spjd        {
148456Spjd          s_next = *src++;
159307Spjd          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
148456Spjd          s = s_next;
148456Spjd
148456Spjd          SUBC_LIMB (c, l, ls, c);
148456Spjd
148456Spjd          q = l * inverse;
148456Spjd          *dst++ = q;
148456Spjd
148456Spjd          umul_ppmm_half_lowequal (l, q, divisor, l);
148456Spjd          c += l;
148456Spjd
148456Spjd          size--;
148456Spjd        }
148456Spjd      while (size != 0);
148456Spjd
148456Spjd      ls = s >> rshift;
148456Spjd      l = ls - c;
148456Spjd      q = l * inverse;
148456Spjd      *dst = q;
148456Spjd    }
148456Spjd  else
148456Spjd    {
148456Spjd      /* 64-bit divisor */
148456Spjd      mp_limb_t  divisor_l = LOW32 (divisor);
148456Spjd      do
148456Spjd        {
148456Spjd          s_next = *src++;
148456Spjd          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
148456Spjd          s = s_next;
148456Spjd
148456Spjd          SUBC_LIMB (c, l, ls, c);
148456Spjd
148456Spjd          q = l * inverse;
214118Spjd          *dst++ = q;
148456Spjd
148456Spjd          umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);
148456Spjd          c += l;
148456Spjd
213067Spjd          size--;
213067Spjd        }
213067Spjd      while (size != 0);
213067Spjd
213067Spjd      ls = s >> rshift;
213067Spjd      l = ls - c;
213067Spjd      q = l * inverse;
213067Spjd      *dst = q;
213067Spjd    }
213067Spjd}
213067Spjd