mpn/sparc64/dive_1.c

158115Sume/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
158115Sume
158115Sume   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
158115Sume   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
158115Sume   FUTURE GNU MP RELEASES.
158115Sume
158115SumeCopyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc.
158115Sume
158115SumeThis file is part of the GNU MP Library.
158115Sume
158115SumeThe GNU MP Library is free software; you can redistribute it and/or modify
158115Sumeit under the terms of either:
158115Sume
158115Sume  * the GNU Lesser General Public License as published by the Free
158115Sume    Software Foundation; either version 3 of the License, or (at your
158115Sume    option) any later version.
158115Sume
158115Sumeor
158115Sume
158115Sume  * the GNU General Public License as published by the Free Software
158115Sume    Foundation; either version 2 of the License, or (at your option) any
158115Sume    later version.
158115Sume
158115Sumeor both in parallel, as here.
158115Sume
158115SumeThe GNU MP Library is distributed in the hope that it will be useful, but
158115SumeWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
158115Sumeor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
158115Sumefor more details.
158115Sume
158115SumeYou should have received copies of the GNU General Public License and the
158115SumeGNU Lesser General Public License along with the GNU MP Library.  If not,
158115Sumesee https://www.gnu.org/licenses/.  */
158115Sume
158115Sume#include "gmp-impl.h"
158115Sume#include "longlong.h"
158115Sume
158115Sume#include "mpn/sparc64/sparc64.h"
158115Sume
158115Sume
158115Sume/*                 64-bit divisor   32-bit divisor
158115Sume                    cycles/limb      cycles/limb
158115Sume                     (approx)         (approx)
158115Sume   Ultrasparc 2i:      110               70
158115Sume*/
158115Sume
158115Sume
158115Sume/* There are two key ideas here to reduce mulx's.  Firstly when the divisor
158115Sume   is 32-bits the high of q*d can be calculated without the two 32x32->64
158115Sume   cross-products involving the high 32-bits of the divisor, that being zero
158115Sume   of course.  Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
158115Sume   one mulx (each) knowing the low of q*d is equal to the input limb l.
158115Sume
158115Sume   For size==1, a simple udivx is used.  This is faster than calculating an
158115Sume   inverse.
158115Sume
158115Sume   For a 32-bit divisor and small sizes, an attempt was made at a simple
158115Sume   udivx loop (two per 64-bit limb), but it turned out to be slower than
158115Sume   mul-by-inverse.  At size==2 the inverse is about 260 cycles total
158115Sume   compared to a udivx at 291.  Perhaps the latter would suit when size==2
158115Sume   but the high 32-bits of the second limb is zero (saving one udivx), but
158115Sume   it doesn't seem worth a special case just for that.  */
158115Sume
158115Sumevoid
158115Sumempn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
158115Sume{
158115Sume  mp_limb_t  inverse, s, s_next, c, l, ls, q;
158115Sume  unsigned   rshift, lshift;
158115Sume  mp_limb_t  lshift_mask;
158115Sume  mp_limb_t  divisor_h;
158115Sume
158115Sume  ASSERT (size >= 1);
158115Sume  ASSERT (divisor != 0);
158115Sume  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
164882Sume  ASSERT_MPN (src, size);
158115Sume  ASSERT_LIMB (divisor);
158115Sume
248252Sjilles  s = *src++;                 /* src low limb */
158115Sume  size--;
248252Sjilles  if (size == 0)
248252Sjilles    {
158115Sume      *dst = s / divisor;
158115Sume      return;
158115Sume    }
158115Sume
158115Sume  if ((divisor & 1) == 0)
158115Sume    {
158115Sume      count_trailing_zeros (rshift, divisor);
158115Sume      divisor >>= rshift;
158115Sume      lshift = 64 - rshift;
158115Sume
158115Sume      lshift_mask = MP_LIMB_T_MAX;
158115Sume    }
158115Sume  else
158115Sume    {
158115Sume      rshift = 0;
158115Sume
158115Sume      /* rshift==0 means no shift, so must mask out other part in this case */
158115Sume      lshift = 0;
158115Sume      lshift_mask = 0;
158115Sume    }
158115Sume
158115Sume  binvert_limb (inverse, divisor);
158115Sume
158115Sume  c = 0;
158115Sume  divisor_h = HIGH32 (divisor);
158115Sume
158115Sume  if (divisor_h == 0)
158115Sume    {
158115Sume      /* 32-bit divisor */
158115Sume      do
158115Sume        {
158115Sume          s_next = *src++;
158115Sume          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
158115Sume          s = s_next;
158115Sume
158115Sume          SUBC_LIMB (c, l, ls, c);
164882Sume
158115Sume          q = l * inverse;
158115Sume          *dst++ = q;
158115Sume
158115Sume          umul_ppmm_half_lowequal (l, q, divisor, l);
158115Sume          c += l;
158115Sume
158115Sume          size--;
158115Sume        }
158115Sume      while (size != 0);
158115Sume
158115Sume      ls = s >> rshift;
158115Sume      l = ls - c;
158115Sume      q = l * inverse;
158115Sume      *dst = q;
158115Sume    }
158115Sume  else
158115Sume    {
158115Sume      /* 64-bit divisor */
158115Sume      mp_limb_t  divisor_l = LOW32 (divisor);
158115Sume      do
158115Sume        {
158115Sume          s_next = *src++;
158115Sume          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
158115Sume          s = s_next;
158115Sume
158115Sume          SUBC_LIMB (c, l, ls, c);
158115Sume
158115Sume          q = l * inverse;
158115Sume          *dst++ = q;
158115Sume
158115Sume          umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);
158115Sume          c += l;
158115Sume
158115Sume          size--;
158115Sume        }
158115Sume      while (size != 0);
158257Sume
158115Sume      ls = s >> rshift;
158115Sume      l = ls - c;
158115Sume      q = l * inverse;
158257Sume      *dst = q;
158115Sume    }
158115Sume}
158115Sume