1158115Sume/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
2158115Sume
3158115Sume   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4158115Sume   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5158115Sume   FUTURE GNU MP RELEASES.
6158115Sume
7158115SumeCopyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc.
8158115Sume
9158115SumeThis file is part of the GNU MP Library.
10158115Sume
11158115SumeThe GNU MP Library is free software; you can redistribute it and/or modify
12158115Sumeit under the terms of either:
13158115Sume
14158115Sume  * the GNU Lesser General Public License as published by the Free
15158115Sume    Software Foundation; either version 3 of the License, or (at your
16158115Sume    option) any later version.
17158115Sume
18158115Sumeor
19158115Sume
20158115Sume  * the GNU General Public License as published by the Free Software
21158115Sume    Foundation; either version 2 of the License, or (at your option) any
22158115Sume    later version.
23158115Sume
24158115Sumeor both in parallel, as here.
25158115Sume
26158115SumeThe GNU MP Library is distributed in the hope that it will be useful, but
27158115SumeWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28158115Sumeor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29158115Sumefor more details.
30158115Sume
31158115SumeYou should have received copies of the GNU General Public License and the
32158115SumeGNU Lesser General Public License along with the GNU MP Library.  If not,
33158115Sumesee https://www.gnu.org/licenses/.  */
34158115Sume
35158115Sume#include "gmp-impl.h"
36158115Sume#include "longlong.h"
37158115Sume
38158115Sume#include "mpn/sparc64/sparc64.h"
39158115Sume
40158115Sume
41158115Sume/*                 64-bit divisor   32-bit divisor
42158115Sume                    cycles/limb      cycles/limb
43158115Sume                     (approx)         (approx)
44158115Sume   Ultrasparc 2i:      110               70
45158115Sume*/
46158115Sume
47158115Sume
48158115Sume/* There are two key ideas here to reduce mulx's.  Firstly when the divisor
49158115Sume   is 32-bits the high of q*d can be calculated without the two 32x32->64
50158115Sume   cross-products involving the high 32-bits of the divisor, that being zero
51158115Sume   of course.  Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
52158115Sume   one mulx (each) knowing the low of q*d is equal to the input limb l.
53158115Sume
54158115Sume   For size==1, a simple udivx is used.  This is faster than calculating an
55158115Sume   inverse.
56158115Sume
57158115Sume   For a 32-bit divisor and small sizes, an attempt was made at a simple
58158115Sume   udivx loop (two per 64-bit limb), but it turned out to be slower than
59158115Sume   mul-by-inverse.  At size==2 the inverse is about 260 cycles total
60158115Sume   compared to a udivx at 291.  Perhaps the latter would suit when size==2
61158115Sume   but the high 32-bits of the second limb is zero (saving one udivx), but
62158115Sume   it doesn't seem worth a special case just for that.  */
63158115Sume
64158115Sumevoid
65158115Sumempn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
66158115Sume{
67158115Sume  mp_limb_t  inverse, s, s_next, c, l, ls, q;
68158115Sume  unsigned   rshift, lshift;
69158115Sume  mp_limb_t  lshift_mask;
70158115Sume  mp_limb_t  divisor_h;
71158115Sume
72158115Sume  ASSERT (size >= 1);
73158115Sume  ASSERT (divisor != 0);
74158115Sume  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
75164882Sume  ASSERT_MPN (src, size);
76158115Sume  ASSERT_LIMB (divisor);
77158115Sume
78248252Sjilles  s = *src++;                 /* src low limb */
79158115Sume  size--;
80248252Sjilles  if (size == 0)
81248252Sjilles    {
82158115Sume      *dst = s / divisor;
83158115Sume      return;
84158115Sume    }
85158115Sume
86158115Sume  if ((divisor & 1) == 0)
87158115Sume    {
88158115Sume      count_trailing_zeros (rshift, divisor);
89158115Sume      divisor >>= rshift;
90158115Sume      lshift = 64 - rshift;
91158115Sume
92158115Sume      lshift_mask = MP_LIMB_T_MAX;
93158115Sume    }
94158115Sume  else
95158115Sume    {
96158115Sume      rshift = 0;
97158115Sume
98158115Sume      /* rshift==0 means no shift, so must mask out other part in this case */
99158115Sume      lshift = 0;
100158115Sume      lshift_mask = 0;
101158115Sume    }
102158115Sume
103158115Sume  binvert_limb (inverse, divisor);
104158115Sume
105158115Sume  c = 0;
106158115Sume  divisor_h = HIGH32 (divisor);
107158115Sume
108158115Sume  if (divisor_h == 0)
109158115Sume    {
110158115Sume      /* 32-bit divisor */
111158115Sume      do
112158115Sume        {
113158115Sume          s_next = *src++;
114158115Sume          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
115158115Sume          s = s_next;
116158115Sume
117158115Sume          SUBC_LIMB (c, l, ls, c);
118164882Sume
119158115Sume          q = l * inverse;
120158115Sume          *dst++ = q;
121158115Sume
122158115Sume          umul_ppmm_half_lowequal (l, q, divisor, l);
123158115Sume          c += l;
124158115Sume
125158115Sume          size--;
126158115Sume        }
127158115Sume      while (size != 0);
128158115Sume
129158115Sume      ls = s >> rshift;
130158115Sume      l = ls - c;
131158115Sume      q = l * inverse;
132158115Sume      *dst = q;
133158115Sume    }
134158115Sume  else
135158115Sume    {
136158115Sume      /* 64-bit divisor */
137158115Sume      mp_limb_t  divisor_l = LOW32 (divisor);
138158115Sume      do
139158115Sume        {
140158115Sume          s_next = *src++;
141158115Sume          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
142158115Sume          s = s_next;
143158115Sume
144158115Sume          SUBC_LIMB (c, l, ls, c);
145158115Sume
146158115Sume          q = l * inverse;
147158115Sume          *dst++ = q;
148158115Sume
149158115Sume          umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);
150158115Sume          c += l;
151158115Sume
152158115Sume          size--;
153158115Sume        }
154158115Sume      while (size != 0);
155158257Sume
156158115Sume      ls = s >> rshift;
157158115Sume      l = ls - c;
158158115Sume      q = l * inverse;
159158257Sume      *dst = q;
160158115Sume    }
161158115Sume}
162158115Sume