1148456Spjd/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
2213072Spjd
3148456Spjd   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4148456Spjd   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5148456Spjd   FUTURE GNU MP RELEASES.
6148456Spjd
7148456SpjdCopyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc.
8148456Spjd
9148456SpjdThis file is part of the GNU MP Library.
10148456Spjd
11148456SpjdThe GNU MP Library is free software; you can redistribute it and/or modify
12148456Spjdit under the terms of either:
13155174Spjd
14148456Spjd  * the GNU Lesser General Public License as published by the Free
15148456Spjd    Software Foundation; either version 3 of the License, or (at your
16148456Spjd    option) any later version.
17148456Spjd
18148456Spjdor
19148456Spjd
20148456Spjd  * the GNU General Public License as published by the Free Software
21148456Spjd    Foundation; either version 2 of the License, or (at your option) any
22148456Spjd    later version.
23148456Spjd
24148456Spjdor both in parallel, as here.
25148456Spjd
26148456SpjdThe GNU MP Library is distributed in the hope that it will be useful, but
27148456SpjdWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28148456Spjdor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29148456Spjdfor more details.
30148456Spjd
31148456SpjdYou should have received copies of the GNU General Public License and the
32148456SpjdGNU Lesser General Public License along with the GNU MP Library.  If not,
33148456Spjdsee https://www.gnu.org/licenses/.  */
34148456Spjd
35148456Spjd#include "gmp-impl.h"
36148456Spjd#include "longlong.h"
37148456Spjd
38148456Spjd#include "mpn/sparc64/sparc64.h"
39148456Spjd
40148456Spjd
41148456Spjd/*                 64-bit divisor   32-bit divisor
42148456Spjd                    cycles/limb      cycles/limb
43148456Spjd                     (approx)         (approx)
44148867Spjd   Ultrasparc 2i:      110               70
45148456Spjd*/
46148456Spjd
47148456Spjd
48148456Spjd/* There are two key ideas here to reduce mulx's.  Firstly when the divisor
49148456Spjd   is 32-bits the high of q*d can be calculated without the two 32x32->64
50148456Spjd   cross-products involving the high 32-bits of the divisor, that being zero
51148456Spjd   of course.  Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
52148456Spjd   one mulx (each) knowing the low of q*d is equal to the input limb l.
53148456Spjd
54148456Spjd   For size==1, a simple udivx is used.  This is faster than calculating an
55148456Spjd   inverse.
56148456Spjd
57159307Spjd   For a 32-bit divisor and small sizes, an attempt was made at a simple
58159307Spjd   udivx loop (two per 64-bit limb), but it turned out to be slower than
59161217Spjd   mul-by-inverse.  At size==2 the inverse is about 260 cycles total
60211927Spjd   compared to a udivx at 291.  Perhaps the latter would suit when size==2
61211927Spjd   but the high 32-bits of the second limb is zero (saving one udivx), but
62161220Spjd   it doesn't seem worth a special case just for that.  */
63213070Spjd
64148456Spjdvoid
65213067Spjdmpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
66148456Spjd{
67161127Spjd  mp_limb_t  inverse, s, s_next, c, l, ls, q;
68148456Spjd  unsigned   rshift, lshift;
69161220Spjd  mp_limb_t  lshift_mask;
70148456Spjd  mp_limb_t  divisor_h;
71161220Spjd
72148456Spjd  ASSERT (size >= 1);
73161220Spjd  ASSERT (divisor != 0);
74148456Spjd  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
75161220Spjd  ASSERT_MPN (src, size);
76159307Spjd  ASSERT_LIMB (divisor);
77161220Spjd
78161127Spjd  s = *src++;                 /* src low limb */
79161220Spjd  size--;
80161127Spjd  if (size == 0)
81148456Spjd    {
82161220Spjd      *dst = s / divisor;
83148456Spjd      return;
84161220Spjd    }
85161220Spjd
86161220Spjd  if ((divisor & 1) == 0)
87213067Spjd    {
88213067Spjd      count_trailing_zeros (rshift, divisor);
89214118Spjd      divisor >>= rshift;
90214118Spjd      lshift = 64 - rshift;
91148456Spjd
92214118Spjd      lshift_mask = MP_LIMB_T_MAX;
93214118Spjd    }
94148456Spjd  else
95159307Spjd    {
96148456Spjd      rshift = 0;
97148456Spjd
98148456Spjd      /* rshift==0 means no shift, so must mask out other part in this case */
99148456Spjd      lshift = 0;
100148456Spjd      lshift_mask = 0;
101159307Spjd    }
102148456Spjd
103148456Spjd  binvert_limb (inverse, divisor);
104148456Spjd
105148456Spjd  c = 0;
106148456Spjd  divisor_h = HIGH32 (divisor);
107213062Spjd
108213067Spjd  if (divisor_h == 0)
109213067Spjd    {
110148456Spjd      /* 32-bit divisor */
111148456Spjd      do
112213165Spjd        {
113148456Spjd          s_next = *src++;
114159307Spjd          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
115148456Spjd          s = s_next;
116148456Spjd
117148456Spjd          SUBC_LIMB (c, l, ls, c);
118148456Spjd
119148456Spjd          q = l * inverse;
120148456Spjd          *dst++ = q;
121148456Spjd
122148456Spjd          umul_ppmm_half_lowequal (l, q, divisor, l);
123148456Spjd          c += l;
124148456Spjd
125148456Spjd          size--;
126148456Spjd        }
127148456Spjd      while (size != 0);
128148456Spjd
129148456Spjd      ls = s >> rshift;
130148456Spjd      l = ls - c;
131148456Spjd      q = l * inverse;
132148456Spjd      *dst = q;
133148456Spjd    }
134148456Spjd  else
135148456Spjd    {
136148456Spjd      /* 64-bit divisor */
137148456Spjd      mp_limb_t  divisor_l = LOW32 (divisor);
138148456Spjd      do
139148456Spjd        {
140148456Spjd          s_next = *src++;
141148456Spjd          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
142148456Spjd          s = s_next;
143148456Spjd
144148456Spjd          SUBC_LIMB (c, l, ls, c);
145148456Spjd
146148456Spjd          q = l * inverse;
147214118Spjd          *dst++ = q;
148148456Spjd
149148456Spjd          umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);
150148456Spjd          c += l;
151148456Spjd
152213067Spjd          size--;
153213067Spjd        }
154213067Spjd      while (size != 0);
155213067Spjd
156213067Spjd      ls = s >> rshift;
157213067Spjd      l = ls - c;
158213067Spjd      q = l * inverse;
159213067Spjd      *dst = q;
160213067Spjd    }
161213067Spjd}
162213067Spjd