1/* UltraSparc 64 mpn_divrem_1 -- mpn by limb division.
2
3Copyright 1991, 1993, 1994, 1996, 1998, 1999, 2000, 2001, 2003 Free Software
4Foundation, Inc.
5
6This file is part of the GNU MP Library.
7
8The GNU MP Library is free software; you can redistribute it and/or modify
9it under the terms of the GNU Lesser General Public License as published by
10the Free Software Foundation; either version 3 of the License, or (at your
11option) any later version.
12
13The GNU MP Library is distributed in the hope that it will be useful, but
14WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16License for more details.
17
18You should have received a copy of the GNU Lesser General Public License
19along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20
21#include "gmp.h"
22#include "gmp-impl.h"
23#include "longlong.h"
24
25#include "mpn/sparc64/sparc64.h"
26
27
28/*                   64-bit divisor       32-bit divisor
29                       cycles/limb          cycles/limb
30                        (approx)             (approx)
31                   integer  fraction    integer  fraction
32   Ultrasparc 2i:    160      160          122      96
33*/
34
35
36/* 32-bit divisors are treated in special case code.  This requires 4 mulx
37   per limb instead of 8 in the general case.
38
39   For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
40   addressing, to get the two halves of each limb read in the correct order.
41   This is kept in an adj variable.  Doing that measures about 4 c/l faster
42   than just writing HALF_ENDIAN_ADJ(i) in the integer loop.  The latter
43   shouldn't be 6 cycles worth of work, but perhaps it doesn't schedule well
44   (on gcc 3.2.1 at least).  The fraction loop doesn't seem affected, but we
45   still use a variable since that ought to work out best.  */
46
47mp_limb_t
48mpn_divrem_1 (mp_ptr qp_limbptr, mp_size_t xsize_limbs,
49              mp_srcptr ap_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
50{
51  mp_size_t  total_size_limbs;
52  mp_size_t  i;
53
54  ASSERT (xsize_limbs >= 0);
55  ASSERT (size_limbs >= 0);
56  ASSERT (d_limb != 0);
57  /* FIXME: What's the correct overlap rule when xsize!=0? */
58  ASSERT (MPN_SAME_OR_SEPARATE_P (qp_limbptr + xsize_limbs,
59                                  ap_limbptr, size_limbs));
60
61  total_size_limbs = size_limbs + xsize_limbs;
62  if (UNLIKELY (total_size_limbs == 0))
63    return 0;
64
65  /* udivx is good for total_size==1, and no need to bother checking
66     limb<divisor, since if that's likely the caller should check */
67  if (UNLIKELY (total_size_limbs == 1))
68    {
69      mp_limb_t  a, q;
70      a = (LIKELY (size_limbs != 0) ? ap_limbptr[0] : 0);
71      q = a / d_limb;
72      qp_limbptr[0] = q;
73      return a - q*d_limb;
74    }
75
76  if (d_limb <= CNST_LIMB(0xFFFFFFFF))
77    {
78      mp_size_t  size, xsize, total_size, adj;
79      unsigned   *qp, n1, n0, q, r, nshift, norm_rmask;
80      mp_limb_t  dinv_limb;
81      const unsigned *ap;
82      int        norm, norm_rshift;
83
84      size = 2 * size_limbs;
85      xsize = 2 * xsize_limbs;
86      total_size = size + xsize;
87
88      ap = (unsigned *) ap_limbptr;
89      qp = (unsigned *) qp_limbptr;
90
91      qp += xsize;
92      r = 0;        /* initial remainder */
93
94      if (LIKELY (size != 0))
95        {
96          n1 = ap[size-1 + HALF_ENDIAN_ADJ(1)];
97
98          /* If the length of the source is uniformly distributed, then
99             there's a 50% chance of the high 32-bits being zero, which we
100             can skip.  */
101          if (n1 == 0)
102            {
103              n1 = ap[size-2 + HALF_ENDIAN_ADJ(0)];
104              total_size--;
105              size--;
106              ASSERT (size > 0);  /* because always even */
107              qp[size + HALF_ENDIAN_ADJ(1)] = 0;
108            }
109
110          /* Skip a division if high < divisor (high quotient 0).  Testing
111             here before before normalizing will still skip as often as
112             possible.  */
113          if (n1 < d_limb)
114            {
115              r = n1;
116              size--;
117              qp[size + HALF_ENDIAN_ADJ(size)] = 0;
118              total_size--;
119              if (total_size == 0)
120                return r;
121            }
122        }
123
124      count_leading_zeros_32 (norm, d_limb);
125      norm -= 32;
126      d_limb <<= norm;
127      r <<= norm;
128
129      norm_rshift = 32 - norm;
130      norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
131
132      invert_half_limb (dinv_limb, d_limb);
133
134      if (LIKELY (size != 0))
135        {
136          i = size - 1;
137          adj = HALF_ENDIAN_ADJ (i);
138          n1 = ap[i + adj];
139          adj = -adj;
140          r |= ((n1 >> norm_rshift) & norm_rmask);
141          for ( ; i > 0; i--)
142            {
143              n0 = ap[i-1 + adj];
144              adj = -adj;
145              nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
146              udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
147              qp[i + adj] = q;
148              n1 = n0;
149            }
150          nshift = n1 << norm;
151          udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
152          qp[0 + HALF_ENDIAN_ADJ(0)] = q;
153        }
154      qp -= xsize;
155      adj = HALF_ENDIAN_ADJ (0);
156      for (i = xsize-1; i >= 0; i--)
157        {
158          udiv_qrnnd_half_preinv (q, r, r, 0, d_limb, dinv_limb);
159          adj = -adj;
160          qp[i + adj] = q;
161        }
162
163      return r >> norm;
164    }
165  else
166    {
167      mp_srcptr  ap;
168      mp_ptr     qp;
169      mp_size_t  size, xsize, total_size;
170      mp_limb_t  d, n1, n0, q, r, dinv, nshift, norm_rmask;
171      int        norm, norm_rshift;
172
173      ap = ap_limbptr;
174      qp = qp_limbptr;
175      size = size_limbs;
176      xsize = xsize_limbs;
177      total_size = total_size_limbs;
178      d = d_limb;
179
180      qp += total_size;   /* above high limb */
181      r = 0;              /* initial remainder */
182
183      if (LIKELY (size != 0))
184        {
185          /* Skip a division if high < divisor (high quotient 0).  Testing
186             here before before normalizing will still skip as often as
187             possible.  */
188          n1 = ap[size-1];
189          if (n1 < d)
190            {
191              r = n1;
192              *--qp = 0;
193              total_size--;
194              if (total_size == 0)
195                return r;
196              size--;
197            }
198        }
199
200      count_leading_zeros (norm, d);
201      d <<= norm;
202      r <<= norm;
203
204      norm_rshift = GMP_LIMB_BITS - norm;
205      norm_rmask = (norm == 0 ? 0 : ~CNST_LIMB(0));
206
207      invert_limb (dinv, d);
208
209      if (LIKELY (size != 0))
210        {
211          n1 = ap[size-1];
212          r |= ((n1 >> norm_rshift) & norm_rmask);
213          for (i = size-2; i >= 0; i--)
214            {
215              n0 = ap[i];
216              nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
217              udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
218              *--qp = q;
219              n1 = n0;
220            }
221          nshift = n1 << norm;
222          udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
223          *--qp = q;
224        }
225      for (i = 0; i < xsize; i++)
226        {
227          udiv_qrnnd_preinv (q, r, r, CNST_LIMB(0), d, dinv);
228          *--qp = q;
229        }
230      return r >> norm;
231    }
232}
233