1/* mpn_divexact_by3c -- mpn exact division by 3.
2
3Copyright 2000-2003, 2008 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of either:
9
10  * the GNU Lesser General Public License as published by the Free
11    Software Foundation; either version 3 of the License, or (at your
12    option) any later version.
13
14or
15
16  * the GNU General Public License as published by the Free Software
17    Foundation; either version 2 of the License, or (at your option) any
18    later version.
19
20or both in parallel, as here.
21
22The GNU MP Library is distributed in the hope that it will be useful, but
23WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25for more details.
26
27You should have received copies of the GNU General Public License and the
28GNU Lesser General Public License along with the GNU MP Library.  If not,
29see https://www.gnu.org/licenses/.  */
30
31#include "gmp-impl.h"
32
33#if DIVEXACT_BY3_METHOD == 0
34
35mp_limb_t
36mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
37{
38  mp_limb_t r;
39  r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
40
41  /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
42     We want to return C.  We compute the remainder mod 4 and notice that the
43     inverse of (2^(2k)-1)/3 mod 4 is 1.  */
44  return r & 3;
45}
46
47#endif
48
49#if DIVEXACT_BY3_METHOD == 1
50
51/* The algorithm here is basically the same as mpn_divexact_1, as described
52   in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
53   borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
54   high(divisor*q) can be determined with two comparisons instead of a
55   multiply.
56
57   The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
58   1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
59   2.95.2 at least).
60
61   It will be noted that the new c is formed by adding three values each 0
62   or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
63   causes a borrow, that leaves a limb value of either 0xFF...FF or
64   0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
65   0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
66   respectively, hence a total of no more than 2.
67
68   Alternatives:
69
70   This implementation has each multiply on the dependent chain, due to
71   "l=s-c".  See below for alternative code which avoids that.  */
72
73mp_limb_t
74mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
75{
76  mp_limb_t  l, q, s;
77  mp_size_t  i;
78
79  ASSERT (un >= 1);
80  ASSERT (c == 0 || c == 1 || c == 2);
81  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
82
83  i = 0;
84  do
85    {
86      s = up[i];
87      SUBC_LIMB (c, l, s, c);
88
89      q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
90      rp[i] = q;
91
92      c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
93      c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
94    }
95  while (++i < un);
96
97  ASSERT (c == 0 || c == 1 || c == 2);
98  return c;
99}
100
101
102#endif
103
104#if DIVEXACT_BY3_METHOD == 2
105
106/* The following alternative code re-arranges the quotient calculation from
107   (src[i]-c)*inverse to instead
108
109       q = src[i]*inverse - c*inverse
110
111   thereby allowing src[i]*inverse to be scheduled back as far as desired,
112   making full use of multiplier throughput and leaving just some carry
113   handing on the dependent chain.
114
115   The carry handling consists of determining the c for the next iteration.
116   This is the same as described above, namely look for any borrow from
117   src[i]-c, and at the high of 3*q.
118
119   high(3*q) is done with two comparisons as above (in c2 and c3).  The
120   borrow from src[i]-c is incorporated into those by noting that if there's
121   a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
122   giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
123   enough to make high(3*q) come out 1 bigger, as required.
124
125   l = -c*inverse is calculated at the same time as c, since for most chips
126   it can be more conveniently derived from separate c1/c2/c3 values than
127   from a combined c equal to 0, 1 or 2.
128
129   The net effect is that with good pipelining this loop should be able to
130   run at perhaps 4 cycles/limb, depending on available execute resources
131   etc.
132
133   Usage:
134
135   This code is not used by default, since we really can't rely on the
136   compiler generating a good software pipeline, nor on such an approach
137   even being worthwhile on all CPUs.
138
139   Itanium is one chip where this algorithm helps though, see
140   mpn/ia64/diveby3.asm.  */
141
142mp_limb_t
143mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
144{
145  mp_limb_t  s, sm, cl, q, qx, c2, c3;
146  mp_size_t  i;
147
148  ASSERT (un >= 1);
149  ASSERT (cy == 0 || cy == 1 || cy == 2);
150  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
151
152  cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
153
154  for (i = 0; i < un; i++)
155    {
156      s = up[i];
157      sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
158
159      q = (cl + sm) & GMP_NUMB_MASK;
160      rp[i] = q;
161      qx = q + (s < cy);
162
163      c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
164      c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
165
166      cy = c2 + c3;
167      cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
168    }
169
170  return cy;
171}
172
173#endif
174