1/* mpn_divexact_by3c -- mpn exact division by 3.
2
3Copyright 2000, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
4
5This file is part of the GNU MP Library.
6
7The GNU MP Library is free software; you can redistribute it and/or modify
8it under the terms of the GNU Lesser General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at your
10option) any later version.
11
12The GNU MP Library is distributed in the hope that it will be useful, but
13WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15License for more details.
16
17You should have received a copy of the GNU Lesser General Public License
18along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
19
20#include "gmp.h"
21#include "gmp-impl.h"
22
23#if DIVEXACT_BY3_METHOD == 0
24
25mp_limb_t
26mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
27{
28  mp_limb_t r;
29  r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
30
31  /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
32     We want to return C.  We compute the remainder mod 4 and notice that the
33     inverse of (2^(2k)-1)/3 mod 4 is 1.  */
34  return r & 3;
35}
36
37#endif
38
39#if DIVEXACT_BY3_METHOD == 1
40
41/* The algorithm here is basically the same as mpn_divexact_1, as described
42   in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
43   borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
44   high(divisor*q) can be determined with two comparisons instead of a
45   multiply.
46
47   The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
48   1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
49   2.95.2 at least).
50
51   It will be noted that the new c is formed by adding three values each 0
52   or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
53   causes a borrow, that leaves a limb value of either 0xFF...FF or
54   0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
55   0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
56   respectively, hence a total of no more than 2.
57
58   Alternatives:
59
60   This implementation has each multiply on the dependent chain, due to
61   "l=s-c".  See below for alternative code which avoids that.  */
62
63mp_limb_t
64mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
65{
66  mp_limb_t  l, q, s;
67  mp_size_t  i;
68
69  ASSERT (un >= 1);
70  ASSERT (c == 0 || c == 1 || c == 2);
71  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
72
73  i = 0;
74  do
75    {
76      s = up[i];
77      SUBC_LIMB (c, l, s, c);
78
79      q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
80      rp[i] = q;
81
82      c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
83      c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
84    }
85  while (++i < un);
86
87  ASSERT (c == 0 || c == 1 || c == 2);
88  return c;
89}
90
91
92#endif
93
94#if DIVEXACT_BY3_METHOD == 2
95
96/* The following alternative code re-arranges the quotient calculation from
97   (src[i]-c)*inverse to instead
98
99       q = src[i]*inverse - c*inverse
100
101   thereby allowing src[i]*inverse to be scheduled back as far as desired,
102   making full use of multiplier throughput and leaving just some carry
103   handing on the dependent chain.
104
105   The carry handling consists of determining the c for the next iteration.
106   This is the same as described above, namely look for any borrow from
107   src[i]-c, and at the high of 3*q.
108
109   high(3*q) is done with two comparisons as above (in c2 and c3).  The
110   borrow from src[i]-c is incorporated into those by noting that if there's
111   a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
112   giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
113   enough to make high(3*q) come out 1 bigger, as required.
114
115   l = -c*inverse is calculated at the same time as c, since for most chips
116   it can be more conveniently derived from separate c1/c2/c3 values than
117   from a combined c equal to 0, 1 or 2.
118
119   The net effect is that with good pipelining this loop should be able to
120   run at perhaps 4 cycles/limb, depending on available execute resources
121   etc.
122
123   Usage:
124
125   This code is not used by default, since we really can't rely on the
126   compiler generating a good software pipeline, nor on such an approach
127   even being worthwhile on all CPUs.
128
129   Itanium is one chip where this algorithm helps though, see
130   mpn/ia64/diveby3.asm.  */
131
132mp_limb_t
133mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
134{
135  mp_limb_t  s, sm, cl, q, qx, c2, c3;
136  mp_size_t  i;
137
138  ASSERT (un >= 1);
139  ASSERT (cy == 0 || cy == 1 || cy == 2);
140  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
141
142  cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
143
144  for (i = 0; i < un; i++)
145    {
146      s = up[i];
147      sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
148
149      q = (cl + sm) & GMP_NUMB_MASK;
150      rp[i] = q;
151      qx = q + (s < cy);
152
153      c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
154      c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
155
156      cy = c2 + c3;
157      cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
158    }
159
160  return cy;
161}
162
163#endif
164