1/* mpn_mod_1s_3p (ap, n, b, cps)
2   Divide (ap,,n) by b.  Return the single-limb remainder.
3   Requires that d < B / 4.
4
5   Contributed to the GNU project by Torbjorn Granlund.
6
7   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11Copyright 2008, 2009 Free Software Foundation, Inc.
12
13This file is part of the GNU MP Library.
14
15The GNU MP Library is free software; you can redistribute it and/or modify
16it under the terms of the GNU Lesser General Public License as published by
17the Free Software Foundation; either version 3 of the License, or (at your
18option) any later version.
19
20The GNU MP Library is distributed in the hope that it will be useful, but
21WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
23License for more details.
24
25You should have received a copy of the GNU Lesser General Public License
26along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27
28#include "gmp.h"
29#include "gmp-impl.h"
30#include "longlong.h"
31
32void
33mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
34{
35  mp_limb_t bi;
36  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
37  int cnt;
38
39  ASSERT (b <= (~(mp_limb_t) 0) / 4);
40
41  count_leading_zeros (cnt, b);
42
43  b <<= cnt;
44  invert_limb (bi, b);
45
46  B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
47  ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
48  udiv_rnd_preinv (B2modb, B1modb, b, bi);
49  udiv_rnd_preinv (B3modb, B2modb, b, bi);
50  udiv_rnd_preinv (B4modb, B3modb, b, bi);
51  udiv_rnd_preinv (B5modb, B4modb, b, bi);
52
53  cps[0] = bi;
54  cps[1] = cnt;
55  cps[2] = B1modb >> cnt;
56  cps[3] = B2modb >> cnt;
57  cps[4] = B3modb >> cnt;
58  cps[5] = B4modb >> cnt;
59  cps[6] = B5modb >> cnt;
60
61#if WANT_ASSERT
62  {
63    int i;
64    b = cps[2];
65    for (i = 3; i <= 6; i++)
66      {
67	b += cps[i];
68	ASSERT (b >= cps[i]);
69      }
70  }
71#endif
72}
73
74mp_limb_t
75mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t cps[7])
76{
77  mp_limb_t rh, rl, bi, q, ph, pl, ch, cl, r;
78  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
79  mp_size_t i;
80  int cnt;
81
82  ASSERT (n >= 1);
83
84  B1modb = cps[2];
85  B2modb = cps[3];
86  B3modb = cps[4];
87  B4modb = cps[5];
88  B5modb = cps[6];
89
90  switch (n & 3)
91    {
92    case 0:
93      umul_ppmm (ph, pl, ap[n - 3], B1modb);
94      add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]);
95      umul_ppmm (ch, cl, ap[n - 2], B2modb);
96      add_ssaaaa (ph, pl, ph, pl, ch, cl);
97      umul_ppmm (rh, rl, ap[n - 1], B3modb);
98      add_ssaaaa (rh, rl, rh, rl, ph, pl);
99      n -= 4;
100      break;
101    case 1:
102      rh = 0;
103      rl = ap[n - 1];
104      n -= 1;
105      break;
106    case 2:
107      umul_ppmm (ph, pl, ap[n - 1], B1modb);
108      add_ssaaaa (rh, rl, ph, pl, 0, ap[n - 2]);
109      n -= 2;
110      break;
111    case 3:
112      umul_ppmm (ph, pl, ap[n - 2], B1modb);
113      add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
114      umul_ppmm (rh, rl, ap[n - 1], B2modb);
115      add_ssaaaa (rh, rl, rh, rl, ph, pl);
116      n -= 3;
117      break;
118    }
119
120  for (i = n - 4; i >= 0; i -= 4)
121    {
122      /* rr = ap[i]				< B
123	    + ap[i+1] * (B mod b)		<= (B-1)(b-1)
124	    + ap[i+2] * (B^2 mod b)		<= (B-1)(b-1)
125	    + ap[i+3] * (B^3 mod b)		<= (B-1)(b-1)
126	    + LO(rr)  * (B^4 mod b)		<= (B-1)(b-1)
127	    + HI(rr)  * (B^5 mod b)		<= (B-1)(b-1)
128      */
129      umul_ppmm (ph, pl, ap[i + 1], B1modb);
130      add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
131
132      umul_ppmm (ch, cl, ap[i + 2], B2modb);
133      add_ssaaaa (ph, pl, ph, pl, ch, cl);
134
135      umul_ppmm (ch, cl, ap[i + 3], B3modb);
136      add_ssaaaa (ph, pl, ph, pl, ch, cl);
137
138      umul_ppmm (ch, cl, rl, B4modb);
139      add_ssaaaa (ph, pl, ph, pl, ch, cl);
140
141      umul_ppmm (rh, rl, rh, B5modb);
142      add_ssaaaa (rh, rl, rh, rl, ph, pl);
143    }
144
145  bi = cps[0];
146  cnt = cps[1];
147
148#if 1
149  umul_ppmm (rh, cl, rh, B1modb);
150  add_ssaaaa (rh, rl, rh, rl, 0, cl);
151  r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
152#else
153  udiv_qrnnd_preinv (q, r, rh >> (GMP_LIMB_BITS - cnt),
154		     (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)), b, bi);
155  ASSERT (q <= 4);	/* optimize for small quotient? */
156#endif
157
158  udiv_qrnnd_preinv (q, r, r, rl << cnt, b, bi);
159
160  return r >> cnt;
161}
162