1/* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3
2   times as large as bn.  Or more accurately, (4/3)bn < an < (5/2)bn.
3
4   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6   The idea of applying toom to unbalanced multiplication is due to Marco
7   Bodrato and Alberto Zanoni.
8
9   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
10   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
11   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12
13Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
14
15This file is part of the GNU MP Library.
16
17The GNU MP Library is free software; you can redistribute it and/or modify
18it under the terms of the GNU Lesser General Public License as published by
19the Free Software Foundation; either version 3 of the License, or (at your
20option) any later version.
21
22The GNU MP Library is distributed in the hope that it will be useful, but
23WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
25License for more details.
26
27You should have received a copy of the GNU Lesser General Public License
28along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
29
30
31#include "gmp.h"
32#include "gmp-impl.h"
33
34/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
35
36  <-s-><--n--><--n--><--n--><--n-->
37   ___ ______ ______ ______ ______
38  |a4_|___a3_|___a2_|___a1_|___a0_|
39	       |__b2|___b1_|___b0_|
40	       <-t--><--n--><--n-->
41
42  v0  =    a0                  *  b0          #    A(0)*B(0)
43  v1  = (  a0+ a1+ a2+ a3+  a4)*( b0+ b1+ b2) #    A(1)*B(1)      ah  <= 4   bh <= 2
44  vm1 = (  a0- a1+ a2- a3+  a4)*( b0- b1+ b2) #   A(-1)*B(-1)    |ah| <= 2   bh <= 1
45  v2  = (  a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) #    A(2)*B(2)      ah  <= 30  bh <= 6
46  vm2 = (  a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) #    A(2)*B(2)     -9<=ah<=20 -1<=bh<=4
47  vh  = (16a0+8a1+4a2+2a3+  a4)*(4b0+2b1+ b2) #  A(1/2)*B(1/2)    ah  <= 30  bh <= 6
48  vinf=                     a4 *          b2  #  A(inf)*B(inf)
49*/
50
51void
52mpn_toom53_mul (mp_ptr pp,
53		mp_srcptr ap, mp_size_t an,
54		mp_srcptr bp, mp_size_t bn,
55		mp_ptr scratch)
56{
57  mp_size_t n, s, t;
58  mp_limb_t cy;
59  mp_ptr gp;
60  mp_ptr as1, asm1, as2, asm2, ash;
61  mp_ptr bs1, bsm1, bs2, bsm2, bsh;
62  enum toom7_flags flags;
63  TMP_DECL;
64
65#define a0  ap
66#define a1  (ap + n)
67#define a2  (ap + 2*n)
68#define a3  (ap + 3*n)
69#define a4  (ap + 4*n)
70#define b0  bp
71#define b1  (bp + n)
72#define b2  (bp + 2*n)
73
74  n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);
75
76  s = an - 4 * n;
77  t = bn - 2 * n;
78
79  ASSERT (0 < s && s <= n);
80  ASSERT (0 < t && t <= n);
81
82  TMP_MARK;
83
84  as1  = TMP_SALLOC_LIMBS (n + 1);
85  asm1 = TMP_SALLOC_LIMBS (n + 1);
86  as2  = TMP_SALLOC_LIMBS (n + 1);
87  asm2 = TMP_SALLOC_LIMBS (n + 1);
88  ash  = TMP_SALLOC_LIMBS (n + 1);
89
90  bs1  = TMP_SALLOC_LIMBS (n + 1);
91  bsm1 = TMP_SALLOC_LIMBS (n + 1);
92  bs2  = TMP_SALLOC_LIMBS (n + 1);
93  bsm2 = TMP_SALLOC_LIMBS (n + 1);
94  bsh  = TMP_SALLOC_LIMBS (n + 1);
95
96  gp = pp;
97
98  /* Compute as1 and asm1.  */
99  flags = toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp);
100
101  /* Compute as2 and asm2. */
102  flags |= toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp);
103
104  /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
105     = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4  */
106#if HAVE_NATIVE_mpn_addlsh1_n
107  cy = mpn_addlsh1_n (ash, a1, a0, n);
108  cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
109  cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
110  if (s < n)
111    {
112      mp_limb_t cy2;
113      cy2 = mpn_addlsh1_n (ash, a4, ash, s);
114      ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
115      MPN_INCR_U (ash + s, n+1-s, cy2);
116    }
117  else
118    ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
119#else
120  cy = mpn_lshift (ash, a0, n, 1);
121  cy += mpn_add_n (ash, ash, a1, n);
122  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
123  cy += mpn_add_n (ash, ash, a2, n);
124  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
125  cy += mpn_add_n (ash, ash, a3, n);
126  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
127  ash[n] = cy + mpn_add (ash, ash, n, a4, s);
128#endif
129
130  /* Compute bs1 and bsm1.  */
131  bs1[n] = mpn_add (bs1, b0, n, b2, t);		/* b0 + b2 */
132#if HAVE_NATIVE_mpn_add_n_sub_n
133  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
134    {
135      bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
136      bsm1[n] = 0;
137      flags ^= toom7_w3_neg;
138    }
139  else
140    {
141      cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n);
142      bsm1[n] = bs1[n] - (cy & 1);
143      bs1[n] += (cy >> 1);
144    }
145#else
146  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
147    {
148      mpn_sub_n (bsm1, b1, bs1, n);
149      bsm1[n] = 0;
150      flags ^= toom7_w3_neg;
151    }
152  else
153    {
154      bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n);
155    }
156  bs1[n] += mpn_add_n (bs1, bs1, b1, n);  /* b0+b1+b2 */
157#endif
158
159  /* Compute bs2 and bsm2. */
160#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
161#if HAVE_NATIVE_mpn_addlsh2_n
162  cy = mpn_addlsh2_n (bs2, b0, b2, t);
163#else /* HAVE_NATIVE_mpn_addlsh_n */
164  cy = mpn_addlsh_n (bs2, b0, b2, t, 2);
165#endif
166  if (t < n)
167    cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy);
168  bs2[n] = cy;
169#else
170  cy = mpn_lshift (gp, b2, t, 2);
171  bs2[n] = mpn_add (bs2, b0, n, gp, t);
172  MPN_INCR_U (bs2 + t, n+1-t, cy);
173#endif
174
175  gp[n] = mpn_lshift (gp, b1, n, 1);
176
177#if HAVE_NATIVE_mpn_add_n_sub_n
178  if (mpn_cmp (bs2, gp, n+1) < 0)
179    {
180      ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1));
181      flags ^= toom7_w1_neg;
182    }
183  else
184    {
185      ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1));
186    }
187#else
188  if (mpn_cmp (bs2, gp, n+1) < 0)
189    {
190      ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1));
191      flags ^= toom7_w1_neg;
192    }
193  else
194    {
195      ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1));
196    }
197  mpn_add_n (bs2, bs2, gp, n+1);
198#endif
199
200  /* Compute bsh = 4 b0 + 2 b1 + b0 = 2*(2*b0 + b1)+b0.  */
201#if HAVE_NATIVE_mpn_addlsh1_n
202  cy = mpn_addlsh1_n (bsh, b1, b0, n);
203  if (t < n)
204    {
205      mp_limb_t cy2;
206      cy2 = mpn_addlsh1_n (bsh, b2, bsh, t);
207      bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1);
208      MPN_INCR_U (bsh + t, n+1-t, cy2);
209    }
210  else
211    bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n);
212#else
213  cy = mpn_lshift (bsh, b0, n, 1);
214  cy += mpn_add_n (bsh, bsh, b1, n);
215  cy = 2*cy + mpn_lshift (bsh, bsh, n, 1);
216  bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t);
217#endif
218
219  ASSERT (as1[n] <= 4);
220  ASSERT (bs1[n] <= 2);
221  ASSERT (asm1[n] <= 2);
222  ASSERT (bsm1[n] <= 1);
223  ASSERT (as2[n] <= 30);
224  ASSERT (bs2[n] <= 6);
225  ASSERT (asm2[n] <= 20);
226  ASSERT (bsm2[n] <= 4);
227  ASSERT (ash[n] <= 30);
228  ASSERT (bsh[n] <= 6);
229
230#define v0    pp				/* 2n */
231#define v1    (pp + 2 * n)			/* 2n+1 */
232#define vinf  (pp + 6 * n)			/* s+t */
233#define v2    scratch				/* 2n+1 */
234#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
235#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
236#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
237#define scratch_out (scratch + 8 * n + 4)		/* 2n+1 */
238  /* Total scratch need: 10*n+5 */
239
240  /* Must be in allocation order, as they overwrite one limb beyond
241   * 2n+1. */
242  mpn_mul_n (v2, as2, bs2, n + 1);		/* v2, 2n+1 limbs */
243  mpn_mul_n (vm2, asm2, bsm2, n + 1);		/* vm2, 2n+1 limbs */
244  mpn_mul_n (vh, ash, bsh, n + 1);		/* vh, 2n+1 limbs */
245
246  /* vm1, 2n+1 limbs */
247#ifdef SMALLER_RECURSION
248  mpn_mul_n (vm1, asm1, bsm1, n);
249  if (asm1[n] == 1)
250    {
251      cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
252    }
253  else if (asm1[n] == 2)
254    {
255#if HAVE_NATIVE_mpn_addlsh1_n
256      cy = 2 * bsm1[n] + mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
257#else
258      cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
259#endif
260    }
261  else
262    cy = 0;
263  if (bsm1[n] != 0)
264    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
265  vm1[2 * n] = cy;
266#else /* SMALLER_RECURSION */
267  vm1[2 * n] = 0;
268  mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
269#endif /* SMALLER_RECURSION */
270
271  /* v1, 2n+1 limbs */
272#ifdef SMALLER_RECURSION
273  mpn_mul_n (v1, as1, bs1, n);
274  if (as1[n] == 1)
275    {
276      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
277    }
278  else if (as1[n] == 2)
279    {
280#if HAVE_NATIVE_mpn_addlsh1_n
281      cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
282#else
283      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
284#endif
285    }
286  else if (as1[n] != 0)
287    {
288      cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
289    }
290  else
291    cy = 0;
292  if (bs1[n] == 1)
293    {
294      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
295    }
296  else if (bs1[n] == 2)
297    {
298#if HAVE_NATIVE_mpn_addlsh1_n
299      cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
300#else
301      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
302#endif
303    }
304  v1[2 * n] = cy;
305#else /* SMALLER_RECURSION */
306  v1[2 * n] = 0;
307  mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
308#endif /* SMALLER_RECURSION */
309
310  mpn_mul_n (v0, a0, b0, n);			/* v0, 2n limbs */
311
312  /* vinf, s+t limbs */
313  if (s > t)  mpn_mul (vinf, a4, s, b2, t);
314  else        mpn_mul (vinf, b2, t, a4, s);
315
316  mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t,
317			     scratch_out);
318
319  TMP_FREE;
320}
321