1/* mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
2   times as large as bn.  Or more accurately, bn < an < 2 bn.
3
4   Contributed to the GNU project by Marco Bodrato.
5
6   The idea of applying toom to unbalanced multiplication is due to Marco
7   Bodrato and Alberto Zanoni.
8
9   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
10   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
11   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12
13Copyright 2009 Free Software Foundation, Inc.
14
15This file is part of the GNU MP Library.
16
17The GNU MP Library is free software; you can redistribute it and/or modify
18it under the terms of the GNU Lesser General Public License as published by
19the Free Software Foundation; either version 3 of the License, or (at your
20option) any later version.
21
22The GNU MP Library is distributed in the hope that it will be useful, but
23WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
25License for more details.
26
27You should have received a copy of the GNU Lesser General Public License
28along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
29
30
31#include "gmp.h"
32#include "gmp-impl.h"
33
34/* Evaluate in: -2, -1, 0, +1, +2, +inf
35
36  <-s-><--n--><--n--><--n--><--n-->
37   ___ ______ ______ ______ ______
38  |a4_|___a3_|___a2_|___a1_|___a0_|
39			|b1|___b0_|
40			<t-><--n-->
41
42  v0  =  a0                  * b0      #   A(0)*B(0)
43  v1  = (a0+ a1+ a2+ a3+  a4)*(b0+ b1) #   A(1)*B(1)      ah  <= 4   bh <= 1
44  vm1 = (a0- a1+ a2- a3+  a4)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 2   bh  = 0
45  v2  = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) #   A(2)*B(2)      ah  <= 30  bh <= 2
46  vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) #  A(-2)*B(-2)    |ah| <= 20 |bh|<= 1
47  vinf=                   a4 *     b1  # A(inf)*B(inf)
48
49  Some slight optimization in evaluation are taken from the paper:
50  "Towards Optimal Toom-Cook Multiplication for Univariate and
51  Multivariate Polynomials in Characteristic 2 and 0."
52*/
53
54void
55mpn_toom52_mul (mp_ptr pp,
56		mp_srcptr ap, mp_size_t an,
57		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
58{
59  mp_size_t n, s, t;
60  enum toom6_flags flags;
61
62#define a0  ap
63#define a1  (ap + n)
64#define a2  (ap + 2 * n)
65#define a3  (ap + 3 * n)
66#define a4  (ap + 4 * n)
67#define b0  bp
68#define b1  (bp + n)
69
70  n = 1 + (2 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) >> 1);
71
72  s = an - 4 * n;
73  t = bn - n;
74
75  ASSERT (0 < s && s <= n);
76  ASSERT (0 < t && t <= n);
77
78  /* Ensures that 5 values of n+1 limbs each fits in the product area.
79     Borderline cases are an = 32, bn = 8, n = 7, and an = 36, bn = 9,
80     n = 8. */
81  ASSERT (s+t >= 5);
82
83#define v0    pp				/* 2n */
84#define vm1   (scratch)				/* 2n+1 */
85#define v1    (pp + 2 * n)			/* 2n+1 */
86#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
87#define v2    (scratch + 4 * n + 2)		/* 2n+1 */
88#define vinf  (pp + 5 * n)			/* s+t */
89#define bs1    pp				/* n+1 */
90#define bsm1  (scratch + 2 * n + 2)		/* n   */
91#define asm1  (scratch + 3 * n + 3)		/* n+1 */
92#define asm2  (scratch + 4 * n + 4)		/* n+1 */
93#define bsm2  (pp + n + 1)			/* n+1 */
94#define bs2   (pp + 2 * n + 2)			/* n+1 */
95#define as2   (pp + 3 * n + 3)			/* n+1 */
96#define as1   (pp + 4 * n + 4)			/* n+1 */
97
98  /* Scratch need is 6 * n + 3 + 1. We need one extra limb, because
99     products will overwrite 2n+2 limbs. */
100
101#define a0a2  scratch
102#define a1a3  asm1
103
104  /* Compute as2 and asm2.  */
105  flags = toom6_vm2_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a1a3);
106
107  /* Compute bs1 and bsm1.  */
108  if (t == n)
109    {
110#if HAVE_NATIVE_mpn_add_n_sub_n
111      mp_limb_t cy;
112
113      if (mpn_cmp (b0, b1, n) < 0)
114	{
115	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
116	  flags ^= toom6_vm1_neg;
117	}
118      else
119	{
120	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
121	}
122      bs1[n] = cy >> 1;
123#else
124      bs1[n] = mpn_add_n (bs1, b0, b1, n);
125      if (mpn_cmp (b0, b1, n) < 0)
126	{
127	  mpn_sub_n (bsm1, b1, b0, n);
128	  flags ^= toom6_vm1_neg;
129	}
130      else
131	{
132	  mpn_sub_n (bsm1, b0, b1, n);
133	}
134#endif
135    }
136  else
137    {
138      bs1[n] = mpn_add (bs1, b0, n, b1, t);
139      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
140	{
141	  mpn_sub_n (bsm1, b1, b0, t);
142	  MPN_ZERO (bsm1 + t, n - t);
143	  flags ^= toom6_vm1_neg;
144	}
145      else
146	{
147	  mpn_sub (bsm1, b0, n, b1, t);
148	}
149    }
150
151  /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1  */
152  mpn_add (bs2, bs1, n+1, b1, t);
153  if (flags & toom6_vm1_neg )
154    {
155      bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
156      flags ^= toom6_vm2_neg;
157    }
158  else
159    {
160      bsm2[n] = 0;
161      if (t == n)
162	{
163	  if (mpn_cmp (bsm1, b1, n) < 0)
164	    {
165	      mpn_sub_n (bsm2, b1, bsm1, n);
166	      flags ^= toom6_vm2_neg;
167	    }
168	  else
169	    {
170	      mpn_sub_n (bsm2, bsm1, b1, n);
171	    }
172	}
173      else
174	{
175	  if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
176	    {
177	      mpn_sub_n (bsm2, b1, bsm1, t);
178	      MPN_ZERO (bsm2 + t, n - t);
179	      flags ^= toom6_vm2_neg;
180	    }
181	  else
182	    {
183	      mpn_sub (bsm2, bsm1, n, b1, t);
184	    }
185	}
186    }
187
188  /* Compute as1 and asm1.  */
189  flags ^= toom6_vm1_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2);
190
191  ASSERT (as1[n] <= 4);
192  ASSERT (bs1[n] <= 1);
193  ASSERT (asm1[n] <= 2);
194/*   ASSERT (bsm1[n] <= 1); */
195  ASSERT (as2[n] <=30);
196  ASSERT (bs2[n] <= 2);
197  ASSERT (asm2[n] <= 20);
198  ASSERT (bsm2[n] <= 1);
199
200  /* vm1, 2n+1 limbs */
201  mpn_mul (vm1, asm1, n+1, bsm1, n);  /* W4 */
202
203  /* vm2, 2n+1 limbs */
204  mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */
205
206  /* v2, 2n+1 limbs */
207  mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */
208
209  /* v1, 2n+1 limbs */
210  mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */
211
212  /* vinf, s+t limbs */   /* W0 */
213  if (s > t)  mpn_mul (vinf, a4, s, b1, t);
214  else        mpn_mul (vinf, b1, t, a4, s);
215
216  /* v0, 2n limbs */
217  mpn_mul_n (v0, ap, bp, n);  /* W5 */
218
219  mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
220
221#undef v0
222#undef vm1
223#undef v1
224#undef vm2
225#undef v2
226#undef vinf
227#undef bs1
228#undef bs2
229#undef bsm1
230#undef bsm2
231#undef asm1
232#undef asm2
233#undef as1
234#undef as2
235#undef a0a2
236#undef b0b2
237#undef a1a3
238#undef a0
239#undef a1
240#undef a2
241#undef a3
242#undef b0
243#undef b1
244#undef b2
245
246}
247