1/* mpn_mul -- Multiply two natural numbers.
2
3   Contributed to the GNU project by Torbjorn Granlund.
4
5Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005,
62006, 2007, 2009, 2010 Free Software Foundation, Inc.
7
8This file is part of the GNU MP Library.
9
10The GNU MP Library is free software; you can redistribute it and/or modify
11it under the terms of the GNU Lesser General Public License as published by
12the Free Software Foundation; either version 3 of the License, or (at your
13option) any later version.
14
15The GNU MP Library is distributed in the hope that it will be useful, but
16WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18License for more details.
19
20You should have received a copy of the GNU Lesser General Public License
21along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
22
23#include "gmp.h"
24#include "gmp-impl.h"
25
26
27#ifndef MUL_BASECASE_MAX_UN
28#define MUL_BASECASE_MAX_UN 500
29#endif
30
31#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
32#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
33
34/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
35   (pointed to by VP, with VN limbs), and store the result at PRODP.  The
36   result is UN + VN limbs.  Return the most significant limb of the result.
37
38   NOTE: The space pointed to by PRODP is overwritten before finished with U
39   and V, so overlap is an error.
40
41   Argument constraints:
42   1. UN >= VN.
43   2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
44      the multiplier and the multiplicand.  */
45
46/*
47  * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
48    ideal lines of the surrounding algorithms.  Is that optimal?
49
50  * The toomX3 code now uses a structure similar to the one of toomX2, except
51    that it loops longer in the unbalanced case.  The result is that the
52    remaining area might have un < vn.  Should we fix the toomX2 code in a
53    similar way?
54
55  * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
56    therefore calls mpn_mul recursively for certain cases.
57
58  * Allocate static temp space using THRESHOLD variables (except for toom44
59    when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
60
61  * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
62    have the same vn threshold.  This is not true, we should actually use
63    mul_basecase for slightly larger operands for toom32 than for toom22, and
64    even larger for toom42.
65
66  * That problem is even more prevalent for toomX3.  We therefore use special
67    THRESHOLD variables there.
68
69  * Is our ITCH allocation correct?
70*/
71
72#define ITCH (16*vn + 100)
73
74mp_limb_t
75mpn_mul (mp_ptr prodp,
76	 mp_srcptr up, mp_size_t un,
77	 mp_srcptr vp, mp_size_t vn)
78{
79  ASSERT (un >= vn);
80  ASSERT (vn >= 1);
81  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
82  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
83
84  if (un == vn)
85    {
86      if (up == vp)
87	mpn_sqr (prodp, up, un);
88      else
89	mpn_mul_n (prodp, up, vp, un);
90    }
91  else if (vn < MUL_TOOM22_THRESHOLD)
92    { /* plain schoolbook multiplication */
93
94      /* Unless un is very large, or else if have an applicable mpn_mul_N,
95	 perform basecase multiply directly.  */
96      if (un <= MUL_BASECASE_MAX_UN
97#if HAVE_NATIVE_mpn_mul_2
98	  || vn <= 2
99#else
100	  || vn == 1
101#endif
102	  )
103	mpn_mul_basecase (prodp, up, un, vp, vn);
104      else
105	{
106	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
107	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
108	     these pieces with the vp[] operand.  After each such partial
109	     multiplication (but the last) we copy the most significant vn
110	     limbs into a temporary buffer since that part would otherwise be
111	     overwritten by the next multiplication.  After the next
112	     multiplication, we add it back.  This illustrates the situation:
113
114                                                    -->vn<--
115                                                      |  |<------- un ------->|
116                                                         _____________________|
117                                                        X                    /|
118                                                      /XX__________________/  |
119                                    _____________________                     |
120                                   X                    /                     |
121                                 /XX__________________/                       |
122               _____________________                                          |
123              /                    /                                          |
124            /____________________/                                            |
125	    ==================================================================
126
127	    The parts marked with X are the parts whose sums are copied into
128	    the temporary buffer.  */
129
130	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
131	  mp_limb_t cy;
132	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
133
134	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
135	  prodp += MUL_BASECASE_MAX_UN;
136	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
137	  up += MUL_BASECASE_MAX_UN;
138	  un -= MUL_BASECASE_MAX_UN;
139	  while (un > MUL_BASECASE_MAX_UN)
140	    {
141	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
142	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
143	      mpn_incr_u (prodp + vn, cy);
144	      prodp += MUL_BASECASE_MAX_UN;
145	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
146	      up += MUL_BASECASE_MAX_UN;
147	      un -= MUL_BASECASE_MAX_UN;
148	    }
149	  if (un > vn)
150	    {
151	      mpn_mul_basecase (prodp, up, un, vp, vn);
152	    }
153	  else
154	    {
155	      ASSERT (un > 0);
156	      mpn_mul_basecase (prodp, vp, vn, up, un);
157	    }
158	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
159	  mpn_incr_u (prodp + vn, cy);
160	}
161    }
162  else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
163    {
164      /* Use ToomX2 variants */
165      mp_ptr scratch;
166      TMP_SDECL; TMP_SMARK;
167
168      scratch = TMP_SALLOC_LIMBS (ITCH);
169
170      /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
171	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
172	 wise; we would get better balance by slightly moving the bound.  We
173	 will sometimes end up with un < vn, like the the X3 arm below.  */
174      if (un >= 3 * vn)
175	{
176	  mp_limb_t cy;
177	  mp_ptr ws;
178
179	  /* The maximum ws usage is for the mpn_mul result.  */
180	  ws = TMP_SALLOC_LIMBS (4 * vn);
181
182	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
183	  un -= 2 * vn;
184	  up += 2 * vn;
185	  prodp += 2 * vn;
186
187	  while (un >= 3 * vn)
188	    {
189	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
190	      un -= 2 * vn;
191	      up += 2 * vn;
192	      cy = mpn_add_n (prodp, prodp, ws, vn);
193	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
194	      mpn_incr_u (prodp + vn, cy);
195	      prodp += 2 * vn;
196	    }
197
198	  /* vn <= un < 3vn */
199
200	  if (4 * un < 5 * vn)
201	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
202	  else if (4 * un < 7 * vn)
203	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
204	  else
205	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
206
207	  cy = mpn_add_n (prodp, prodp, ws, vn);
208	  MPN_COPY (prodp + vn, ws + vn, un);
209	  mpn_incr_u (prodp + vn, cy);
210	}
211      else
212	{
213	  if (4 * un < 5 * vn)
214	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
215	  else if (4 * un < 7 * vn)
216	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
217	  else
218	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
219	}
220      TMP_SFREE;
221    }
222  else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
223	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
224    {
225      /* Handle the largest operands that are not in the FFT range.  The 2nd
226	 condition makes very unbalanced operands avoid the FFT code (except
227	 perhaps as coefficient products of the Toom code.  */
228
229      if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
230	{
231	  /* Use ToomX3 variants */
232	  mp_ptr scratch;
233	  TMP_SDECL; TMP_SMARK;
234
235	  scratch = TMP_SALLOC_LIMBS (ITCH);
236
237	  if (2 * un >= 5 * vn)
238	    {
239	      mp_limb_t cy;
240	      mp_ptr ws;
241
242	      /* The maximum ws usage is for the mpn_mul result.  */
243	      ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
244
245	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
246		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
247	      else
248		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
249	      un -= 2 * vn;
250	      up += 2 * vn;
251	      prodp += 2 * vn;
252
253	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
254		{
255		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
256		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
257		  else
258		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
259		  un -= 2 * vn;
260		  up += 2 * vn;
261		  cy = mpn_add_n (prodp, prodp, ws, vn);
262		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
263		  mpn_incr_u (prodp + vn, cy);
264		  prodp += 2 * vn;
265		}
266
267	      /* vn / 2 <= un < 2.5vn */
268
269	      if (un < vn)
270		mpn_mul (ws, vp, vn, up, un);
271	      else
272		mpn_mul (ws, up, un, vp, vn);
273
274	      cy = mpn_add_n (prodp, prodp, ws, vn);
275	      MPN_COPY (prodp + vn, ws + vn, un);
276	      mpn_incr_u (prodp + vn, cy);
277	    }
278	  else
279	    {
280	      if (6 * un < 7 * vn)
281		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
282	      else if (2 * un < 3 * vn)
283		{
284		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
285		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
286		  else
287		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
288		}
289	      else if (6 * un < 11 * vn)
290		{
291		  if (4 * un < 7 * vn)
292		    {
293		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
294			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
295		      else
296			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
297		    }
298		  else
299		    {
300		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
301			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
302		      else
303			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
304		    }
305		}
306	      else
307		{
308		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
309		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
310		  else
311		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
312		}
313	    }
314	  TMP_SFREE;
315	}
316      else
317	{
318	  mp_ptr scratch;
319	  TMP_DECL; TMP_MARK;
320
321	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
322	    {
323	      scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
324	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
325	    }
326	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
327	    {
328	      scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
329	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
330	    }
331	  else
332	    {
333	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
334	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
335	    }
336	  TMP_FREE;
337	}
338    }
339  else
340    {
341      if (un >= 8 * vn)
342	{
343	  mp_limb_t cy;
344	  mp_ptr ws;
345	  TMP_DECL; TMP_MARK;
346
347	  /* The maximum ws usage is for the mpn_mul result.  */
348	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
349
350	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
351	  un -= 3 * vn;
352	  up += 3 * vn;
353	  prodp += 3 * vn;
354
355	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
356	    {
357	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
358	      un -= 3 * vn;
359	      up += 3 * vn;
360	      cy = mpn_add_n (prodp, prodp, ws, vn);
361	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
362	      mpn_incr_u (prodp + vn, cy);
363	      prodp += 3 * vn;
364	    }
365
366	  /* vn / 2 <= un < 3.5vn */
367
368	  if (un < vn)
369	    mpn_mul (ws, vp, vn, up, un);
370	  else
371	    mpn_mul (ws, up, un, vp, vn);
372
373	  cy = mpn_add_n (prodp, prodp, ws, vn);
374	  MPN_COPY (prodp + vn, ws + vn, un);
375	  mpn_incr_u (prodp + vn, cy);
376
377	  TMP_FREE;
378	}
379      else
380	mpn_fft_mul (prodp, up, un, vp, vn);
381    }
382
383  return prodp[un + vn - 1];	/* historic */
384}
385