1/* mpn_mul -- Multiply two natural numbers.
2
3   Contributed to the GNU project by Torbjorn Granlund.
4
5Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
62014, 2019 Free Software Foundation, Inc.
7
8This file is part of the GNU MP Library.
9
10The GNU MP Library is free software; you can redistribute it and/or modify
11it under the terms of either:
12
13  * the GNU Lesser General Public License as published by the Free
14    Software Foundation; either version 3 of the License, or (at your
15    option) any later version.
16
17or
18
19  * the GNU General Public License as published by the Free Software
20    Foundation; either version 2 of the License, or (at your option) any
21    later version.
22
23or both in parallel, as here.
24
25The GNU MP Library is distributed in the hope that it will be useful, but
26WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28for more details.
29
30You should have received copies of the GNU General Public License and the
31GNU Lesser General Public License along with the GNU MP Library.  If not,
32see https://www.gnu.org/licenses/.  */
33
34#include "gmp-impl.h"
35
36
37#ifndef MUL_BASECASE_MAX_UN
38#define MUL_BASECASE_MAX_UN 500
39#endif
40
41/* Areas where the different toom algorithms can be called (extracted
42   from the t-toom*.c files, and ignoring small constant offsets):
43
44   1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
45                                        4/7              6/7
46				       6/11
47                                       |--------------------| toom22 (small)
48                                                           || toom22 (large)
49                                                       |xxxx| toom22 called
50                      |-------------------------------------| toom32
51                                         |xxxxxxxxxxxxxxxx| | toom32 called
52                                               |------------| toom33
53                                                          |x| toom33 called
54             |---------------------------------|            | toom42
55	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
56                                       |--------------------| toom43
57                                               |xxxxxxxxxx|   toom43 called
58         |-----------------------------|                      toom52 (unused)
59                                                   |--------| toom44
60						   |xxxxxxxx| toom44 called
61                              |--------------------|        | toom53
62                                        |xxxxxx|              toom53 called
63    |-------------------------|                               toom62 (unused)
64                                           |----------------| toom54 (unused)
65                      |--------------------|                  toom63
66	                      |xxxxxxxxx|                   | toom63 called
67                          |---------------------------------| toom6h
68						   |xxxxxxxx| toom6h called
69                                  |-------------------------| toom8h (32 bit)
70                 |------------------------------------------| toom8h (64 bit)
71						   |xxxxxxxx| toom8h called
72*/
73
74#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
75#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
76
77/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
78   (pointed to by VP, with VN limbs), and store the result at PRODP.  The
79   result is UN + VN limbs.  Return the most significant limb of the result.
80
81   NOTE: The space pointed to by PRODP is overwritten before finished with U
82   and V, so overlap is an error.
83
84   Argument constraints:
85   1. UN >= VN.
86   2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
87      the multiplier and the multiplicand.  */
88
89/*
90  * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
91    ideal lines of the surrounding algorithms.  Is that optimal?
92
93  * The toomX3 code now uses a structure similar to the one of toomX2, except
94    that it loops longer in the unbalanced case.  The result is that the
95    remaining area might have un < vn.  Should we fix the toomX2 code in a
96    similar way?
97
98  * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
99    therefore calls mpn_mul recursively for certain cases.
100
101  * Allocate static temp space using THRESHOLD variables (except for toom44
102    when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
103
104  * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
105    have the same vn threshold.  This is not true, we should actually use
106    mul_basecase for slightly larger operands for toom32 than for toom22, and
107    even larger for toom42.
108
109  * That problem is even more prevalent for toomX3.  We therefore use special
110    THRESHOLD variables there.
111*/
112
113mp_limb_t
114mpn_mul (mp_ptr prodp,
115	 mp_srcptr up, mp_size_t un,
116	 mp_srcptr vp, mp_size_t vn)
117{
118  ASSERT (un >= vn);
119  ASSERT (vn >= 1);
120  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
121  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
122
123  if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
124    {
125      /* When un (and thus vn) is below the toom22 range, do mul_basecase.
126	 Test un and not vn here not to thwart the un >> vn code below.
127	 This special case is not necessary, but cuts the overhead for the
128	 smallest operands. */
129      mpn_mul_basecase (prodp, up, un, vp, vn);
130    }
131  else if (un == vn)
132    {
133      mpn_mul_n (prodp, up, vp, un);
134    }
135  else if (vn < MUL_TOOM22_THRESHOLD)
136    { /* plain schoolbook multiplication */
137
138      /* Unless un is very large, or else if have an applicable mpn_mul_N,
139	 perform basecase multiply directly.  */
140      if (un <= MUL_BASECASE_MAX_UN
141#if HAVE_NATIVE_mpn_mul_2
142	  || vn <= 2
143#else
144	  || vn == 1
145#endif
146	  )
147	mpn_mul_basecase (prodp, up, un, vp, vn);
148      else
149	{
150	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
151	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
152	     these pieces with the vp[] operand.  After each such partial
153	     multiplication (but the last) we copy the most significant vn
154	     limbs into a temporary buffer since that part would otherwise be
155	     overwritten by the next multiplication.  After the next
156	     multiplication, we add it back.  This illustrates the situation:
157
158                                                    -->vn<--
159                                                      |  |<------- un ------->|
160                                                         _____________________|
161                                                        X                    /|
162                                                      /XX__________________/  |
163                                    _____________________                     |
164                                   X                    /                     |
165                                 /XX__________________/                       |
166               _____________________                                          |
167              /                    /                                          |
168            /____________________/                                            |
169	    ==================================================================
170
171	    The parts marked with X are the parts whose sums are copied into
172	    the temporary buffer.  */
173
174	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
175	  mp_limb_t cy;
176	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
177
178	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
179	  prodp += MUL_BASECASE_MAX_UN;
180	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
181	  up += MUL_BASECASE_MAX_UN;
182	  un -= MUL_BASECASE_MAX_UN;
183	  while (un > MUL_BASECASE_MAX_UN)
184	    {
185	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
186	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
187	      mpn_incr_u (prodp + vn, cy);
188	      prodp += MUL_BASECASE_MAX_UN;
189	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
190	      up += MUL_BASECASE_MAX_UN;
191	      un -= MUL_BASECASE_MAX_UN;
192	    }
193	  if (un > vn)
194	    {
195	      mpn_mul_basecase (prodp, up, un, vp, vn);
196	    }
197	  else
198	    {
199	      ASSERT (un > 0);
200	      mpn_mul_basecase (prodp, vp, vn, up, un);
201	    }
202	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
203	  mpn_incr_u (prodp + vn, cy);
204	}
205    }
206  else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
207    {
208      /* Use ToomX2 variants */
209      mp_ptr scratch;
210      TMP_SDECL; TMP_SMARK;
211
212#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
213      scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
214      ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
215      ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
216      ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
217#undef ITCH_TOOMX2
218
219      /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
220	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
221	 wise; we would get better balance by slightly moving the bound.  We
222	 will sometimes end up with un < vn, like in the X3 arm below.  */
223      if (un >= 3 * vn)
224	{
225	  mp_limb_t cy;
226	  mp_ptr ws;
227
228	  /* The maximum ws usage is for the mpn_mul result.  */
229	  ws = TMP_SALLOC_LIMBS (4 * vn);
230
231	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
232	  un -= 2 * vn;
233	  up += 2 * vn;
234	  prodp += 2 * vn;
235
236	  while (un >= 3 * vn)
237	    {
238	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
239	      un -= 2 * vn;
240	      up += 2 * vn;
241	      cy = mpn_add_n (prodp, prodp, ws, vn);
242	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
243	      mpn_incr_u (prodp + vn, cy);
244	      prodp += 2 * vn;
245	    }
246
247	  /* vn <= un < 3vn */
248
249	  if (4 * un < 5 * vn)
250	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
251	  else if (4 * un < 7 * vn)
252	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
253	  else
254	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
255
256	  cy = mpn_add_n (prodp, prodp, ws, vn);
257	  MPN_COPY (prodp + vn, ws + vn, un);
258	  mpn_incr_u (prodp + vn, cy);
259	}
260      else
261	{
262	  if (4 * un < 5 * vn)
263	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
264	  else if (4 * un < 7 * vn)
265	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
266	  else
267	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
268	}
269      TMP_SFREE;
270    }
271  else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
272	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
273    {
274      /* Handle the largest operands that are not in the FFT range.  The 2nd
275	 condition makes very unbalanced operands avoid the FFT code (except
276	 perhaps as coefficient products of the Toom code.  */
277
278      if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
279	{
280	  /* Use ToomX3 variants */
281	  mp_ptr scratch;
282	  TMP_DECL; TMP_MARK;
283
284#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
285	  scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
286	  ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
287	  ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
288	  ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
289	  ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
290	  ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
291	  ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
292#undef ITCH_TOOMX3
293
294	  if (2 * un >= 5 * vn)
295	    {
296	      mp_limb_t cy;
297	      mp_ptr ws;
298
299	      /* The maximum ws usage is for the mpn_mul result.  */
300	      ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
301
302	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
303		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
304	      else
305		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
306	      un -= 2 * vn;
307	      up += 2 * vn;
308	      prodp += 2 * vn;
309
310	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
311		{
312		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
313		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
314		  else
315		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
316		  un -= 2 * vn;
317		  up += 2 * vn;
318		  cy = mpn_add_n (prodp, prodp, ws, vn);
319		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
320		  mpn_incr_u (prodp + vn, cy);
321		  prodp += 2 * vn;
322		}
323
324	      /* vn / 2 <= un < 2.5vn */
325
326	      if (un < vn)
327		mpn_mul (ws, vp, vn, up, un);
328	      else
329		mpn_mul (ws, up, un, vp, vn);
330
331	      cy = mpn_add_n (prodp, prodp, ws, vn);
332	      MPN_COPY (prodp + vn, ws + vn, un);
333	      mpn_incr_u (prodp + vn, cy);
334	    }
335	  else
336	    {
337	      if (6 * un < 7 * vn)
338		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
339	      else if (2 * un < 3 * vn)
340		{
341		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
342		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
343		  else
344		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
345		}
346	      else if (6 * un < 11 * vn)
347		{
348		  if (4 * un < 7 * vn)
349		    {
350		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
351			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
352		      else
353			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
354		    }
355		  else
356		    {
357		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
358			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
359		      else
360			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
361		    }
362		}
363	      else
364		{
365		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
366		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
367		  else
368		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
369		}
370	    }
371	  TMP_FREE;
372	}
373      else
374	{
375	  mp_ptr scratch;
376	  TMP_DECL; TMP_MARK;
377
378	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
379	    {
380	      scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
381	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
382	    }
383	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
384	    {
385	      scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
386	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
387	    }
388	  else
389	    {
390	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
391	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
392	    }
393	  TMP_FREE;
394	}
395    }
396  else
397    {
398      if (un >= 8 * vn)
399	{
400	  mp_limb_t cy;
401	  mp_ptr ws;
402	  TMP_DECL; TMP_MARK;
403
404	  /* The maximum ws usage is for the mpn_mul result.  */
405	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
406
407	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
408	  un -= 3 * vn;
409	  up += 3 * vn;
410	  prodp += 3 * vn;
411
412	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
413	    {
414	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
415	      un -= 3 * vn;
416	      up += 3 * vn;
417	      cy = mpn_add_n (prodp, prodp, ws, vn);
418	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
419	      mpn_incr_u (prodp + vn, cy);
420	      prodp += 3 * vn;
421	    }
422
423	  /* vn / 2 <= un < 3.5vn */
424
425	  if (un < vn)
426	    mpn_mul (ws, vp, vn, up, un);
427	  else
428	    mpn_mul (ws, up, un, vp, vn);
429
430	  cy = mpn_add_n (prodp, prodp, ws, vn);
431	  MPN_COPY (prodp + vn, ws + vn, un);
432	  mpn_incr_u (prodp + vn, cy);
433
434	  TMP_FREE;
435	}
436      else
437	mpn_fft_mul (prodp, up, un, vp, vn);
438    }
439
440  return prodp[un + vn - 1];	/* historic */
441}
442