1227825Stheraven/* Implementation of the squaring algorithm with Toom-Cook 8.5-way.
2227825Stheraven
3227825Stheraven   Contributed to the GNU project by Marco Bodrato.
4227825Stheraven
5227825Stheraven   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6227825Stheraven   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7227825Stheraven   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8227825Stheraven
9227825StheravenCopyright 2009, 2012 Free Software Foundation, Inc.
10227825Stheraven
11227825StheravenThis file is part of the GNU MP Library.
12227825Stheraven
13227825StheravenThe GNU MP Library is free software; you can redistribute it and/or modify
14227825Stheravenit under the terms of either:
15227825Stheraven
16227825Stheraven  * the GNU Lesser General Public License as published by the Free
17227825Stheraven    Software Foundation; either version 3 of the License, or (at your
18227825Stheraven    option) any later version.
19227825Stheraven
20227825Stheravenor
21227825Stheraven
22227825Stheraven  * the GNU General Public License as published by the Free Software
23227825Stheraven    Foundation; either version 2 of the License, or (at your option) any
24227825Stheraven    later version.
25227825Stheraven
26227825Stheravenor both in parallel, as here.
27227825Stheraven
28227825StheravenThe GNU MP Library is distributed in the hope that it will be useful, but
29227825StheravenWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30227825Stheravenor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31227825Stheravenfor more details.
32262801Sdim
33262801SdimYou should have received copies of the GNU General Public License and the
34227825StheravenGNU Lesser General Public License along with the GNU MP Library.  If not,
35227825Stheravensee https://www.gnu.org/licenses/.  */
36227825Stheraven
37227825Stheraven
38227825Stheraven#include "gmp-impl.h"
39227825Stheraven
40227825Stheraven#if GMP_NUMB_BITS < 29
41262801Sdim#error Not implemented.
42262801Sdim#endif
43227825Stheraven
44227825Stheraven#if GMP_NUMB_BITS < 43
45227825Stheraven#define BIT_CORRECTION 1
46227825Stheraven#define CORRECTION_BITS GMP_NUMB_BITS
47227825Stheraven#else
48227825Stheraven#define BIT_CORRECTION 0
49227825Stheraven#define CORRECTION_BITS 0
50262801Sdim#endif
51262801Sdim
52227825Stheraven#ifndef SQR_TOOM8_THRESHOLD
53227825Stheraven#define SQR_TOOM8_THRESHOLD MUL_TOOM8H_THRESHOLD
54227825Stheraven#endif
55227825Stheraven
56227825Stheraven#ifndef SQR_TOOM6_THRESHOLD
57227825Stheraven#define SQR_TOOM6_THRESHOLD MUL_TOOM6H_THRESHOLD
58227825Stheraven#endif
59227825Stheraven
60227825Stheraven#if TUNE_PROGRAM_BUILD
61227825Stheraven#define MAYBE_sqr_basecase 1
62227825Stheraven#define MAYBE_sqr_above_basecase   1
63227825Stheraven#define MAYBE_sqr_toom2   1
64227825Stheraven#define MAYBE_sqr_above_toom2   1
65227825Stheraven#define MAYBE_sqr_toom3   1
66227825Stheraven#define MAYBE_sqr_above_toom3   1
67227825Stheraven#define MAYBE_sqr_toom4   1
68227825Stheraven#define MAYBE_sqr_above_toom4   1
69227825Stheraven#define MAYBE_sqr_above_toom6   1
70227825Stheraven#else
71227825Stheraven#define SQR_TOOM8_MAX					\
72227825Stheraven  ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (8*2-1+7)) ?	\
73227825Stheraven   ((SQR_FFT_THRESHOLD+8*2-1+7)/8)			\
74227825Stheraven   : MP_SIZE_T_MAX )
75227825Stheraven#define MAYBE_sqr_basecase					\
76227825Stheraven  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD)
77227825Stheraven#define MAYBE_sqr_above_basecase				\
78227825Stheraven  (SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD)
79262801Sdim#define MAYBE_sqr_toom2						\
80227825Stheraven  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD)
81227825Stheraven#define MAYBE_sqr_above_toom2					\
82227825Stheraven  (SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD)
83227825Stheraven#define MAYBE_sqr_toom3						\
84227825Stheraven  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD)
85227825Stheraven#define MAYBE_sqr_above_toom3					\
86227825Stheraven  (SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD)
87227825Stheraven#define MAYBE_sqr_toom4						\
88227825Stheraven  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD)
89227825Stheraven#define MAYBE_sqr_above_toom4					\
90227825Stheraven  (SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD)
91227825Stheraven#define MAYBE_sqr_above_toom6					\
92227825Stheraven  (SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD)
93227825Stheraven#endif
94227825Stheraven
95227825Stheraven#define TOOM8_SQR_REC(p, a, f, p2, a2, n, ws)				\
96227825Stheraven  do {									\
97227825Stheraven    if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase		\
98227825Stheraven	|| BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) {			\
99227825Stheraven      mpn_sqr_basecase (p, a, n);					\
100227825Stheraven      if (f) mpn_sqr_basecase (p2, a2, n);				\
101227825Stheraven    } else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2		\
102227825Stheraven	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) {		\
103227825Stheraven      mpn_toom2_sqr (p, a, n, ws);					\
104227825Stheraven      if (f) mpn_toom2_sqr (p2, a2, n, ws);				\
105227825Stheraven    } else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3		\
106227825Stheraven	     || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) {		\
107227825Stheraven      mpn_toom3_sqr (p, a, n, ws);					\
108227825Stheraven      if (f) mpn_toom3_sqr (p2, a2, n, ws);				\
109227825Stheraven    } else if (MAYBE_sqr_toom4 && ( !MAYBE_sqr_above_toom4		\
110227825Stheraven	     || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))) {		\
111227825Stheraven      mpn_toom4_sqr (p, a, n, ws);					\
112227825Stheraven      if (f) mpn_toom4_sqr (p2, a2, n, ws);				\
113227825Stheraven    } else if (! MAYBE_sqr_above_toom6					\
114262801Sdim	     || BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) {		\
115227825Stheraven      mpn_toom6_sqr (p, a, n, ws);					\
116227825Stheraven      if (f) mpn_toom6_sqr (p2, a2, n, ws);				\
117227825Stheraven    } else {								\
118227825Stheraven      mpn_toom8_sqr (p, a, n, ws);					\
119227825Stheraven      if (f) mpn_toom8_sqr (p2, a2, n, ws);				\
120227825Stheraven    }									\
121227825Stheraven  } while (0)
122227825Stheraven
123227825Stheravenvoid
124227825Stheravenmpn_toom8_sqr  (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch)
125227825Stheraven{
126227825Stheraven  mp_size_t n, s;
127227825Stheraven
128227825Stheraven  /***************************** decomposition *******************************/
129227825Stheraven
130227825Stheraven  ASSERT ( an >= 40 );
131227825Stheraven
132227825Stheraven  n = 1 + ((an - 1)>>3);
133227825Stheraven
134227825Stheraven  s = an - 7 * n;
135227825Stheraven
136227825Stheraven  ASSERT (0 < s && s <= n);
137227825Stheraven  ASSERT ( s + s > 3 );
138227825Stheraven
139227825Stheraven#define   r6    (pp + 3 * n)			/* 3n+1 */
140227825Stheraven#define   r4    (pp + 7 * n)			/* 3n+1 */
141227825Stheraven#define   r2    (pp +11 * n)			/* 3n+1 */
142227825Stheraven#define   r0    (pp +15 * n)			/* s+t <= 2*n */
143227825Stheraven#define   r7    (scratch)			/* 3n+1 */
144227825Stheraven#define   r5    (scratch + 3 * n + 1)		/* 3n+1 */
145227825Stheraven#define   r3    (scratch + 6 * n + 2)		/* 3n+1 */
146227825Stheraven#define   r1    (scratch + 9 * n + 3)		/* 3n+1 */
147227825Stheraven#define   v0    (pp +11 * n)			/* n+1 */
148227825Stheraven#define   v2    (pp +13 * n+2)			/* n+1 */
149262801Sdim#define   wse   (scratch +12 * n + 4)		/* 3n+1 */
150227825Stheraven
151227825Stheraven  /* Alloc also 3n+1 limbs for ws... toom_interpolate_16pts may
152227825Stheraven     need all of them, when DO_mpn_sublsh_n usea a scratch  */
153227825Stheraven/*   if (scratch == NULL) */
154227825Stheraven/*     scratch = TMP_SALLOC_LIMBS (30 * n + 6); */
155227825Stheraven
156227825Stheraven  /********************** evaluation and recursive calls *********************/
157227825Stheraven  /* $\pm1/8$ */
158227825Stheraven  mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 3, pp);
159227825Stheraven  /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */
160227825Stheraven  TOOM8_SQR_REC(pp, v0, 2, r7, v2, n + 1, wse);
161227825Stheraven  mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 0);
162227825Stheraven
163227825Stheraven  /* $\pm1/4$ */
164227825Stheraven  mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 2, pp);
165227825Stheraven  /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */
166227825Stheraven  TOOM8_SQR_REC(pp, v0, 2, r5, v2, n + 1, wse);
167227825Stheraven  mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 2, 0);
168227825Stheraven
169227825Stheraven  /* $\pm2$ */
170227825Stheraven  mpn_toom_eval_pm2 (v2, v0, 7, ap, n, s, pp);
171227825Stheraven  /* A(-2)*B(-2) */ /* A(+2)*B(+2) */
172227825Stheraven  TOOM8_SQR_REC(pp, v0, 2, r3, v2, n + 1, wse);
173227825Stheraven  mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 1, 2);
174227825Stheraven
175227825Stheraven  /* $\pm8$ */
176227825Stheraven  mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 3, pp);
177227825Stheraven  /* A(-8)*B(-8) */ /* A(+8)*B(+8) */
178227825Stheraven  TOOM8_SQR_REC(pp, v0, 2, r1, v2, n + 1, wse);
179227825Stheraven  mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 6);
180227825Stheraven
181227825Stheraven  /* $\pm1/2$ */
182227825Stheraven  mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 1, pp);
183227825Stheraven  /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */
184227825Stheraven  TOOM8_SQR_REC(pp, v0, 2, r6, v2, n + 1, wse);
185262801Sdim  mpn_toom_couple_handling (r6, 2 * n + 1, pp, 0, n, 1, 0);
186227825Stheraven
187227825Stheraven  /* $\pm1$ */
188227825Stheraven  mpn_toom_eval_pm1 (v2, v0, 7, ap, n, s,    pp);
189227825Stheraven  /* A(-1)*B(-1) */ /* A(1)*B(1) */
190227825Stheraven  TOOM8_SQR_REC(pp, v0, 2, r4, v2, n + 1, wse);
191227825Stheraven  mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 0, 0);
192227825Stheraven
193227825Stheraven  /* $\pm4$ */
194227825Stheraven  mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 2, pp);
195227825Stheraven  /* A(-4)*B(-4) */ /* A(+4)*B(+4) */
196227825Stheraven  TOOM8_SQR_REC(pp, v0, 2, r2, v2, n + 1, wse);
197227825Stheraven  mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 2, 4);
198227825Stheraven
199227825Stheraven#undef v0
200227825Stheraven#undef v2
201227825Stheraven
202262801Sdim  /* A(0)*B(0) */
203227825Stheraven  TOOM8_SQR_REC(pp, ap, 0, pp, ap, n, wse);
204227825Stheraven
205227825Stheraven  mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, 2 * s, 0, wse);
206227825Stheraven
207227825Stheraven#undef r0
208227825Stheraven#undef r1
209227825Stheraven#undef r2
210227825Stheraven#undef r3
211227825Stheraven#undef r4
212227825Stheraven#undef r5
213227825Stheraven#undef r6
214227825Stheraven#undef wse
215227825Stheraven
216227825Stheraven}
217227825Stheraven
218227825Stheraven#undef TOOM8_SQR_REC
219227825Stheraven#undef MAYBE_sqr_basecase
220227825Stheraven#undef MAYBE_sqr_above_basecase
221227825Stheraven#undef MAYBE_sqr_toom2
222227825Stheraven#undef MAYBE_sqr_above_toom2
223227825Stheraven#undef MAYBE_sqr_toom3
224227825Stheraven#undef MAYBE_sqr_above_toom3
225227825Stheraven#undef MAYBE_sqr_above_toom4
226227825Stheraven