1/* Interpolation for the algorithm Toom-Cook 6.5-way.
2
3   Contributed to the GNU project by Marco Bodrato.
4
5   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of either:
15
16  * the GNU Lesser General Public License as published by the Free
17    Software Foundation; either version 3 of the License, or (at your
18    option) any later version.
19
20or
21
22  * the GNU General Public License as published by the Free Software
23    Foundation; either version 2 of the License, or (at your option) any
24    later version.
25
26or both in parallel, as here.
27
28The GNU MP Library is distributed in the hope that it will be useful, but
29WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31for more details.
32
33You should have received copies of the GNU General Public License and the
34GNU Lesser General Public License along with the GNU MP Library.  If not,
35see https://www.gnu.org/licenses/.  */
36
37
38#include "gmp-impl.h"
39
40
41#if GMP_NUMB_BITS < 21
42#error Not implemented: Both sublsh_n(,,,20) should be corrected.
43#endif
44
45#if GMP_NUMB_BITS < 16
46#error Not implemented: divexact_by42525 needs splitting.
47#endif
48
49#if GMP_NUMB_BITS < 12
50#error Not implemented: Hard to adapt...
51#endif
52
53
54/* FIXME: tuneup should decide the best variant */
55#ifndef AORSMUL_FASTER_AORS_AORSLSH
56#define AORSMUL_FASTER_AORS_AORSLSH 1
57#endif
58#ifndef AORSMUL_FASTER_AORS_2AORSLSH
59#define AORSMUL_FASTER_AORS_2AORSLSH 1
60#endif
61#ifndef AORSMUL_FASTER_2AORSLSH
62#define AORSMUL_FASTER_2AORSLSH 1
63#endif
64#ifndef AORSMUL_FASTER_3AORSLSH
65#define AORSMUL_FASTER_3AORSLSH 1
66#endif
67
68
69#if HAVE_NATIVE_mpn_sublsh_n
70#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
71#else
72static mp_limb_t
73DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
74{
75#if USE_MUL_1 && 0
76  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
77#else
78  mp_limb_t __cy;
79  __cy = mpn_lshift(ws,src,n,s);
80  return    __cy + mpn_sub_n(dst,dst,ws,n);
81#endif
82}
83#endif
84
85#if HAVE_NATIVE_mpn_addlsh_n
86#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
87#else
88#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
89static mp_limb_t
90DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
91{
92#if USE_MUL_1 && 0
93  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
94#else
95  mp_limb_t __cy;
96  __cy = mpn_lshift(ws,src,n,s);
97  return    __cy + mpn_add_n(dst,dst,ws,n);
98#endif
99}
100#endif
101#endif
102
103#if HAVE_NATIVE_mpn_subrsh
104#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
105#else
106/* FIXME: This is not a correct definition, it assumes no carry */
107#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
108do {									\
109  mp_limb_t __cy;							\
110  MPN_DECR_U (dst, nd, src[0] >> s);					\
111  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
112  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
113} while (0)
114#endif
115
116
117#define BINVERT_9 \
118  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
119
120#define BINVERT_255 \
121  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
122
123  /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
124#if GMP_LIMB_BITS == 32
125#define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
126#define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
127#else
128#if GMP_LIMB_BITS == 64
129#define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
130#define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
131#endif
132#endif
133
134#ifndef mpn_divexact_by255
135#if GMP_NUMB_BITS % 8 == 0
136#define mpn_divexact_by255(dst,src,size) \
137  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
138#else
139#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
140#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
141#else
142#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
143#endif
144#endif
145#endif
146
147#ifndef mpn_divexact_by9x4
148#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
149#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
150#else
151#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
152#endif
153#endif
154
155#ifndef mpn_divexact_by42525
156#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
157#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
158#else
159#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
160#endif
161#endif
162
163#ifndef mpn_divexact_by2835x4
164#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
165#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
166#else
167#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
168#endif
169#endif
170
171/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
172   points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
173   we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
174   degree 11 (or 10), given the 12 (rsp. 11) values:
175
176     r0 = limit at infinity of f(x) / x^11,
177     r1 = f(4),f(-4),
178     r2 = f(2),f(-2),
179     r3 = f(1),f(-1),
180     r4 = f(1/4),f(-1/4),
181     r5 = f(1/2),f(-1/2),
182     r6 = f(0).
183
184   All couples of the form f(n),f(-n) must be already mixed with
185   toom_couple_handling(f(n),...,f(-n),...)
186
187   The result is stored in {pp, spt + 7*n (or 6*n)}.
188   At entry, r6 is stored at {pp, 2n},
189   r4 is stored at {pp + 3n, 3n + 1}.
190   r2 is stored at {pp + 7n, 3n + 1}.
191   r0 is stored at {pp +11n, spt}.
192
193   The other values are 3n+1 limbs each (with most significant limbs small).
194
195   Negative intermediate results are stored two-complemented.
196   Inputs are destroyed.
197*/
198
199void
200mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
201			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
202{
203  mp_limb_t cy;
204  mp_size_t n3;
205  mp_size_t n3p1;
206  n3 = 3 * n;
207  n3p1 = n3 + 1;
208
209#define   r4    (pp + n3)			/* 3n+1 */
210#define   r2    (pp + 7 * n)			/* 3n+1 */
211#define   r0    (pp +11 * n)			/* s+t <= 2*n */
212
213  /******************************* interpolation *****************************/
214  if (half != 0) {
215    cy = mpn_sub_n (r3, r3, r0, spt);
216    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
217
218    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
219    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
220    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
221
222    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
223    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
224    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
225  };
226
227  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
228  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
229
230#if HAVE_NATIVE_mpn_add_n_sub_n
231  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
232#else
233  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
234  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
235  MP_PTR_SWAP(r1, wsi);
236#endif
237
238  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
239  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
240
241#if HAVE_NATIVE_mpn_add_n_sub_n
242  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
243#else
244  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
245  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
246  MP_PTR_SWAP(r5, wsi);
247#endif
248
249  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
250
251#if AORSMUL_FASTER_AORS_AORSLSH
252  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
253#else
254  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
255  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
256#endif
257  /* A division by 2835x4 follows. Warning: the operand can be negative! */
258  mpn_divexact_by2835x4(r4, r4, n3p1);
259  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
260    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
261
262#if AORSMUL_FASTER_2AORSLSH
263  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
264#else
265  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
266  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
267#endif
268  mpn_divexact_by255(r5, r5, n3p1);
269
270  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
271
272#if AORSMUL_FASTER_3AORSLSH
273  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
274#else
275  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
276  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
277  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
278#endif
279  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
280  mpn_divexact_by42525(r1, r1, n3p1);
281
282#if AORSMUL_FASTER_AORS_2AORSLSH
283  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
284#else
285  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
286  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
287  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
288#endif
289  mpn_divexact_by9x4(r2, r2, n3p1);
290
291  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
292
293#ifdef HAVE_NATIVE_mpn_rsh1sub_n
294  mpn_rsh1sub_n (r4, r2, r4, n3p1);
295  r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
296#else
297  mpn_sub_n (r4, r2, r4, n3p1);
298  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
299#endif
300  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
301
302#ifdef HAVE_NATIVE_mpn_rsh1add_n
303  mpn_rsh1add_n (r5, r5, r1, n3p1);
304  r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
305#else
306  mpn_add_n (r5, r5, r1, n3p1);
307  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
308#endif
309
310  /* last interpolation steps... */
311  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
312  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
313  /* ... could be mixed with recomposition
314	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
315  */
316
317  /***************************** recomposition *******************************/
318  /*
319    pp[] prior to operations:
320    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
321
322    summation scheme for remaining operations:
323    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
324    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
325	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
326  */
327
328  cy = mpn_add_n (pp + n, pp + n, r5, n);
329  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
330#if HAVE_NATIVE_mpn_add_nc
331  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
332#else
333  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
334  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
335#endif
336  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
337
338  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
339  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
340#if HAVE_NATIVE_mpn_add_nc
341  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
342#else
343  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
344  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
345#endif
346  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
347
348  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
349  if (half) {
350    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
351#if HAVE_NATIVE_mpn_add_nc
352    if (LIKELY (spt > n)) {
353      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
354      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
355    } else {
356      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
357    }
358#else
359    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
360    if (LIKELY (spt > n)) {
361      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
362      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
363    } else {
364      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
365    }
366#endif
367  } else {
368    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
369  }
370
371#undef   r0
372#undef   r2
373#undef   r4
374}
375