1/* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
2
3   Contributed to the GNU project by Marco Bodrato.
4
5   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9Copyright 2009, 2010 Free Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of the GNU Lesser General Public License as published by
15the Free Software Foundation; either version 3 of the License, or (at your
16option) any later version.
17
18The GNU MP Library is distributed in the hope that it will be useful, but
19WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21License for more details.
22
23You should have received a copy of the GNU Lesser General Public License
24along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25
26#include "gmp.h"
27#include "gmp-impl.h"
28
29/* For odd divisors, mpn_divexact_1 works fine with two's complement. */
30#ifndef mpn_divexact_by3
31#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
32#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
33#else
34#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
35#endif
36#endif
37
38/* Interpolation for Toom-3.5, using the evaluation points: infinity,
39   1, -1, 2, -2. More precisely, we want to compute
40   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
41   six values
42
43     w5 = f(0),
44     w4 = f(-1),
45     w3 = f(1)
46     w2 = f(-2),
47     w1 = f(2),
48     w0 = limit at infinity of f(x) / x^5,
49
50   The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
51   {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
52   {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
53   significant limbs small). f(-1) and f(-2) may be negative, signs
54   determined by the flag bits. All intermediate results are positive.
55   Inputs are destroyed.
56
57   Interpolation sequence was taken from the paper: "Integer and
58   Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
59   Some slight variations were introduced: adaptation to "gmp
60   instruction set", and a final saving of an operation by interlacing
61   interpolation and recomposition phases.
62*/
63
64void
65mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
66			   mp_ptr w4, mp_ptr w2, mp_ptr w1,
67			   mp_size_t w0n)
68{
69  mp_limb_t cy;
70  /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
71  mp_limb_t cy4, cy6, embankment;
72
73  ASSERT( n > 0 );
74  ASSERT( 2*n >= w0n && w0n > 0 );
75
76#define w5  pp					/* 2n   */
77#define w3  (pp + 2 * n)			/* 2n+1 */
78#define w0  (pp + 5 * n)			/* w0n  */
79
80  /* Interpolate with sequence:
81     W2 =(W1 - W2)>>2
82     W1 =(W1 - W5)>>1
83     W1 =(W1 - W2)>>1
84     W4 =(W3 - W4)>>1
85     W2 =(W2 - W4)/3
86     W3 = W3 - W4 - W5
87     W1 =(W1 - W3)/3
88     // Last steps are mixed with recomposition...
89     W2 = W2 - W0<<2
90     W4 = W4 - W2
91     W3 = W3 - W1
92     W2 = W2 - W0
93  */
94
95  /* W2 =(W1 - W2)>>2 */
96  if (flags & toom6_vm2_neg)
97    mpn_add_n (w2, w1, w2, 2 * n + 1);
98  else
99    mpn_sub_n (w2, w1, w2, 2 * n + 1);
100  mpn_rshift (w2, w2, 2 * n + 1, 2);
101
102  /* W1 =(W1 - W5)>>1 */
103  w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
104  mpn_rshift (w1, w1, 2 * n + 1, 1);
105
106  /* W1 =(W1 - W2)>>1 */
107#if HAVE_NATIVE_mpn_rsh1sub_n
108  mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
109#else
110  mpn_sub_n (w1, w1, w2, 2 * n + 1);
111  mpn_rshift (w1, w1, 2 * n + 1, 1);
112#endif
113
114  /* W4 =(W3 - W4)>>1 */
115  if (flags & toom6_vm1_neg)
116    {
117#if HAVE_NATIVE_mpn_rsh1add_n
118      mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
119#else
120      mpn_add_n (w4, w3, w4, 2 * n + 1);
121      mpn_rshift (w4, w4, 2 * n + 1, 1);
122#endif
123    }
124  else
125    {
126#if HAVE_NATIVE_mpn_rsh1sub_n
127      mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
128#else
129      mpn_sub_n (w4, w3, w4, 2 * n + 1);
130      mpn_rshift (w4, w4, 2 * n + 1, 1);
131#endif
132    }
133
134  /* W2 =(W2 - W4)/3 */
135  mpn_sub_n (w2, w2, w4, 2 * n + 1);
136  mpn_divexact_by3 (w2, w2, 2 * n + 1);
137
138  /* W3 = W3 - W4 - W5 */
139  mpn_sub_n (w3, w3, w4, 2 * n + 1);
140  w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
141
142  /* W1 =(W1 - W3)/3 */
143  mpn_sub_n (w1, w1, w3, 2 * n + 1);
144  mpn_divexact_by3 (w1, w1, 2 * n + 1);
145
146  /*
147    [1 0 0 0 0 0;
148     0 1 0 0 0 0;
149     1 0 1 0 0 0;
150     0 1 0 1 0 0;
151     1 0 1 0 1 0;
152     0 0 0 0 0 1]
153
154    pp[] prior to operations:
155     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
156
157    summation scheme for remaining operations:
158     |______________5|n_____4|n_____3|n_____2|n______|n______|pp
159     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
160				    || H w4  | L w4  |
161		    || H w2  | L w2  |
162	    || H w1  | L w1  |
163			    ||-H w1  |-L w1  |
164		     |-H w0  |-L w0 ||-H w2  |-L w2  |
165  */
166  cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
167  MPN_INCR_U (pp + 3 * n + 1, n, cy);
168
169  /* W2 -= W0<<2 */
170#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n
171#if HAVE_NATIVE_mpn_sublsh2_n
172  cy = mpn_sublsh2_n(w2, w2, w0, w0n);
173#else
174  cy = mpn_sublsh_n(w2, w2, w0, w0n, 2);
175#endif
176#else
177  /* {W4,2*n+1} is now free and can be overwritten. */
178  cy = mpn_lshift(w4, w0, w0n, 2);
179  cy+= mpn_sub_n(w2, w2, w4, w0n);
180#endif
181  MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
182
183  /* W4L = W4L - W2L */
184  cy = mpn_sub_n (pp + n, pp + n, w2, n);
185  MPN_DECR_U (w3, 2 * n + 1, cy);
186
187  /* W3H = W3H + W2L */
188  cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
189  /* W1L + W2H */
190  cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
191  MPN_INCR_U (w1 + n, n + 1, cy);
192
193  /* W0 = W0 + W1H */
194  if (LIKELY (w0n > n))
195    cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
196  else
197    cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
198
199  /*
200    summation scheme for the next operation:
201     |...____5|n_____4|n_____3|n_____2|n______|n______|pp
202     |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
203		     ...-w0___|-w1_w2 |
204  */
205  /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
206  cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
207
208  /* embankment is a "dirty trick" to avoid carry/borrow propagation
209     beyond allocated memory */
210  embankment = w0[w0n - 1] - 1;
211  w0[w0n - 1] = 1;
212  if (LIKELY (w0n > n)) {
213    if ( LIKELY(cy4 > cy6) )
214      MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
215    else
216      MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
217    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
218    MPN_INCR_U (w0 + n, w0n - n, cy6);
219  } else {
220    MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
221    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
222  }
223  w0[w0n - 1] += embankment;
224
225#undef w5
226#undef w3
227#undef w0
228
229}
230