1/* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62. 2 3 Contributed to the GNU project by Niels M�ller. 4 Improvements by Marco Bodrato. 5 6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10Copyright 2006, 2007, 2009 Free Software Foundation, Inc. 11 12This file is part of the GNU MP Library. 13 14The GNU MP Library is free software; you can redistribute it and/or modify 15it under the terms of the GNU Lesser General Public License as published by 16the Free Software Foundation; either version 3 of the License, or (at your 17option) any later version. 18 19The GNU MP Library is distributed in the hope that it will be useful, but 20WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 21or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 22License for more details. 23 24You should have received a copy of the GNU Lesser General Public License 25along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 26 27#include "gmp.h" 28#include "gmp-impl.h" 29 30#define BINVERT_3 MODLIMB_INVERSE_3 31 32#define BINVERT_9 \ 33 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) 34 35#define BINVERT_15 \ 36 ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)) 37 38/* For the various mpn_divexact_byN here, fall back to using either 39 mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is 40 many faster if it is native. For now, since mpn_divexact_1 is native on 41 several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use 42 mpn_pi1_bdiv_q_1 unconditionally. FIXME. */ 43 44/* For odd divisors, mpn_divexact_1 works fine with two's complement. */ 45#ifndef mpn_divexact_by3 46#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 47#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) 48#else 49#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) 50#endif 51#endif 52 53#ifndef mpn_divexact_by9 54#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 55#define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0) 56#else 57#define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9) 58#endif 59#endif 60 61#ifndef mpn_divexact_by15 62#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 63#define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0) 64#else 65#define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15) 66#endif 67#endif 68 69/* Interpolation for toom4, using the evaluation points 0, infinity, 70 1, -1, 2, -2, 1/2. More precisely, we want to compute 71 f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the 72 seven values 73 74 w0 = f(0), 75 w1 = f(-2), 76 w2 = f(1), 77 w3 = f(-1), 78 w4 = f(2) 79 w5 = 64 * f(1/2) 80 w6 = limit at infinity of f(x) / x^6, 81 82 The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n }, 83 w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n, 84 w6n }. The other values are 2n + 1 limbs each (with most 85 significant limbs small). f(-1) and f(-1/2) may be negative, signs 86 determined by the flag bits. Inputs are destroyed. 87 88 Needs (2*n + 1) limbs of temporary storage. 89*/ 90 91void 92mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags, 93 mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5, 94 mp_size_t w6n, mp_ptr tp) 95{ 96 mp_size_t m; 97 mp_limb_t cy; 98 99 m = 2*n + 1; 100#define w0 rp 101#define w2 (rp + 2*n) 102#define w6 (rp + 6*n) 103 104 ASSERT (w6n > 0); 105 ASSERT (w6n <= 2*n); 106 107 /* Using formulas similar to Marco Bodrato's 108 109 W5 = W5 + W4 110 W1 =(W4 - W1)/2 111 W4 = W4 - W0 112 W4 =(W4 - W1)/4 - W6*16 113 W3 =(W2 - W3)/2 114 W2 = W2 - W3 115 116 W5 = W5 - W2*65 May be negative. 117 W2 = W2 - W6 - W0 118 W5 =(W5 + W2*45)/2 Now >= 0 again. 119 W4 =(W4 - W2)/3 120 W2 = W2 - W4 121 122 W1 = W5 - W1 May be negative. 123 W5 =(W5 - W3*8)/9 124 W3 = W3 - W5 125 W1 =(W1/15 + W5)/2 Now >= 0 again. 126 W5 = W5 - W1 127 128 where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), 129 W4 = f(2), W5 = f(1/2), W6 = f(oo), 130 131 Note that most intermediate results are positive; the ones that 132 may be negative are represented in two's complement. We must 133 never shift right a value that may be negative, since that would 134 invalidate the sign bit. On the other hand, divexact by odd 135 numbers work fine with two's complement. 136 */ 137 138 mpn_add_n (w5, w5, w4, m); 139 if (flags & toom7_w1_neg) 140 { 141#ifdef HAVE_NATIVE_mpn_rsh1add_n 142 mpn_rsh1add_n (w1, w1, w4, m); 143#else 144 mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); 145 mpn_rshift (w1, w1, m, 1); 146#endif 147 } 148 else 149 { 150#ifdef HAVE_NATIVE_mpn_rsh1sub_n 151 mpn_rsh1sub_n (w1, w4, w1, m); 152#else 153 mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); 154 mpn_rshift (w1, w1, m, 1); 155#endif 156 } 157 mpn_sub (w4, w4, m, w0, 2*n); 158 mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); 159 mpn_rshift (w4, w4, m, 2); /* w4>=0 */ 160 161 tp[w6n] = mpn_lshift (tp, w6, w6n, 4); 162 mpn_sub (w4, w4, m, tp, w6n+1); 163 164 if (flags & toom7_w3_neg) 165 { 166#ifdef HAVE_NATIVE_mpn_rsh1add_n 167 mpn_rsh1add_n (w3, w3, w2, m); 168#else 169 mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); 170 mpn_rshift (w3, w3, m, 1); 171#endif 172 } 173 else 174 { 175#ifdef HAVE_NATIVE_mpn_rsh1sub_n 176 mpn_rsh1sub_n (w3, w2, w3, m); 177#else 178 mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); 179 mpn_rshift (w3, w3, m, 1); 180#endif 181 } 182 183 mpn_sub_n (w2, w2, w3, m); 184 185 mpn_submul_1 (w5, w2, m, 65); 186 mpn_sub (w2, w2, m, w6, w6n); 187 mpn_sub (w2, w2, m, w0, 2*n); 188 189 mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); 190 mpn_rshift (w5, w5, m, 1); 191 mpn_sub_n (w4, w4, w2, m); 192 193 mpn_divexact_by3 (w4, w4, m); 194 mpn_sub_n (w2, w2, w4, m); 195 196 mpn_sub_n (w1, w5, w1, m); 197 mpn_lshift (tp, w3, m, 3); 198 mpn_sub_n (w5, w5, tp, m); 199 mpn_divexact_by9 (w5, w5, m); 200 mpn_sub_n (w3, w3, w5, m); 201 202 mpn_divexact_by15 (w1, w1, m); 203 mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); 204 mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ 205 mpn_sub_n (w5, w5, w1, m); 206 207 /* These bounds are valid for the 4x4 polynomial product of toom44, 208 * and they are conservative for toom53 and toom62. */ 209 ASSERT (w1[2*n] < 2); 210 ASSERT (w2[2*n] < 3); 211 ASSERT (w3[2*n] < 4); 212 ASSERT (w4[2*n] < 3); 213 ASSERT (w5[2*n] < 2); 214 215 /* Addition chain. Note carries and the 2n'th limbs that need to be 216 * added in. 217 * 218 * Special care is needed for w2[2n] and the corresponding carry, 219 * since the "simple" way of adding it all together would overwrite 220 * the limb at wp[2*n] and rp[4*n] (same location) with the sum of 221 * the high half of w3 and the low half of w4. 222 * 223 * 7 6 5 4 3 2 1 0 224 * | | | | | | | | | 225 * ||w3 (2n+1)| 226 * ||w4 (2n+1)| 227 * ||w5 (2n+1)| ||w1 (2n+1)| 228 * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r) 229 * ----------------------------------------------- 230 * r | | | | | | | | | 231 * c7 c6 c5 c4 c3 Carries to propagate 232 */ 233 234 cy = mpn_add_n (rp + n, rp + n, w1, m); 235 MPN_INCR_U (w2 + n + 1, n , cy); 236 cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n); 237 MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy); 238 cy = mpn_add_n (rp + 4*n, w3 + n, w4, n); 239 MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy); 240 cy = mpn_add_n (rp + 5*n, w4 + n, w5, n); 241 MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy); 242 if (w6n > n + 1) 243 ASSERT_NOCARRY (mpn_add (rp + 6*n, rp + 6*n, w6n, w5 + n, n + 1)); 244 else 245 { 246 ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n)); 247#if WANT_ASSERT 248 { 249 mp_size_t i; 250 for (i = w6n; i <= n; i++) 251 ASSERT (w5[n + i] == 0); 252 } 253#endif 254 } 255} 256