1/* Interpolation for the algorithm Toom-Cook 6.5-way. 2 3 Contributed to the GNU project by Marco Bodrato. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc. 10 11This file is part of the GNU MP Library. 12 13The GNU MP Library is free software; you can redistribute it and/or modify 14it under the terms of either: 15 16 * the GNU Lesser General Public License as published by the Free 17 Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20or 21 22 * the GNU General Public License as published by the Free Software 23 Foundation; either version 2 of the License, or (at your option) any 24 later version. 25 26or both in parallel, as here. 27 28The GNU MP Library is distributed in the hope that it will be useful, but 29WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31for more details. 32 33You should have received copies of the GNU General Public License and the 34GNU Lesser General Public License along with the GNU MP Library. If not, 35see https://www.gnu.org/licenses/. */ 36 37 38#include "gmp-impl.h" 39 40 41#if GMP_NUMB_BITS < 21 42#error Not implemented: Both sublsh_n(,,,20) should be corrected. 43#endif 44 45#if GMP_NUMB_BITS < 16 46#error Not implemented: divexact_by42525 needs splitting. 47#endif 48 49#if GMP_NUMB_BITS < 12 50#error Not implemented: Hard to adapt... 51#endif 52 53 54/* FIXME: tuneup should decide the best variant */ 55#ifndef AORSMUL_FASTER_AORS_AORSLSH 56#define AORSMUL_FASTER_AORS_AORSLSH 1 57#endif 58#ifndef AORSMUL_FASTER_AORS_2AORSLSH 59#define AORSMUL_FASTER_AORS_2AORSLSH 1 60#endif 61#ifndef AORSMUL_FASTER_2AORSLSH 62#define AORSMUL_FASTER_2AORSLSH 1 63#endif 64#ifndef AORSMUL_FASTER_3AORSLSH 65#define AORSMUL_FASTER_3AORSLSH 1 66#endif 67 68 69#if HAVE_NATIVE_mpn_sublsh_n 70#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s) 71#else 72static mp_limb_t 73DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) 74{ 75#if USE_MUL_1 && 0 76 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); 77#else 78 mp_limb_t __cy; 79 __cy = mpn_lshift(ws,src,n,s); 80 return __cy + mpn_sub_n(dst,dst,ws,n); 81#endif 82} 83#endif 84 85#if HAVE_NATIVE_mpn_addlsh_n 86#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s) 87#else 88#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH) 89static mp_limb_t 90DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) 91{ 92#if USE_MUL_1 && 0 93 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s)); 94#else 95 mp_limb_t __cy; 96 __cy = mpn_lshift(ws,src,n,s); 97 return __cy + mpn_add_n(dst,dst,ws,n); 98#endif 99} 100#endif 101#endif 102 103#if HAVE_NATIVE_mpn_subrsh 104#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s) 105#else 106/* FIXME: This is not a correct definition, it assumes no carry */ 107#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ 108do { \ 109 mp_limb_t __cy; \ 110 MPN_DECR_U (dst, nd, src[0] >> s); \ 111 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ 112 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ 113} while (0) 114#endif 115 116 117#define BINVERT_9 \ 118 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) 119 120#define BINVERT_255 \ 121 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) 122 123 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */ 124#if GMP_LIMB_BITS == 32 125#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) 126#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) 127#else 128#if GMP_LIMB_BITS == 64 129#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) 130#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) 131#endif 132#endif 133 134#ifndef mpn_divexact_by255 135#if GMP_NUMB_BITS % 8 == 0 136#define mpn_divexact_by255(dst,src,size) \ 137 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) 138#else 139#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 140#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) 141#else 142#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) 143#endif 144#endif 145#endif 146 147#ifndef mpn_divexact_by9x4 148#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 149#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2) 150#else 151#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2) 152#endif 153#endif 154 155#ifndef mpn_divexact_by42525 156#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) 157#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0) 158#else 159#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)) 160#endif 161#endif 162 163#ifndef mpn_divexact_by2835x4 164#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) 165#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2) 166#else 167#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2) 168#endif 169#endif 170 171/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation 172 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely, 173 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of 174 degree 11 (or 10), given the 12 (rsp. 11) values: 175 176 r0 = limit at infinity of f(x) / x^11, 177 r1 = f(4),f(-4), 178 r2 = f(2),f(-2), 179 r3 = f(1),f(-1), 180 r4 = f(1/4),f(-1/4), 181 r5 = f(1/2),f(-1/2), 182 r6 = f(0). 183 184 All couples of the form f(n),f(-n) must be already mixed with 185 toom_couple_handling(f(n),...,f(-n),...) 186 187 The result is stored in {pp, spt + 7*n (or 6*n)}. 188 At entry, r6 is stored at {pp, 2n}, 189 r4 is stored at {pp + 3n, 3n + 1}. 190 r2 is stored at {pp + 7n, 3n + 1}. 191 r0 is stored at {pp +11n, spt}. 192 193 The other values are 3n+1 limbs each (with most significant limbs small). 194 195 Negative intermediate results are stored two-complemented. 196 Inputs are destroyed. 197*/ 198 199void 200mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, 201 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) 202{ 203 mp_limb_t cy; 204 mp_size_t n3; 205 mp_size_t n3p1; 206 n3 = 3 * n; 207 n3p1 = n3 + 1; 208 209#define r4 (pp + n3) /* 3n+1 */ 210#define r2 (pp + 7 * n) /* 3n+1 */ 211#define r0 (pp +11 * n) /* s+t <= 2*n */ 212 213 /******************************* interpolation *****************************/ 214 if (half != 0) { 215 cy = mpn_sub_n (r3, r3, r0, spt); 216 MPN_DECR_U (r3 + spt, n3p1 - spt, cy); 217 218 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi); 219 MPN_DECR_U (r2 + spt, n3p1 - spt, cy); 220 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi); 221 222 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi); 223 MPN_DECR_U (r1 + spt, n3p1 - spt, cy); 224 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi); 225 }; 226 227 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi); 228 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi); 229 230#if HAVE_NATIVE_mpn_add_n_sub_n 231 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1); 232#else 233 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1)); 234 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */ 235 MP_PTR_SWAP(r1, wsi); 236#endif 237 238 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi); 239 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi); 240 241#if HAVE_NATIVE_mpn_add_n_sub_n 242 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); 243#else 244 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ 245 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); 246 MP_PTR_SWAP(r5, wsi); 247#endif 248 249 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n); 250 251#if AORSMUL_FASTER_AORS_AORSLSH 252 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */ 253#else 254 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */ 255 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */ 256#endif 257 /* A division by 2835x4 follows. Warning: the operand can be negative! */ 258 mpn_divexact_by2835x4(r4, r4, n3p1); 259 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) 260 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); 261 262#if AORSMUL_FASTER_2AORSLSH 263 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */ 264#else 265 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */ 266 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */ 267#endif 268 mpn_divexact_by255(r5, r5, n3p1); 269 270 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi)); 271 272#if AORSMUL_FASTER_3AORSLSH 273 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100)); 274#else 275 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi)); 276 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi)); 277 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi)); 278#endif 279 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi)); 280 mpn_divexact_by42525(r1, r1, n3p1); 281 282#if AORSMUL_FASTER_AORS_2AORSLSH 283 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225)); 284#else 285 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1)); 286 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi)); 287 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi)); 288#endif 289 mpn_divexact_by9x4(r2, r2, n3p1); 290 291 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1)); 292 293#ifdef HAVE_NATIVE_mpn_rsh1sub_n 294 mpn_rsh1sub_n (r4, r2, r4, n3p1); 295 r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; 296#else 297 mpn_sub_n (r4, r2, r4, n3p1); 298 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1)); 299#endif 300 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1)); 301 302#ifdef HAVE_NATIVE_mpn_rsh1add_n 303 mpn_rsh1add_n (r5, r5, r1, n3p1); 304 r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; 305#else 306 mpn_add_n (r5, r5, r1, n3p1); 307 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); 308#endif 309 310 /* last interpolation steps... */ 311 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); 312 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1)); 313 /* ... could be mixed with recomposition 314 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1| 315 */ 316 317 /***************************** recomposition *******************************/ 318 /* 319 pp[] prior to operations: 320 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp 321 322 summation scheme for remaining operations: 323 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp 324 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp 325 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| 326 */ 327 328 cy = mpn_add_n (pp + n, pp + n, r5, n); 329 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy); 330#if HAVE_NATIVE_mpn_add_nc 331 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy); 332#else 333 MPN_INCR_U (r5 + 2 * n, n + 1, cy); 334 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n); 335#endif 336 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy); 337 338 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n); 339 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]); 340#if HAVE_NATIVE_mpn_add_nc 341 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy); 342#else 343 MPN_INCR_U (r3 + 2 * n, n + 1, cy); 344 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n); 345#endif 346 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); 347 348 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n); 349 if (half) { 350 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]); 351#if HAVE_NATIVE_mpn_add_nc 352 if (LIKELY (spt > n)) { 353 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy); 354 MPN_INCR_U (pp + 4 * n3, spt - n, cy); 355 } else { 356 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy)); 357 } 358#else 359 MPN_INCR_U (r1 + 2 * n, n + 1, cy); 360 if (LIKELY (spt > n)) { 361 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n); 362 MPN_INCR_U (pp + 4 * n3, spt - n, cy); 363 } else { 364 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt)); 365 } 366#endif 367 } else { 368 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n])); 369 } 370 371#undef r0 372#undef r2 373#undef r4 374} 375