1227825Stheraven/* Implementation of the squaring algorithm with Toom-Cook 8.5-way. 2227825Stheraven 3227825Stheraven Contributed to the GNU project by Marco Bodrato. 4227825Stheraven 5227825Stheraven THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6227825Stheraven SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7227825Stheraven GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8227825Stheraven 9227825StheravenCopyright 2009, 2012 Free Software Foundation, Inc. 10227825Stheraven 11227825StheravenThis file is part of the GNU MP Library. 12227825Stheraven 13227825StheravenThe GNU MP Library is free software; you can redistribute it and/or modify 14227825Stheravenit under the terms of either: 15227825Stheraven 16227825Stheraven * the GNU Lesser General Public License as published by the Free 17227825Stheraven Software Foundation; either version 3 of the License, or (at your 18227825Stheraven option) any later version. 19227825Stheraven 20227825Stheravenor 21227825Stheraven 22227825Stheraven * the GNU General Public License as published by the Free Software 23227825Stheraven Foundation; either version 2 of the License, or (at your option) any 24227825Stheraven later version. 25227825Stheraven 26227825Stheravenor both in parallel, as here. 27227825Stheraven 28227825StheravenThe GNU MP Library is distributed in the hope that it will be useful, but 29227825StheravenWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 30227825Stheravenor FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 31227825Stheravenfor more details. 32262801Sdim 33262801SdimYou should have received copies of the GNU General Public License and the 34227825StheravenGNU Lesser General Public License along with the GNU MP Library. If not, 35227825Stheravensee https://www.gnu.org/licenses/. */ 36227825Stheraven 37227825Stheraven 38227825Stheraven#include "gmp-impl.h" 39227825Stheraven 40227825Stheraven#if GMP_NUMB_BITS < 29 41262801Sdim#error Not implemented. 42262801Sdim#endif 43227825Stheraven 44227825Stheraven#if GMP_NUMB_BITS < 43 45227825Stheraven#define BIT_CORRECTION 1 46227825Stheraven#define CORRECTION_BITS GMP_NUMB_BITS 47227825Stheraven#else 48227825Stheraven#define BIT_CORRECTION 0 49227825Stheraven#define CORRECTION_BITS 0 50262801Sdim#endif 51262801Sdim 52227825Stheraven#ifndef SQR_TOOM8_THRESHOLD 53227825Stheraven#define SQR_TOOM8_THRESHOLD MUL_TOOM8H_THRESHOLD 54227825Stheraven#endif 55227825Stheraven 56227825Stheraven#ifndef SQR_TOOM6_THRESHOLD 57227825Stheraven#define SQR_TOOM6_THRESHOLD MUL_TOOM6H_THRESHOLD 58227825Stheraven#endif 59227825Stheraven 60227825Stheraven#if TUNE_PROGRAM_BUILD 61227825Stheraven#define MAYBE_sqr_basecase 1 62227825Stheraven#define MAYBE_sqr_above_basecase 1 63227825Stheraven#define MAYBE_sqr_toom2 1 64227825Stheraven#define MAYBE_sqr_above_toom2 1 65227825Stheraven#define MAYBE_sqr_toom3 1 66227825Stheraven#define MAYBE_sqr_above_toom3 1 67227825Stheraven#define MAYBE_sqr_toom4 1 68227825Stheraven#define MAYBE_sqr_above_toom4 1 69227825Stheraven#define MAYBE_sqr_above_toom6 1 70227825Stheraven#else 71227825Stheraven#define SQR_TOOM8_MAX \ 72227825Stheraven ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (8*2-1+7)) ? \ 73227825Stheraven ((SQR_FFT_THRESHOLD+8*2-1+7)/8) \ 74227825Stheraven : MP_SIZE_T_MAX ) 75227825Stheraven#define MAYBE_sqr_basecase \ 76227825Stheraven (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD) 77227825Stheraven#define MAYBE_sqr_above_basecase \ 78227825Stheraven (SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD) 79262801Sdim#define MAYBE_sqr_toom2 \ 80227825Stheraven (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD) 81227825Stheraven#define MAYBE_sqr_above_toom2 \ 82227825Stheraven (SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD) 83227825Stheraven#define MAYBE_sqr_toom3 \ 84227825Stheraven (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD) 85227825Stheraven#define MAYBE_sqr_above_toom3 \ 86227825Stheraven (SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD) 87227825Stheraven#define MAYBE_sqr_toom4 \ 88227825Stheraven (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD) 89227825Stheraven#define MAYBE_sqr_above_toom4 \ 90227825Stheraven (SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD) 91227825Stheraven#define MAYBE_sqr_above_toom6 \ 92227825Stheraven (SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD) 93227825Stheraven#endif 94227825Stheraven 95227825Stheraven#define TOOM8_SQR_REC(p, a, f, p2, a2, n, ws) \ 96227825Stheraven do { \ 97227825Stheraven if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase \ 98227825Stheraven || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) { \ 99227825Stheraven mpn_sqr_basecase (p, a, n); \ 100227825Stheraven if (f) mpn_sqr_basecase (p2, a2, n); \ 101227825Stheraven } else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2 \ 102227825Stheraven || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) { \ 103227825Stheraven mpn_toom2_sqr (p, a, n, ws); \ 104227825Stheraven if (f) mpn_toom2_sqr (p2, a2, n, ws); \ 105227825Stheraven } else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3 \ 106227825Stheraven || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) { \ 107227825Stheraven mpn_toom3_sqr (p, a, n, ws); \ 108227825Stheraven if (f) mpn_toom3_sqr (p2, a2, n, ws); \ 109227825Stheraven } else if (MAYBE_sqr_toom4 && ( !MAYBE_sqr_above_toom4 \ 110227825Stheraven || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))) { \ 111227825Stheraven mpn_toom4_sqr (p, a, n, ws); \ 112227825Stheraven if (f) mpn_toom4_sqr (p2, a2, n, ws); \ 113227825Stheraven } else if (! MAYBE_sqr_above_toom6 \ 114262801Sdim || BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { \ 115227825Stheraven mpn_toom6_sqr (p, a, n, ws); \ 116227825Stheraven if (f) mpn_toom6_sqr (p2, a2, n, ws); \ 117227825Stheraven } else { \ 118227825Stheraven mpn_toom8_sqr (p, a, n, ws); \ 119227825Stheraven if (f) mpn_toom8_sqr (p2, a2, n, ws); \ 120227825Stheraven } \ 121227825Stheraven } while (0) 122227825Stheraven 123227825Stheravenvoid 124227825Stheravenmpn_toom8_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) 125227825Stheraven{ 126227825Stheraven mp_size_t n, s; 127227825Stheraven 128227825Stheraven /***************************** decomposition *******************************/ 129227825Stheraven 130227825Stheraven ASSERT ( an >= 40 ); 131227825Stheraven 132227825Stheraven n = 1 + ((an - 1)>>3); 133227825Stheraven 134227825Stheraven s = an - 7 * n; 135227825Stheraven 136227825Stheraven ASSERT (0 < s && s <= n); 137227825Stheraven ASSERT ( s + s > 3 ); 138227825Stheraven 139227825Stheraven#define r6 (pp + 3 * n) /* 3n+1 */ 140227825Stheraven#define r4 (pp + 7 * n) /* 3n+1 */ 141227825Stheraven#define r2 (pp +11 * n) /* 3n+1 */ 142227825Stheraven#define r0 (pp +15 * n) /* s+t <= 2*n */ 143227825Stheraven#define r7 (scratch) /* 3n+1 */ 144227825Stheraven#define r5 (scratch + 3 * n + 1) /* 3n+1 */ 145227825Stheraven#define r3 (scratch + 6 * n + 2) /* 3n+1 */ 146227825Stheraven#define r1 (scratch + 9 * n + 3) /* 3n+1 */ 147227825Stheraven#define v0 (pp +11 * n) /* n+1 */ 148227825Stheraven#define v2 (pp +13 * n+2) /* n+1 */ 149262801Sdim#define wse (scratch +12 * n + 4) /* 3n+1 */ 150227825Stheraven 151227825Stheraven /* Alloc also 3n+1 limbs for ws... toom_interpolate_16pts may 152227825Stheraven need all of them, when DO_mpn_sublsh_n usea a scratch */ 153227825Stheraven/* if (scratch == NULL) */ 154227825Stheraven/* scratch = TMP_SALLOC_LIMBS (30 * n + 6); */ 155227825Stheraven 156227825Stheraven /********************** evaluation and recursive calls *********************/ 157227825Stheraven /* $\pm1/8$ */ 158227825Stheraven mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 3, pp); 159227825Stheraven /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */ 160227825Stheraven TOOM8_SQR_REC(pp, v0, 2, r7, v2, n + 1, wse); 161227825Stheraven mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 0); 162227825Stheraven 163227825Stheraven /* $\pm1/4$ */ 164227825Stheraven mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 2, pp); 165227825Stheraven /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */ 166227825Stheraven TOOM8_SQR_REC(pp, v0, 2, r5, v2, n + 1, wse); 167227825Stheraven mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 2, 0); 168227825Stheraven 169227825Stheraven /* $\pm2$ */ 170227825Stheraven mpn_toom_eval_pm2 (v2, v0, 7, ap, n, s, pp); 171227825Stheraven /* A(-2)*B(-2) */ /* A(+2)*B(+2) */ 172227825Stheraven TOOM8_SQR_REC(pp, v0, 2, r3, v2, n + 1, wse); 173227825Stheraven mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 1, 2); 174227825Stheraven 175227825Stheraven /* $\pm8$ */ 176227825Stheraven mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 3, pp); 177227825Stheraven /* A(-8)*B(-8) */ /* A(+8)*B(+8) */ 178227825Stheraven TOOM8_SQR_REC(pp, v0, 2, r1, v2, n + 1, wse); 179227825Stheraven mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 6); 180227825Stheraven 181227825Stheraven /* $\pm1/2$ */ 182227825Stheraven mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 1, pp); 183227825Stheraven /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */ 184227825Stheraven TOOM8_SQR_REC(pp, v0, 2, r6, v2, n + 1, wse); 185262801Sdim mpn_toom_couple_handling (r6, 2 * n + 1, pp, 0, n, 1, 0); 186227825Stheraven 187227825Stheraven /* $\pm1$ */ 188227825Stheraven mpn_toom_eval_pm1 (v2, v0, 7, ap, n, s, pp); 189227825Stheraven /* A(-1)*B(-1) */ /* A(1)*B(1) */ 190227825Stheraven TOOM8_SQR_REC(pp, v0, 2, r4, v2, n + 1, wse); 191227825Stheraven mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 0, 0); 192227825Stheraven 193227825Stheraven /* $\pm4$ */ 194227825Stheraven mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 2, pp); 195227825Stheraven /* A(-4)*B(-4) */ /* A(+4)*B(+4) */ 196227825Stheraven TOOM8_SQR_REC(pp, v0, 2, r2, v2, n + 1, wse); 197227825Stheraven mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 2, 4); 198227825Stheraven 199227825Stheraven#undef v0 200227825Stheraven#undef v2 201227825Stheraven 202262801Sdim /* A(0)*B(0) */ 203227825Stheraven TOOM8_SQR_REC(pp, ap, 0, pp, ap, n, wse); 204227825Stheraven 205227825Stheraven mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, 2 * s, 0, wse); 206227825Stheraven 207227825Stheraven#undef r0 208227825Stheraven#undef r1 209227825Stheraven#undef r2 210227825Stheraven#undef r3 211227825Stheraven#undef r4 212227825Stheraven#undef r5 213227825Stheraven#undef r6 214227825Stheraven#undef wse 215227825Stheraven 216227825Stheraven} 217227825Stheraven 218227825Stheraven#undef TOOM8_SQR_REC 219227825Stheraven#undef MAYBE_sqr_basecase 220227825Stheraven#undef MAYBE_sqr_above_basecase 221227825Stheraven#undef MAYBE_sqr_toom2 222227825Stheraven#undef MAYBE_sqr_above_toom2 223227825Stheraven#undef MAYBE_sqr_toom3 224227825Stheraven#undef MAYBE_sqr_above_toom3 225227825Stheraven#undef MAYBE_sqr_above_toom4 226227825Stheraven