1/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, 2 zero otherwise. 3 4Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software 5Foundation, Inc. 6 7This file is part of the GNU MP Library. 8 9The GNU MP Library is free software; you can redistribute it and/or modify 10it under the terms of either: 11 12 * the GNU Lesser General Public License as published by the Free 13 Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16or 17 18 * the GNU General Public License as published by the Free Software 19 Foundation; either version 2 of the License, or (at your option) any 20 later version. 21 22or both in parallel, as here. 23 24The GNU MP Library is distributed in the hope that it will be useful, but 25WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27for more details. 28 29You should have received copies of the GNU General Public License and the 30GNU Lesser General Public License along with the GNU MP Library. If not, 31see https://www.gnu.org/licenses/. */ 32 33#include <stdio.h> /* for NULL */ 34#include "gmp-impl.h" 35#include "longlong.h" 36 37#include "perfsqr.h" 38 39 40/* change this to "#define TRACE(x) x" for diagnostics */ 41#define TRACE(x) 42 43 44 45/* PERFSQR_MOD_* detects non-squares using residue tests. 46 47 A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes 48 {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or 49 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34, 50 or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP 51 used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this 52 case too. 53 54 PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or 55 PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table 56 data indicating residues and non-residues modulo those divisors. The 57 table data is in 1 or 2 limbs worth of bits respectively, per the size of 58 each d. 59 60 A "modexact" style remainder is taken to reduce r modulo d. 61 PERFSQR_MOD_IDX implements this, producing an index "idx" for use with 62 the table data. Notice there's just one multiplication by a constant 63 "inv", for each d. 64 65 The modexact doesn't produce a true r%d remainder, instead idx satisfies 66 "-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor 67 -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is 68 accounted for by having the table data suitably permuted. 69 70 The remainder r fits within PERFSQR_MOD_BITS which is less than a limb. 71 In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit 72 each divisor d meaning the modexact multiply can take place entirely 73 within one limb, giving the compiler the chance to optimize it, in a way 74 that say umul_ppmm would not give. 75 76 There's no need for the divisors d to be prime, in fact gen-psqr.c makes 77 a deliberate effort to combine factors so as to reduce the number of 78 separate tests done on r. But such combining is limited to d <= 79 2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs. 80 81 Alternatives: 82 83 It'd be possible to use bigger divisors d, and more than 2 limbs of table 84 data, but this doesn't look like it would be of much help to the prime 85 factors in the usual moduli 2^24-1 or 2^48-1. 86 87 The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're 88 just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime 89 factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have 90 fewer prime factors. 91 92 The nails case usually ends up using mpn_mod_1, which is a lot slower 93 than mpn_mod_34lsub1. Perhaps other such special moduli could be found 94 for the nails case. Two-term things like 2^30-2^15-1 might be 95 candidates. Or at worst some on-the-fly de-nailing would allow the plain 96 2^24-1 to be used. Currently nails are too preliminary to be worried 97 about. 98 99*/ 100 101#define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1) 102 103#define MOD34_BITS (GMP_NUMB_BITS / 4 * 3) 104#define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1) 105 106#define PERFSQR_MOD_34(r, up, usize) \ 107 do { \ 108 (r) = mpn_mod_34lsub1 (up, usize); \ 109 (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \ 110 } while (0) 111 112/* FIXME: The %= here isn't good, and might destroy any savings from keeping 113 the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm). 114 Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor 115 and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our 116 normal case, so lets not worry too much about mod_1. */ 117#define PERFSQR_MOD_PP(r, up, usize) \ 118 do { \ 119 if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \ 120 { \ 121 (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \ 122 PERFSQR_PP_INVERTED); \ 123 (r) %= PERFSQR_PP; \ 124 } \ 125 else \ 126 { \ 127 (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \ 128 } \ 129 } while (0) 130 131#define PERFSQR_MOD_IDX(idx, r, d, inv) \ 132 do { \ 133 mp_limb_t q; \ 134 ASSERT ((r) <= PERFSQR_MOD_MASK); \ 135 ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \ 136 ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \ 137 \ 138 q = ((r) * (inv)) & PERFSQR_MOD_MASK; \ 139 ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \ 140 (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \ 141 } while (0) 142 143#define PERFSQR_MOD_1(r, d, inv, mask) \ 144 do { \ 145 unsigned idx; \ 146 ASSERT ((d) <= GMP_LIMB_BITS); \ 147 PERFSQR_MOD_IDX(idx, r, d, inv); \ 148 TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \ 149 d, r%d, idx)); \ 150 if ((((mask) >> idx) & 1) == 0) \ 151 { \ 152 TRACE (printf (" non-square\n")); \ 153 return 0; \ 154 } \ 155 } while (0) 156 157/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the 158 sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */ 159#define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \ 160 do { \ 161 mp_limb_t m; \ 162 unsigned idx; \ 163 ASSERT ((d) <= 2*GMP_LIMB_BITS); \ 164 \ 165 PERFSQR_MOD_IDX (idx, r, d, inv); \ 166 TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \ 167 d, r%d, idx)); \ 168 m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \ 169 idx %= GMP_LIMB_BITS; \ 170 if (((m >> idx) & 1) == 0) \ 171 { \ 172 TRACE (printf (" non-square\n")); \ 173 return 0; \ 174 } \ 175 } while (0) 176 177 178int 179mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) 180{ 181 ASSERT (usize >= 1); 182 183 TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize)); 184 185 /* The first test excludes 212/256 (82.8%) of the perfect square candidates 186 in O(1) time. */ 187 { 188 unsigned idx = up[0] % 0x100; 189 if (((sq_res_0x100[idx / GMP_LIMB_BITS] 190 >> (idx % GMP_LIMB_BITS)) & 1) == 0) 191 return 0; 192 } 193 194#if 0 195 /* Check that we have even multiplicity of 2, and then check that the rest is 196 a possible perfect square. Leave disabled until we can determine this 197 really is an improvement. It it is, it could completely replace the 198 simple probe above, since this should throw out more non-squares, but at 199 the expense of somewhat more cycles. */ 200 { 201 mp_limb_t lo; 202 int cnt; 203 lo = up[0]; 204 while (lo == 0) 205 up++, lo = up[0], usize--; 206 count_trailing_zeros (cnt, lo); 207 if ((cnt & 1) != 0) 208 return 0; /* return of not even multiplicity of 2 */ 209 lo >>= cnt; /* shift down to align lowest non-zero bit */ 210 lo >>= 1; /* shift away lowest non-zero bit */ 211 if ((lo & 3) != 0) 212 return 0; 213 } 214#endif 215 216 217 /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares 218 according to their residues modulo small primes (or powers of 219 primes). See perfsqr.h. */ 220 PERFSQR_MOD_TEST (up, usize); 221 222 223 /* For the third and last test, we finally compute the square root, 224 to make sure we've really got a perfect square. */ 225 { 226 mp_ptr root_ptr; 227 int res; 228 TMP_DECL; 229 230 TMP_MARK; 231 root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2); 232 233 /* Iff mpn_sqrtrem returns zero, the square is perfect. */ 234 res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); 235 TMP_FREE; 236 237 return res; 238 } 239} 240