1/* mpn_divexact_by3c -- mpn exact division by 3. 2 3Copyright 2000-2003, 2008 Free Software Foundation, Inc. 4 5This file is part of the GNU MP Library. 6 7The GNU MP Library is free software; you can redistribute it and/or modify 8it under the terms of either: 9 10 * the GNU Lesser General Public License as published by the Free 11 Software Foundation; either version 3 of the License, or (at your 12 option) any later version. 13 14or 15 16 * the GNU General Public License as published by the Free Software 17 Foundation; either version 2 of the License, or (at your option) any 18 later version. 19 20or both in parallel, as here. 21 22The GNU MP Library is distributed in the hope that it will be useful, but 23WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25for more details. 26 27You should have received copies of the GNU General Public License and the 28GNU Lesser General Public License along with the GNU MP Library. If not, 29see https://www.gnu.org/licenses/. */ 30 31#include "gmp-impl.h" 32 33#if DIVEXACT_BY3_METHOD == 0 34 35mp_limb_t 36mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c) 37{ 38 mp_limb_t r; 39 r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c); 40 41 /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3. 42 We want to return C. We compute the remainder mod 4 and notice that the 43 inverse of (2^(2k)-1)/3 mod 4 is 1. */ 44 return r & 3; 45} 46 47#endif 48 49#if DIVEXACT_BY3_METHOD == 1 50 51/* The algorithm here is basically the same as mpn_divexact_1, as described 52 in the manual. Namely at each step q = (src[i]-c)*inverse, and new c = 53 borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3, 54 high(divisor*q) can be determined with two comparisons instead of a 55 multiply. 56 57 The "c += ..."s add the high limb of 3*l to c. That high limb will be 0, 58 1 or 2. Doing two separate "+="s seems to give better code on gcc (as of 59 2.95.2 at least). 60 61 It will be noted that the new c is formed by adding three values each 0 62 or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c 63 causes a borrow, that leaves a limb value of either 0xFF...FF or 64 0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or 65 0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1 66 respectively, hence a total of no more than 2. 67 68 Alternatives: 69 70 This implementation has each multiply on the dependent chain, due to 71 "l=s-c". See below for alternative code which avoids that. */ 72 73mp_limb_t 74mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c) 75{ 76 mp_limb_t l, q, s; 77 mp_size_t i; 78 79 ASSERT (un >= 1); 80 ASSERT (c == 0 || c == 1 || c == 2); 81 ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); 82 83 i = 0; 84 do 85 { 86 s = up[i]; 87 SUBC_LIMB (c, l, s, c); 88 89 q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; 90 rp[i] = q; 91 92 c += (q >= GMP_NUMB_CEIL_MAX_DIV3); 93 c += (q >= GMP_NUMB_CEIL_2MAX_DIV3); 94 } 95 while (++i < un); 96 97 ASSERT (c == 0 || c == 1 || c == 2); 98 return c; 99} 100 101 102#endif 103 104#if DIVEXACT_BY3_METHOD == 2 105 106/* The following alternative code re-arranges the quotient calculation from 107 (src[i]-c)*inverse to instead 108 109 q = src[i]*inverse - c*inverse 110 111 thereby allowing src[i]*inverse to be scheduled back as far as desired, 112 making full use of multiplier throughput and leaving just some carry 113 handing on the dependent chain. 114 115 The carry handling consists of determining the c for the next iteration. 116 This is the same as described above, namely look for any borrow from 117 src[i]-c, and at the high of 3*q. 118 119 high(3*q) is done with two comparisons as above (in c2 and c3). The 120 borrow from src[i]-c is incorporated into those by noting that if there's 121 a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn 122 giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is 123 enough to make high(3*q) come out 1 bigger, as required. 124 125 l = -c*inverse is calculated at the same time as c, since for most chips 126 it can be more conveniently derived from separate c1/c2/c3 values than 127 from a combined c equal to 0, 1 or 2. 128 129 The net effect is that with good pipelining this loop should be able to 130 run at perhaps 4 cycles/limb, depending on available execute resources 131 etc. 132 133 Usage: 134 135 This code is not used by default, since we really can't rely on the 136 compiler generating a good software pipeline, nor on such an approach 137 even being worthwhile on all CPUs. 138 139 Itanium is one chip where this algorithm helps though, see 140 mpn/ia64/diveby3.asm. */ 141 142mp_limb_t 143mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy) 144{ 145 mp_limb_t s, sm, cl, q, qx, c2, c3; 146 mp_size_t i; 147 148 ASSERT (un >= 1); 149 ASSERT (cy == 0 || cy == 1 || cy == 2); 150 ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); 151 152 cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3; 153 154 for (i = 0; i < un; i++) 155 { 156 s = up[i]; 157 sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; 158 159 q = (cl + sm) & GMP_NUMB_MASK; 160 rp[i] = q; 161 qx = q + (s < cy); 162 163 c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3; 164 c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ; 165 166 cy = c2 + c3; 167 cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3); 168 } 169 170 return cy; 171} 172 173#endif 174