1158115Sume/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division. 2158115Sume 3158115Sume THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4158115Sume CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5158115Sume FUTURE GNU MP RELEASES. 6158115Sume 7158115SumeCopyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc. 8158115Sume 9158115SumeThis file is part of the GNU MP Library. 10158115Sume 11158115SumeThe GNU MP Library is free software; you can redistribute it and/or modify 12158115Sumeit under the terms of either: 13158115Sume 14158115Sume * the GNU Lesser General Public License as published by the Free 15158115Sume Software Foundation; either version 3 of the License, or (at your 16158115Sume option) any later version. 17158115Sume 18158115Sumeor 19158115Sume 20158115Sume * the GNU General Public License as published by the Free Software 21158115Sume Foundation; either version 2 of the License, or (at your option) any 22158115Sume later version. 23158115Sume 24158115Sumeor both in parallel, as here. 25158115Sume 26158115SumeThe GNU MP Library is distributed in the hope that it will be useful, but 27158115SumeWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28158115Sumeor FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29158115Sumefor more details. 30158115Sume 31158115SumeYou should have received copies of the GNU General Public License and the 32158115SumeGNU Lesser General Public License along with the GNU MP Library. If not, 33158115Sumesee https://www.gnu.org/licenses/. */ 34158115Sume 35158115Sume#include "gmp-impl.h" 36158115Sume#include "longlong.h" 37158115Sume 38158115Sume#include "mpn/sparc64/sparc64.h" 39158115Sume 40158115Sume 41158115Sume/* 64-bit divisor 32-bit divisor 42158115Sume cycles/limb cycles/limb 43158115Sume (approx) (approx) 44158115Sume Ultrasparc 2i: 110 70 45158115Sume*/ 46158115Sume 47158115Sume 48158115Sume/* There are two key ideas here to reduce mulx's. Firstly when the divisor 49158115Sume is 32-bits the high of q*d can be calculated without the two 32x32->64 50158115Sume cross-products involving the high 32-bits of the divisor, that being zero 51158115Sume of course. Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save 52158115Sume one mulx (each) knowing the low of q*d is equal to the input limb l. 53158115Sume 54158115Sume For size==1, a simple udivx is used. This is faster than calculating an 55158115Sume inverse. 56158115Sume 57158115Sume For a 32-bit divisor and small sizes, an attempt was made at a simple 58158115Sume udivx loop (two per 64-bit limb), but it turned out to be slower than 59158115Sume mul-by-inverse. At size==2 the inverse is about 260 cycles total 60158115Sume compared to a udivx at 291. Perhaps the latter would suit when size==2 61158115Sume but the high 32-bits of the second limb is zero (saving one udivx), but 62158115Sume it doesn't seem worth a special case just for that. */ 63158115Sume 64158115Sumevoid 65158115Sumempn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) 66158115Sume{ 67158115Sume mp_limb_t inverse, s, s_next, c, l, ls, q; 68158115Sume unsigned rshift, lshift; 69158115Sume mp_limb_t lshift_mask; 70158115Sume mp_limb_t divisor_h; 71158115Sume 72158115Sume ASSERT (size >= 1); 73158115Sume ASSERT (divisor != 0); 74158115Sume ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); 75164882Sume ASSERT_MPN (src, size); 76158115Sume ASSERT_LIMB (divisor); 77158115Sume 78248252Sjilles s = *src++; /* src low limb */ 79158115Sume size--; 80248252Sjilles if (size == 0) 81248252Sjilles { 82158115Sume *dst = s / divisor; 83158115Sume return; 84158115Sume } 85158115Sume 86158115Sume if ((divisor & 1) == 0) 87158115Sume { 88158115Sume count_trailing_zeros (rshift, divisor); 89158115Sume divisor >>= rshift; 90158115Sume lshift = 64 - rshift; 91158115Sume 92158115Sume lshift_mask = MP_LIMB_T_MAX; 93158115Sume } 94158115Sume else 95158115Sume { 96158115Sume rshift = 0; 97158115Sume 98158115Sume /* rshift==0 means no shift, so must mask out other part in this case */ 99158115Sume lshift = 0; 100158115Sume lshift_mask = 0; 101158115Sume } 102158115Sume 103158115Sume binvert_limb (inverse, divisor); 104158115Sume 105158115Sume c = 0; 106158115Sume divisor_h = HIGH32 (divisor); 107158115Sume 108158115Sume if (divisor_h == 0) 109158115Sume { 110158115Sume /* 32-bit divisor */ 111158115Sume do 112158115Sume { 113158115Sume s_next = *src++; 114158115Sume ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); 115158115Sume s = s_next; 116158115Sume 117158115Sume SUBC_LIMB (c, l, ls, c); 118164882Sume 119158115Sume q = l * inverse; 120158115Sume *dst++ = q; 121158115Sume 122158115Sume umul_ppmm_half_lowequal (l, q, divisor, l); 123158115Sume c += l; 124158115Sume 125158115Sume size--; 126158115Sume } 127158115Sume while (size != 0); 128158115Sume 129158115Sume ls = s >> rshift; 130158115Sume l = ls - c; 131158115Sume q = l * inverse; 132158115Sume *dst = q; 133158115Sume } 134158115Sume else 135158115Sume { 136158115Sume /* 64-bit divisor */ 137158115Sume mp_limb_t divisor_l = LOW32 (divisor); 138158115Sume do 139158115Sume { 140158115Sume s_next = *src++; 141158115Sume ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); 142158115Sume s = s_next; 143158115Sume 144158115Sume SUBC_LIMB (c, l, ls, c); 145158115Sume 146158115Sume q = l * inverse; 147158115Sume *dst++ = q; 148158115Sume 149158115Sume umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l); 150158115Sume c += l; 151158115Sume 152158115Sume size--; 153158115Sume } 154158115Sume while (size != 0); 155158257Sume 156158115Sume ls = s >> rshift; 157158115Sume l = ls - c; 158158115Sume q = l * inverse; 159158257Sume *dst = q; 160158115Sume } 161158115Sume} 162158115Sume