1148456Spjd/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division. 2213072Spjd 3148456Spjd THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4148456Spjd CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5148456Spjd FUTURE GNU MP RELEASES. 6148456Spjd 7148456SpjdCopyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc. 8148456Spjd 9148456SpjdThis file is part of the GNU MP Library. 10148456Spjd 11148456SpjdThe GNU MP Library is free software; you can redistribute it and/or modify 12148456Spjdit under the terms of either: 13155174Spjd 14148456Spjd * the GNU Lesser General Public License as published by the Free 15148456Spjd Software Foundation; either version 3 of the License, or (at your 16148456Spjd option) any later version. 17148456Spjd 18148456Spjdor 19148456Spjd 20148456Spjd * the GNU General Public License as published by the Free Software 21148456Spjd Foundation; either version 2 of the License, or (at your option) any 22148456Spjd later version. 23148456Spjd 24148456Spjdor both in parallel, as here. 25148456Spjd 26148456SpjdThe GNU MP Library is distributed in the hope that it will be useful, but 27148456SpjdWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28148456Spjdor FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29148456Spjdfor more details. 30148456Spjd 31148456SpjdYou should have received copies of the GNU General Public License and the 32148456SpjdGNU Lesser General Public License along with the GNU MP Library. If not, 33148456Spjdsee https://www.gnu.org/licenses/. */ 34148456Spjd 35148456Spjd#include "gmp-impl.h" 36148456Spjd#include "longlong.h" 37148456Spjd 38148456Spjd#include "mpn/sparc64/sparc64.h" 39148456Spjd 40148456Spjd 41148456Spjd/* 64-bit divisor 32-bit divisor 42148456Spjd cycles/limb cycles/limb 43148456Spjd (approx) (approx) 44148867Spjd Ultrasparc 2i: 110 70 45148456Spjd*/ 46148456Spjd 47148456Spjd 48148456Spjd/* There are two key ideas here to reduce mulx's. Firstly when the divisor 49148456Spjd is 32-bits the high of q*d can be calculated without the two 32x32->64 50148456Spjd cross-products involving the high 32-bits of the divisor, that being zero 51148456Spjd of course. Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save 52148456Spjd one mulx (each) knowing the low of q*d is equal to the input limb l. 53148456Spjd 54148456Spjd For size==1, a simple udivx is used. This is faster than calculating an 55148456Spjd inverse. 56148456Spjd 57159307Spjd For a 32-bit divisor and small sizes, an attempt was made at a simple 58159307Spjd udivx loop (two per 64-bit limb), but it turned out to be slower than 59161217Spjd mul-by-inverse. At size==2 the inverse is about 260 cycles total 60211927Spjd compared to a udivx at 291. Perhaps the latter would suit when size==2 61211927Spjd but the high 32-bits of the second limb is zero (saving one udivx), but 62161220Spjd it doesn't seem worth a special case just for that. */ 63213070Spjd 64148456Spjdvoid 65213067Spjdmpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) 66148456Spjd{ 67161127Spjd mp_limb_t inverse, s, s_next, c, l, ls, q; 68148456Spjd unsigned rshift, lshift; 69161220Spjd mp_limb_t lshift_mask; 70148456Spjd mp_limb_t divisor_h; 71161220Spjd 72148456Spjd ASSERT (size >= 1); 73161220Spjd ASSERT (divisor != 0); 74148456Spjd ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); 75161220Spjd ASSERT_MPN (src, size); 76159307Spjd ASSERT_LIMB (divisor); 77161220Spjd 78161127Spjd s = *src++; /* src low limb */ 79161220Spjd size--; 80161127Spjd if (size == 0) 81148456Spjd { 82161220Spjd *dst = s / divisor; 83148456Spjd return; 84161220Spjd } 85161220Spjd 86161220Spjd if ((divisor & 1) == 0) 87213067Spjd { 88213067Spjd count_trailing_zeros (rshift, divisor); 89214118Spjd divisor >>= rshift; 90214118Spjd lshift = 64 - rshift; 91148456Spjd 92214118Spjd lshift_mask = MP_LIMB_T_MAX; 93214118Spjd } 94148456Spjd else 95159307Spjd { 96148456Spjd rshift = 0; 97148456Spjd 98148456Spjd /* rshift==0 means no shift, so must mask out other part in this case */ 99148456Spjd lshift = 0; 100148456Spjd lshift_mask = 0; 101159307Spjd } 102148456Spjd 103148456Spjd binvert_limb (inverse, divisor); 104148456Spjd 105148456Spjd c = 0; 106148456Spjd divisor_h = HIGH32 (divisor); 107213062Spjd 108213067Spjd if (divisor_h == 0) 109213067Spjd { 110148456Spjd /* 32-bit divisor */ 111148456Spjd do 112213165Spjd { 113148456Spjd s_next = *src++; 114159307Spjd ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); 115148456Spjd s = s_next; 116148456Spjd 117148456Spjd SUBC_LIMB (c, l, ls, c); 118148456Spjd 119148456Spjd q = l * inverse; 120148456Spjd *dst++ = q; 121148456Spjd 122148456Spjd umul_ppmm_half_lowequal (l, q, divisor, l); 123148456Spjd c += l; 124148456Spjd 125148456Spjd size--; 126148456Spjd } 127148456Spjd while (size != 0); 128148456Spjd 129148456Spjd ls = s >> rshift; 130148456Spjd l = ls - c; 131148456Spjd q = l * inverse; 132148456Spjd *dst = q; 133148456Spjd } 134148456Spjd else 135148456Spjd { 136148456Spjd /* 64-bit divisor */ 137148456Spjd mp_limb_t divisor_l = LOW32 (divisor); 138148456Spjd do 139148456Spjd { 140148456Spjd s_next = *src++; 141148456Spjd ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); 142148456Spjd s = s_next; 143148456Spjd 144148456Spjd SUBC_LIMB (c, l, ls, c); 145148456Spjd 146148456Spjd q = l * inverse; 147214118Spjd *dst++ = q; 148148456Spjd 149148456Spjd umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l); 150148456Spjd c += l; 151148456Spjd 152213067Spjd size--; 153213067Spjd } 154213067Spjd while (size != 0); 155213067Spjd 156213067Spjd ls = s >> rshift; 157213067Spjd l = ls - c; 158213067Spjd q = l * inverse; 159213067Spjd *dst = q; 160213067Spjd } 161213067Spjd} 162213067Spjd