divdf3.S revision 1.1.1.5
1/* Copyright (C) 2008-2017 Free Software Foundation, Inc. 2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com> 3 on behalf of Synopsys Inc. 4 5This file is part of GCC. 6 7GCC is free software; you can redistribute it and/or modify it under 8the terms of the GNU General Public License as published by the Free 9Software Foundation; either version 3, or (at your option) any later 10version. 11 12GCC is distributed in the hope that it will be useful, but WITHOUT ANY 13WARRANTY; without even the implied warranty of MERCHANTABILITY or 14FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15for more details. 16 17Under Section 7 of GPL version 3, you are granted additional 18permissions described in the GCC Runtime Library Exception, version 193.1, as published by the Free Software Foundation. 20 21You should have received a copy of the GNU General Public License and 22a copy of the GCC Runtime Library Exception along with this program; 23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24<http://www.gnu.org/licenses/>. */ 25 26/* 27 to calculate a := b/x as b*y, with y := 1/x: 28 - x is in the range [1..2) 29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms. 30 Precision is higher for polynoms used to evaluate input with larger 31 value. 32 - Do one newton-raphson iteration step to double the precision, 33 then multiply this with the divisor 34 -> more time to decide if dividend is subnormal 35 - the worst error propagation is on the side of the value range 36 with the least initial defect, thus giving us about 30 bits precision. 37 The truncation error for the either is less than 1 + x/2 ulp. 38 A 31 bit inverse can be simply calculated by using x with implicit 1 39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 40 with the bare fraction part of x, then add in y0^2 for the implicit 41 1 of x. 42 - If calculating a 31 bit inverse, the systematic error is less than 43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp. 44 - If we calculate our seed with a 32 bit fraction, we can archive a 45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we 46 only need to take the step to calculate the 2nd stage rest and 47 rounding adjust 1/32th of the time. However, if we use a 20 bit 48 fraction for the seed, the negative error can exceed -2 ulp/128, (2) 49 thus for a simple add / tst check, we need to do the 2nd stage 50 rest calculation/ rounding adjust 1/16th of the time. 51 (1): The inexactness of the 32 bit inverse contributes an error in the 52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the 53 rest contributes an error < +1/x ulp/128 . In the interval [1,2), 54 x/2 + 1/x <= 1.5 . 55 (2): Unless proven otherwise. I have not actually looked for an 56 example where -2 ulp/128 is exceeded, and my calculations indicate 57 that the excess, if existent, is less than -1/512 ulp. 58 ??? The algorithm is still based on the ARC700 optimized code. 59 Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply 60 results. 61 */ 62#include "../arc-ieee-754.h" 63#define mlo acc2 64#define mhi acc1 65#define mul64(b,c) mullw 0,b,c` machlw 0,b,c 66#define mulu64(b,c) mululw 0,b,c` machulw 0,b,c 67 68/* N.B. fp-bit.c does double rounding on denormal numbers. */ 69#if 0 /* DEBUG */ 70 .global __divdf3 71 FUNC(__divdf3) 72 .balign 4 73__divdf3: 74 push_s blink 75 push_s r2 76 push_s r3 77 push_s r0 78 bl.d __divdf3_c 79 push_s r1 80 ld_s r2,[sp,12] 81 ld_s r3,[sp,8] 82 st_s r0,[sp,12] 83 st_s r1,[sp,8] 84 pop_s r1 85 bl.d __divdf3_asm 86 pop_s r0 87 pop_s r3 88 pop_s r2 89 pop_s blink 90 cmp r0,r2 91 cmp.eq r1,r3 92 jeq_s [blink] 93 and r12,DBL0H,DBL1H 94 bic.f 0,0x7ff80000,r12 ; both NaN -> OK 95 jeq_s [blink] 96 bl abort 97 ENDFUNC(__divdf3) 98#define __divdf3 __divdf3_asm 99#endif /* DEBUG */ 100 101 FUNC(__divdf3) 102 .balign 4 103.L7ff00000: 104 .long 0x7ff00000 105.Ldivtab: 106 .long 0xfc0fffe1 107 .long 0xf46ffdfb 108 .long 0xed1ffa54 109 .long 0xe61ff515 110 .long 0xdf7fee75 111 .long 0xd91fe680 112 .long 0xd2ffdd52 113 .long 0xcd1fd30c 114 .long 0xc77fc7cd 115 .long 0xc21fbbb6 116 .long 0xbcefaec0 117 .long 0xb7efa100 118 .long 0xb32f92bf 119 .long 0xae8f83b7 120 .long 0xaa2f7467 121 .long 0xa5ef6479 122 .long 0xa1cf53fa 123 .long 0x9ddf433e 124 .long 0x9a0f3216 125 .long 0x965f2091 126 .long 0x92df0f11 127 .long 0x8f6efd05 128 .long 0x8c1eeacc 129 .long 0x88eed876 130 .long 0x85dec615 131 .long 0x82eeb3b9 132 .long 0x800ea10b 133 .long 0x7d3e8e0f 134 .long 0x7a8e7b3f 135 .long 0x77ee6836 136 .long 0x756e5576 137 .long 0x72fe4293 138 .long 0x709e2f93 139 .long 0x6e4e1c7f 140 .long 0x6c0e095e 141 .long 0x69edf6c5 142 .long 0x67cde3a5 143 .long 0x65cdd125 144 .long 0x63cdbe25 145 .long 0x61ddab3f 146 .long 0x600d991f 147 .long 0x5e3d868c 148 .long 0x5c6d7384 149 .long 0x5abd615f 150 .long 0x590d4ecd 151 .long 0x576d3c83 152 .long 0x55dd2a89 153 .long 0x545d18e9 154 .long 0x52dd06e9 155 .long 0x516cf54e 156 .long 0x4ffce356 157 .long 0x4e9cd1ce 158 .long 0x4d3cbfec 159 .long 0x4becae86 160 .long 0x4aac9da4 161 .long 0x496c8c73 162 .long 0x483c7bd3 163 .long 0x470c6ae8 164 .long 0x45dc59af 165 .long 0x44bc4915 166 .long 0x43ac3924 167 .long 0x428c27fb 168 .long 0x418c187a 169 .long 0x407c07bd 170 171__divdf3_support: /* This label makes debugger output saner. */ 172 .balign 4 173.Ldenorm_dbl1: 174 brge r6, \ 175 0x43500000,.Linf_NaN ; large number / denorm -> Inf 176 bmsk.f r12,DBL1H,19 177 mov.eq r12,DBL1L 178 mov.eq DBL1L,0 179 sub.eq r7,r7,32 180 norm.f r11,r12 ; flag for x/0 -> Inf check 181 beq_s .Linf_NaN 182 mov.mi r11,0 183 add.pl r11,r11,1 184 add_s r12,r12,r12 185 asl r8,r12,r11 186 rsub r12,r11,31 187 lsr r12,DBL1L,r12 188 tst_s DBL1H,DBL1H 189 or r8,r8,r12 190 lsr r4,r8,26 191 lsr DBL1H,r8,12 192 ld.as r4,[r10,r4] 193 bxor.mi DBL1H,DBL1H,31 194 sub r11,r11,11 195 asl DBL1L,DBL1L,r11 196 sub r11,r11,1 197 mulu64 (r4,r8) 198 sub r7,r7,r11 199 b.d .Lpast_denorm_dbl1 200 asl r7,r7,20 201 202.Linf_NaN: 203 tst_s DBL0L,DBL0L ; 0/0 -> NaN 204 xor_s DBL1H,DBL1H,DBL0H 205 bclr.eq.f DBL0H,DBL0H,31 206 bmsk DBL0H,DBL1H,30 207 xor_s DBL0H,DBL0H,DBL1H 208 sub.eq DBL0H,DBL0H,1 209 mov_s DBL0L,0 210 j_s.d [blink] 211 or DBL0H,DBL0H,r9 212 .balign 4 213.Lret0_2: 214 xor_s DBL1H,DBL1H,DBL0H 215 mov_s DBL0L,0 216 bmsk DBL0H,DBL1H,30 217 j_s.d [blink] 218 xor_s DBL0H,DBL0H,DBL1H 219 .balign 4 220 .global __divdf3 221/* N.B. the spacing between divtab and the sub3 to get its address must 222 be a multiple of 8. */ 223__divdf3: 224 asl r8,DBL1H,12 225 lsr r4,r8,26 226 sub3 r10,pcl,51;(.-.Ldivtab) >> 3 227 ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 228 ld.as r4,[r10,r4] 229 lsr r12,DBL1L,20 230 and.f r7,DBL1H,r9 231 or r8,r8,r12 232 mulu64 (r4,r8) 233 beq.d .Ldenorm_dbl1 234.Lpast_denorm_dbl1: 235 and.f r6,DBL0H,r9 236 breq.d r7,r9,.Linf_nan_dbl1 237 asl r4,r4,12 238 sub r4,r4,mhi 239 mululw 0,r4,r4 240 machulw r5,r4,r4 241 bne.d .Lnormal_dbl0 242 lsr r8,r8,1 243 244 .balign 4 245.Ldenorm_dbl0: 246 bmsk.f r12,DBL0H,19 247 ; wb stall 248 mov.eq r12,DBL0L 249 sub.eq r6,r6,32 250 norm.f r11,r12 ; flag for 0/x -> 0 check 251 brge r7, \ 252 0x43500000, .Lret0_2 ; denorm/large number -> 0 253 beq_s .Lret0_2 254 mov.mi r11,0 255 add.pl r11,r11,1 256 asl r12,r12,r11 257 sub r6,r6,r11 258 add.f 0,r6,31 259 lsr r10,DBL0L,r6 260 mov.mi r10,0 261 add r6,r6,11+32 262 neg.f r11,r6 263 asl DBL0L,DBL0L,r11 264 mov.pl DBL0L,0 265 sub r6,r6,32-1 266 b.d .Lpast_denorm_dbl0 267 asl r6,r6,20 268 269 .balign 4 270.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN 271 or.f 0,r6,DBL0L 272 cmp.ne r6,r9 273 not_s DBL0L,DBL1H 274 sub_s.ne DBL0L,DBL0L,DBL0L 275 tst_s DBL0H,DBL0H 276 add_s DBL0H,DBL1H,DBL0L 277 j_s.d [blink] 278 bxor.mi DBL0H,DBL0H,31 279 280 .balign 4 281.Lnormal_dbl0: 282 breq.d r6,r9,.Linf_nan_dbl0 283 asl r12,DBL0H,11 284 lsr r10,DBL0L,21 285.Lpast_denorm_dbl0: 286 bset r8,r8,31 287 mulu64 (r5,r8) 288 add_s r12,r12,r10 289 bset r5,r12,31 290 cmp r5,r8 291 cmp.eq DBL0L,DBL1L 292 lsr.cc r5,r5,1 293 sub r4,r4,mhi ; u1.31 inverse, about 30 bit 294 mululw 0,r5,r4 295 machulw r11,r5,r4 ; result fraction highpart 296 lsr r8,r8,2 ; u3.29 297 add r5,r6, /* wait for immediate */ \ 298 0x3fe00000 299 mulu64 (r11,r8) ; u-28.31 300 asl_s DBL1L,DBL1L,9 ; u-29.23:9 301 sbc r6,r5,r7 302 mov r12,mlo ; u-28.31 303 mulu64 (r11,DBL1L) ; mhi: u-28.23:9 304 add.cs DBL0L,DBL0L,DBL0L 305 asl_s DBL0L,DBL0L,6 ; u-26.25:7 306 asl r10,r11,23 307 sub_l DBL0L,DBL0L,r12 308 lsr r7,r11,9 309 sub r5,DBL0L,mhi ; rest msw ; u-26.31:0 310 mul64 (r5,r4) ; mhi: result fraction lowpart 311 xor.f 0,DBL0H,DBL1H 312 and DBL0H,r6,r9 313 add_s DBL0H,DBL0H,r7 314 bclr r12,r9,20 ; 0x7fe00000 315 brhs.d r6,r12,.Linf_denorm 316 bxor.mi DBL0H,DBL0H,31 317 add.f r12,mhi,0x11 318 asr r9,r12,5 319 sub.mi DBL0H,DBL0H,1 320 add.f DBL0L,r9,r10 321 tst r12,0x1c 322 jne.d [blink] 323 add.cs DBL0H,DBL0H,1 324 /* work out exact rounding if we fall through here. */ 325 /* We know that the exact result cannot be represented in double 326 precision. Find the mid-point between the two nearest 327 representable values, multiply with the divisor, and check if 328 the result is larger than the dividend. Since we want to know 329 only the sign bit, it is sufficient to calculate only the 330 highpart of the lower 64 bits. */ 331 mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo 332 sub.f DBL0L,DBL0L,1 333 asl r12,r9,2 ; u-22.30:2 334 sub.cs DBL0H,DBL0H,1 335 sub.f r12,r12,2 336 mov r10,mlo ; rest before considering r12 in r5 : -r10 337 mululw 0,r12,DBL1L 338 machulw r7,r12,DBL1L ; mhi: u-51.32 339 asl r5,r5,25 ; s-51.7:25 340 lsr r10,r10,7 ; u-51.30:2 341 mulu64 (r12,r8) ; mlo: u-51.31:1 342 sub r5,r5,r10 343 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L 344 bset r7,r7,0 ; make sure that the result is not zero, and that 345 sub r5,r5,r7 ; a highpart zero appears negative 346 sub.f r5,r5,mlo ; rest msw 347 add.pl.f DBL0L,DBL0L,1 348 j_s.d [blink] 349 add.eq DBL0H,DBL0H,1 350 351.Linf_nan_dbl0: 352 tst_s DBL1H,DBL1H 353 j_s.d [blink] 354 bxor.mi DBL0H,DBL0H,31 355 .balign 4 356.Linf_denorm: 357 lsr r12,r6,28 358 brlo.d r12,0xc,.Linf 359.Ldenorm: 360 asr r6,r6,20 361 neg r9,r6 362 mov_s DBL0H,0 363 brhs.d r9,54,.Lret0 364 bxor.mi DBL0H,DBL0H,31 365 add r12,mhi,1 366 and r12,r12,-4 367 rsub r7,r6,5 368 asr r10,r12,28 369 bmsk r4,r12,27 370 min r7,r7,31 371 asr DBL0L,r4,r7 372 add DBL1H,r11,r10 373 abs.f r10,r4 374 sub.mi r10,r10,1 375 add.f r7,r6,32-5 376 asl r4,r4,r7 377 mov.mi r4,r10 378 add.f r10,r6,23 379 rsub r7,r6,9 380 lsr r7,DBL1H,r7 381 asl r10,DBL1H,r10 382 or.pnz DBL0H,DBL0H,r7 383 or.mi r4,r4,r10 384 mov.mi r10,r7 385 add.f DBL0L,r10,DBL0L 386 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point 387 bxor.f 0,r4,31 388 add.pnz.f DBL0L,DBL0L,1 389 add.cs.f DBL0H,DBL0H,1 390 jne_s [blink] 391 /* Calculation so far was not conclusive; calculate further rest. */ 392 mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo 393 asr.f r12,r12,3 394 asl r5,r5,25 ; s-51.7:25 395 mov r11,mlo ; rest before considering r12 in r5 : -r11 396 mulu64 (r12,r8) ; u-51.31:1 397 and r9,DBL0L,1 ; tie-breaker: round to even 398 lsr r11,r11,7 ; u-51.30:2 399 mov DBL1H,mlo ; u-51.31:1 400 mulu64 (r12,DBL1L) ; u-51.62:2 401 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L 402 add_s DBL1H,DBL1H,r11 403 sub DBL1H,DBL1H,r5 ; -rest msw 404 add_s DBL1H,DBL1H,mhi ; -rest msw 405 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( 406 tst_s DBL1H,DBL1H 407 cmp.eq mlo,r9 408 add.cs.f DBL0L,DBL0L,1 409 j_s.d [blink] 410 add.cs DBL0H,DBL0H,1 411 412.Lret0: 413 /* return +- 0 */ 414 j_s.d [blink] 415 mov_s DBL0L,0 416.Linf: 417 mov_s DBL0H,r9 418 mov_s DBL0L,0 419 j_s.d [blink] 420 bxor.mi DBL0H,DBL0H,31 421 ENDFUNC(__divdf3) 422