divdf3.S revision 1.5
1/* Copyright (C) 2008-2019 Free Software Foundation, Inc. 2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com> 3 on behalf of Synopsys Inc. 4 5This file is part of GCC. 6 7GCC is free software; you can redistribute it and/or modify it under 8the terms of the GNU General Public License as published by the Free 9Software Foundation; either version 3, or (at your option) any later 10version. 11 12GCC is distributed in the hope that it will be useful, but WITHOUT ANY 13WARRANTY; without even the implied warranty of MERCHANTABILITY or 14FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15for more details. 16 17Under Section 7 of GPL version 3, you are granted additional 18permissions described in the GCC Runtime Library Exception, version 193.1, as published by the Free Software Foundation. 20 21You should have received a copy of the GNU General Public License and 22a copy of the GCC Runtime Library Exception along with this program; 23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24<http://www.gnu.org/licenses/>. */ 25 26/* 27 to calculate a := b/x as b*y, with y := 1/x: 28 - x is in the range [1..2) 29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms. 30 Precision is higher for polynoms used to evaluate input with larger 31 value. 32 - Do one newton-raphson iteration step to double the precision, 33 then multiply this with the divisor 34 -> more time to decide if dividend is subnormal 35 - the worst error propagation is on the side of the value range 36 with the least initial defect, thus giving us about 30 bits precision. 37 The truncation error for the either is less than 1 + x/2 ulp. 38 A 31 bit inverse can be simply calculated by using x with implicit 1 39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 40 with the bare fraction part of x, then add in y0^2 for the implicit 41 1 of x. 42 - If calculating a 31 bit inverse, the systematic error is less than 43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp. 44 - If we calculate our seed with a 32 bit fraction, we can archive a 45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we 46 only need to take the step to calculate the 2nd stage rest and 47 rounding adjust 1/32th of the time. However, if we use a 20 bit 48 fraction for the seed, the negative error can exceed -2 ulp/128, (2) 49 thus for a simple add / tst check, we need to do the 2nd stage 50 rest calculation/ rounding adjust 1/16th of the time. 51 (1): The inexactness of the 32 bit inverse contributes an error in the 52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the 53 rest contributes an error < +1/x ulp/128 . In the interval [1,2), 54 x/2 + 1/x <= 1.5 . 55 (2): Unless proven otherwise. I have not actually looked for an 56 example where -2 ulp/128 is exceeded, and my calculations indicate 57 that the excess, if existent, is less than -1/512 ulp. 58 */ 59#include "arc-ieee-754.h" 60 61/* N.B. fp-bit.c does double rounding on denormal numbers. */ 62#if 0 /* DEBUG */ 63 .global __divdf3 64 FUNC(__divdf3) 65 .balign 4 66__divdf3: 67 push_s blink 68 push_s r2 69 push_s r3 70 push_s r0 71 bl.d __divdf3_c 72 push_s r1 73 ld_s r2,[sp,12] 74 ld_s r3,[sp,8] 75 st_s r0,[sp,12] 76 st_s r1,[sp,8] 77 pop_s r1 78 bl.d __divdf3_asm 79 pop_s r0 80 pop_s r3 81 pop_s r2 82 pop_s blink 83 cmp r0,r2 84 cmp.eq r1,r3 85 jeq_s [blink] 86 and r12,DBL0H,DBL1H 87 bic.f 0,0x7ff80000,r12 ; both NaN -> OK 88 jeq_s [blink] 89 bl abort 90 ENDFUNC(__divdf3) 91#define __divdf3 __divdf3_asm 92#endif /* DEBUG */ 93 94 FUNC(__divdf3) 95__divdf3_support: /* This label makes debugger output saner. */ 96 .balign 4 97.Ldenorm_dbl1: 98 brge r6, \ 99 0x43500000,.Linf_NaN ; large number / denorm -> Inf 100 bmsk.f r12,DBL1H,19 101 mov.eq r12,DBL1L 102 mov.eq DBL1L,0 103 sub.eq r7,r7,32 104 norm.f r11,r12 ; flag for x/0 -> Inf check 105 beq_s .Linf_NaN 106 mov.mi r11,0 107 add.pl r11,r11,1 108 add_s r12,r12,r12 109 asl r8,r12,r11 110 rsub r12,r11,31 111 lsr r12,DBL1L,r12 112 tst_s DBL1H,DBL1H 113 or r8,r8,r12 114 lsr r4,r8,26 115 lsr DBL1H,r8,12 116 ld.as r4,[r10,r4] 117 bxor.mi DBL1H,DBL1H,31 118 sub r11,r11,11 119 asl DBL1L,DBL1L,r11 120 sub r11,r11,1 121 MPYHU r5,r4,r8 122 sub r7,r7,r11 123 asl r4,r4,12 124 b.d .Lpast_denorm_dbl1 125 asl r7,r7,20 126 ; wb stall 127 128 .balign 4 129.Ldenorm_dbl0: 130 bmsk.f r12,DBL0H,19 131 ; wb stall 132 mov.eq r12,DBL0L 133 sub.eq r6,r6,32 134 norm.f r11,r12 ; flag for 0/x -> 0 check 135 brge r7, \ 136 0x43500000, .Lret0_NaN ; denorm/large number -> 0 137 beq_s .Lret0_NaN 138 mov.mi r11,0 139 add.pl r11,r11,1 140 asl r12,r12,r11 141 sub r6,r6,r11 142 add.f 0,r6,31 143 lsr r10,DBL0L,r6 144 mov.mi r10,0 145 add r6,r6,11+32 146 neg.f r11,r6 147 asl DBL0L,DBL0L,r11 148 mov.pl DBL0L,0 149 sub r6,r6,32-1 150 b.d .Lpast_denorm_dbl0 151 asl r6,r6,20 152 153.Linf_NaN: 154 tst_s DBL0L,DBL0L ; 0/0 -> NaN 155 xor_s DBL1H,DBL1H,DBL0H 156 bclr.eq.f DBL0H,DBL0H,31 157 bmsk DBL0H,DBL1H,30 158 xor_s DBL0H,DBL0H,DBL1H 159 sub.eq DBL0H,DBL0H,1 160 mov_s DBL0L,0 161 j_s.d [blink] 162 or DBL0H,DBL0H,r9 163 .balign 4 164.Lret0_NaN: 165 xor_s DBL1H,DBL1H,DBL0H 166 cmp_s r12,r9 167 mov_s DBL0L,0 168 bmsk DBL0H,DBL1H,30 169 xor_s DBL0H,DBL0H,DBL1H 170 j_s.d [blink] 171 sub.hi DBL0H,DBL0H,1 172.Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN 173 not_s DBL0L,DBL1H 174 cmp r6,r9 175 sub_s.ne DBL0L,DBL0L,DBL0L 176 tst_s DBL0H,DBL0H 177 add_s DBL0H,DBL1H,DBL0L 178 j_s.d [blink] 179 bxor.mi DBL0H,DBL0H,31 180.Linf_nan_dbl0: 181 tst_s DBL1H,DBL1H 182 j_s.d [blink] 183 bxor.mi DBL0H,DBL0H,31 184 .balign 4 185 .global __divdf3 186/* N.B. the spacing between divtab and the add3 to get its address must 187 be a multiple of 8. */ 188__divdf3: 189 asl r8,DBL1H,12 190 lsr r12,DBL1L,20 191 lsr r4,r8,26 192#if defined (__ARCHS__) || defined (__ARCEM__) 193 add3 r10,pcl,60 ; (.Ldivtab-.) >> 3 194#else 195 add3 r10,pcl,59 ; (.Ldivtab-.) >> 3 196#endif 197 ld.as r4,[r10,r4] 198#if defined (__ARCHS__) || defined (__ARCEM__) 199 ld.as r9,[pcl,182]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 200#else 201 ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 202#endif 203 or r8,r8,r12 204 MPYHU r5,r4,r8 205 and.f r7,DBL1H,r9 206 asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline. 207 beq.d .Ldenorm_dbl1 208 and r6,DBL0H,r9 209.Lpast_denorm_dbl1: ; wb stall 210 sub r4,r4,r5 211 MPYHU r5,r4,r4 212 breq.d r6,0,.Ldenorm_dbl0 213 lsr r8,r8,1 214 asl r12,DBL0H,11 215 lsr r10,DBL0L,21 216.Lpast_denorm_dbl0: ; wb stall 217 bset r8,r8,31 218 MPYHU r11,r5,r8 219 add_s r12,r12,r10 220 bset r5,r12,31 221 cmp r5,r8 222 cmp.eq DBL0L,DBL1L 223 ; wb stall 224 lsr.cc r5,r5,1 225 sub r4,r4,r11 ; u1.31 inverse, about 30 bit 226 MPYHU r11,r5,r4 ; result fraction highpart 227 breq r7,r9,.Linf_nan_dbl1 228 lsr r8,r8,2 ; u3.29 229 add r5,r6, /* wait for immediate / XMAC wb stall */ \ 230 0x3fe00000 231 ; wb stall (not for XMAC) 232 breq r6,r9,.Linf_nan_dbl0 233 mpyu r12,r11,r8 ; u-28.31 234 asl_s DBL1L,DBL1L,9 ; u-29.23:9 235 sbc r6,r5,r7 236 ; resource conflict (not for XMAC) 237 MPYHU r5,r11,DBL1L ; u-28.23:9 238 add.cs DBL0L,DBL0L,DBL0L 239 asl_s DBL0L,DBL0L,6 ; u-26.25:7 240 asl r10,r11,23 241 sub_l DBL0L,DBL0L,r12 242 ; wb stall (before 'and' for XMAC) 243 lsr r7,r11,9 244 sub r5,DBL0L,r5 ; rest msw ; u-26.31:0 245 MPYH r12,r5,r4 ; result fraction lowpart 246 xor.f 0,DBL0H,DBL1H 247 and DBL0H,r6,r9 248 add_s DBL0H,DBL0H,r7 ; (XMAC wb stall) 249 bxor.mi DBL0H,DBL0H,31 250 brhs r6, /* wb stall / wait for immediate */ \ 251 0x7fe00000,.Linf_denorm 252 add.f r12,r12,0x11 253 asr r9,r12,5 254 sub.mi DBL0H,DBL0H,1 255 add.f DBL0L,r9,r10 256 tst r12,0x1c 257 jne.d [blink] 258 add.cs DBL0H,DBL0H,1 259 /* work out exact rounding if we fall through here. */ 260 /* We know that the exact result cannot be represented in double 261 precision. Find the mid-point between the two nearest 262 representable values, multiply with the divisor, and check if 263 the result is larger than the dividend. Since we want to know 264 only the sign bit, it is sufficient to calculate only the 265 highpart of the lower 64 bits. */ 266 sub.f DBL0L,DBL0L,1 267 asl r12,r9,2 ; u-22.30:2 268 mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10 269 sub.cs DBL0H,DBL0H,1 270 sub.f r12,r12,2 271 ; resource conflict (not for XMAC) 272 MPYHU r7,r12,DBL1L ; u-51.32 273 asl r5,r5,25 ; s-51.7:25 274 lsr r10,r10,7 ; u-51.30:2 275 ; resource conflict (not for XMAC) 276 ; resource conflict (not for XMAC) 277 mpyu r9,r12,r8 ; u-51.31:1 278 sub r5,r5,r10 279 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L 280 bset r7,r7,0 ; make sure that the result is not zero, and that 281 ; wb stall (one earlier for XMAC) 282 sub r5,r5,r7 ; a highpart zero appears negative 283 sub.f r5,r5,r9 ; rest msw 284 add.pl.f DBL0L,DBL0L,1 285 j_s.d [blink] 286 add.eq DBL0H,DBL0H,1 287 288 .balign 4 289.Linf_denorm: 290 brlo r6,0xc0000000,.Linf 291.Ldenorm: 292 asr r6,r6,20 293 neg r9,r6 294 mov_s DBL0H,0 295 brhs.d r9,54,.Lret0 296 bxor.mi DBL0H,DBL0H,31 297 add_l r12,r12,1 298 and r12,r12,-4 299 rsub r7,r6,5 300 asr r10,r12,28 301 bmsk r4,r12,27 302#if defined (__ARCHS__) || defined (__ARCEM__) 303 min r7, r7, 31 304 asr DBL0L, r4, r7 305#else 306 asrs DBL0L,r4,r7 307#endif 308 add DBL1H,r11,r10 309#if defined (__ARCHS__) || defined (__ARCEM__) 310 abs.f r10, r4 311 sub.mi r10, r10, 1 312#endif 313 add.f r7,r6,32-5 314#ifdef __ARC700__ 315 abss r10,r4 316#endif 317 asl r4,r4,r7 318 mov.mi r4,r10 319 add.f r10,r6,23 320 rsub r7,r6,9 321 lsr r7,DBL1H,r7 322 asl r10,DBL1H,r10 323 or.pnz DBL0H,DBL0H,r7 324 or.mi r4,r4,r10 325 mov.mi r10,r7 326 add.f DBL0L,r10,DBL0L 327 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point 328 bxor.f 0,r4,31 329 add.pnz.f DBL0L,DBL0L,1 330 add.cs.f DBL0H,DBL0H,1 331 jne_l [blink] 332 /* Calculation so far was not conclusive; calculate further rest. */ 333 mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11 334 asr.f r12,r12,3 335 asl r5,r5,25 ; s-51.7:25 336 ; resource conflict (not for XMAC) 337 mpyu DBL1H,r12,r8 ; u-51.31:1 338 and r9,DBL0L,1 ; tie-breaker: round to even 339 lsr r11,r11,7 ; u-51.30:2 340 ; resource conflict (not for XMAC) 341 MPYHU r8,r12,DBL1L ; u-51.32 342 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L 343 add_s DBL1H,DBL1H,r11 344 ; resource conflict (not for XMAC) 345 ; resource conflict (not for XMAC) 346 mpyu r12,r12,DBL1L ; u-83.30:2 347 sub DBL1H,DBL1H,r5 ; -rest msw 348 add_s DBL1H,DBL1H,r8 ; -rest msw 349 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( 350 ; wb stall (XMAC: Before add.f) 351 tst_s DBL1H,DBL1H 352 cmp.eq r12,r9 353 add.cs.f DBL0L,DBL0L,1 354 j_s.d [blink] 355 add.cs DBL0H,DBL0H,1 356 357.Lret0: 358 /* return +- 0 */ 359 j_s.d [blink] 360 mov_s DBL0L,0 361.Linf: 362 mov_s DBL0H,r9 363 mov_s DBL0L,0 364 j_s.d [blink] 365 bxor.mi DBL0H,DBL0H,31 366 367 .balign 4 368.Ldivtab: 369 .long 0xfc0fffe1 370 .long 0xf46ffdfb 371 .long 0xed1ffa54 372 .long 0xe61ff515 373 .long 0xdf7fee75 374 .long 0xd91fe680 375 .long 0xd2ffdd52 376 .long 0xcd1fd30c 377 .long 0xc77fc7cd 378 .long 0xc21fbbb6 379 .long 0xbcefaec0 380 .long 0xb7efa100 381 .long 0xb32f92bf 382 .long 0xae8f83b7 383 .long 0xaa2f7467 384 .long 0xa5ef6479 385 .long 0xa1cf53fa 386 .long 0x9ddf433e 387 .long 0x9a0f3216 388 .long 0x965f2091 389 .long 0x92df0f11 390 .long 0x8f6efd05 391 .long 0x8c1eeacc 392 .long 0x88eed876 393 .long 0x85dec615 394 .long 0x82eeb3b9 395 .long 0x800ea10b 396 .long 0x7d3e8e0f 397 .long 0x7a8e7b3f 398 .long 0x77ee6836 399 .long 0x756e5576 400 .long 0x72fe4293 401 .long 0x709e2f93 402 .long 0x6e4e1c7f 403 .long 0x6c0e095e 404 .long 0x69edf6c5 405 .long 0x67cde3a5 406 .long 0x65cdd125 407 .long 0x63cdbe25 408 .long 0x61ddab3f 409 .long 0x600d991f 410 .long 0x5e3d868c 411 .long 0x5c6d7384 412 .long 0x5abd615f 413 .long 0x590d4ecd 414 .long 0x576d3c83 415 .long 0x55dd2a89 416 .long 0x545d18e9 417 .long 0x52dd06e9 418 .long 0x516cf54e 419 .long 0x4ffce356 420 .long 0x4e9cd1ce 421 .long 0x4d3cbfec 422 .long 0x4becae86 423 .long 0x4aac9da4 424 .long 0x496c8c73 425 .long 0x483c7bd3 426 .long 0x470c6ae8 427 .long 0x45dc59af 428 .long 0x44bc4915 429 .long 0x43ac3924 430 .long 0x428c27fb 431 .long 0x418c187a 432 .long 0x407c07bd 433.L7ff00000: 434 .long 0x7ff00000 435 ENDFUNC(__divdf3) 436