lib1funcs-Os-4-200.S revision 1.1.1.9
1/* Copyright (C) 2006-2020 Free Software Foundation, Inc. 2 3This file is free software; you can redistribute it and/or modify it 4under the terms of the GNU General Public License as published by the 5Free Software Foundation; either version 3, or (at your option) any 6later version. 7 8This file is distributed in the hope that it will be useful, but 9WITHOUT ANY WARRANTY; without even the implied warranty of 10MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11General Public License for more details. 12 13Under Section 7 of GPL version 3, you are granted additional 14permissions described in the GCC Runtime Library Exception, version 153.1, as published by the Free Software Foundation. 16 17You should have received a copy of the GNU General Public License and 18a copy of the GCC Runtime Library Exception along with this program; 19see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 20<http://www.gnu.org/licenses/>. */ 21 22/* Moderately Space-optimized libgcc routines for the Renesas SH / 23 STMicroelectronics ST40 CPUs. 24 Contributed by J"orn Rennecke joern.rennecke@st.com. */ 25 26#include "lib1funcs.h" 27 28#ifdef L_udivsi3_i4i 29 30/* 88 bytes; sh4-200 cycle counts: 31 divisor >= 2G: 11 cycles 32 dividend < 2G: 48 cycles 33 dividend >= 2G: divisor != 1: 54 cycles 34 dividend >= 2G, divisor == 1: 22 cycles */ 35#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) 36!! args in r4 and r5, result in r0, clobber r1 37 38 .global GLOBAL(udivsi3_i4i) 39 FUNC(GLOBAL(udivsi3_i4i)) 40GLOBAL(udivsi3_i4i): 41 mova L1,r0 42 cmp/pz r5 43 sts fpscr,r1 44 lds.l @r0+,fpscr 45 sts.l fpul,@-r15 46 bf LOCAL(huge_divisor) 47 mov.l r1,@-r15 48 lds r4,fpul 49 cmp/pz r4 50#ifdef FMOVD_WORKS 51 fmov.d dr0,@-r15 52 float fpul,dr0 53 fmov.d dr2,@-r15 54 bt LOCAL(dividend_adjusted) 55 mov #1,r1 56 fmov.d @r0,dr2 57 cmp/eq r1,r5 58 bt LOCAL(div_by_1) 59 fadd dr2,dr0 60LOCAL(dividend_adjusted): 61 lds r5,fpul 62 float fpul,dr2 63 fdiv dr2,dr0 64LOCAL(div_by_1): 65 fmov.d @r15+,dr2 66 ftrc dr0,fpul 67 fmov.d @r15+,dr0 68#else /* !FMOVD_WORKS */ 69 fmov.s DR01,@-r15 70 mov #1,r1 71 fmov.s DR00,@-r15 72 float fpul,dr0 73 fmov.s DR21,@-r15 74 bt/s LOCAL(dividend_adjusted) 75 fmov.s DR20,@-r15 76 cmp/eq r1,r5 77 bt LOCAL(div_by_1) 78 fmov.s @r0+,DR20 79 fmov.s @r0,DR21 80 fadd dr2,dr0 81LOCAL(dividend_adjusted): 82 lds r5,fpul 83 float fpul,dr2 84 fdiv dr2,dr0 85LOCAL(div_by_1): 86 fmov.s @r15+,DR20 87 fmov.s @r15+,DR21 88 ftrc dr0,fpul 89 fmov.s @r15+,DR00 90 fmov.s @r15+,DR01 91#endif /* !FMOVD_WORKS */ 92 lds.l @r15+,fpscr 93 sts fpul,r0 94 rts 95 lds.l @r15+,fpul 96 97#ifdef FMOVD_WORKS 98 .p2align 3 ! make double below 8 byte aligned. 99#endif 100LOCAL(huge_divisor): 101 lds r1,fpscr 102 add #4,r15 103 cmp/hs r5,r4 104 rts 105 movt r0 106 107 .p2align 2 108L1: 109#ifndef FMOVD_WORKS 110 .long 0x80000 111#else 112 .long 0x180000 113#endif 114 .double 4294967296 115 116 ENDFUNC(GLOBAL(udivsi3_i4i)) 117#elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */ 118 119#if 0 120/* With 36 bytes, the following would probably be the most compact 121 implementation, but with 139 cycles on an sh4-200, it is extremely slow. */ 122GLOBAL(udivsi3_i4i): 123 mov.l r2,@-r15 124 mov #0,r1 125 div0u 126 mov r1,r2 127 mov.l r3,@-r15 128 mov r1,r3 129 sett 130 mov r4,r0 131LOCAL(loop): 132 rotcr r2 133 ; 134 bt/s LOCAL(end) 135 cmp/gt r2,r3 136 rotcl r0 137 bra LOCAL(loop) 138 div1 r5,r1 139LOCAL(end): 140 rotcl r0 141 mov.l @r15+,r3 142 rts 143 mov.l @r15+,r2 144#endif /* 0 */ 145 146/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i 147 sh4-200 run times: 148 udiv small divisor: 55 cycles 149 udiv large divisor: 52 cycles 150 sdiv small divisor, positive result: 59 cycles 151 sdiv large divisor, positive result: 56 cycles 152 sdiv small divisor, negative result: 65 cycles (*) 153 sdiv large divisor, negative result: 62 cycles (*) 154 (*): r2 is restored in the rts delay slot and has a lingering latency 155 of two more cycles. */ 156 .balign 4 157 .global GLOBAL(udivsi3_i4i) 158 FUNC(GLOBAL(udivsi3_i4i)) 159 FUNC(GLOBAL(sdivsi3_i4i)) 160GLOBAL(udivsi3_i4i): 161 sts pr,r1 162 mov.l r4,@-r15 163 extu.w r5,r0 164 cmp/eq r5,r0 165 swap.w r4,r0 166 shlr16 r4 167 bf/s LOCAL(large_divisor) 168 div0u 169 mov.l r5,@-r15 170 shll16 r5 171LOCAL(sdiv_small_divisor): 172 div1 r5,r4 173 bsr LOCAL(div6) 174 div1 r5,r4 175 div1 r5,r4 176 bsr LOCAL(div6) 177 div1 r5,r4 178 xtrct r4,r0 179 xtrct r0,r4 180 bsr LOCAL(div7) 181 swap.w r4,r4 182 div1 r5,r4 183 bsr LOCAL(div7) 184 div1 r5,r4 185 xtrct r4,r0 186 mov.l @r15+,r5 187 swap.w r0,r0 188 mov.l @r15+,r4 189 jmp @r1 190 rotcl r0 191LOCAL(div7): 192 div1 r5,r4 193LOCAL(div6): 194 div1 r5,r4; div1 r5,r4; div1 r5,r4 195 div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 196 197LOCAL(divx3): 198 rotcl r0 199 div1 r5,r4 200 rotcl r0 201 div1 r5,r4 202 rotcl r0 203 rts 204 div1 r5,r4 205 206LOCAL(large_divisor): 207 mov.l r5,@-r15 208LOCAL(sdiv_large_divisor): 209 xor r4,r0 210 .rept 4 211 rotcl r0 212 bsr LOCAL(divx3) 213 div1 r5,r4 214 .endr 215 mov.l @r15+,r5 216 mov.l @r15+,r4 217 jmp @r1 218 rotcl r0 219 ENDFUNC(GLOBAL(udivsi3_i4i)) 220 221 .global GLOBAL(sdivsi3_i4i) 222GLOBAL(sdivsi3_i4i): 223 mov.l r4,@-r15 224 cmp/pz r5 225 mov.l r5,@-r15 226 bt/s LOCAL(pos_divisor) 227 cmp/pz r4 228 neg r5,r5 229 extu.w r5,r0 230 bt/s LOCAL(neg_result) 231 cmp/eq r5,r0 232 neg r4,r4 233LOCAL(pos_result): 234 swap.w r4,r0 235 bra LOCAL(sdiv_check_divisor) 236 sts pr,r1 237LOCAL(pos_divisor): 238 extu.w r5,r0 239 bt/s LOCAL(pos_result) 240 cmp/eq r5,r0 241 neg r4,r4 242LOCAL(neg_result): 243 mova LOCAL(negate_result),r0 244 ; 245 mov r0,r1 246 swap.w r4,r0 247 lds r2,macl 248 sts pr,r2 249LOCAL(sdiv_check_divisor): 250 shlr16 r4 251 bf/s LOCAL(sdiv_large_divisor) 252 div0u 253 bra LOCAL(sdiv_small_divisor) 254 shll16 r5 255 .balign 4 256LOCAL(negate_result): 257 neg r0,r0 258 jmp @r2 259 sts macl,r2 260 ENDFUNC(GLOBAL(sdivsi3_i4i)) 261#endif /* !__SH_FPU_DOUBLE__ */ 262#endif /* L_udivsi3_i4i */ 263 264#ifdef L_sdivsi3_i4i 265#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) 266/* 48 bytes, 45 cycles on sh4-200 */ 267!! args in r4 and r5, result in r0, clobber r1 268 269 .global GLOBAL(sdivsi3_i4i) 270 FUNC(GLOBAL(sdivsi3_i4i)) 271GLOBAL(sdivsi3_i4i): 272 sts.l fpscr,@-r15 273 sts fpul,r1 274 mova L1,r0 275 lds.l @r0+,fpscr 276 lds r4,fpul 277#ifdef FMOVD_WORKS 278 fmov.d dr0,@-r15 279 float fpul,dr0 280 lds r5,fpul 281 fmov.d dr2,@-r15 282#else 283 fmov.s DR01,@-r15 284 fmov.s DR00,@-r15 285 float fpul,dr0 286 lds r5,fpul 287 fmov.s DR21,@-r15 288 fmov.s DR20,@-r15 289#endif 290 float fpul,dr2 291 fdiv dr2,dr0 292#ifdef FMOVD_WORKS 293 fmov.d @r15+,dr2 294#else 295 fmov.s @r15+,DR20 296 fmov.s @r15+,DR21 297#endif 298 ftrc dr0,fpul 299#ifdef FMOVD_WORKS 300 fmov.d @r15+,dr0 301#else 302 fmov.s @r15+,DR00 303 fmov.s @r15+,DR01 304#endif 305 lds.l @r15+,fpscr 306 sts fpul,r0 307 rts 308 lds r1,fpul 309 310 .p2align 2 311L1: 312#ifndef FMOVD_WORKS 313 .long 0x80000 314#else 315 .long 0x180000 316#endif 317 318 ENDFUNC(GLOBAL(sdivsi3_i4i)) 319#endif /* __SH_FPU_DOUBLE__ */ 320#endif /* L_sdivsi3_i4i */ 321