lib1funcs-Os-4-200.S revision 1.1.1.3
1/* Copyright (C) 2006-2016 Free Software Foundation, Inc. 2 3This file is free software; you can redistribute it and/or modify it 4under the terms of the GNU General Public License as published by the 5Free Software Foundation; either version 3, or (at your option) any 6later version. 7 8This file is distributed in the hope that it will be useful, but 9WITHOUT ANY WARRANTY; without even the implied warranty of 10MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11General Public License for more details. 12 13Under Section 7 of GPL version 3, you are granted additional 14permissions described in the GCC Runtime Library Exception, version 153.1, as published by the Free Software Foundation. 16 17You should have received a copy of the GNU General Public License and 18a copy of the GCC Runtime Library Exception along with this program; 19see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 20<http://www.gnu.org/licenses/>. */ 21 22/* Moderately Space-optimized libgcc routines for the Renesas SH / 23 STMicroelectronics ST40 CPUs. 24 Contributed by J"orn Rennecke joern.rennecke@st.com. */ 25 26#include "lib1funcs.h" 27 28#if !__SHMEDIA__ 29#ifdef L_udivsi3_i4i 30 31/* 88 bytes; sh4-200 cycle counts: 32 divisor >= 2G: 11 cycles 33 dividend < 2G: 48 cycles 34 dividend >= 2G: divisor != 1: 54 cycles 35 dividend >= 2G, divisor == 1: 22 cycles */ 36#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) 37!! args in r4 and r5, result in r0, clobber r1 38 39 .global GLOBAL(udivsi3_i4i) 40 FUNC(GLOBAL(udivsi3_i4i)) 41GLOBAL(udivsi3_i4i): 42 mova L1,r0 43 cmp/pz r5 44 sts fpscr,r1 45 lds.l @r0+,fpscr 46 sts.l fpul,@-r15 47 bf LOCAL(huge_divisor) 48 mov.l r1,@-r15 49 lds r4,fpul 50 cmp/pz r4 51#ifdef FMOVD_WORKS 52 fmov.d dr0,@-r15 53 float fpul,dr0 54 fmov.d dr2,@-r15 55 bt LOCAL(dividend_adjusted) 56 mov #1,r1 57 fmov.d @r0,dr2 58 cmp/eq r1,r5 59 bt LOCAL(div_by_1) 60 fadd dr2,dr0 61LOCAL(dividend_adjusted): 62 lds r5,fpul 63 float fpul,dr2 64 fdiv dr2,dr0 65LOCAL(div_by_1): 66 fmov.d @r15+,dr2 67 ftrc dr0,fpul 68 fmov.d @r15+,dr0 69#else /* !FMOVD_WORKS */ 70 fmov.s DR01,@-r15 71 mov #1,r1 72 fmov.s DR00,@-r15 73 float fpul,dr0 74 fmov.s DR21,@-r15 75 bt/s LOCAL(dividend_adjusted) 76 fmov.s DR20,@-r15 77 cmp/eq r1,r5 78 bt LOCAL(div_by_1) 79 fmov.s @r0+,DR20 80 fmov.s @r0,DR21 81 fadd dr2,dr0 82LOCAL(dividend_adjusted): 83 lds r5,fpul 84 float fpul,dr2 85 fdiv dr2,dr0 86LOCAL(div_by_1): 87 fmov.s @r15+,DR20 88 fmov.s @r15+,DR21 89 ftrc dr0,fpul 90 fmov.s @r15+,DR00 91 fmov.s @r15+,DR01 92#endif /* !FMOVD_WORKS */ 93 lds.l @r15+,fpscr 94 sts fpul,r0 95 rts 96 lds.l @r15+,fpul 97 98#ifdef FMOVD_WORKS 99 .p2align 3 ! make double below 8 byte aligned. 100#endif 101LOCAL(huge_divisor): 102 lds r1,fpscr 103 add #4,r15 104 cmp/hs r5,r4 105 rts 106 movt r0 107 108 .p2align 2 109L1: 110#ifndef FMOVD_WORKS 111 .long 0x80000 112#else 113 .long 0x180000 114#endif 115 .double 4294967296 116 117 ENDFUNC(GLOBAL(udivsi3_i4i)) 118#elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */ 119 120#if 0 121/* With 36 bytes, the following would probably be the most compact 122 implementation, but with 139 cycles on an sh4-200, it is extremely slow. */ 123GLOBAL(udivsi3_i4i): 124 mov.l r2,@-r15 125 mov #0,r1 126 div0u 127 mov r1,r2 128 mov.l r3,@-r15 129 mov r1,r3 130 sett 131 mov r4,r0 132LOCAL(loop): 133 rotcr r2 134 ; 135 bt/s LOCAL(end) 136 cmp/gt r2,r3 137 rotcl r0 138 bra LOCAL(loop) 139 div1 r5,r1 140LOCAL(end): 141 rotcl r0 142 mov.l @r15+,r3 143 rts 144 mov.l @r15+,r2 145#endif /* 0 */ 146 147/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i 148 sh4-200 run times: 149 udiv small divisor: 55 cycles 150 udiv large divisor: 52 cycles 151 sdiv small divisor, positive result: 59 cycles 152 sdiv large divisor, positive result: 56 cycles 153 sdiv small divisor, negative result: 65 cycles (*) 154 sdiv large divisor, negative result: 62 cycles (*) 155 (*): r2 is restored in the rts delay slot and has a lingering latency 156 of two more cycles. */ 157 .balign 4 158 .global GLOBAL(udivsi3_i4i) 159 FUNC(GLOBAL(udivsi3_i4i)) 160 FUNC(GLOBAL(sdivsi3_i4i)) 161GLOBAL(udivsi3_i4i): 162 sts pr,r1 163 mov.l r4,@-r15 164 extu.w r5,r0 165 cmp/eq r5,r0 166 swap.w r4,r0 167 shlr16 r4 168 bf/s LOCAL(large_divisor) 169 div0u 170 mov.l r5,@-r15 171 shll16 r5 172LOCAL(sdiv_small_divisor): 173 div1 r5,r4 174 bsr LOCAL(div6) 175 div1 r5,r4 176 div1 r5,r4 177 bsr LOCAL(div6) 178 div1 r5,r4 179 xtrct r4,r0 180 xtrct r0,r4 181 bsr LOCAL(div7) 182 swap.w r4,r4 183 div1 r5,r4 184 bsr LOCAL(div7) 185 div1 r5,r4 186 xtrct r4,r0 187 mov.l @r15+,r5 188 swap.w r0,r0 189 mov.l @r15+,r4 190 jmp @r1 191 rotcl r0 192LOCAL(div7): 193 div1 r5,r4 194LOCAL(div6): 195 div1 r5,r4; div1 r5,r4; div1 r5,r4 196 div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 197 198LOCAL(divx3): 199 rotcl r0 200 div1 r5,r4 201 rotcl r0 202 div1 r5,r4 203 rotcl r0 204 rts 205 div1 r5,r4 206 207LOCAL(large_divisor): 208 mov.l r5,@-r15 209LOCAL(sdiv_large_divisor): 210 xor r4,r0 211 .rept 4 212 rotcl r0 213 bsr LOCAL(divx3) 214 div1 r5,r4 215 .endr 216 mov.l @r15+,r5 217 mov.l @r15+,r4 218 jmp @r1 219 rotcl r0 220 ENDFUNC(GLOBAL(udivsi3_i4i)) 221 222 .global GLOBAL(sdivsi3_i4i) 223GLOBAL(sdivsi3_i4i): 224 mov.l r4,@-r15 225 cmp/pz r5 226 mov.l r5,@-r15 227 bt/s LOCAL(pos_divisor) 228 cmp/pz r4 229 neg r5,r5 230 extu.w r5,r0 231 bt/s LOCAL(neg_result) 232 cmp/eq r5,r0 233 neg r4,r4 234LOCAL(pos_result): 235 swap.w r4,r0 236 bra LOCAL(sdiv_check_divisor) 237 sts pr,r1 238LOCAL(pos_divisor): 239 extu.w r5,r0 240 bt/s LOCAL(pos_result) 241 cmp/eq r5,r0 242 neg r4,r4 243LOCAL(neg_result): 244 mova LOCAL(negate_result),r0 245 ; 246 mov r0,r1 247 swap.w r4,r0 248 lds r2,macl 249 sts pr,r2 250LOCAL(sdiv_check_divisor): 251 shlr16 r4 252 bf/s LOCAL(sdiv_large_divisor) 253 div0u 254 bra LOCAL(sdiv_small_divisor) 255 shll16 r5 256 .balign 4 257LOCAL(negate_result): 258 neg r0,r0 259 jmp @r2 260 sts macl,r2 261 ENDFUNC(GLOBAL(sdivsi3_i4i)) 262#endif /* !__SH_FPU_DOUBLE__ */ 263#endif /* L_udivsi3_i4i */ 264 265#ifdef L_sdivsi3_i4i 266#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) 267/* 48 bytes, 45 cycles on sh4-200 */ 268!! args in r4 and r5, result in r0, clobber r1 269 270 .global GLOBAL(sdivsi3_i4i) 271 FUNC(GLOBAL(sdivsi3_i4i)) 272GLOBAL(sdivsi3_i4i): 273 sts.l fpscr,@-r15 274 sts fpul,r1 275 mova L1,r0 276 lds.l @r0+,fpscr 277 lds r4,fpul 278#ifdef FMOVD_WORKS 279 fmov.d dr0,@-r15 280 float fpul,dr0 281 lds r5,fpul 282 fmov.d dr2,@-r15 283#else 284 fmov.s DR01,@-r15 285 fmov.s DR00,@-r15 286 float fpul,dr0 287 lds r5,fpul 288 fmov.s DR21,@-r15 289 fmov.s DR20,@-r15 290#endif 291 float fpul,dr2 292 fdiv dr2,dr0 293#ifdef FMOVD_WORKS 294 fmov.d @r15+,dr2 295#else 296 fmov.s @r15+,DR20 297 fmov.s @r15+,DR21 298#endif 299 ftrc dr0,fpul 300#ifdef FMOVD_WORKS 301 fmov.d @r15+,dr0 302#else 303 fmov.s @r15+,DR00 304 fmov.s @r15+,DR01 305#endif 306 lds.l @r15+,fpscr 307 sts fpul,r0 308 rts 309 lds r1,fpul 310 311 .p2align 2 312L1: 313#ifndef FMOVD_WORKS 314 .long 0x80000 315#else 316 .long 0x180000 317#endif 318 319 ENDFUNC(GLOBAL(sdivsi3_i4i)) 320#endif /* __SH_FPU_DOUBLE__ */ 321#endif /* L_sdivsi3_i4i */ 322#endif /* !__SHMEDIA__ */ 323