1222625Sed/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
2222625Sed *
3222625Sed *                     The LLVM Compiler Infrastructure
4222625Sed *
5222625Sed * This file is dual licensed under the MIT and the University of Illinois Open
6222625Sed * Source Licenses. See LICENSE.TXT for details.
7222625Sed *
8222625Sed *===----------------------------------------------------------------------===//
9222625Sed *
10222625Sed * This file implements the __udivsi3 (32-bit unsigned integer divide)
11222625Sed * function for the ARM architecture.  A naive digit-by-digit computation is
12222625Sed * employed for simplicity.
13222625Sed *
14222625Sed *===----------------------------------------------------------------------===*/
15222625Sed
16222625Sed#include "../assembly.h"
17222625Sed
18222625Sed#define ESTABLISH_FRAME \
19222625Sed    push   {r7, lr}    ;\
20222625Sed    mov     r7,     sp
21222625Sed#define CLEAR_FRAME_AND_RETURN \
22222625Sed    pop    {r7, pc}
23222625Sed
24222625Sed#define a r0
25222625Sed#define b r1
26222625Sed#define r r2
27222625Sed#define i r3
28222625Sed#define q ip
29222625Sed#define one lr
30222625Sed
31222625Sed.syntax unified
32222625Sed.align 3
33222625Sed// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine.
34222625SedDEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
35222625SedDEFINE_COMPILERRT_FUNCTION(__udivsi3)
36245628Sandrew#if __ARM_ARCH_7S__
37245628Sandrew	tst	r1,r1
38245628Sandrew	beq	LOCAL_LABEL(divzero)
39245628Sandrew	udiv	r0, r0, r1
40245628Sandrew	bx  	lr
41245628Sandrew	LOCAL_LABEL(divzero):
42245628Sandrew	mov	r0,#0
43245628Sandrew	bx	lr
44245628Sandrew#else
45222625Sed//  We use a simple digit by digit algorithm; before we get into the actual
46222625Sed//  divide loop, we must calculate the left-shift amount necessary to align
47222625Sed//  the MSB of the divisor with that of the dividend (If this shift is
48222625Sed//  negative, then the result is zero, and we early out). We also conjure a
49222625Sed//  bit mask of 1 to use in constructing the quotient, and initialize the
50222625Sed//  quotient to zero.
51222625Sed    ESTABLISH_FRAME
52222625Sed    clz     r2,     a
53222625Sed    tst     b,      b   // detect divide-by-zero
54222625Sed    clz     r3,     b
55222625Sed    mov     q,      #0
56222625Sed    beq     LOCAL_LABEL(return)    // return 0 if b is zero.
57222625Sed    mov     one,    #1
58222625Sed    subs    i,      r3, r2
59222625Sed    blt     LOCAL_LABEL(return)    // return 0 if MSB(a) < MSB(b)
60222625Sed
61222625SedLOCAL_LABEL(mainLoop):
62222625Sed//  This loop basically implements the following:
63222625Sed//
64222625Sed//  do {
65222625Sed//      if (a >= b << i) {
66222625Sed//          a -= b << i;
67222625Sed//          q |= 1 << i;
68222625Sed//          if (a == 0) break;
69222625Sed//      }
70222625Sed//  } while (--i)
71222625Sed//
72222625Sed//  Note that this does not perform the final iteration (i == 0); by doing it
73222625Sed//  this way, we can merge the two branches which is a substantial win for
74222625Sed//  such a tight loop on current ARM architectures.
75222625Sed    subs    r,      a,  b, lsl i
76222625Sed    orrhs   q,      q,one, lsl i
77222625Sed    movhs   a,      r
78222625Sed    subsne  i,      i, #1
79222625Sed    bhi     LOCAL_LABEL(mainLoop)
80222625Sed
81222625Sed//  Do the final test subtraction and update of quotient (i == 0), as it is
82222625Sed//  not performed in the main loop.
83222625Sed    subs    r,      a,  b
84222625Sed    orrhs   q,      #1
85222625Sed
86222625SedLOCAL_LABEL(return):
87222625Sed//  Move the quotient to r0 and return.
88222625Sed    mov     r0,     q
89222625Sed    CLEAR_FRAME_AND_RETURN
90245628Sandrew#endif
91