1353358Sdim//===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===//
2353358Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6353358Sdim//
7353358Sdim//===----------------------------------------------------------------------===//
8353358Sdim//
9353358Sdim// This file implements the __umodsi3 (32-bit unsigned integer modulus)
10353358Sdim// function for the ARM 32-bit architecture.
11353358Sdim//
12353358Sdim//===----------------------------------------------------------------------===//
13276789Sdim
14276789Sdim#include "../assembly.h"
15276789Sdim
16276789Sdim	.syntax unified
17276789Sdim	.text
18327952Sdim	DEFINE_CODE_STATE
19276789Sdim
20276789Sdim@ unsigned int __umodsi3(unsigned int divident, unsigned int divisor)
21276789Sdim@   Calculate and return the remainder of the (unsigned) division.
22276789Sdim
23276789Sdim	.p2align 2
24276789SdimDEFINE_COMPILERRT_FUNCTION(__umodsi3)
25276789Sdim#if __ARM_ARCH_EXT_IDIV__
26276789Sdim	tst     r1, r1
27276789Sdim	beq     LOCAL_LABEL(divby0)
28276789Sdim	udiv	r2, r0, r1
29276789Sdim	mls 	r0, r2, r1, r0
30276789Sdim	bx  	lr
31276789Sdim#else
32276789Sdim	cmp	r1, #1
33276789Sdim	bcc	LOCAL_LABEL(divby0)
34276789Sdim	ITT(eq)
35276789Sdim	moveq	r0, #0
36276789Sdim	JMPc(lr, eq)
37276789Sdim	cmp	r0, r1
38276789Sdim	IT(cc)
39276789Sdim	JMPc(lr, cc)
40276789Sdim
41353358Sdim	// Implement division using binary long division algorithm.
42353358Sdim	//
43353358Sdim	// r0 is the numerator, r1 the denominator.
44353358Sdim	//
45353358Sdim	// The code before JMP computes the correct shift I, so that
46353358Sdim	// r0 and (r1 << I) have the highest bit set in the same position.
47353358Sdim	// At the time of JMP, ip := .Ldiv0block - 8 * I.
48353358Sdim	// This depends on the fixed instruction size of block.
49353358Sdim	// For ARM mode, this is 8 Bytes, for THUMB mode 10 Bytes.
50353358Sdim	//
51353358Sdim	// block(shift) implements the test-and-update-quotient core.
52353358Sdim	// It assumes (r0 << shift) can be computed without overflow and
53353358Sdim	// that (r0 << shift) < 2 * r1. The quotient is stored in r3.
54353358Sdim
55276789Sdim#  ifdef __ARM_FEATURE_CLZ
56276789Sdim	clz	ip, r0
57276789Sdim	clz	r3, r1
58353358Sdim	// r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3.
59276789Sdim	sub	r3, r3, ip
60316005Sdim#    if defined(USE_THUMB_2)
61276789Sdim	adr	ip, LOCAL_LABEL(div0block) + 1
62276789Sdim	sub	ip, ip, r3, lsl #1
63276789Sdim#    else
64276789Sdim	adr	ip, LOCAL_LABEL(div0block)
65276789Sdim#    endif
66276789Sdim	sub	ip, ip, r3, lsl #3
67276789Sdim	bx	ip
68276789Sdim#  else
69316005Sdim#    if defined(USE_THUMB_2)
70276789Sdim#    error THUMB mode requires CLZ or UDIV
71276789Sdim#    endif
72276789Sdim	mov	r2, r0
73276789Sdim	adr	ip, LOCAL_LABEL(div0block)
74276789Sdim
75276789Sdim	lsr	r3, r2, #16
76276789Sdim	cmp	r3, r1
77276789Sdim	movhs	r2, r3
78276789Sdim	subhs	ip, ip, #(16 * 8)
79276789Sdim
80276789Sdim	lsr	r3, r2, #8
81276789Sdim	cmp	r3, r1
82276789Sdim	movhs	r2, r3
83276789Sdim	subhs	ip, ip, #(8 * 8)
84276789Sdim
85276789Sdim	lsr	r3, r2, #4
86276789Sdim	cmp	r3, r1
87276789Sdim	movhs	r2, r3
88276789Sdim	subhs	ip, #(4 * 8)
89276789Sdim
90276789Sdim	lsr	r3, r2, #2
91276789Sdim	cmp	r3, r1
92276789Sdim	movhs	r2, r3
93276789Sdim	subhs	ip, ip, #(2 * 8)
94276789Sdim
95353358Sdim	// Last block, no need to update r2 or r3.
96276789Sdim	cmp	r1, r2, lsr #1
97276789Sdim	subls	ip, ip, #(1 * 8)
98276789Sdim
99276789Sdim	JMP(ip)
100276789Sdim#  endif
101276789Sdim
102276789Sdim#define	IMM	#
103276789Sdim
104276789Sdim#define block(shift)                                                           \
105276789Sdim	cmp	r0, r1, lsl IMM shift;                                         \
106276789Sdim	IT(hs);                                                                \
107276789Sdim	WIDE(subhs)	r0, r0, r1, lsl IMM shift
108276789Sdim
109276789Sdim	block(31)
110276789Sdim	block(30)
111276789Sdim	block(29)
112276789Sdim	block(28)
113276789Sdim	block(27)
114276789Sdim	block(26)
115276789Sdim	block(25)
116276789Sdim	block(24)
117276789Sdim	block(23)
118276789Sdim	block(22)
119276789Sdim	block(21)
120276789Sdim	block(20)
121276789Sdim	block(19)
122276789Sdim	block(18)
123276789Sdim	block(17)
124276789Sdim	block(16)
125276789Sdim	block(15)
126276789Sdim	block(14)
127276789Sdim	block(13)
128276789Sdim	block(12)
129276789Sdim	block(11)
130276789Sdim	block(10)
131276789Sdim	block(9)
132276789Sdim	block(8)
133276789Sdim	block(7)
134276789Sdim	block(6)
135276789Sdim	block(5)
136276789Sdim	block(4)
137276789Sdim	block(3)
138276789Sdim	block(2)
139276789Sdim	block(1)
140276789SdimLOCAL_LABEL(div0block):
141276789Sdim	block(0)
142276789Sdim	JMP(lr)
143353358Sdim#endif // __ARM_ARCH_EXT_IDIV__
144276789Sdim
145276789SdimLOCAL_LABEL(divby0):
146276789Sdim	mov	r0, #0
147276789Sdim#ifdef __ARM_EABI__
148276789Sdim	b	__aeabi_idiv0
149276789Sdim#else
150276789Sdim	JMP(lr)
151276789Sdim#endif
152276789Sdim
153276789SdimEND_COMPILERRT_FUNCTION(__umodsi3)
154309124Sdim
155309124SdimNO_EXEC_STACK_DIRECTIVE
156309124Sdim
157