/* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */
#include <linux/linkage.h>
#include <asm/asmmacro.h>
#include <asm/core.h>

#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 || XCHAL_HAVE_MAC16
#define XCHAL_NO_MUL 0
#else
#define XCHAL_NO_MUL 1
#endif

ENTRY(__umulsidi3)

#ifdef __XTENSA_CALL0_ABI__
	abi_entry(32)
	s32i	a12, sp, 16
	s32i	a13, sp, 20
	s32i	a14, sp, 24
	s32i	a15, sp, 28
#elif XCHAL_NO_MUL
	/* This is not really a leaf function; allocate enough stack space
	   to allow CALL12s to a helper function.  */
	abi_entry(32)
#else
	abi_entry_default
#endif

#ifdef __XTENSA_EB__
#define wh a2
#define wl a3
#else
#define wh a3
#define wl a2
#endif /* __XTENSA_EB__ */

	/* This code is taken from the mulsf3 routine in ieee754-sf.S.
	   See more comments there.  */

#if XCHAL_HAVE_MUL32_HIGH
	mull	a6, a2, a3
	muluh	wh, a2, a3
	mov	wl, a6

#else /* ! MUL32_HIGH */

#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL
	/* a0 and a8 will be clobbered by calling the multiply function
	   but a8 is not used here and need not be saved.  */
	s32i	a0, sp, 0
#endif

#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32

#define a2h a4
#define a3h a5

	/* Get the high halves of the inputs into registers.  */
	srli	a2h, a2, 16
	srli	a3h, a3, 16

#define a2l a2
#define a3l a3

#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
	/* Clear the high halves of the inputs.  This does not matter
	   for MUL16 because the high bits are ignored.  */
	extui	a2, a2, 0, 16
	extui	a3, a3, 0, 16
#endif
#endif /* MUL16 || MUL32 */


#if XCHAL_HAVE_MUL16

#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
	mul16u	dst, xreg ## xhalf, yreg ## yhalf

#elif XCHAL_HAVE_MUL32

#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
	mull	dst, xreg ## xhalf, yreg ## yhalf

#elif XCHAL_HAVE_MAC16

/* The preprocessor insists on inserting a space when concatenating after
   a period in the definition of do_mul below.  These macros are a workaround
   using underscores instead of periods when doing the concatenation.  */
#define umul_aa_ll umul.aa.ll
#define umul_aa_lh umul.aa.lh
#define umul_aa_hl umul.aa.hl
#define umul_aa_hh umul.aa.hh

#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
	rsr	dst, ACCLO

#else /* no multiply hardware */

#define set_arg_l(dst, src) \
	extui	dst, src, 0, 16
#define set_arg_h(dst, src) \
	srli	dst, src, 16

#ifdef __XTENSA_CALL0_ABI__
#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
	set_arg_ ## xhalf (a13, xreg); \
	set_arg_ ## yhalf (a14, yreg); \
	call0	.Lmul_mulsi3; \
	mov	dst, a12
#else
#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
	set_arg_ ## xhalf (a14, xreg); \
	set_arg_ ## yhalf (a15, yreg); \
	call12	.Lmul_mulsi3; \
	mov	dst, a14
#endif /* __XTENSA_CALL0_ABI__ */

#endif /* no multiply hardware */

	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
	do_mul(a6, a2, l, a3, h)	/* pp 1 */
	do_mul(a11, a2, h, a3, l)	/* pp 2 */
	movi	a9, 0
	add	a6, a6, a11
	bgeu	a6, a11, 1f
	addi	a9, a9, 1
1:
	/* Shift the high half of a9/a6 into position in a9.  Note that
	   this value can be safely incremented without any carry-outs.  */
	ssai	16
	src	a9, a9, a6

	/* Compute the low word into a6.  */
	do_mul(a11, a2, l, a3, l)	/* pp 0 */
	sll	a6, a6
	add	a6, a6, a11
	bgeu	a6, a11, 1f
	addi	a9, a9, 1
1:
	/* Compute the high word into wh.  */
	do_mul(wh, a2, h, a3, h)	/* pp 3 */
	add	wh, wh, a9
	mov	wl, a6

#endif /* !MUL32_HIGH */

#if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL
	/* Restore the original return address.  */
	l32i	a0, sp, 0
#endif
#ifdef __XTENSA_CALL0_ABI__
	l32i	a12, sp, 16
	l32i	a13, sp, 20
	l32i	a14, sp, 24
	l32i	a15, sp, 28
	abi_ret(32)
#else
	abi_ret_default
#endif

#if XCHAL_NO_MUL

	.macro	do_addx2 dst, as, at, tmp
#if XCHAL_HAVE_ADDX
	addx2	\dst, \as, \at
#else
	slli	\tmp, \as, 1
	add	\dst, \tmp, \at
#endif
	.endm

	.macro	do_addx4 dst, as, at, tmp
#if XCHAL_HAVE_ADDX
	addx4	\dst, \as, \at
#else
	slli	\tmp, \as, 2
	add	\dst, \tmp, \at
#endif
	.endm

	.macro	do_addx8 dst, as, at, tmp
#if XCHAL_HAVE_ADDX
	addx8	\dst, \as, \at
#else
	slli	\tmp, \as, 3
	add	\dst, \tmp, \at
#endif
	.endm

	/* For Xtensa processors with no multiply hardware, this simplified
	   version of _mulsi3 is used for multiplying 16-bit chunks of
	   the floating-point mantissas.  When using CALL0, this function
	   uses a custom ABI: the inputs are passed in a13 and a14, the
	   result is returned in a12, and a8 and a15 are clobbered.  */
	.align	4
.Lmul_mulsi3:
	abi_entry_default

	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
	movi	\dst, 0
1:	add	\tmp1, \src2, \dst
	extui	\tmp2, \src1, 0, 1
	movnez	\dst, \tmp1, \tmp2

	do_addx2 \tmp1, \src2, \dst, \tmp1
	extui	\tmp2, \src1, 1, 1
	movnez	\dst, \tmp1, \tmp2

	do_addx4 \tmp1, \src2, \dst, \tmp1
	extui	\tmp2, \src1, 2, 1
	movnez	\dst, \tmp1, \tmp2

	do_addx8 \tmp1, \src2, \dst, \tmp1
	extui	\tmp2, \src1, 3, 1
	movnez	\dst, \tmp1, \tmp2

	srli	\src1, \src1, 4
	slli	\src2, \src2, 4
	bnez	\src1, 1b
	.endm

#ifdef __XTENSA_CALL0_ABI__
	mul_mulsi3_body a12, a13, a14, a15, a8
#else
	/* The result will be written into a2, so save that argument in a4.  */
	mov	a4, a2
	mul_mulsi3_body a2, a4, a3, a5, a6
#endif
	abi_ret_default
#endif /* XCHAL_NO_MUL */

ENDPROC(__umulsidi3)
EXPORT_SYMBOL(__umulsidi3)