x86/k7/aors_n.asm

dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.

dnl  Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
dnl
dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or
dnl  modify it under the terms of the GNU Lesser General Public License as
dnl  published by the Free Software Foundation; either version 3 of the
dnl  License, or (at your option) any later version.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful,
dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dnl  Lesser General Public License for more details.
dnl
dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C K7: 1.64 cycles/limb (at 16 limbs/loop).


dnl  K7: UNROLL_COUNT cycles/limb
dnl           8           1.9
dnl          16           1.64
dnl          32           1.7
dnl          64           2.0
dnl  Maximum possible with the current code is 64.

deflit(UNROLL_COUNT, 16)


ifdef(`OPERATION_add_n', `
	define(M4_inst,        adcl)
	define(M4_function_n,  mpn_add_n)
	define(M4_function_nc, mpn_add_nc)
	define(M4_description, add)
',`ifdef(`OPERATION_sub_n', `
	define(M4_inst,        sbbl)
	define(M4_function_n,  mpn_sub_n)
	define(M4_function_nc, mpn_sub_nc)
	define(M4_description, subtract)
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')

MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)


C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C                         mp_size_t size);
C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
C	                   mp_size_t size, mp_limb_t carry);
C
C Calculate src1,size M4_description src2,size, and store the result in
C dst,size.  The return value is the carry bit from the top of the result (1
C or 0).
C
C The _nc version accepts 1 or 0 for an initial carry into the low limb of
C the calculation.  Note values other than 1 or 0 here will lead to garbage
C results.
C
C This code runs at 1.64 cycles/limb, which might be the best possible with
C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
C which can be done each cycle, leading to 1.5 c/l.

dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
ifdef(`PIC',`
deflit(UNROLL_THRESHOLD, 8)
',`
deflit(UNROLL_THRESHOLD, 8)
')

defframe(PARAM_CARRY,20)
defframe(PARAM_SIZE, 16)
defframe(PARAM_SRC2, 12)
defframe(PARAM_SRC1, 8)
defframe(PARAM_DST,  4)

defframe(SAVE_EBP, -4)
defframe(SAVE_ESI, -8)
defframe(SAVE_EBX, -12)
defframe(SAVE_EDI, -16)
deflit(STACK_SPACE, 16)

	TEXT
	ALIGN(32)
deflit(`FRAME',0)

PROLOGUE(M4_function_nc)
	movl	PARAM_CARRY, %eax
	jmp	L(start)
EPILOGUE()

PROLOGUE(M4_function_n)

	xorl	%eax, %eax	C carry
L(start):
	movl	PARAM_SIZE, %ecx
	subl	$STACK_SPACE, %esp
deflit(`FRAME',STACK_SPACE)

	movl	%edi, SAVE_EDI
	movl	%ebx, SAVE_EBX
	cmpl	$UNROLL_THRESHOLD, %ecx

	movl	PARAM_SRC2, %edx
	movl	PARAM_SRC1, %ebx
	jae	L(unroll)

	movl	PARAM_DST, %edi
	leal	(%ebx,%ecx,4), %ebx
	leal	(%edx,%ecx,4), %edx

	leal	(%edi,%ecx,4), %edi
	negl	%ecx
	shrl	%eax

	C This loop in in a single 16 byte code block already, so no
	C alignment necessary.
L(simple):
	C eax	scratch
	C ebx	src1
	C ecx	counter
	C edx	src2
	C esi
	C edi	dst
	C ebp

	movl	(%ebx,%ecx,4), %eax
	M4_inst	(%edx,%ecx,4), %eax
	movl	%eax, (%edi,%ecx,4)
	incl	%ecx
	jnz	L(simple)

	movl	$0, %eax
	movl	SAVE_EDI, %edi

	movl	SAVE_EBX, %ebx
	setc	%al
	addl	$STACK_SPACE, %esp

	ret


C -----------------------------------------------------------------------------
	C This is at 0x55, close enough to aligned.
L(unroll):
deflit(`FRAME',STACK_SPACE)
	movl	%ebp, SAVE_EBP
	andl	$-2, %ecx		C size low bit masked out
	andl	$1, PARAM_SIZE		C size low bit kept

	movl	%ecx, %edi
	decl	%ecx
	movl	PARAM_DST, %ebp

	shrl	$UNROLL_LOG2, %ecx
	negl	%edi
	movl	%esi, SAVE_ESI

	andl	$UNROLL_MASK, %edi

ifdef(`PIC',`
	call	L(pic_calc)
L(here):
',`
	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
')
	negl	%edi
	shrl	%eax

	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi

	jmp	*%esi


ifdef(`PIC',`
L(pic_calc):
	C See mpn/x86/README about old gas bugs
	leal	(%edi,%edi,8), %esi
	addl	$L(entry)-L(here), %esi
	addl	(%esp), %esi
	ret_internal
')


C -----------------------------------------------------------------------------
	ALIGN(32)
L(top):
	C eax	zero
	C ebx	src1
	C ecx	counter
	C edx	src2
	C esi	scratch (was computed jump)
	C edi	dst
	C ebp	scratch

	leal	UNROLL_BYTES(%edx), %edx

L(entry):
deflit(CHUNK_COUNT, 2)
forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
	deflit(`disp1', eval(disp0 + 4))

Zdisp(	movl,	disp0,(%ebx), %esi)
	movl	disp1(%ebx), %ebp
Zdisp(	M4_inst,disp0,(%edx), %esi)
Zdisp(	movl,	%esi, disp0,(%edi))
	M4_inst	disp1(%edx), %ebp
	movl	%ebp, disp1(%edi)
')

	decl	%ecx
	leal	UNROLL_BYTES(%ebx), %ebx
	leal	UNROLL_BYTES(%edi), %edi
	jns	L(top)


	mov	PARAM_SIZE, %esi
	movl	SAVE_EBP, %ebp
	movl	$0, %eax

	decl	%esi
	js	L(even)

	movl	(%ebx), %ecx
	M4_inst	UNROLL_BYTES(%edx), %ecx
	movl	%ecx, (%edi)
L(even):

	movl	SAVE_EDI, %edi
	movl	SAVE_EBX, %ebx
	setc	%al

	movl	SAVE_ESI, %esi
	addl	$STACK_SPACE, %esp

	ret

EPILOGUE()