mpn/x86_64/lshsub_n.asm

254721Semastednl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
254721Semaste
254721Semastednl  Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
254721Semaste
254721Semastednl  This file is part of the GNU MP Library.
254721Semastednl
254721Semastednl  The GNU MP Library is free software; you can redistribute it and/or modify
254721Semastednl  it under the terms of either:
254721Semastednl
254721Semastednl    * the GNU Lesser General Public License as published by the Free
254721Semastednl      Software Foundation; either version 3 of the License, or (at your
254721Semastednl      option) any later version.
254721Semastednl
263363Semastednl  or
263363Semastednl
254721Semastednl    * the GNU General Public License as published by the Free Software
254721Semastednl      Foundation; either version 2 of the License, or (at your option) any
254721Semastednl      later version.
254721Semastednl
254721Semastednl  or both in parallel, as here.
254721Semastednl
254721Semastednl  The GNU MP Library is distributed in the hope that it will be useful, but
254721Semastednl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
254721Semastednl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
263363Semastednl  for more details.
254721Semastednl
254721Semastednl  You should have received copies of the GNU General Public License and the
254721Semastednl  GNU Lesser General Public License along with the GNU MP Library.  If not,
254721Semastednl  see https://www.gnu.org/licenses/.
254721Semaste
254721Semasteinclude(`../config.m4')
254721Semaste
254721Semaste
254721SemasteC	     cycles/limb
263363SemasteC AMD K8,K9	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
254721SemasteC AMD K10	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
254721SemasteC Intel P4	16.5
254721SemasteC Intel core2	 4.35
254721SemasteC Intel corei	 ?
263363SemasteC Intel atom	 ?
263363SemasteC VIA nano	 ?
263363Semaste
263363SemasteC This was written quickly and not optimized at all, but it runs very well on
254721SemasteC K8.  But perhaps one could get under 3 c/l.  Ideas:
254721SemasteC   1) Use indexing to save the 3 LEA
254721SemasteC   2) Write reasonable feed-in code
254721SemasteC   3) Be more clever about register usage
254721SemasteC   4) Unroll more, handling CL negation, carry save/restore cost much now
254721SemasteC   5) Reschedule
254721Semaste
254721SemasteC INPUT PARAMETERS
254721Semastedefine(`rp',	`%rdi')
254721Semastedefine(`up',	`%rsi')
254721Semastedefine(`vp',	`%rdx')
254721Semastedefine(`n',	`%rcx')
254721Semastedefine(`cnt',	`%r8')
254721Semaste
254721SemasteABI_SUPPORT(DOS64)
254721SemasteABI_SUPPORT(STD64)
254721Semaste
254721SemasteASM_START()
254721Semaste	TEXT
254721Semaste	ALIGN(16)
254721SemastePROLOGUE(mpn_lshsub_n)
254721Semaste	FUNC_ENTRY(4)
254721SemasteIFDOS(`	mov	56(%rsp), %r8d	')
263363Semaste
263367Semaste	push	%r12
269024Semaste	push	%r13
269024Semaste	push	%r14
269024Semaste	push	%r15
254721Semaste	push	%rbx
254721Semaste
254721Semaste	mov	n, %rax
254721Semaste	xor	R32(%rbx), R32(%rbx)	C clear carry save register
254721Semaste	mov	R32(%r8), R32(%rcx)	C shift count
254721Semaste	xor	R32(%r15), R32(%r15)	C limb carry
254721Semaste
254721Semaste	mov	R32(%rax), R32(%r11)
254721Semaste	and	$3, R32(%r11)
254721Semaste	je	L(4)
263363Semaste	sub	$1, R32(%r11)
263363Semaste
254721SemasteL(oopette):
254721Semaste	add	R32(%rbx), R32(%rbx)	C restore carry flag
254721Semaste	mov	0(up), %r8
254721Semaste	lea	8(up), up
254721Semaste	sbb	0(vp), %r8
254721Semaste	mov	%r8, %r12
269024Semaste	sbb	R32(%rbx), R32(%rbx)	C save carry flag
254721Semaste	shl	R8(%rcx), %r8
254721Semaste	or	%r15, %r8
254721Semaste	mov	%r12, %r15
254721Semaste	lea	8(vp), vp
254721Semaste	neg	R8(%rcx)
254721Semaste	shr	R8(%rcx), %r15
254721Semaste	neg	R8(%rcx)
263363Semaste	mov	%r8, 0(rp)
263363Semaste	lea	8(rp), rp
263363Semaste	sub	$1, R32(%r11)
263363Semaste	jnc	L(oopette)
269024Semaste
269024SemasteL(4):
254721Semaste	sub	$4, %rax
254721Semaste	jc	L(end)
254721Semaste
254721Semaste	ALIGN(16)
254721SemasteL(oop):
254721Semaste	add	R32(%rbx), R32(%rbx)	C restore carry flag
254721Semaste
254721Semaste	mov	0(up), %r8
254721Semaste	mov	8(up), %r9
254721Semaste	mov	16(up), %r10
254721Semaste	mov	24(up), %r11
254721Semaste
254721Semaste	lea	32(up), up
254721Semaste
254721Semaste	sbb	0(vp), %r8
263367Semaste	mov	%r8, %r12
263367Semaste	sbb	8(vp), %r9
254721Semaste	mov	%r9, %r13
254721Semaste	sbb	16(vp), %r10
254721Semaste	mov	%r10, %r14
263367Semaste	sbb	24(vp), %r11
269024Semaste
269024Semaste	sbb	R32(%rbx), R32(%rbx)	C save carry flag
269024Semaste
269024Semaste	shl	R8(%rcx), %r8
269024Semaste	shl	R8(%rcx), %r9
269024Semaste	shl	R8(%rcx), %r10
269024Semaste	or	%r15, %r8
269024Semaste	mov	%r11, %r15
263367Semaste	shl	R8(%rcx), %r11
263367Semaste
263367Semaste	lea	32(vp), vp
263367Semaste
263367Semaste	neg	R8(%rcx)
263367Semaste
269024Semaste	shr	R8(%rcx), %r12
269024Semaste	shr	R8(%rcx), %r13
269024Semaste	shr	R8(%rcx), %r14
269024Semaste	shr	R8(%rcx), %r15		C used next loop
269024Semaste
263367Semaste	or	%r12, %r9
263367Semaste	or	%r13, %r10
263367Semaste	or	%r14, %r11
263367Semaste
263367Semaste	neg	R8(%rcx)
263367Semaste
263367Semaste	mov	%r8, 0(rp)
263367Semaste	mov	%r9, 8(rp)
263367Semaste	mov	%r10, 16(rp)
263367Semaste	mov	%r11, 24(rp)
263367Semaste
263367Semaste	lea	32(rp), rp
263367Semaste
254721Semaste	sub	$4, %rax
254721Semaste	jnc	L(oop)
254721SemasteL(end):
263367Semaste	neg	R32(%rbx)
269024Semaste	shl	R8(%rcx), %rbx
269024Semaste	adc	%r15, %rbx
269024Semaste	mov	%rbx, %rax
269024Semaste	pop	%rbx
269024Semaste	pop	%r15
269024Semaste	pop	%r14
269024Semaste	pop	%r13
269024Semaste	pop	%r12
269024Semaste
269024Semaste	FUNC_EXIT()
269024Semaste	ret
269024SemasteEPILOGUE()
269024Semaste