x86_64/fastsse/copyi-palignr.asm

193323Seddnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
193323Sed
193323Seddnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
193323Sed
193323Seddnl  Contributed to the GNU project by Torbj��rn Granlund.
193323Sed
193323Seddnl  This file is part of the GNU MP Library.
193323Seddnl
193323Seddnl  The GNU MP Library is free software; you can redistribute it and/or modify
193323Seddnl  it under the terms of either:
193323Seddnl
193323Seddnl    * the GNU Lesser General Public License as published by the Free
193323Seddnl      Software Foundation; either version 3 of the License, or (at your
193323Seddnl      option) any later version.
193323Seddnl
193323Seddnl  or
193323Seddnl
193323Seddnl    * the GNU General Public License as published by the Free Software
193323Seddnl      Foundation; either version 2 of the License, or (at your option) any
193323Seddnl      later version.
193323Seddnl
193323Seddnl  or both in parallel, as here.
193323Seddnl
193323Seddnl  The GNU MP Library is distributed in the hope that it will be useful, but
193323Seddnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
193323Seddnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
193323Seddnl  for more details.
193323Seddnl
193323Seddnl  You should have received copies of the GNU General Public License and the
243830Sdimdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
243830Sdimdnl  see https://www.gnu.org/licenses/.
193323Sed
193323Sedinclude(`../config.m4')
193323Sed
193323SedC	     cycles/limb     cycles/limb     cycles/limb      good
193323SedC              aligned	      unaligned	      best seen	     for cpu?
193323SedC AMD K8,K9	 2.0		 illop		1.0/1.0		N
193323SedC AMD K10	 0.85		 illop				Y/N
193323SedC AMD bd1	 0.70		 0.66				Y
193323SedC AMD bd2	 0.68		 0.66				Y
193323SedC AMD bd3	 ?		 ?
193323SedC AMD bd4	 ?		 ?
193323SedC AMD bt1	 1.97		 8.16		1.5/1.5		N
193323SedC AMD bt2	 0.77		 0.93		0.65/opt	N/Y
193323SedC AMD zn1	 ?		 ?
193323SedC AMD zn2	 ?		 ?
193323SedC Intel P4	 2.26		 illop				Y/N
193323SedC Intel CNR	 0.52		 0.64		opt/opt		Y
243830SdimC Intel NHM	 0.52		 0.71		0.50/0.67	N
193323SedC Intel SBR	 0.51		 0.54		opt/0.51	Y
193323SedC Intel IBR	 0.50		 0.54		opt/opt		Y
243830SdimC Intel HWL	 0.50		 0.51		opt/opt		Y
193323SedC Intel BWL	 0.55		 0.55		opt/opt		Y
193323SedC Intel atom	 1.16		 1.61		opt/opt		Y
193323SedC Intel SLM	 1.02		 1.07		opt/opt		Y
193323SedC VIA nano	 1.09		 1.08		opt/opt		Y
193323Sed
243830SdimC We use only 16-byte operations, except for unaligned top-most and bottom-most
193323SedC limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
193323SedC instruction is better adapted to mpn_copyd's needs, we need to contort the
193323SedC code to use it here.
193323SedC
243830SdimC For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
193323SedC taken from the x86_64 default code.
193323Sed
193323SedC INPUT PARAMETERS
193323Seddefine(`rp', `%rdi')
243830Sdimdefine(`up', `%rsi')
193323Seddefine(`n',  `%rdx')
193323Sed
193323SedC There are three instructions for loading an aligned 128-bit quantity.  We use
193323SedC movaps, since it has the shortest coding.
243830Sdimdnl define(`movdqa', ``movaps'')
193323Sed
193323Sedifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
193323Sed
193323SedASM_START()
243830Sdim	TEXT
193323Sed	ALIGN(64)
193323SedPROLOGUE(mpn_copyi)
193323Sed	FUNC_ENTRY(3)
193323Sed
243830Sdim	cmp	$COPYI_SSE_THRESHOLD, n
193323Sed	jbe	L(bc)
193323Sed
193323Sed	test	$8, R8(rp)		C is rp 16-byte aligned?
193323Sed	jz	L(rp_aligned)		C jump if rp aligned
243830Sdim
193323Sed	movsq				C copy one limb
193323Sed	dec	n
193323Sed
193323SedL(rp_aligned):
193323Sed	test	$8, R8(up)
243830Sdim	jnz	L(uent)
193323Sed
193323Sedifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
193323Sed`	sub	$8, n',
193323Sed`	jmp	L(am)')
193323Sed
243830Sdim	ALIGN(16)
243830SdimL(atop):movdqa	0(up), %xmm0
193323Sed	movdqa	16(up), %xmm1
193323Sed	movdqa	32(up), %xmm2
193323Sed	movdqa	48(up), %xmm3
193323Sed	lea	64(up), up
193323Sed	movdqa	%xmm0, (rp)
193323Sed	movdqa	%xmm1, 16(rp)
193323Sed	movdqa	%xmm2, 32(rp)
243830Sdim	movdqa	%xmm3, 48(rp)
193323Sed	lea	64(rp), rp
193323SedL(am):	sub	$8, n
193323Sed	jnc	L(atop)
193323Sed
193323Sed	test	$4, R8(n)
193323Sed	jz	1f
193323Sed	movdqa	(up), %xmm0
243830Sdim	movdqa	16(up), %xmm1
218893Sdim	lea	32(up), up
193323Sed	movdqa	%xmm0, (rp)
193323Sed	movdqa	%xmm1, 16(rp)
193323Sed	lea	32(rp), rp
193323Sed
193323Sed1:	test	$2, R8(n)
218893Sdim	jz	1f
193323Sed	movdqa	(up), %xmm0
218893Sdim	lea	16(up), up
193323Sed	movdqa	%xmm0, (rp)
193323Sed	lea	16(rp), rp
243830Sdim
243830Sdim1:	test	$1, R8(n)
243830Sdim	jz	1f
243830Sdim	mov	(up), %r8
243830Sdim	mov	%r8, (rp)
243830Sdim
243830Sdim1:	FUNC_EXIT()
243830Sdim	ret
243830Sdim
243830SdimL(uent):
243830SdimC Code handling up - rp = 8 (mod 16)
243830Sdim
243830Sdim	cmp	$16, n
243830Sdim	jc	L(ued0)
243830Sdim
243830SdimIFDOS(`	add	$-56, %rsp	')
243830SdimIFDOS(`	movdqa	%xmm6, (%rsp)	')
243830SdimIFDOS(`	movdqa	%xmm7, 16(%rsp)	')
243830SdimIFDOS(`	movdqa	%xmm8, 32(%rsp)	')
243830Sdim
243830Sdim	movaps	120(up), %xmm7
243830Sdim	movaps	104(up), %xmm6
243830Sdim	movaps	88(up), %xmm5
193323Sed	movaps	72(up), %xmm4
243830Sdim	movaps	56(up), %xmm3
243830Sdim	movaps	40(up), %xmm2
243830Sdim	lea	128(up), up
193323Sed	sub	$32, n
193323Sed	jc	L(ued1)
193323Sed
193323Sed	ALIGN(16)
243830SdimL(utop):movaps	-104(up), %xmm1
193323Sed	sub	$16, n
193323Sed	movaps	-120(up), %xmm0
193323Sed	palignr($8, %xmm6, %xmm7)
193323Sed	movaps	-136(up), %xmm8
193323Sed	movdqa	%xmm7, 112(rp)
243830Sdim	palignr($8, %xmm5, %xmm6)
193323Sed	movaps	120(up), %xmm7
193323Sed	movdqa	%xmm6, 96(rp)
193323Sed	palignr($8, %xmm4, %xmm5)
193323Sed	movaps	104(up), %xmm6
193323Sed	movdqa	%xmm5, 80(rp)
234353Sdim	palignr($8, %xmm3, %xmm4)
263508Sdim	movaps	88(up), %xmm5
234353Sdim	movdqa	%xmm4, 64(rp)
234353Sdim	palignr($8, %xmm2, %xmm3)
234353Sdim	movaps	72(up), %xmm4
234353Sdim	movdqa	%xmm3, 48(rp)
193323Sed	palignr($8, %xmm1, %xmm2)
210299Sed	movaps	56(up), %xmm3
210299Sed	movdqa	%xmm2, 32(rp)
210299Sed	palignr($8, %xmm0, %xmm1)
210299Sed	movaps	40(up), %xmm2
210299Sed	movdqa	%xmm1, 16(rp)
210299Sed	palignr($8, %xmm8, %xmm0)
210299Sed	lea	128(up), up
210299Sed	movdqa	%xmm0, (rp)
193323Sed	lea	128(rp), rp
243830Sdim	jnc	L(utop)
243830Sdim
243830SdimL(ued1):movaps	-104(up), %xmm1
243830Sdim	movaps	-120(up), %xmm0
243830Sdim	movaps	-136(up), %xmm8
243830Sdim	palignr($8, %xmm6, %xmm7)
243830Sdim	movdqa	%xmm7, 112(rp)
243830Sdim	palignr($8, %xmm5, %xmm6)
243830Sdim	movdqa	%xmm6, 96(rp)
243830Sdim	palignr($8, %xmm4, %xmm5)
243830Sdim	movdqa	%xmm5, 80(rp)
243830Sdim	palignr($8, %xmm3, %xmm4)
243830Sdim	movdqa	%xmm4, 64(rp)
243830Sdim	palignr($8, %xmm2, %xmm3)
243830Sdim	movdqa	%xmm3, 48(rp)
243830Sdim	palignr($8, %xmm1, %xmm2)
243830Sdim	movdqa	%xmm2, 32(rp)
243830Sdim	palignr($8, %xmm0, %xmm1)
243830Sdim	movdqa	%xmm1, 16(rp)
243830Sdim	palignr($8, %xmm8, %xmm0)
243830Sdim	movdqa	%xmm0, (rp)
243830Sdim	lea	128(rp), rp
243830Sdim
193323SedIFDOS(`	movdqa	(%rsp), %xmm6	')
193323SedIFDOS(`	movdqa	16(%rsp), %xmm7	')
193323SedIFDOS(`	movdqa	32(%rsp), %xmm8	')
193323SedIFDOS(`	add	$56, %rsp	')
243830Sdim
193323SedL(ued0):test	$8, R8(n)
193323Sed	jz	1f
193323Sed	movaps	56(up), %xmm3
193323Sed	movaps	40(up), %xmm2
193323Sed	movaps	24(up), %xmm1
193323Sed	movaps	8(up), %xmm0
243830Sdim	movaps	-8(up), %xmm4
193323Sed	palignr($8, %xmm2, %xmm3)
193323Sed	movdqa	%xmm3, 48(rp)
193323Sed	palignr($8, %xmm1, %xmm2)
193323Sed	movdqa	%xmm2, 32(rp)
193323Sed	palignr($8, %xmm0, %xmm1)
193323Sed	movdqa	%xmm1, 16(rp)
193323Sed	palignr($8, %xmm4, %xmm0)
193323Sed	lea	64(up), up
193323Sed	movdqa	%xmm0, (rp)
193323Sed	lea	64(rp), rp

1:	test	$4, R8(n)
	jz	1f
	movaps	24(up), %xmm1
	movaps	8(up), %xmm0
	palignr($8, %xmm0, %xmm1)
	movaps	-8(up), %xmm3
	movdqa	%xmm1, 16(rp)
	palignr($8, %xmm3, %xmm0)
	lea	32(up), up
	movdqa	%xmm0, (rp)
	lea	32(rp), rp

1:	test	$2, R8(n)
	jz	1f
	movdqa	8(up), %xmm0
	movdqa	-8(up), %xmm3
	palignr($8, %xmm3, %xmm0)
	lea	16(up), up
	movdqa	%xmm0, (rp)
	lea	16(rp), rp

1:	test	$1, R8(n)
	jz	1f
	mov	(up), %r8
	mov	%r8, (rp)

1:	FUNC_EXIT()
	ret

C Basecase code.  Needed for good small operands speed, not for
C correctness as the above code is currently written.

L(bc):	lea	-8(rp), rp
	sub	$4, R32(n)
	jc	L(end)

	ALIGN(16)
L(top):	mov	(up), %r8
	mov	8(up), %r9
	lea	32(rp), rp
	mov	16(up), %r10
	mov	24(up), %r11
	lea	32(up), up
	mov	%r8, -24(rp)
	mov	%r9, -16(rp)
ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
`	sub	$4, R32(n)')
	mov	%r10, -8(rp)
	mov	%r11, (rp)
ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
`	jnc	L(top)')

L(end):	test	$1, R8(n)
	jz	1f
	mov	(up), %r8
	mov	%r8, 8(rp)
	lea	8(rp), rp
	lea	8(up), up
1:	test	$2, R8(n)
	jz	1f
	mov	(up), %r8
	mov	8(up), %r9
	mov	%r8, 8(rp)
	mov	%r9, 16(rp)
1:	FUNC_EXIT()
	ret
EPILOGUE()