x86_64/bd1/mul_1.asm

279377Simpdnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
279377Simp
279377Simpdnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
279377Simp
279377Simpdnl  This file is part of the GNU MP Library.
279377Simpdnl
279377Simpdnl  The GNU MP Library is free software; you can redistribute it and/or modify
279377Simpdnl  it under the terms of either:
279377Simpdnl
279377Simpdnl    * the GNU Lesser General Public License as published by the Free
279377Simpdnl      Software Foundation; either version 3 of the License, or (at your
279377Simpdnl      option) any later version.
279377Simpdnl
279377Simpdnl  or
279377Simpdnl
279377Simpdnl    * the GNU General Public License as published by the Free Software
279377Simpdnl      Foundation; either version 2 of the License, or (at your option) any
279377Simpdnl      later version.
279377Simpdnl
279377Simpdnl  or both in parallel, as here.
279377Simpdnl
279377Simpdnl  The GNU MP Library is distributed in the hope that it will be useful, but
279377Simpdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
279377Simpdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
279377Simpdnl  for more details.
279377Simpdnl
279377Simpdnl  You should have received copies of the GNU General Public License and the
279377Simpdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
279377Simpdnl  see https://www.gnu.org/licenses/.
279377Simp
279377Simpinclude(`../config.m4')
279377Simp
279377SimpC	     cycles/limb
279377SimpC AMD K8,K9      3.65
279377SimpC AMD K10        3.30    3.68
279377SimpC AMD bull       4.04    4.29
279377SimpC AMD pile       4.33
279377SimpC AMD steam
279377SimpC AMD excavator
279377SimpC AMD bobcat     5.73
279377SimpC AMD jaguar     5.87
279377SimpC Intel P4      12.5
279377SimpC Intel core2    4.38
279377SimpC Intel NHM      4.28
279377SimpC Intel SBR      2.69
279377SimpC Intel IBR      2.55
279377SimpC Intel HWL      2.41
279377SimpC Intel BWL      2.49
279377SimpC Intel SKL      2.50
279377SimpC Intel atom    20.3
279377SimpC Intel SLM      7.8
279377SimpC VIA nano       4.25
279377Simp
279377SimpC The loop of this code is the result of running a code generation and
279377SimpC optimisation tool suite written by David Harvey and Torbjorn Granlund.
279377Simp
279377SimpC TODO
279377SimpC  * Move loop code into feed-in blocks, to save insn for zeroing regs.
279377Simp
279377Simpdefine(`rp',      `%rdi')   C rcx
279377Simpdefine(`up',      `%rsi')   C rdx
279377Simpdefine(`n_param', `%rdx')   C r8
279377Simpdefine(`v0',      `%rcx')   C r9
279377Simp
279377Simpdefine(`n',       `%rbx')
279377Simp
279377SimpABI_SUPPORT(DOS64)
279377SimpABI_SUPPORT(STD64)
279377Simp
279377SimpIFDOS(`	define(`up', ``%rsi'')	') dnl
279377SimpIFDOS(`	define(`rp', ``%rcx'')	') dnl
279377SimpIFDOS(`	define(`v0', ``%r9'')	') dnl
279377SimpIFDOS(`	define(`r9', ``rdi'')	') dnl
279377SimpIFDOS(`	define(`n',  ``%r8'')	') dnl
279377SimpIFDOS(`	define(`r8', ``rbx'')	') dnl
279377Simp
279377SimpASM_START()
279377Simp	TEXT
279377Simp	ALIGN(16)
279377SimpPROLOGUE(mpn_mul_1c)
279377SimpIFDOS(``push	%rsi		'')
279377SimpIFDOS(``push	%rdi		'')
279377SimpIFDOS(``mov	%rdx, %rsi	'')
279377Simp
279377Simp	mov	(up), %rax		C read first u limb early
279377Simp	push	%rbx
279377SimpIFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
279377SimpIFDOS(`	mov	n, %r11		')
279377Simp	mul	v0
279377Simp
279377SimpIFSTD(` add	%r8, %rax	')
279377SimpIFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
279377Simp	adc	$0, %rdx
279377Simp	jmp	L(common)
279377Simp
279377SimpEPILOGUE()
279377Simp
279377Simp	ALIGN(16)
279377SimpPROLOGUE(mpn_mul_1)
279377SimpIFDOS(``push	%rsi		'')
279377SimpIFDOS(``push	%rdi		'')
279377SimpIFDOS(``mov	%rdx, %rsi	'')
279377Simp
279377Simp	mov	(up), %rax		C read first u limb early
279377Simp	push	%rbx
279377SimpIFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
279377SimpIFDOS(`	mov	n, %r11		')
279377Simp	mul	v0
279377Simp
279377SimpL(common):
279377SimpIFSTD(`	mov	%r11, n		')
279377Simp
279377Simp	and	$3, R32(%r11)
279377Simp	lea	-16(rp,n,8), rp
279377Simp	jz	L(b0)
279377Simp	cmp	$2, R32(%r11)
279377Simp	jb	L(b1)
279377Simp	jz	L(b2)
279377Simp
279377SimpL(b3):	mov	%rax, %r10
279377Simp	mov	%rdx, %r11
279377Simp	mov	8(up), %rax
279377Simp	mul	v0
279377Simp	lea	(up,n,8), up
279377Simp	not	n
279377Simp	jmp	L(L3)
279377Simp
279377SimpL(b0):	mov	%rax, %r9
279377Simp	mov	%rdx, %r10
279377Simp	mov	8(up), %rax
279377Simp	lea	(up,n,8), up
279377Simp	neg	n
279377Simp	jmp	L(L0)
279377Simp
279377SimpL(b1):	mov	%rax, %r8
279377Simp	cmp	$1, n
279377Simp	jz	L(n1)
279377Simp	mov	%rdx, %r9
279377Simp	lea	(up,n,8), up
279377Simp	neg	n
279377Simp	mov	%r8, 16(rp,n,8)
279377Simp	inc	n
279377Simp	jmp	L(L1)
279377Simp
279377SimpL(b2):	mov	%rax, %r11
279377Simp	mov	%rdx, %r8
279377Simp	mov	8(up), %rax
279377Simp	lea	(up,n,8), up
279377Simp	neg	n
279377Simp	add	$2, n
279377Simp	jns	L(end)
279377Simp
279377Simp	ALIGN(16)
279377SimpL(top):	mul	v0
279377Simp	mov	%rdx, %r9
279377Simp	add	%rax, %r8
279377Simp	adc	$0, %r9
279377Simp	mov	%r8, 8(rp,n,8)
279377Simp	mov	%r11, (rp,n,8)
279377SimpL(L1):	mov	(up,n,8), %rax
279377Simp	mul	v0
279377Simp	add	%rax, %r9
279377Simp	mov	%rdx, %r10
279377Simp	mov	8(up,n,8), %rax
279377Simp	adc	$0, %r10
279377SimpL(L0):	mul	v0
279377Simp	add	%rax, %r10
279377Simp	mov	%rdx, %r11
279377Simp	mov	16(up,n,8), %rax
279377Simp	adc	$0, %r11
279377Simp	mul	v0
279377Simp	mov	%r9, 16(rp,n,8)
279377SimpL(L3):	add	%rax, %r11
279377Simp	mov	%r10, 24(rp,n,8)
279377Simp	mov	%rdx, %r8
279377Simp	adc	$0, %r8
279377Simp	add	$4, n
279377Simp	mov	-8(up,n,8), %rax
279377Simp	js	L(top)
279377Simp
279377SimpL(end):	mul	v0
279377Simp	add	%rax, %r8
279377Simp	adc	$0, %rdx
279377Simp	mov	%r11, (rp)
279377SimpL(n1):	mov	%r8, 8(rp)
279377Simp	mov	%rdx, %rax

	pop	%rbx
IFDOS(``pop	%rdi		'')
IFDOS(``pop	%rsi		'')
	ret
EPILOGUE()
ASM_END()