12998SN/Adnl  x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
215601Sskovalev
32998SN/Adnl  Copyright 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
42998SN/A
52998SN/Adnl  This file is part of the GNU MP Library.
62998SN/A
72998SN/Adnl  The GNU MP Library is free software; you can redistribute it and/or modify
82998SN/Adnl  it under the terms of the GNU Lesser General Public License as published
92998SN/Adnl  by the Free Software Foundation; either version 3 of the License, or (at
102998SN/Adnl  your option) any later version.
112998SN/A
122998SN/Adnl  The GNU MP Library is distributed in the hope that it will be useful, but
132998SN/Adnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
142998SN/Adnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
152998SN/Adnl  License for more details.
162998SN/A
172998SN/Adnl  You should have received a copy of the GNU Lesser General Public License
182998SN/Adnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
192998SN/A
202998SN/Ainclude(`../config.m4')
212998SN/A
222998SN/AC	     cycles/limb
232998SN/AC K8,K9:	 4
242998SN/AC K10:		 4
252998SN/AC P4:		 ?
262998SN/AC P6 core2:	 4.3-4.5 (fluctuating)
272998SN/AC P6 corei7:	 5
282998SN/A
292998SN/AC INPUT PARAMETERS
3015601Sskovalevdefine(`rp',	`%rdi')
3116177Salanbdefine(`up',	`%rsi')
3216177Salanbdefine(`n',	`%rdx')
3316177Salanbdefine(`v0',	`%rcx')
3415601Sskovalev
3515601Sskovalevifdef(`OPERATION_addmul_1',`
3615601Sskovalev      define(`ADDSUB',        `add')
374479SN/A      define(`func',  `mpn_addmul_1')
384479SN/A')
394479SN/Aifdef(`OPERATION_submul_1',`
404479SN/A      define(`ADDSUB',        `sub')
412998SN/A      define(`func',  `mpn_submul_1')
422998SN/A')
432998SN/A
442998SN/AMULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
4515601Sskovalev
4615601SskovalevASM_START()
4715601Sskovalev	TEXT
4815601Sskovalev	ALIGN(16)
4915601SskovalevPROLOGUE(func)
5015601Sskovalev	push	%rbx
512998SN/A	push	%rbp
522998SN/A	lea	(%rdx), %rbx
532998SN/A	neg	%rbx
542998SN/A
552998SN/A	mov	(up), %rax
562998SN/A	mov	(rp), %r10
572998SN/A
582998SN/A	lea	-16(rp,%rdx,8), rp
592998SN/A	lea	(up,%rdx,8), up
602998SN/A	mul	%rcx
612998SN/A
622998SN/A	bt	$0, R32(%rbx)
632998SN/A	jc	L(odd)
642998SN/A
652998SN/A	lea	(%rax), %r11
662998SN/A	mov	8(up,%rbx,8), %rax
672998SN/A	lea	(%rdx), %rbp
682998SN/A	mul	%rcx
692998SN/A	add	$2, %rbx
709372Sxuelei	jns	L(n2)
712998SN/A
722998SN/A	lea	(%rax), %r8
732998SN/A	mov	(up,%rbx,8), %rax
742998SN/A	lea	(%rdx), %r9
752998SN/A	jmp	L(mid)
762998SN/A
772998SN/AL(odd):	add	$1, %rbx
782998SN/A	jns	L(n1)
792998SN/A
802998SN/A	lea	(%rax), %r8
812998SN/A	mov	(up,%rbx,8), %rax
822998SN/A	lea	(%rdx), %r9
832998SN/A	mul	%rcx
842998SN/A	lea	(%rax), %r11
852998SN/A	mov	8(up,%rbx,8), %rax
862998SN/A	lea	(%rdx), %rbp
872998SN/A	jmp	L(e)
882998SN/A
892998SN/A	ALIGN(16)
902998SN/AL(top):	mul	%rcx
912998SN/A	ADDSUB	%r8, %r10
922998SN/A	lea	(%rax), %r8
932998SN/A	mov	(up,%rbx,8), %rax
942998SN/A	adc	%r9, %r11
952998SN/A	mov	%r10, -8(rp,%rbx,8)
962998SN/A	mov	(rp,%rbx,8), %r10
972998SN/A	lea	(%rdx), %r9
982998SN/A	adc	$0, %rbp
992998SN/AL(mid):	mul	%rcx
1002998SN/A	ADDSUB	%r11, %r10
1012998SN/A	lea	(%rax), %r11
1022998SN/A	mov	8(up,%rbx,8), %rax
1032998SN/A	adc	%rbp, %r8
1042998SN/A	mov	%r10, (rp,%rbx,8)
1052998SN/A	mov	8(rp,%rbx,8), %r10
1062998SN/A	lea	(%rdx), %rbp
1072998SN/A	adc	$0, %r9
1082998SN/AL(e):	add	$2, %rbx
1092998SN/A	js	L(top)
1102998SN/A
1112998SN/A	mul	%rcx
1122998SN/A	ADDSUB	%r8, %r10
1132998SN/A	adc	%r9, %r11
1142998SN/A	mov	%r10, -8(rp)
1152998SN/A	adc	$0, %rbp
1162998SN/AL(n2):	mov	(rp), %r10
1172998SN/A	ADDSUB	%r11, %r10
1182998SN/A	adc	%rbp, %rax
1192998SN/A	mov	%r10, (rp)
1202998SN/A	adc	$0, %rdx
1212998SN/AL(n1):	mov	8(rp), %r10
1222998SN/A	ADDSUB	%rax, %r10
1232998SN/A	mov	%r10, 8(rp)
1242998SN/A	mov	R32(%rbx), R32(%rax)	C zero rax
1252998SN/A	adc	%rdx, %rax
1262998SN/A	pop	%rbp
1272998SN/A	pop	%rbx
1282998SN/A	ret
1292998SN/AEPILOGUE()
1302998SN/A