1dnl  x86 mpn_gcd_11 optimised for AMD K7.
2
3dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
4dnl  Granlund.
5
6dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
7dnl  Foundation, Inc.
8
9dnl  This file is part of the GNU MP Library.
10dnl
11dnl  The GNU MP Library is free software; you can redistribute it and/or modify
12dnl  it under the terms of either:
13dnl
14dnl    * the GNU Lesser General Public License as published by the Free
15dnl      Software Foundation; either version 3 of the License, or (at your
16dnl      option) any later version.
17dnl
18dnl  or
19dnl
20dnl    * the GNU General Public License as published by the Free Software
21dnl      Foundation; either version 2 of the License, or (at your option) any
22dnl      later version.
23dnl
24dnl  or both in parallel, as here.
25dnl
26dnl  The GNU MP Library is distributed in the hope that it will be useful, but
27dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29dnl  for more details.
30dnl
31dnl  You should have received copies of the GNU General Public License and the
32dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
33dnl  see https://www.gnu.org/licenses/.
34
35include(`../config.m4')
36
37
38C	     cycles/bit (approx)
39C AMD K7	 5.31
40C AMD K8,K9	 5.33
41C AMD K10	 5.30
42C AMD bd1	 ?
43C AMD bobcat	 7.02
44C Intel P4-2	10.1
45C Intel P4-3/4	10.0
46C Intel P6/13	 5.88
47C Intel core2	 6.26
48C Intel NHM	 6.83
49C Intel SBR	 8.50
50C Intel atom	 8.90
51C VIA nano	 ?
52C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
53
54
55C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
56
57deflit(MAXSHIFT, 6)
58deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
59
60DEF_OBJECT(ctz_table,64)
61	.byte	MAXSHIFT
62forloop(i,1,MASK,
63`	.byte	m4_count_trailing_zeros(i)
64')
65END_OBJECT(ctz_table)
66
67
68define(`u0',    `%eax')
69define(`v0',    `%edx')
70
71ASM_START()
72	TEXT
73	ALIGN(16)
74PROLOGUE(mpn_gcd_11)
75	push	%edi
76	push	%esi
77
78	mov	12(%esp), %eax
79	mov	16(%esp), %edx
80
81	LEAL(	ctz_table, %esi)
82	jmp	L(odd)
83
84	ALIGN(16)			C
85L(top):	cmovc(	%ecx, %eax)		C u = |v - u|
86	cmovc(	%edi, %edx)		C v = min(u,v)
87L(mid):	and	$MASK, %ecx		C
88	movzbl	(%esi,%ecx), %ecx	C
89	jz	L(shift_alot)		C
90	shr	%cl, %eax		C
91L(odd):	mov	%eax, %edi		C
92	mov	%edx, %ecx		C
93	sub	%eax, %ecx		C
94	sub	%edx, %eax		C
95	jnz	L(top)			C
96
97L(end):	mov	%edx, %eax
98	pop	%esi
99	pop	%edi
100	ret
101
102L(shift_alot):
103	shr	$MAXSHIFT, %eax
104	mov	%eax, %ecx
105	jmp	L(mid)
106EPILOGUE()
107ASM_END()
108