1dnl  Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
2
3dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C ev67: 3.4 cycles/bitpair for 1x1 part
24
25
26C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
27C
28C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
29C strip trailing zeros from abs(x-y) to maintain x and y both odd.
30C
31C The trailing zeros are calculated from just x-y, since in twos-complement
32C there's the same number of trailing zeros on d or -d.  This means the cttz
33C runs in parallel with abs(x-y).
34C
35C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
36C operands with this algorithm gives the measured 3.4 c/l.
37C
38C The slottings shown are for SVR4 style systems, Unicos differs in the
39C initial gp setup and the LEA.
40C
41C Enhancement:
42C
43C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
44C it to a bsr, but probably only in a static binary.  Plain "jsr foo" gives
45C the right object code for relaxation, and ought to be available
46C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
47C the usual case of running in a shared library.
48C
49C bsr could perhaps be used explicitly anyway.  We should be able to assume
50C modexact is in the same module as us (ie. shared library or mainline).
51C Would there be any worries about the size of the displacement?  Could
52C always put modexact and gcd_1 in the same .o to be certain.
53
54ASM_START()
55PROLOGUE(mpn_gcd_1, gp)
56
57	C r16	xp
58	C r17	size
59	C r18	y
60
61	C ldah				C l
62	C lda				C u
63
64	ldq	r0, 0(r16)		C L   x = xp[0]
65	lda	r30, -32(r30)		C u   alloc stack
66
67	LEA(  r27, mpn_modexact_1c_odd)	C L   modexact addr, ldq (gp)
68	stq	r10, 16(r30)		C L   save r10
69	cttz	r18, r10		C U0  y twos
70	cmpeq	r17, 1, r5		C u   test size==1
71
72	stq	r9, 8(r30)		C L   save r9
73	clr	r19			C u   zero c for modexact
74	unop
75	unop
76
77	cttz	r0, r6			C U0  x twos
78	stq	r26, 0(r30)		C L   save ra
79
80	srl	r18, r10, r18		C U   y odd
81
82	mov	r18, r9			C l   hold y across call
83
84	cmpult	r6, r10, r2		C u   test x_twos < y_twos
85
86	cmovne	r2, r6, r10		C l   common_twos = min(x_twos,y_twos)
87	bne	r5, L(one)		C U   no modexact if size==1
88	jsr	r26, (r27), mpn_modexact_1c_odd   C L0
89
90	LDGP(	r29, 0(r26))		C u,l ldah,lda
91	cttz	r0, r6			C U0  new x twos
92	ldq	r26, 0(r30)		C L   restore ra
93
94L(one):
95	mov	r9, r1			C u   y
96	ldq	r9, 8(r30)		C L   restore r9
97	mov	r10, r2			C u   common twos
98	ldq	r10, 16(r30)		C L   restore r10
99
100	lda	r30, 32(r30)		C l   free stack
101	beq	r0, L(done)		C U   return y if x%y==0
102
103	srl	r0, r6, r0		C U   x odd
104	unop
105
106	ALIGN(16)
107L(top):
108	C r0	x
109	C r1	y
110	C r2	common twos, for use at end
111
112	subq	r0, r1, r7		C l0  d = x - y
113	cmpult	r0, r1, r16		C u0  test x >= y
114
115	subq	r1, r0, r4		C l0  new_x = y - x
116	cttz	r7, r8			C U0  d twos
117
118	cmoveq	r16, r7, r4		C l0  new_x = d if x>=y
119	cmovne	r16, r0, r1		C u0  y = x if x<y
120	unop				C l   \ force cmoveq into l0
121	unop				C u   /
122
123	C				C cmoveq2 L0, cmovne2 U0
124
125	srl	r4, r8, r0		C U0  x = new_x >> twos
126	bne	r7, L(top)		C U1  stop when d==0
127
128
129L(done):
130	sll	r1, r2, r0		C U0  return y << common_twos
131	ret	r31, (r26), 1		C L0
132
133EPILOGUE()
134ASM_END()
135