1dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2dnl  subtract the result from a second limb vector.
3
4dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		    cycles/limb
24C 8000,8200:		7
25C 8500,8600,8700:	6.5
26
27C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
28C  could be saved there per call.
29
30C  DESCRIPTION:
31C  The main loop "BIG" is 4-way unrolled, mainly to allow
32C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
33C  registers to the IU registers, have demanded a deep software pipeline, and
34C  a lot of stack slots for partial products in flight.
35C
36C  CODE STRUCTURE:
37C  save-some-registers
38C  do 0, 1, 2, or 3 limbs
39C  if done, restore-some-regs and return
40C  save-many-regs
41C  do 4, 8, ... limb
42C  restore-all-regs
43
44C  STACK LAYOUT:
45C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
46C  slots marked FREE, as well as some slots in the caller's "frame marker".
47C
48C -00 <- r30
49C -08  FREE
50C -10  tmp
51C -18  tmp
52C -20  tmp
53C -28  tmp
54C -30  tmp
55C -38  tmp
56C -40  tmp
57C -48  tmp
58C -50  tmp
59C -58  tmp
60C -60  tmp
61C -68  tmp
62C -70  tmp
63C -78  tmp
64C -80  tmp
65C -88  tmp
66C -90  FREE
67C -98  FREE
68C -a0  FREE
69C -a8  FREE
70C -b0  r13
71C -b8  r12
72C -c0  r11
73C -c8  r10
74C -d0  r8
75C -d8  r8
76C -e0  r7
77C -e8  r6
78C -f0  r5
79C -f8  r4
80C -100 r3
81C  Previous frame:
82C  [unused area]
83C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
84
85
86include(`../config.m4')
87
88C INPUT PARAMETERS:
89define(`rp',`%r26')	C
90define(`up',`%r25')	C
91define(`n',`%r24')	C
92define(`vlimb',`%r23')	C
93
94define(`climb',`%r23')	C
95
96ifdef(`HAVE_ABI_2_0w',
97`	.level	2.0w
98',`	.level	2.0
99')
100PROLOGUE(mpn_submul_1)
101
102ifdef(`HAVE_ABI_2_0w',
103`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
104')
105	std,ma		%r3, 0x100(%r30)
106	std		%r4, -0xf8(%r30)
107	std		%r5, -0xf0(%r30)
108	ldo		0(%r0), climb		C clear climb
109	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
110
111define(`p032a1',`%r1')	C
112define(`p032a2',`%r19')	C
113
114define(`m032',`%r20')	C
115define(`m096',`%r21')	C
116
117define(`p000a',`%r22')	C
118define(`p064a',`%r29')	C
119
120define(`s000',`%r31')	C
121
122define(`ma000',`%r4')	C
123define(`ma064',`%r20')	C
124
125define(`r000',`%r3')	C
126
127	extrd,u		n, 63, 2, %r5
128	cmpb,=		%r5, %r0, L(BIG)
129	nop
130
131	fldd		0(up), %fr4
132	ldo		8(up), up
133	xmpyu		%fr8R, %fr4L, %fr22
134	xmpyu		%fr8L, %fr4R, %fr23
135	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
136	xmpyu		%fr8R, %fr4R, %fr24
137	xmpyu		%fr8L, %fr4L, %fr25
138	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
139	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
140	addib,<>	-1, %r5, L(two_or_more)
141	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
142LDEF(one)
143	ldd		-0x78(%r30), p032a1
144	ldd		-0x70(%r30), p032a2
145	ldd		-0x80(%r30), p000a
146	b		L(0_one_out)
147	ldd		-0x68(%r30), p064a
148
149LDEF(two_or_more)
150	fldd		0(up), %fr4
151	ldo		8(up), up
152	xmpyu		%fr8R, %fr4L, %fr22
153	xmpyu		%fr8L, %fr4R, %fr23
154	ldd		-0x78(%r30), p032a1
155	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
156	xmpyu		%fr8R, %fr4R, %fr24
157	xmpyu		%fr8L, %fr4L, %fr25
158	ldd		-0x70(%r30), p032a2
159	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
160	ldd		-0x80(%r30), p000a
161	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
162	ldd		-0x68(%r30), p064a
163	addib,<>	-1, %r5, L(three_or_more)
164	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
165LDEF(two)
166	add		p032a1, p032a2, m032
167	add,dc		%r0, %r0, m096
168	depd,z		m032, 31, 32, ma000
169	extrd,u		m032, 31, 32, ma064
170	ldd		0(rp), r000
171	b		L(0_two_out)
172	depd		m096, 31, 32, ma064
173
174LDEF(three_or_more)
175	fldd		0(up), %fr4
176	add		p032a1, p032a2, m032
177	add,dc		%r0, %r0, m096
178	depd,z		m032, 31, 32, ma000
179	extrd,u		m032, 31, 32, ma064
180	ldd		0(rp), r000
181C	addib,=		-1, %r5, L(0_out)
182	depd		m096, 31, 32, ma064
183LDEF(loop0)
184C	xmpyu		%fr8R, %fr4L, %fr22
185C	xmpyu		%fr8L, %fr4R, %fr23
186C	ldd		-0x78(%r30), p032a1
187C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
188C
189C	xmpyu		%fr8R, %fr4R, %fr24
190C	xmpyu		%fr8L, %fr4L, %fr25
191C	ldd		-0x70(%r30), p032a2
192C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
193C
194C	ldo		8(rp), rp
195C	add		climb, p000a, s000
196C	ldd		-0x80(%r30), p000a
197C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
198C
199C	add,dc		p064a, %r0, climb
200C	ldo		8(up), up
201C	ldd		-0x68(%r30), p064a
202C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
203C
204C	add		ma000, s000, s000
205C	add,dc		ma064, climb, climb
206C	fldd		0(up), %fr4
207C
208C	sub		r000, s000, s000
209C	sub,db		%r0, climb, climb
210C	sub		%r0, climb, climb
211C	std		s000, -8(rp)
212C
213C	add		p032a1, p032a2, m032
214C	add,dc		%r0, %r0, m096
215C
216C	depd,z		m032, 31, 32, ma000
217C	extrd,u		m032, 31, 32, ma064
218C	ldd		0(rp), r000
219C	addib,<>	-1, %r5, L(loop0)
220C	depd		m096, 31, 32, ma064
221LDEF(0_out)
222	ldo		8(up), up
223	xmpyu		%fr8R, %fr4L, %fr22
224	xmpyu		%fr8L, %fr4R, %fr23
225	ldd		-0x78(%r30), p032a1
226	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
227	xmpyu		%fr8R, %fr4R, %fr24
228	xmpyu		%fr8L, %fr4L, %fr25
229	ldd		-0x70(%r30), p032a2
230	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
231	ldo		8(rp), rp
232	add		climb, p000a, s000
233	ldd		-0x80(%r30), p000a
234	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
235	add,dc		p064a, %r0, climb
236	ldd		-0x68(%r30), p064a
237	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
238	add		ma000, s000, s000
239	add,dc		ma064, climb, climb
240	sub		r000, s000, s000
241	sub,db		%r0, climb, climb
242	sub		%r0, climb, climb
243	std		s000, -8(rp)
244	add		p032a1, p032a2, m032
245	add,dc		%r0, %r0, m096
246	depd,z		m032, 31, 32, ma000
247	extrd,u		m032, 31, 32, ma064
248	ldd		0(rp), r000
249	depd		m096, 31, 32, ma064
250LDEF(0_two_out)
251	ldd		-0x78(%r30), p032a1
252	ldd		-0x70(%r30), p032a2
253	ldo		8(rp), rp
254	add		climb, p000a, s000
255	ldd		-0x80(%r30), p000a
256	add,dc		p064a, %r0, climb
257	ldd		-0x68(%r30), p064a
258	add		ma000, s000, s000
259	add,dc		ma064, climb, climb
260	sub		r000, s000, s000
261	sub,db		%r0, climb, climb
262	sub		%r0, climb, climb
263	std		s000, -8(rp)
264LDEF(0_one_out)
265	add		p032a1, p032a2, m032
266	add,dc		%r0, %r0, m096
267	depd,z		m032, 31, 32, ma000
268	extrd,u		m032, 31, 32, ma064
269	ldd		0(rp), r000
270	depd		m096, 31, 32, ma064
271
272	add		climb, p000a, s000
273	add,dc		p064a, %r0, climb
274	add		ma000, s000, s000
275	add,dc		ma064, climb, climb
276	sub		r000, s000, s000
277	sub,db		%r0, climb, climb
278	sub		%r0, climb, climb
279	std		s000, 0(rp)
280
281	cmpib,>=	4, n, L(done)
282	ldo		8(rp), rp
283
284C 4-way unrolled code.
285
286LDEF(BIG)
287
288define(`p032a1',`%r1')	C
289define(`p032a2',`%r19')	C
290define(`p096b1',`%r20')	C
291define(`p096b2',`%r21')	C
292define(`p160c1',`%r22')	C
293define(`p160c2',`%r29')	C
294define(`p224d1',`%r31')	C
295define(`p224d2',`%r3')	C
296			C
297define(`m032',`%r4')	C
298define(`m096',`%r5')	C
299define(`m160',`%r6')	C
300define(`m224',`%r7')	C
301define(`m288',`%r8')	C
302			C
303define(`p000a',`%r1')	C
304define(`p064a',`%r19')	C
305define(`p064b',`%r20')	C
306define(`p128b',`%r21')	C
307define(`p128c',`%r22')	C
308define(`p192c',`%r29')	C
309define(`p192d',`%r31')	C
310define(`p256d',`%r3')	C
311			C
312define(`s000',`%r10')	C
313define(`s064',`%r11')	C
314define(`s128',`%r12')	C
315define(`s192',`%r13')	C
316			C
317define(`ma000',`%r9')	C
318define(`ma064',`%r4')	C
319define(`ma128',`%r5')	C
320define(`ma192',`%r6')	C
321define(`ma256',`%r7')	C
322			C
323define(`r000',`%r1')	C
324define(`r064',`%r19')	C
325define(`r128',`%r20')	C
326define(`r192',`%r21')	C
327
328	std		%r6, -0xe8(%r30)
329	std		%r7, -0xe0(%r30)
330	std		%r8, -0xd8(%r30)
331	std		%r9, -0xd0(%r30)
332	std		%r10, -0xc8(%r30)
333	std		%r11, -0xc0(%r30)
334	std		%r12, -0xb8(%r30)
335	std		%r13, -0xb0(%r30)
336
337ifdef(`HAVE_ABI_2_0w',
338`	extrd,u		n, 61, 62, n		C right shift 2
339',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
340')
341
342LDEF(4_or_more)
343	fldd		0(up), %fr4
344	fldd		8(up), %fr5
345	fldd		16(up), %fr6
346	fldd		24(up), %fr7
347	xmpyu		%fr8R, %fr4L, %fr22
348	xmpyu		%fr8L, %fr4R, %fr23
349	xmpyu		%fr8R, %fr5L, %fr24
350	xmpyu		%fr8L, %fr5R, %fr25
351	xmpyu		%fr8R, %fr6L, %fr26
352	xmpyu		%fr8L, %fr6R, %fr27
353	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
354	xmpyu		%fr8R, %fr7L, %fr28
355	xmpyu		%fr8L, %fr7R, %fr29
356	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
357	xmpyu		%fr8R, %fr4R, %fr30
358	xmpyu		%fr8L, %fr4L, %fr31
359	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
360	xmpyu		%fr8R, %fr5R, %fr22
361	xmpyu		%fr8L, %fr5L, %fr23
362	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
363	xmpyu		%fr8R, %fr6R, %fr24
364	xmpyu		%fr8L, %fr6L, %fr25
365	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
366	xmpyu		%fr8R, %fr7R, %fr26
367	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
368	addib,<>	-1, n, L(8_or_more)
369	xmpyu		%fr8L, %fr7L, %fr27
370	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
371	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
372	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
373	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
374	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
375	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
376	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
377	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
378	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
379	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
380	ldd		-0x78(%r30), p032a1
381	ldd		-0x70(%r30), p032a2
382	ldd		-0x38(%r30), p096b1
383	ldd		-0x30(%r30), p096b2
384	ldd		-0x58(%r30), p160c1
385	ldd		-0x50(%r30), p160c2
386	ldd		-0x18(%r30), p224d1
387	ldd		-0x10(%r30), p224d2
388	b		L(end1)
389	nop
390
391LDEF(8_or_more)
392	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
393	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
394	ldo		32(up), up
395	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
396	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
397	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
398	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
399	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
400	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
401	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
402	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
403	fldd		0(up), %fr4
404	fldd		8(up), %fr5
405	fldd		16(up), %fr6
406	fldd		24(up), %fr7
407	xmpyu		%fr8R, %fr4L, %fr22
408	ldd		-0x78(%r30), p032a1
409	xmpyu		%fr8L, %fr4R, %fr23
410	xmpyu		%fr8R, %fr5L, %fr24
411	ldd		-0x70(%r30), p032a2
412	xmpyu		%fr8L, %fr5R, %fr25
413	xmpyu		%fr8R, %fr6L, %fr26
414	ldd		-0x38(%r30), p096b1
415	xmpyu		%fr8L, %fr6R, %fr27
416	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
417	xmpyu		%fr8R, %fr7L, %fr28
418	ldd		-0x30(%r30), p096b2
419	xmpyu		%fr8L, %fr7R, %fr29
420	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
421	xmpyu		%fr8R, %fr4R, %fr30
422	ldd		-0x58(%r30), p160c1
423	xmpyu		%fr8L, %fr4L, %fr31
424	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
425	xmpyu		%fr8R, %fr5R, %fr22
426	ldd		-0x50(%r30), p160c2
427	xmpyu		%fr8L, %fr5L, %fr23
428	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
429	xmpyu		%fr8R, %fr6R, %fr24
430	ldd		-0x18(%r30), p224d1
431	xmpyu		%fr8L, %fr6L, %fr25
432	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
433	xmpyu		%fr8R, %fr7R, %fr26
434	ldd		-0x10(%r30), p224d2
435	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
436	addib,=		-1, n, L(end2)
437	xmpyu		%fr8L, %fr7L, %fr27
438LDEF(loop)
439	add		p032a1, p032a2, m032
440	ldd		-0x80(%r30), p000a
441	add,dc		p096b1, p096b2, m096
442	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
443
444	add,dc		p160c1, p160c2, m160
445	ldd		-0x68(%r30), p064a
446	add,dc		p224d1, p224d2, m224
447	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
448
449	add,dc		%r0, %r0, m288
450	ldd		-0x40(%r30), p064b
451	ldo		32(up), up
452	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
453
454	depd,z		m032, 31, 32, ma000
455	ldd		-0x28(%r30), p128b
456	extrd,u		m032, 31, 32, ma064
457	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
458
459	depd		m096, 31, 32, ma064
460	ldd		-0x60(%r30), p128c
461	extrd,u		m096, 31, 32, ma128
462	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
463
464	depd		m160, 31, 32, ma128
465	ldd		-0x48(%r30), p192c
466	extrd,u		m160, 31, 32, ma192
467	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
468
469	depd		m224, 31, 32, ma192
470	ldd		-0x20(%r30), p192d
471	extrd,u		m224, 31, 32, ma256
472	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
473
474	depd		m288, 31, 32, ma256
475	ldd		-0x88(%r30), p256d
476	add		climb, p000a, s000
477	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
478
479	add,dc		p064a, p064b, s064
480	ldd		0(rp), r000
481	add,dc		p128b, p128c, s128
482	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
483
484	add,dc		p192c, p192d, s192
485	ldd		8(rp), r064
486	add,dc		p256d, %r0, climb
487	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
488
489	ldd		16(rp), r128
490	add		ma000, s000, s000	C accum mid 0
491	ldd		24(rp), r192
492	add,dc		ma064, s064, s064	C accum mid 1
493
494	add,dc		ma128, s128, s128	C accum mid 2
495	fldd		0(up), %fr4
496	add,dc		ma192, s192, s192	C accum mid 3
497	fldd		8(up), %fr5
498
499	add,dc		ma256, climb, climb
500	fldd		16(up), %fr6
501	sub		r000, s000, s000	C accum rlimb 0
502	fldd		24(up), %fr7
503
504	sub,db		r064, s064, s064	C accum rlimb 1
505	sub,db		r128, s128, s128	C accum rlimb 2
506	std		s000, 0(rp)
507
508	sub,db		r192, s192, s192	C accum rlimb 3
509	sub,db		%r0, climb, climb
510	sub		%r0, climb, climb
511	std		s064, 8(rp)
512
513	xmpyu		%fr8R, %fr4L, %fr22
514	ldd		-0x78(%r30), p032a1
515	xmpyu		%fr8L, %fr4R, %fr23
516	std		s128, 16(rp)
517
518	xmpyu		%fr8R, %fr5L, %fr24
519	ldd		-0x70(%r30), p032a2
520	xmpyu		%fr8L, %fr5R, %fr25
521	std		s192, 24(rp)
522
523	xmpyu		%fr8R, %fr6L, %fr26
524	ldd		-0x38(%r30), p096b1
525	xmpyu		%fr8L, %fr6R, %fr27
526	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
527
528	xmpyu		%fr8R, %fr7L, %fr28
529	ldd		-0x30(%r30), p096b2
530	xmpyu		%fr8L, %fr7R, %fr29
531	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
532
533	xmpyu		%fr8R, %fr4R, %fr30
534	ldd		-0x58(%r30), p160c1
535	xmpyu		%fr8L, %fr4L, %fr31
536	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
537
538	xmpyu		%fr8R, %fr5R, %fr22
539	ldd		-0x50(%r30), p160c2
540	xmpyu		%fr8L, %fr5L, %fr23
541	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
542
543	xmpyu		%fr8R, %fr6R, %fr24
544	ldd		-0x18(%r30), p224d1
545	xmpyu		%fr8L, %fr6L, %fr25
546	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
547
548	xmpyu		%fr8R, %fr7R, %fr26
549	ldd		-0x10(%r30), p224d2
550	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
551	xmpyu		%fr8L, %fr7L, %fr27
552
553	addib,<>	-1, n, L(loop)
554	ldo		32(rp), rp
555
556LDEF(end2)
557	add		p032a1, p032a2, m032
558	ldd		-0x80(%r30), p000a
559	add,dc		p096b1, p096b2, m096
560	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
561	add,dc		p160c1, p160c2, m160
562	ldd		-0x68(%r30), p064a
563	add,dc		p224d1, p224d2, m224
564	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
565	add,dc		%r0, %r0, m288
566	ldd		-0x40(%r30), p064b
567	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
568	depd,z		m032, 31, 32, ma000
569	ldd		-0x28(%r30), p128b
570	extrd,u		m032, 31, 32, ma064
571	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
572	depd		m096, 31, 32, ma064
573	ldd		-0x60(%r30), p128c
574	extrd,u		m096, 31, 32, ma128
575	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
576	depd		m160, 31, 32, ma128
577	ldd		-0x48(%r30), p192c
578	extrd,u		m160, 31, 32, ma192
579	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
580	depd		m224, 31, 32, ma192
581	ldd		-0x20(%r30), p192d
582	extrd,u		m224, 31, 32, ma256
583	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
584	depd		m288, 31, 32, ma256
585	ldd		-0x88(%r30), p256d
586	add		climb, p000a, s000
587	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
588	add,dc		p064a, p064b, s064
589	ldd		0(rp), r000
590	add,dc		p128b, p128c, s128
591	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
592	add,dc		p192c, p192d, s192
593	ldd		8(rp), r064
594	add,dc		p256d, %r0, climb
595	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
596	ldd		16(rp), r128
597	add		ma000, s000, s000	C accum mid 0
598	ldd		24(rp), r192
599	add,dc		ma064, s064, s064	C accum mid 1
600	add,dc		ma128, s128, s128	C accum mid 2
601	add,dc		ma192, s192, s192	C accum mid 3
602	add,dc		ma256, climb, climb
603	sub		r000, s000, s000	C accum rlimb 0
604	sub,db		r064, s064, s064	C accum rlimb 1
605	sub,db		r128, s128, s128	C accum rlimb 2
606	std		s000, 0(rp)
607	sub,db		r192, s192, s192	C accum rlimb 3
608	sub,db		%r0, climb, climb
609	sub		%r0, climb, climb
610	std		s064, 8(rp)
611	ldd		-0x78(%r30), p032a1
612	std		s128, 16(rp)
613	ldd		-0x70(%r30), p032a2
614	std		s192, 24(rp)
615	ldd		-0x38(%r30), p096b1
616	ldd		-0x30(%r30), p096b2
617	ldd		-0x58(%r30), p160c1
618	ldd		-0x50(%r30), p160c2
619	ldd		-0x18(%r30), p224d1
620	ldd		-0x10(%r30), p224d2
621	ldo		32(rp), rp
622
623LDEF(end1)
624	add		p032a1, p032a2, m032
625	ldd		-0x80(%r30), p000a
626	add,dc		p096b1, p096b2, m096
627	add,dc		p160c1, p160c2, m160
628	ldd		-0x68(%r30), p064a
629	add,dc		p224d1, p224d2, m224
630	add,dc		%r0, %r0, m288
631	ldd		-0x40(%r30), p064b
632	depd,z		m032, 31, 32, ma000
633	ldd		-0x28(%r30), p128b
634	extrd,u		m032, 31, 32, ma064
635	depd		m096, 31, 32, ma064
636	ldd		-0x60(%r30), p128c
637	extrd,u		m096, 31, 32, ma128
638	depd		m160, 31, 32, ma128
639	ldd		-0x48(%r30), p192c
640	extrd,u		m160, 31, 32, ma192
641	depd		m224, 31, 32, ma192
642	ldd		-0x20(%r30), p192d
643	extrd,u		m224, 31, 32, ma256
644	depd		m288, 31, 32, ma256
645	ldd		-0x88(%r30), p256d
646	add		climb, p000a, s000
647	add,dc		p064a, p064b, s064
648	ldd		0(rp), r000
649	add,dc		p128b, p128c, s128
650	add,dc		p192c, p192d, s192
651	ldd		8(rp), r064
652	add,dc		p256d, %r0, climb
653	ldd		16(rp), r128
654	add		ma000, s000, s000	C accum mid 0
655	ldd		24(rp), r192
656	add,dc		ma064, s064, s064	C accum mid 1
657	add,dc		ma128, s128, s128	C accum mid 2
658	add,dc		ma192, s192, s192	C accum mid 3
659	add,dc		ma256, climb, climb
660	sub		r000, s000, s000	C accum rlimb 0
661	sub,db		r064, s064, s064	C accum rlimb 1
662	sub,db		r128, s128, s128	C accum rlimb 2
663	std		s000, 0(rp)
664	sub,db		r192, s192, s192	C accum rlimb 3
665	sub,db		%r0, climb, climb
666	sub		%r0, climb, climb
667	std		s064, 8(rp)
668	std		s128, 16(rp)
669	std		s192, 24(rp)
670
671	ldd		-0xb0(%r30), %r13
672	ldd		-0xb8(%r30), %r12
673	ldd		-0xc0(%r30), %r11
674	ldd		-0xc8(%r30), %r10
675	ldd		-0xd0(%r30), %r9
676	ldd		-0xd8(%r30), %r8
677	ldd		-0xe0(%r30), %r7
678	ldd		-0xe8(%r30), %r6
679LDEF(done)
680ifdef(`HAVE_ABI_2_0w',
681`	copy		climb, %r28
682',`	extrd,u		climb, 63, 32, %r29
683	extrd,u		climb, 31, 32, %r28
684')
685	ldd		-0xf0(%r30), %r5
686	ldd		-0xf8(%r30), %r4
687	bve		(%r2)
688	ldd,mb		-0x100(%r30), %r3
689EPILOGUE(mpn_submul_1)
690