1dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
35C product at around 20x20 limbs.
36
37
38C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
39C
40C Calculate src,size squared, storing the result in dst,2*size.
41C
42C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
43C lot of function call overheads are avoided, especially when the size is
44C small.
45
46defframe(PARAM_SIZE,12)
47defframe(PARAM_SRC, 8)
48defframe(PARAM_DST, 4)
49
50	TEXT
51	ALIGN(8)
52PROLOGUE(mpn_sqr_basecase)
53deflit(`FRAME',0)
54
55	movl	PARAM_SIZE, %edx
56	movl	PARAM_SRC, %eax
57
58	cmpl	$2, %edx
59	movl	PARAM_DST, %ecx
60
61	je	L(two_limbs)
62
63	movl	(%eax), %eax
64	ja	L(three_or_more)
65
66C -----------------------------------------------------------------------------
67C one limb only
68	C eax	src
69	C ebx
70	C ecx	dst
71	C edx
72
73	mull	%eax
74
75	movl	%eax, (%ecx)
76	movl	%edx, 4(%ecx)
77
78	ret
79
80C -----------------------------------------------------------------------------
81	ALIGN(8)
82L(two_limbs):
83	C eax	src
84	C ebx
85	C ecx	dst
86	C edx	size
87
88	pushl	%ebp
89	pushl	%edi
90
91	pushl	%esi
92	pushl	%ebx
93
94	movl	%eax, %ebx
95	movl	(%eax), %eax
96
97	mull	%eax		C src[0]^2
98
99	movl	%eax, (%ecx)	C dst[0]
100	movl	%edx, %esi	C dst[1]
101
102	movl	4(%ebx), %eax
103
104	mull	%eax		C src[1]^2
105
106	movl	%eax, %edi	C dst[2]
107	movl	%edx, %ebp	C dst[3]
108
109	movl	(%ebx), %eax
110
111	mull	4(%ebx)		C src[0]*src[1]
112
113	addl	%eax, %esi
114	popl	%ebx
115
116	adcl	%edx, %edi
117
118	adcl	$0, %ebp
119	addl	%esi, %eax
120
121	adcl	%edi, %edx
122	movl	%eax, 4(%ecx)
123
124	adcl	$0, %ebp
125	popl	%esi
126
127	movl	%edx, 8(%ecx)
128	movl	%ebp, 12(%ecx)
129
130	popl	%edi
131	popl	%ebp
132
133	ret
134
135
136C -----------------------------------------------------------------------------
137	ALIGN(8)
138L(three_or_more):
139	C eax	src low limb
140	C ebx
141	C ecx	dst
142	C edx	size
143
144	cmpl	$4, %edx
145	pushl	%ebx
146deflit(`FRAME',4)
147
148	movl	PARAM_SRC, %ebx
149	jae	L(four_or_more)
150
151
152C -----------------------------------------------------------------------------
153C three limbs
154	C eax	src low limb
155	C ebx	src
156	C ecx	dst
157	C edx	size
158
159	pushl	%ebp
160	pushl	%edi
161
162	mull	%eax		C src[0] ^ 2
163
164	movl	%eax, (%ecx)
165	movl	%edx, 4(%ecx)
166
167	movl	4(%ebx), %eax
168	xorl	%ebp, %ebp
169
170	mull	%eax		C src[1] ^ 2
171
172	movl	%eax, 8(%ecx)
173	movl	%edx, 12(%ecx)
174
175	movl	8(%ebx), %eax
176	pushl	%esi		C risk of cache bank clash
177
178	mull	%eax		C src[2] ^ 2
179
180	movl	%eax, 16(%ecx)
181	movl	%edx, 20(%ecx)
182
183	movl	(%ebx), %eax
184
185	mull	4(%ebx)		C src[0] * src[1]
186
187	movl	%eax, %esi
188	movl	%edx, %edi
189
190	movl	(%ebx), %eax
191
192	mull	8(%ebx)		C src[0] * src[2]
193
194	addl	%eax, %edi
195	movl	%edx, %ebp
196
197	adcl	$0, %ebp
198	movl	4(%ebx), %eax
199
200	mull	8(%ebx)		C src[1] * src[2]
201
202	xorl	%ebx, %ebx
203	addl	%eax, %ebp
204
205	C eax
206	C ebx	zero, will be dst[5]
207	C ecx	dst
208	C edx	dst[4]
209	C esi	dst[1]
210	C edi	dst[2]
211	C ebp	dst[3]
212
213	adcl	$0, %edx
214	addl	%esi, %esi
215
216	adcl	%edi, %edi
217
218	adcl	%ebp, %ebp
219
220	adcl	%edx, %edx
221	movl	4(%ecx), %eax
222
223	adcl	$0, %ebx
224	addl	%esi, %eax
225
226	movl	%eax, 4(%ecx)
227	movl	8(%ecx), %eax
228
229	adcl	%edi, %eax
230	movl	12(%ecx), %esi
231
232	adcl	%ebp, %esi
233	movl	16(%ecx), %edi
234
235	movl	%eax, 8(%ecx)
236	movl	%esi, 12(%ecx)
237
238	adcl	%edx, %edi
239	popl	%esi
240
241	movl	20(%ecx), %eax
242	movl	%edi, 16(%ecx)
243
244	popl	%edi
245	popl	%ebp
246
247	adcl	%ebx, %eax	C no carry out of this
248	popl	%ebx
249
250	movl	%eax, 20(%ecx)
251
252	ret
253
254
255C -----------------------------------------------------------------------------
256	ALIGN(8)
257L(four_or_more):
258	C eax	src low limb
259	C ebx	src
260	C ecx	dst
261	C edx	size
262	C esi
263	C edi
264	C ebp
265	C
266	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
267
268deflit(`FRAME',4)
269
270	pushl	%edi
271FRAME_pushl()
272	pushl	%esi
273FRAME_pushl()
274
275	pushl	%ebp
276FRAME_pushl()
277	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
278
279	leal	(%ebx,%edx,4), %esi	C src end
280	movl	%ebx, %ebp		C src
281
282	negl	%edx			C -size
283	xorl	%ebx, %ebx		C clear carry limb and carry flag
284
285	leal	1(%edx), %ecx		C -(size-1)
286
287L(mul1):
288	C eax	scratch
289	C ebx	carry
290	C ecx	counter, negative
291	C edx	scratch
292	C esi	&src[size]
293	C edi	&dst[size]
294	C ebp	src
295
296	adcl	$0, %ebx
297	movl	(%esi,%ecx,4), %eax
298
299	mull	(%ebp)
300
301	addl	%eax, %ebx
302
303	movl	%ebx, (%edi,%ecx,4)
304	incl	%ecx
305
306	movl	%edx, %ebx
307	jnz	L(mul1)
308
309
310	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
311	C n=1..size-2.
312	C
313	C The last two products, which are the end corner of the product
314	C triangle, are handled separately to save looping overhead.  These
315	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
316	C If size is 4 then it's only these that need to be done.
317	C
318	C In the outer loop %esi is a constant, and %edi just advances by 1
319	C limb each time.  The size of the operation decreases by 1 limb
320	C each time.
321
322	C eax
323	C ebx	carry (needing carry flag added)
324	C ecx
325	C edx
326	C esi	&src[size]
327	C edi	&dst[size]
328	C ebp
329
330	adcl	$0, %ebx
331	movl	PARAM_SIZE, %edx
332
333	movl	%ebx, (%edi)
334	subl	$4, %edx
335
336	negl	%edx
337	jz	L(corner)
338
339
340L(outer):
341	C ebx	previous carry limb to store
342	C edx	outer loop counter (negative)
343	C esi	&src[size]
344	C edi	dst, pointing at stored carry limb of previous loop
345
346	pushl	%edx			C new outer loop counter
347	leal	-2(%edx), %ecx
348
349	movl	%ebx, (%edi)
350	addl	$4, %edi
351
352	addl	$4, %ebp
353	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
354
355L(inner):
356	C eax	scratch
357	C ebx	carry (needing carry flag added)
358	C ecx	counter, negative
359	C edx	scratch
360	C esi	&src[size]
361	C edi	dst end of this addmul
362	C ebp	&src[j]
363
364	adcl	$0, %ebx
365	movl	(%esi,%ecx,4), %eax
366
367	mull	(%ebp)
368
369	addl	%ebx, %eax
370	movl	(%edi,%ecx,4), %ebx
371
372	adcl	$0, %edx
373	addl	%eax, %ebx
374
375	movl	%ebx, (%edi,%ecx,4)
376	incl	%ecx
377
378	movl	%edx, %ebx
379	jnz	L(inner)
380
381
382	adcl	$0, %ebx
383	popl	%edx		C outer loop counter
384
385	incl	%edx
386	jnz	L(outer)
387
388
389	movl	%ebx, (%edi)
390
391L(corner):
392	C esi	&src[size]
393	C edi	&dst[2*size-4]
394
395	movl	-8(%esi), %eax
396	movl	-4(%edi), %ebx		C risk of data cache bank clash here
397
398	mull	-12(%esi)		C src[size-2]*src[size-3]
399
400	addl	%eax, %ebx
401	movl	%edx, %ecx
402
403	adcl	$0, %ecx
404	movl	-4(%esi), %eax
405
406	mull	-12(%esi)		C src[size-1]*src[size-3]
407
408	addl	%ecx, %eax
409	movl	(%edi), %ecx
410
411	adcl	$0, %edx
412	movl	%ebx, -4(%edi)
413
414	addl	%eax, %ecx
415	movl	%edx, %ebx
416
417	adcl	$0, %ebx
418	movl	-4(%esi), %eax
419
420	mull	-8(%esi)		C src[size-1]*src[size-2]
421
422	movl	%ecx, (%edi)
423	addl	%eax, %ebx
424
425	adcl	$0, %edx
426	movl	PARAM_SIZE, %eax
427
428	negl	%eax
429	movl	%ebx, 4(%edi)
430
431	addl	$1, %eax		C -(size-1) and clear carry
432	movl	%edx, 8(%edi)
433
434
435C -----------------------------------------------------------------------------
436C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
437
438L(lshift):
439	C eax	counter, negative
440	C ebx	next limb
441	C ecx
442	C edx
443	C esi
444	C edi	&dst[2*size-4]
445	C ebp
446
447	movl	12(%edi,%eax,8), %ebx
448
449	rcll	%ebx
450	movl	16(%edi,%eax,8), %ecx
451
452	rcll	%ecx
453	movl	%ebx, 12(%edi,%eax,8)
454
455	movl	%ecx, 16(%edi,%eax,8)
456	incl	%eax
457
458	jnz	L(lshift)
459
460
461	adcl	%eax, %eax		C high bit out
462	movl	PARAM_SRC, %esi
463
464	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
465	movl	%eax, 12(%edi)		C dst most significant limb
466
467
468C -----------------------------------------------------------------------------
469C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
470C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
471C low limb of src[0]^2.
472
473	movl	(%esi), %eax		C src[0]
474	leal	(%esi,%ecx,4), %esi	C src end
475
476	negl	%ecx
477
478	mull	%eax
479
480	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
481	movl	%edx, %ebx
482
483	addl	$1, %ecx		C size-1 and clear carry
484
485L(diag):
486	C eax	scratch (low product)
487	C ebx	carry limb
488	C ecx	counter, negative
489	C edx	scratch (high product)
490	C esi	&src[size]
491	C edi	&dst[2*size-4]
492	C ebp	scratch (fetched dst limbs)
493
494	movl	(%esi,%ecx,4), %eax
495	adcl	$0, %ebx
496
497	mull	%eax
498
499	movl	16-4(%edi,%ecx,8), %ebp
500
501	addl	%ebp, %ebx
502	movl	16(%edi,%ecx,8), %ebp
503
504	adcl	%eax, %ebp
505	movl	%ebx, 16-4(%edi,%ecx,8)
506
507	movl	%ebp, 16(%edi,%ecx,8)
508	incl	%ecx
509
510	movl	%edx, %ebx
511	jnz	L(diag)
512
513
514	adcl	$0, %edx
515	movl	16-4(%edi), %eax	C dst most significant limb
516
517	addl	%eax, %edx
518	popl	%ebp
519
520	movl	%edx, 16-4(%edi)
521	popl	%esi		C risk of cache bank clash
522
523	popl	%edi
524	popl	%ebx
525
526	ret
527
528EPILOGUE()
529