1#ifdef L__divtf3
2// Compute a 80-bit IEEE double-extended quotient.
3//
4// From the Intel IA-64 Optimization Guide, choose the minimum latency
5// alternative.
6//
7// farg0 holds the dividend.  farg1 holds the divisor.
8
9	.text
10	.align 16
11	.global __divtf3
12	.proc __divtf3
13__divtf3:
14	cmp.eq p7, p0 = r0, r0
15	frcpa.s0 f10, p6 = farg0, farg1
16	;;
17(p6)	cmp.ne p7, p0 = r0, r0
18	.pred.rel.mutex p6, p7
19(p6)	fnma.s1 f11 = farg1, f10, f1
20(p6)	fma.s1 f12 = farg0, f10, f0
21	;;
22(p6)	fma.s1 f13 = f11, f11, f0
23(p6)	fma.s1 f14 = f11, f11, f11
24	;;
25(p6)	fma.s1 f11 = f13, f13, f11
26(p6)	fma.s1 f13 = f14, f10, f10
27	;;
28(p6)	fma.s1 f10 = f13, f11, f10
29(p6)	fnma.s1 f11 = farg1, f12, farg0
30	;;
31(p6)	fma.s1 f11 = f11, f10, f12
32(p6)	fnma.s1 f12 = farg1, f10, f1
33	;;
34(p6)	fma.s1 f10 = f12, f10, f10
35(p6)	fnma.s1 f12 = farg1, f11, farg0
36	;;
37(p6)	fma.s0 fret0 = f12, f10, f11
38(p7)	mov fret0 = f10
39	br.ret.sptk rp
40	.endp __divtf3
41#endif
42
43#ifdef L__divdf3
44// Compute a 64-bit IEEE double quotient.
45//
46// From the Intel IA-64 Optimization Guide, choose the minimum latency
47// alternative.
48//
49// farg0 holds the dividend.  farg1 holds the divisor.
50
51	.text
52	.align 16
53	.global __divdf3
54	.proc __divdf3
55__divdf3:
56	cmp.eq p7, p0 = r0, r0
57	frcpa.s0 f10, p6 = farg0, farg1
58	;;
59(p6)	cmp.ne p7, p0 = r0, r0
60	.pred.rel.mutex p6, p7
61(p6)	fmpy.s1 f11 = farg0, f10
62(p6)	fnma.s1 f12 = farg1, f10, f1
63	;;
64(p6)	fma.s1 f11 = f12, f11, f11
65(p6)	fmpy.s1 f13 = f12, f12
66	;;
67(p6)	fma.s1 f10 = f12, f10, f10
68(p6)	fma.s1 f11 = f13, f11, f11
69	;;
70(p6)	fmpy.s1 f12 = f13, f13
71(p6)	fma.s1 f10 = f13, f10, f10
72	;;
73(p6)	fma.d.s1 f11 = f12, f11, f11
74(p6)	fma.s1 f10 = f12, f10, f10
75	;;
76(p6)	fnma.d.s1 f8 = farg1, f11, farg0
77	;;
78(p6)	fma.d fret0 = f8, f10, f11
79(p7)	mov fret0 = f10
80	br.ret.sptk rp
81	;;
82	.endp __divdf3
83#endif
84
85#ifdef L__divsf3
86// Compute a 32-bit IEEE float quotient.
87//
88// From the Intel IA-64 Optimization Guide, choose the minimum latency
89// alternative.
90//
91// farg0 holds the dividend.  farg1 holds the divisor.
92
93	.text
94	.align 16
95	.global __divsf3
96	.proc __divsf3
97__divsf3:
98	cmp.eq p7, p0 = r0, r0
99	frcpa.s0 f10, p6 = farg0, farg1
100	;;
101(p6)	cmp.ne p7, p0 = r0, r0
102	.pred.rel.mutex p6, p7
103(p6)	fmpy.s1 f8 = farg0, f10
104(p6)	fnma.s1 f9 = farg1, f10, f1
105	;;
106(p6)	fma.s1 f8 = f9, f8, f8
107(p6)	fmpy.s1 f9 = f9, f9
108	;;
109(p6)	fma.s1 f8 = f9, f8, f8
110(p6)	fmpy.s1 f9 = f9, f9
111	;;
112(p6)	fma.d.s1 f10 = f9, f8, f8
113	;;
114(p6)	fnorm.s.s0 fret0 = f10
115(p7)	mov fret0 = f10
116	br.ret.sptk rp
117	;;
118	.endp __divsf3
119#endif
120
121#ifdef L__divdi3
122// Compute a 64-bit integer quotient.
123//
124// From the Intel IA-64 Optimization Guide, choose the minimum latency
125// alternative.
126//
127// in0 holds the dividend.  in1 holds the divisor.
128
129	.text
130	.align 16
131	.global __divdi3
132	.proc __divdi3
133__divdi3:
134	.regstk 2,0,0,0
135	// Transfer inputs to FP registers.
136	setf.sig f8 = in0
137	setf.sig f9 = in1
138	;;
139	// Convert the inputs to FP, so that they won't be treated as unsigned.
140	fcvt.xf f8 = f8
141	fcvt.xf f9 = f9
142	;;
143	// Compute the reciprocal approximation.
144	frcpa.s1 f10, p6 = f8, f9
145	;;
146	// 3 Newton-Raphson iterations.
147(p6)	fnma.s1 f11 = f9, f10, f1
148(p6)	fmpy.s1 f12 = f8, f10
149	;;
150(p6)	fmpy.s1 f13 = f11, f11
151(p6)	fma.s1 f12 = f11, f12, f12
152	;;
153(p6)	fma.s1 f10 = f11, f10, f10
154(p6)	fma.s1 f11 = f13, f12, f12
155	;;
156(p6)	fma.s1 f10 = f13, f10, f10
157(p6)	fnma.s1 f12 = f9, f11, f8
158	;;
159(p6)	fma.s1 f10 = f12, f10, f11
160	;;
161	// Round quotient to an integer.
162	fcvt.fx.trunc.s1 f10 = f10
163	;;
164	// Transfer result to GP registers.
165	getf.sig ret0 = f10
166	br.ret.sptk rp
167	;;
168	.endp __divdi3
169#endif
170
171#ifdef L__moddi3
172// Compute a 64-bit integer modulus.
173//
174// From the Intel IA-64 Optimization Guide, choose the minimum latency
175// alternative.
176//
177// in0 holds the dividend (a).  in1 holds the divisor (b).
178
179	.text
180	.align 16
181	.global __moddi3
182	.proc __moddi3
183__moddi3:
184	.regstk 2,0,0,0
185	// Transfer inputs to FP registers.
186	setf.sig f14 = in0
187	setf.sig f9 = in1
188	;;
189	// Convert the inputs to FP, so that they won't be treated as unsigned.
190	fcvt.xf f8 = f14
191	fcvt.xf f9 = f9
192	;;
193	// Compute the reciprocal approximation.
194	frcpa.s1 f10, p6 = f8, f9
195	;;
196	// 3 Newton-Raphson iterations.
197(p6)	fmpy.s1 f12 = f8, f10
198(p6)	fnma.s1 f11 = f9, f10, f1
199	;;
200(p6)	fma.s1 f12 = f11, f12, f12
201(p6)	fmpy.s1 f13 = f11, f11
202	;;
203(p6)	fma.s1 f10 = f11, f10, f10
204(p6)	fma.s1 f11 = f13, f12, f12
205	;;
206	sub in1 = r0, in1
207(p6)	fma.s1 f10 = f13, f10, f10
208(p6)	fnma.s1 f12 = f9, f11, f8
209	;;
210	setf.sig f9 = in1
211(p6)	fma.s1 f10 = f12, f10, f11
212	;;
213	fcvt.fx.trunc.s1 f10 = f10
214	;;
215	// r = q * (-b) + a
216	xma.l f10 = f10, f9, f14
217	;;
218	// Transfer result to GP registers.
219	getf.sig ret0 = f10
220	br.ret.sptk rp
221	;;
222	.endp __moddi3
223#endif
224
225#ifdef L__udivdi3
226// Compute a 64-bit unsigned integer quotient.
227//
228// From the Intel IA-64 Optimization Guide, choose the minimum latency
229// alternative.
230//
231// in0 holds the dividend.  in1 holds the divisor.
232
233	.text
234	.align 16
235	.global __udivdi3
236	.proc __udivdi3
237__udivdi3:
238	.regstk 2,0,0,0
239	// Transfer inputs to FP registers.
240	setf.sig f8 = in0
241	setf.sig f9 = in1
242	;;
243	// Convert the inputs to FP, to avoid FP software-assist faults.
244	fcvt.xuf.s1 f8 = f8
245	fcvt.xuf.s1 f9 = f9
246	;;
247	// Compute the reciprocal approximation.
248	frcpa.s1 f10, p6 = f8, f9
249	;;
250	// 3 Newton-Raphson iterations.
251(p6)	fnma.s1 f11 = f9, f10, f1
252(p6)	fmpy.s1 f12 = f8, f10
253	;;
254(p6)	fmpy.s1 f13 = f11, f11
255(p6)	fma.s1 f12 = f11, f12, f12
256	;;
257(p6)	fma.s1 f10 = f11, f10, f10
258(p6)	fma.s1 f11 = f13, f12, f12
259	;;
260(p6)	fma.s1 f10 = f13, f10, f10
261(p6)	fnma.s1 f12 = f9, f11, f8
262	;;
263(p6)	fma.s1 f10 = f12, f10, f11
264	;;
265	// Round quotient to an unsigned integer.
266	fcvt.fxu.trunc.s1 f10 = f10
267	;;
268	// Transfer result to GP registers.
269	getf.sig ret0 = f10
270	br.ret.sptk rp
271	;;
272	.endp __udivdi3
273#endif
274
275#ifdef L__umoddi3
276// Compute a 64-bit unsigned integer modulus.
277//
278// From the Intel IA-64 Optimization Guide, choose the minimum latency
279// alternative.
280//
281// in0 holds the dividend (a).  in1 holds the divisor (b).
282
283	.text
284	.align 16
285	.global __umoddi3
286	.proc __umoddi3
287__umoddi3:
288	.regstk 2,0,0,0
289	// Transfer inputs to FP registers.
290	setf.sig f14 = in0
291	setf.sig f9 = in1
292	;;
293	// Convert the inputs to FP, to avoid FP software assist faults.
294	fcvt.xuf.s1 f8 = f14
295	fcvt.xuf.s1 f9 = f9
296	;;
297	// Compute the reciprocal approximation.
298	frcpa.s1 f10, p6 = f8, f9
299	;;
300	// 3 Newton-Raphson iterations.
301(p6)	fmpy.s1 f12 = f8, f10
302(p6)	fnma.s1 f11 = f9, f10, f1
303	;;
304(p6)	fma.s1 f12 = f11, f12, f12
305(p6)	fmpy.s1 f13 = f11, f11
306	;;
307(p6)	fma.s1 f10 = f11, f10, f10
308(p6)	fma.s1 f11 = f13, f12, f12
309	;;
310	sub in1 = r0, in1
311(p6)	fma.s1 f10 = f13, f10, f10
312(p6)	fnma.s1 f12 = f9, f11, f8
313	;;
314	setf.sig f9 = in1
315(p6)	fma.s1 f10 = f12, f10, f11
316	;;
317	// Round quotient to an unsigned integer.
318	fcvt.fxu.trunc.s1 f10 = f10
319	;;
320	// r = q * (-b) + a
321	xma.l f10 = f10, f9, f14
322	;;
323	// Transfer result to GP registers.
324	getf.sig ret0 = f10
325	br.ret.sptk rp
326	;;
327	.endp __umoddi3
328#endif
329
330#ifdef L__divsi3
331// Compute a 32-bit integer quotient.
332//
333// From the Intel IA-64 Optimization Guide, choose the minimum latency
334// alternative.
335//
336// in0 holds the dividend.  in1 holds the divisor.
337
338	.text
339	.align 16
340	.global __divsi3
341	.proc __divsi3
342__divsi3:
343	.regstk 2,0,0,0
344	sxt4 in0 = in0
345	sxt4 in1 = in1
346	;;
347	setf.sig f8 = in0
348	setf.sig f9 = in1
349	;;
350	mov r2 = 0x0ffdd
351	fcvt.xf f8 = f8
352	fcvt.xf f9 = f9
353	;;
354	setf.exp f11 = r2
355	frcpa.s1 f10, p6 = f8, f9
356	;;
357(p6)	fmpy.s1 f8 = f8, f10
358(p6)	fnma.s1 f9 = f9, f10, f1
359	;;
360(p6)	fma.s1 f8 = f9, f8, f8
361(p6)	fma.s1 f9 = f9, f9, f11
362	;;
363(p6)	fma.s1 f10 = f9, f8, f8
364	;;
365	fcvt.fx.trunc.s1 f10 = f10
366	;;
367	getf.sig ret0 = f10
368	br.ret.sptk rp
369	;;
370	.endp __divsi3
371#endif
372
373#ifdef L__modsi3
374// Compute a 32-bit integer modulus.
375//
376// From the Intel IA-64 Optimization Guide, choose the minimum latency
377// alternative.
378//
379// in0 holds the dividend.  in1 holds the divisor.
380
381	.text
382	.align 16
383	.global __modsi3
384	.proc __modsi3
385__modsi3:
386	.regstk 2,0,0,0
387	mov r2 = 0x0ffdd
388	sxt4 in0 = in0
389	sxt4 in1 = in1
390	;;
391	setf.sig f13 = r32
392	setf.sig f9 = r33
393	;;
394	sub in1 = r0, in1
395	fcvt.xf f8 = f13
396	fcvt.xf f9 = f9
397	;;
398	setf.exp f11 = r2
399	frcpa.s1 f10, p6 = f8, f9
400	;;
401(p6)	fmpy.s1 f12 = f8, f10
402(p6)	fnma.s1 f10 = f9, f10, f1
403	;;
404	setf.sig f9 = in1
405(p6)	fma.s1 f12 = f10, f12, f12
406(p6)	fma.s1 f10 = f10, f10, f11	
407	;;
408(p6)	fma.s1 f10 = f10, f12, f12
409	;;
410	fcvt.fx.trunc.s1 f10 = f10
411	;;
412	xma.l f10 = f10, f9, f13
413	;;
414	getf.sig ret0 = f10
415	br.ret.sptk rp
416	;;
417	.endp __modsi3
418#endif
419
420#ifdef L__udivsi3
421// Compute a 32-bit unsigned integer quotient.
422//
423// From the Intel IA-64 Optimization Guide, choose the minimum latency
424// alternative.
425//
426// in0 holds the dividend.  in1 holds the divisor.
427
428	.text
429	.align 16
430	.global __udivsi3
431	.proc __udivsi3
432__udivsi3:
433	.regstk 2,0,0,0
434	mov r2 = 0x0ffdd
435	zxt4 in0 = in0
436	zxt4 in1 = in1
437	;;
438	setf.sig f8 = in0
439	setf.sig f9 = in1
440	;;
441	fcvt.xf f8 = f8
442	fcvt.xf f9 = f9
443	;;
444	setf.exp f11 = r2
445	frcpa.s1 f10, p6 = f8, f9
446	;;
447(p6)	fmpy.s1 f8 = f8, f10
448(p6)	fnma.s1 f9 = f9, f10, f1
449	;;
450(p6)	fma.s1 f8 = f9, f8, f8
451(p6)	fma.s1 f9 = f9, f9, f11
452	;;
453(p6)	fma.s1 f10 = f9, f8, f8
454	;;
455	fcvt.fxu.trunc.s1 f10 = f10
456	;;
457	getf.sig ret0 = f10
458	br.ret.sptk rp
459	;;
460	.endp __udivsi3
461#endif
462
463#ifdef L__umodsi3
464// Compute a 32-bit unsigned integer modulus.
465//
466// From the Intel IA-64 Optimization Guide, choose the minimum latency
467// alternative.
468//
469// in0 holds the dividend.  in1 holds the divisor.
470
471	.text
472	.align 16
473	.global __umodsi3
474	.proc __umodsi3
475__umodsi3:
476	.regstk 2,0,0,0
477	mov r2 = 0x0ffdd
478	zxt4 in0 = in0
479	zxt4 in1 = in1
480	;;
481	setf.sig f13 = in0
482	setf.sig f9 = in1
483	;;
484	sub in1 = r0, in1
485	fcvt.xf f8 = f13
486	fcvt.xf f9 = f9
487	;;
488	setf.exp f11 = r2
489	frcpa.s1 f10, p6 = f8, f9
490	;;
491(p6)	fmpy.s1 f12 = f8, f10
492(p6)	fnma.s1 f10 = f9, f10, f1
493	;;
494	setf.sig f9 = in1
495(p6)	fma.s1 f12 = f10, f12, f12
496(p6)	fma.s1 f10 = f10, f10, f11
497	;;
498(p6)	fma.s1 f10 = f10, f12, f12
499	;;
500	fcvt.fxu.trunc.s1 f10 = f10
501	;;
502	xma.l f10 = f10, f9, f13
503	;;
504	getf.sig ret0 = f10
505	br.ret.sptk rp
506	;;
507	.endp __umodsi3
508#endif
509
510#ifdef L__save_stack_nonlocal
511// Notes on save/restore stack nonlocal: We read ar.bsp but write
512// ar.bspstore.  This is because ar.bsp can be read at all times
513// (independent of the RSE mode) but since it's read-only we need to
514// restore the value via ar.bspstore.  This is OK because
515// ar.bsp==ar.bspstore after executing "flushrs".
516
517// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
518
519	.text
520	.align 16
521	.global __ia64_save_stack_nonlocal
522	.proc __ia64_save_stack_nonlocal
523__ia64_save_stack_nonlocal:
524	{ .mmf
525	  alloc r18 = ar.pfs, 2, 0, 0, 0
526	  mov r19 = ar.rsc
527	  ;;
528	}
529	{ .mmi
530	  flushrs
531	  st8 [in0] = in1, 24
532	  and r19 = 0x1c, r19
533	  ;;
534	}
535	{ .mmi
536	  st8 [in0] = r18, -16
537	  mov ar.rsc = r19
538	  or r19 = 0x3, r19
539	  ;;
540	}
541	{ .mmi
542	  mov r16 = ar.bsp
543	  mov r17 = ar.rnat
544	  adds r2 = 8, in0
545	  ;;
546	}
547	{ .mmi
548	  st8 [in0] = r16
549	  st8 [r2] = r17
550	}
551	{ .mib
552	  mov ar.rsc = r19
553	  br.ret.sptk.few rp
554	  ;;
555	}
556	.endp __ia64_save_stack_nonlocal
557#endif
558
559#ifdef L__nonlocal_goto
560// void __ia64_nonlocal_goto(void *target_label, void *save_area,
561//			     void *static_chain);
562
563	.text
564	.align 16
565	.global __ia64_nonlocal_goto
566	.proc __ia64_nonlocal_goto
567__ia64_nonlocal_goto:
568	{ .mmi
569	  alloc r20 = ar.pfs, 3, 0, 0, 0
570	  ld8 r12 = [in1], 8
571	  mov.ret.sptk rp = in0, .L0
572	  ;;
573	}
574	{ .mmf
575	  ld8 r16 = [in1], 8
576	  mov r19 = ar.rsc
577	  ;;
578	}
579	{ .mmi
580	  flushrs
581	  ld8 r17 = [in1], 8
582	  and r19 = 0x1c, r19
583	  ;;
584	}
585	{ .mmi
586	  ld8 r18 = [in1]
587	  mov ar.rsc = r19
588	  or r19 = 0x3, r19
589	  ;;
590	}
591	{ .mmi
592	  mov ar.bspstore = r16
593	  ;;
594	  mov ar.rnat = r17
595	  ;;
596	}
597	{ .mmi
598	  loadrs
599	  invala
600	  mov r15 = in2
601	  ;;
602	}
603.L0:	{ .mib
604	  mov ar.rsc = r19
605	  mov ar.pfs = r18
606	  br.ret.sptk.few rp
607	  ;;
608	}
609	.endp __ia64_nonlocal_goto
610#endif
611
612#ifdef L__restore_stack_nonlocal
613// This is mostly the same as nonlocal_goto above.
614// ??? This has not been tested yet.
615
616// void __ia64_restore_stack_nonlocal(void *save_area)
617
618	.text
619	.align 16
620	.global __ia64_restore_stack_nonlocal
621	.proc __ia64_restore_stack_nonlocal
622__ia64_restore_stack_nonlocal:
623	{ .mmf
624	  alloc r20 = ar.pfs, 4, 0, 0, 0
625	  ld8 r12 = [in0], 8
626	  ;;
627	}
628	{ .mmb
629	  ld8 r16=[in0], 8
630	  mov r19 = ar.rsc
631	  ;;
632	}
633	{ .mmi
634	  flushrs
635	  ld8 r17 = [in0], 8
636	  and r19 = 0x1c, r19
637	  ;;
638	}
639	{ .mmf
640	  ld8 r18 = [in0]
641	  mov ar.rsc = r19
642	  ;;
643	}
644	{ .mmi
645	  mov ar.bspstore = r16
646	  ;;
647	  mov ar.rnat = r17
648	  or r19 = 0x3, r19
649	  ;;
650	}
651	{ .mmf
652	  loadrs
653	  invala
654	  ;;
655	}
656.L0:	{ .mib
657	  mov ar.rsc = r19
658	  mov ar.pfs = r18
659	  br.ret.sptk.few rp
660	  ;;
661	}
662	.endp __ia64_restore_stack_nonlocal
663#endif
664
665#ifdef L__trampoline
666// Implement the nested function trampoline.  This is out of line
667// so that we don't have to bother with flushing the icache, as
668// well as making the on-stack trampoline smaller.
669//
670// The trampoline has the following form:
671//
672//		+-------------------+ >
673//	TRAMP:	| __ia64_trampoline | |
674//		+-------------------+  > fake function descriptor
675//		| TRAMP+16          | |
676//		+-------------------+ >
677//		| target descriptor |
678//		+-------------------+
679//		| static link	    |
680//		+-------------------+
681
682	.text
683	.align 16
684	.global __ia64_trampoline
685	.proc __ia64_trampoline
686__ia64_trampoline:
687	{ .mmi
688	  ld8 r2 = [r1], 8
689	  ;;
690	  ld8 r15 = [r1]
691	}
692	{ .mmi
693	  ld8 r3 = [r2], 8
694	  ;;
695	  ld8 r1 = [r2]
696	  mov b6 = r3
697	}
698	{ .bbb
699	  br.sptk.many b6
700	  ;;
701	}
702	.endp __ia64_trampoline
703#endif
704