1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for PowerPC FPU.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# and improvement coefficients relative to gcc-generated code.
23#
24# Freescale e300	9.78/+30%
25# PPC74x0		6.92/+50%
26# PPC970		6.03/+80%
27# POWER7		3.50/+30%
28# POWER8		3.75/+10%
29
30# $output is the last argument if it looks like a file (it has an extension)
31# $flavour is the first argument if it doesn't look like a file
32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
34
35if ($flavour =~ /64/) {
36	$SIZE_T	=8;
37	$LRSAVE	=2*$SIZE_T;
38	$UCMP	="cmpld";
39	$STU	="stdu";
40	$POP	="ld";
41	$PUSH	="std";
42} elsif ($flavour =~ /32/) {
43	$SIZE_T	=4;
44	$LRSAVE	=$SIZE_T;
45	$UCMP	="cmplw";
46	$STU	="stwu";
47	$POP	="lwz";
48	$PUSH	="stw";
49} else { die "nonsense $flavour"; }
50
51$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
52
53$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
54
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
57( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
58die "can't locate ppc-xlate.pl";
59
60open STDOUT,"| $^X $xlate $flavour \"$output\""
61    or die "can't call $xlate: $!";
62
63$LOCALS=6*$SIZE_T;
64$FRAME=$LOCALS+6*8+18*8;
65
66my $sp="r1";
67
68my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
69my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
70
71my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
72    $two0,$two32,$two64,$two96,$two130,$five_two130,
73    $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
74    $s2lo,$s2hi,$s3lo,$s3hi,
75    $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
76# borrowings
77my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
78my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
79my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
80
81$code.=<<___;
82.machine	"any"
83.text
84
85.globl	.poly1305_init_fpu
86.align	6
87.poly1305_init_fpu:
88	$STU	$sp,-$LOCALS($sp)		# minimal frame
89	mflr	$padbit
90	$PUSH	$padbit,`$LOCALS+$LRSAVE`($sp)
91
92	bl	LPICmeup
93
94	xor	r0,r0,r0
95	mtlr	$padbit				# restore lr
96
97	lfd	$two0,8*0($len)			# load constants
98	lfd	$two32,8*1($len)
99	lfd	$two64,8*2($len)
100	lfd	$two96,8*3($len)
101	lfd	$two130,8*4($len)
102	lfd	$five_two130,8*5($len)
103
104	stfd	$two0,8*0($ctx)			# initial hash value, biased 0
105	stfd	$two32,8*1($ctx)
106	stfd	$two64,8*2($ctx)
107	stfd	$two96,8*3($ctx)
108
109	$UCMP	$inp,r0
110	beq-	Lno_key
111
112	lfd	$h3lo,8*13($len)		# new fpscr
113	mffs	$h3hi				# old fpscr
114
115	stfd	$two0,8*4($ctx)			# key "template"
116	stfd	$two32,8*5($ctx)
117	stfd	$two64,8*6($ctx)
118	stfd	$two96,8*7($ctx)
119
120	li	$in1,4
121	li	$in2,8
122	li	$in3,12
123	$LWXLE	$in0,0,$inp			# load key
124	$LWXLE	$in1,$in1,$inp
125	$LWXLE	$in2,$in2,$inp
126	$LWXLE	$in3,$in3,$inp
127
128	lis	$i1,0xf000			#   0xf0000000
129	ori	$i2,$i1,3			#   0xf0000003
130	andc	$in0,$in0,$i1			# &=0x0fffffff
131	andc	$in1,$in1,$i2			# &=0x0ffffffc
132	andc	$in2,$in2,$i2
133	andc	$in3,$in3,$i2
134
135	stw	$in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx)	# fill "template"
136	stw	$in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
137	stw	$in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
138	stw	$in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
139
140	mtfsf	255,$h3lo			# fpscr
141	stfd	$two0,8*18($ctx)		# copy constants to context
142	stfd	$two32,8*19($ctx)
143	stfd	$two64,8*20($ctx)
144	stfd	$two96,8*21($ctx)
145	stfd	$two130,8*22($ctx)
146	stfd	$five_two130,8*23($ctx)
147
148	lfd	$h0lo,8*4($ctx)			# load [biased] key
149	lfd	$h1lo,8*5($ctx)
150	lfd	$h2lo,8*6($ctx)
151	lfd	$h3lo,8*7($ctx)
152
153	fsub	$h0lo,$h0lo,$two0		# r0
154	fsub	$h1lo,$h1lo,$two32		# r1
155	fsub	$h2lo,$h2lo,$two64		# r2
156	fsub	$h3lo,$h3lo,$two96		# r3
157
158	lfd	$two0,8*6($len)			# more constants
159	lfd	$two32,8*7($len)
160	lfd	$two64,8*8($len)
161	lfd	$two96,8*9($len)
162
163	fmul	$h1hi,$h1lo,$five_two130	# s1
164	fmul	$h2hi,$h2lo,$five_two130	# s2
165	 stfd	$h3hi,8*15($ctx)		# borrow slot for original fpscr
166	fmul	$h3hi,$h3lo,$five_two130	# s3
167
168	fadd	$h0hi,$h0lo,$two0
169	 stfd	$h1hi,8*12($ctx)		# put aside for now
170	fadd	$h1hi,$h1lo,$two32
171	 stfd	$h2hi,8*13($ctx)
172	fadd	$h2hi,$h2lo,$two64
173	 stfd	$h3hi,8*14($ctx)
174	fadd	$h3hi,$h3lo,$two96
175
176	fsub	$h0hi,$h0hi,$two0
177	fsub	$h1hi,$h1hi,$two32
178	fsub	$h2hi,$h2hi,$two64
179	fsub	$h3hi,$h3hi,$two96
180
181	lfd	$two0,8*10($len)		# more constants
182	lfd	$two32,8*11($len)
183	lfd	$two64,8*12($len)
184
185	fsub	$h0lo,$h0lo,$h0hi
186	fsub	$h1lo,$h1lo,$h1hi
187	fsub	$h2lo,$h2lo,$h2hi
188	fsub	$h3lo,$h3lo,$h3hi
189
190	stfd	$h0hi,8*5($ctx)			# r0hi
191	stfd	$h1hi,8*7($ctx)			# r1hi
192	stfd	$h2hi,8*9($ctx)			# r2hi
193	stfd	$h3hi,8*11($ctx)		# r3hi
194
195	stfd	$h0lo,8*4($ctx)			# r0lo
196	stfd	$h1lo,8*6($ctx)			# r1lo
197	stfd	$h2lo,8*8($ctx)			# r2lo
198	stfd	$h3lo,8*10($ctx)		# r3lo
199
200	lfd	$h1lo,8*12($ctx)		# s1
201	lfd	$h2lo,8*13($ctx)		# s2
202	lfd	$h3lo,8*14($ctx)		# s3
203	lfd	$h0lo,8*15($ctx)		# pull original fpscr
204
205	fadd	$h1hi,$h1lo,$two0
206	fadd	$h2hi,$h2lo,$two32
207	fadd	$h3hi,$h3lo,$two64
208
209	fsub	$h1hi,$h1hi,$two0
210	fsub	$h2hi,$h2hi,$two32
211	fsub	$h3hi,$h3hi,$two64
212
213	fsub	$h1lo,$h1lo,$h1hi
214	fsub	$h2lo,$h2lo,$h2hi
215	fsub	$h3lo,$h3lo,$h3hi
216
217	stfd	$h1hi,8*13($ctx)		# s1hi
218	stfd	$h2hi,8*15($ctx)		# s2hi
219	stfd	$h3hi,8*17($ctx)		# s3hi
220
221	stfd	$h1lo,8*12($ctx)		# s1lo
222	stfd	$h2lo,8*14($ctx)		# s2lo
223	stfd	$h3lo,8*16($ctx)		# s3lo
224
225	mtfsf	255,$h0lo			# restore fpscr
226Lno_key:
227	xor	r3,r3,r3
228	addi	$sp,$sp,$LOCALS
229	blr
230	.long	0
231	.byte	0,12,4,1,0x80,0,2,0
232.size	.poly1305_init_fpu,.-.poly1305_init_fpu
233
234.globl	.poly1305_blocks_fpu
235.align	4
236.poly1305_blocks_fpu:
237	srwi.	$len,$len,4
238	beq-	Labort
239
240	$STU	$sp,-$FRAME($sp)
241	mflr	r0
242	stfd	f14,`$FRAME-8*18`($sp)
243	stfd	f15,`$FRAME-8*17`($sp)
244	stfd	f16,`$FRAME-8*16`($sp)
245	stfd	f17,`$FRAME-8*15`($sp)
246	stfd	f18,`$FRAME-8*14`($sp)
247	stfd	f19,`$FRAME-8*13`($sp)
248	stfd	f20,`$FRAME-8*12`($sp)
249	stfd	f21,`$FRAME-8*11`($sp)
250	stfd	f22,`$FRAME-8*10`($sp)
251	stfd	f23,`$FRAME-8*9`($sp)
252	stfd	f24,`$FRAME-8*8`($sp)
253	stfd	f25,`$FRAME-8*7`($sp)
254	stfd	f26,`$FRAME-8*6`($sp)
255	stfd	f27,`$FRAME-8*5`($sp)
256	stfd	f28,`$FRAME-8*4`($sp)
257	stfd	f29,`$FRAME-8*3`($sp)
258	stfd	f30,`$FRAME-8*2`($sp)
259	stfd	f31,`$FRAME-8*1`($sp)
260	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
261
262	xor	r0,r0,r0
263	li	$in3,1
264	mtctr	$len
265	neg	$len,$len
266	stw	r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
267	stw	$in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
268
269	lfd	$two0,8*18($ctx)		# load constants
270	lfd	$two32,8*19($ctx)
271	lfd	$two64,8*20($ctx)
272	lfd	$two96,8*21($ctx)
273	lfd	$two130,8*22($ctx)
274	lfd	$five_two130,8*23($ctx)
275
276	lfd	$h0lo,8*0($ctx)			# load [biased] hash value
277	lfd	$h1lo,8*1($ctx)
278	lfd	$h2lo,8*2($ctx)
279	lfd	$h3lo,8*3($ctx)
280
281	stfd	$two0,`$LOCALS+8*0`($sp)	# input "template"
282	oris	$in3,$padbit,`(1023+52+96)<<4`
283	stfd	$two32,`$LOCALS+8*1`($sp)
284	stfd	$two64,`$LOCALS+8*2`($sp)
285	stw	$in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
286
287	li	$i1,4
288	li	$i2,8
289	li	$i3,12
290	$LWXLE	$in0,0,$inp			# load input
291	$LWXLE	$in1,$i1,$inp
292	$LWXLE	$in2,$i2,$inp
293	$LWXLE	$in3,$i3,$inp
294	addi	$inp,$inp,16
295
296	stw	$in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)	# fill "template"
297	stw	$in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
298	stw	$in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
299	stw	$in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
300
301	mffs	$x0				# original fpscr
302	lfd	$x1,`$LOCALS+8*4`($sp)		# new fpscr
303	lfd	$r0lo,8*4($ctx)			# load key
304	lfd	$r0hi,8*5($ctx)
305	lfd	$r1lo,8*6($ctx)
306	lfd	$r1hi,8*7($ctx)
307	lfd	$r2lo,8*8($ctx)
308	lfd	$r2hi,8*9($ctx)
309	lfd	$r3lo,8*10($ctx)
310	lfd	$r3hi,8*11($ctx)
311	lfd	$s1lo,8*12($ctx)
312	lfd	$s1hi,8*13($ctx)
313	lfd	$s2lo,8*14($ctx)
314	lfd	$s2hi,8*15($ctx)
315	lfd	$s3lo,8*16($ctx)
316	lfd	$s3hi,8*17($ctx)
317
318	stfd	$x0,`$LOCALS+8*4`($sp)		# save original fpscr
319	mtfsf	255,$x1
320
321	addic	$len,$len,1
322	addze	r0,r0
323	slwi.	r0,r0,4
324	sub	$inp,$inp,r0			# conditional rewind
325
326	lfd	$x0,`$LOCALS+8*0`($sp)
327	lfd	$x1,`$LOCALS+8*1`($sp)
328	lfd	$x2,`$LOCALS+8*2`($sp)
329	lfd	$x3,`$LOCALS+8*3`($sp)
330
331	fsub	$h0lo,$h0lo,$two0		# de-bias hash value
332	 $LWXLE	$in0,0,$inp			# modulo-scheduled input load
333	fsub	$h1lo,$h1lo,$two32
334	 $LWXLE	$in1,$i1,$inp
335	fsub	$h2lo,$h2lo,$two64
336	 $LWXLE	$in2,$i2,$inp
337	fsub	$h3lo,$h3lo,$two96
338	 $LWXLE	$in3,$i3,$inp
339
340	fsub	$x0,$x0,$two0			# de-bias input
341	 addi	$inp,$inp,16
342	fsub	$x1,$x1,$two32
343	fsub	$x2,$x2,$two64
344	fsub	$x3,$x3,$two96
345
346	fadd	$x0,$x0,$h0lo			# accumulate input
347	 stw	$in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
348	fadd	$x1,$x1,$h1lo
349	 stw	$in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
350	fadd	$x2,$x2,$h2lo
351	 stw	$in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
352	fadd	$x3,$x3,$h3lo
353	 stw	$in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
354
355	b	Lentry
356
357.align	4
358Loop:
359	fsub	$y0,$y0,$two0			# de-bias input
360	 addic	$len,$len,1
361	fsub	$y1,$y1,$two32
362	 addze	r0,r0
363	fsub	$y2,$y2,$two64
364	 slwi.	r0,r0,4
365	fsub	$y3,$y3,$two96
366	 sub	$inp,$inp,r0			# conditional rewind
367
368	fadd	$h0lo,$h0lo,$y0			# accumulate input
369	fadd	$h0hi,$h0hi,$y1
370	fadd	$h2lo,$h2lo,$y2
371	fadd	$h2hi,$h2hi,$y3
372
373	######################################### base 2^48 -> base 2^32
374	fadd	$c1lo,$h1lo,$two64
375	 $LWXLE	$in0,0,$inp			# modulo-scheduled input load
376	fadd	$c1hi,$h1hi,$two64
377	 $LWXLE	$in1,$i1,$inp
378	fadd	$c3lo,$h3lo,$two130
379	 $LWXLE	$in2,$i2,$inp
380	fadd	$c3hi,$h3hi,$two130
381	 $LWXLE	$in3,$i3,$inp
382	fadd	$c0lo,$h0lo,$two32
383	 addi	$inp,$inp,16
384	fadd	$c0hi,$h0hi,$two32
385	fadd	$c2lo,$h2lo,$two96
386	fadd	$c2hi,$h2hi,$two96
387
388	fsub	$c1lo,$c1lo,$two64
389	 stw	$in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)	# fill "template"
390	fsub	$c1hi,$c1hi,$two64
391	 stw	$in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
392	fsub	$c3lo,$c3lo,$two130
393	 stw	$in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
394	fsub	$c3hi,$c3hi,$two130
395	 stw	$in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
396	fsub	$c0lo,$c0lo,$two32
397	fsub	$c0hi,$c0hi,$two32
398	fsub	$c2lo,$c2lo,$two96
399	fsub	$c2hi,$c2hi,$two96
400
401	fsub	$h1lo,$h1lo,$c1lo
402	fsub	$h1hi,$h1hi,$c1hi
403	fsub	$h3lo,$h3lo,$c3lo
404	fsub	$h3hi,$h3hi,$c3hi
405	fsub	$h2lo,$h2lo,$c2lo
406	fsub	$h2hi,$h2hi,$c2hi
407	fsub	$h0lo,$h0lo,$c0lo
408	fsub	$h0hi,$h0hi,$c0hi
409
410	fadd	$h1lo,$h1lo,$c0lo
411	fadd	$h1hi,$h1hi,$c0hi
412	fadd	$h3lo,$h3lo,$c2lo
413	fadd	$h3hi,$h3hi,$c2hi
414	fadd	$h2lo,$h2lo,$c1lo
415	fadd	$h2hi,$h2hi,$c1hi
416	fmadd	$h0lo,$c3lo,$five_two130,$h0lo
417	fmadd	$h0hi,$c3hi,$five_two130,$h0hi
418
419	fadd	$x1,$h1lo,$h1hi
420	 lfd	$s1lo,8*12($ctx)		# reload constants
421	fadd	$x3,$h3lo,$h3hi
422	 lfd	$s1hi,8*13($ctx)
423	fadd	$x2,$h2lo,$h2hi
424	 lfd	$r3lo,8*10($ctx)
425	fadd	$x0,$h0lo,$h0hi
426	 lfd	$r3hi,8*11($ctx)
427Lentry:
428	fmul	$h0lo,$s3lo,$x1
429	fmul	$h0hi,$s3hi,$x1
430	fmul	$h2lo,$r1lo,$x1
431	fmul	$h2hi,$r1hi,$x1
432	fmul	$h1lo,$r0lo,$x1
433	fmul	$h1hi,$r0hi,$x1
434	fmul	$h3lo,$r2lo,$x1
435	fmul	$h3hi,$r2hi,$x1
436
437	fmadd	$h0lo,$s1lo,$x3,$h0lo
438	fmadd	$h0hi,$s1hi,$x3,$h0hi
439	fmadd	$h2lo,$s3lo,$x3,$h2lo
440	fmadd	$h2hi,$s3hi,$x3,$h2hi
441	fmadd	$h1lo,$s2lo,$x3,$h1lo
442	fmadd	$h1hi,$s2hi,$x3,$h1hi
443	fmadd	$h3lo,$r0lo,$x3,$h3lo
444	fmadd	$h3hi,$r0hi,$x3,$h3hi
445
446	fmadd	$h0lo,$s2lo,$x2,$h0lo
447	fmadd	$h0hi,$s2hi,$x2,$h0hi
448	fmadd	$h2lo,$r0lo,$x2,$h2lo
449	fmadd	$h2hi,$r0hi,$x2,$h2hi
450	fmadd	$h1lo,$s3lo,$x2,$h1lo
451	fmadd	$h1hi,$s3hi,$x2,$h1hi
452	fmadd	$h3lo,$r1lo,$x2,$h3lo
453	fmadd	$h3hi,$r1hi,$x2,$h3hi
454
455	fmadd	$h0lo,$r0lo,$x0,$h0lo
456	 lfd	$y0,`$LOCALS+8*0`($sp)		# load [biased] input
457	fmadd	$h0hi,$r0hi,$x0,$h0hi
458	 lfd	$y1,`$LOCALS+8*1`($sp)
459	fmadd	$h2lo,$r2lo,$x0,$h2lo
460	 lfd	$y2,`$LOCALS+8*2`($sp)
461	fmadd	$h2hi,$r2hi,$x0,$h2hi
462	 lfd	$y3,`$LOCALS+8*3`($sp)
463	fmadd	$h1lo,$r1lo,$x0,$h1lo
464	fmadd	$h1hi,$r1hi,$x0,$h1hi
465	fmadd	$h3lo,$r3lo,$x0,$h3lo
466	fmadd	$h3hi,$r3hi,$x0,$h3hi
467
468	bdnz	Loop
469
470	######################################### base 2^48 -> base 2^32
471	fadd	$c0lo,$h0lo,$two32
472	fadd	$c0hi,$h0hi,$two32
473	fadd	$c2lo,$h2lo,$two96
474	fadd	$c2hi,$h2hi,$two96
475	fadd	$c1lo,$h1lo,$two64
476	fadd	$c1hi,$h1hi,$two64
477	fadd	$c3lo,$h3lo,$two130
478	fadd	$c3hi,$h3hi,$two130
479
480	fsub	$c0lo,$c0lo,$two32
481	fsub	$c0hi,$c0hi,$two32
482	fsub	$c2lo,$c2lo,$two96
483	fsub	$c2hi,$c2hi,$two96
484	fsub	$c1lo,$c1lo,$two64
485	fsub	$c1hi,$c1hi,$two64
486	fsub	$c3lo,$c3lo,$two130
487	fsub	$c3hi,$c3hi,$two130
488
489	fsub	$h1lo,$h1lo,$c1lo
490	fsub	$h1hi,$h1hi,$c1hi
491	fsub	$h3lo,$h3lo,$c3lo
492	fsub	$h3hi,$h3hi,$c3hi
493	fsub	$h2lo,$h2lo,$c2lo
494	fsub	$h2hi,$h2hi,$c2hi
495	fsub	$h0lo,$h0lo,$c0lo
496	fsub	$h0hi,$h0hi,$c0hi
497
498	fadd	$h1lo,$h1lo,$c0lo
499	fadd	$h1hi,$h1hi,$c0hi
500	fadd	$h3lo,$h3lo,$c2lo
501	fadd	$h3hi,$h3hi,$c2hi
502	fadd	$h2lo,$h2lo,$c1lo
503	fadd	$h2hi,$h2hi,$c1hi
504	fmadd	$h0lo,$c3lo,$five_two130,$h0lo
505	fmadd	$h0hi,$c3hi,$five_two130,$h0hi
506
507	fadd	$x1,$h1lo,$h1hi
508	fadd	$x3,$h3lo,$h3hi
509	fadd	$x2,$h2lo,$h2hi
510	fadd	$x0,$h0lo,$h0hi
511
512	lfd	$h0lo,`$LOCALS+8*4`($sp)	# pull saved fpscr
513	fadd	$x1,$x1,$two32			# bias
514	fadd	$x3,$x3,$two96
515	fadd	$x2,$x2,$two64
516	fadd	$x0,$x0,$two0
517
518	stfd	$x1,8*1($ctx)			# store [biased] hash value
519	stfd	$x3,8*3($ctx)
520	stfd	$x2,8*2($ctx)
521	stfd	$x0,8*0($ctx)
522
523	mtfsf	255,$h0lo			# restore original fpscr
524	lfd	f14,`$FRAME-8*18`($sp)
525	lfd	f15,`$FRAME-8*17`($sp)
526	lfd	f16,`$FRAME-8*16`($sp)
527	lfd	f17,`$FRAME-8*15`($sp)
528	lfd	f18,`$FRAME-8*14`($sp)
529	lfd	f19,`$FRAME-8*13`($sp)
530	lfd	f20,`$FRAME-8*12`($sp)
531	lfd	f21,`$FRAME-8*11`($sp)
532	lfd	f22,`$FRAME-8*10`($sp)
533	lfd	f23,`$FRAME-8*9`($sp)
534	lfd	f24,`$FRAME-8*8`($sp)
535	lfd	f25,`$FRAME-8*7`($sp)
536	lfd	f26,`$FRAME-8*6`($sp)
537	lfd	f27,`$FRAME-8*5`($sp)
538	lfd	f28,`$FRAME-8*4`($sp)
539	lfd	f29,`$FRAME-8*3`($sp)
540	lfd	f30,`$FRAME-8*2`($sp)
541	lfd	f31,`$FRAME-8*1`($sp)
542	addi	$sp,$sp,$FRAME
543Labort:
544	blr
545	.long	0
546	.byte	0,12,4,1,0x80,0,4,0
547.size	.poly1305_blocks_fpu,.-.poly1305_blocks_fpu
548___
549{
550my ($mac,$nonce)=($inp,$len);
551
552my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
553   ) = map("r$_",(7..11,28..31));
554my $mask = "r0";
555my $FRAME = (6+4)*$SIZE_T;
556
557$code.=<<___;
558.globl	.poly1305_emit_fpu
559.align	4
560.poly1305_emit_fpu:
561	$STU	$sp,-$FRAME($sp)
562	mflr	r0
563	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
564	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
565	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
566	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
567	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
568
569	lwz	$d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx)	# load hash
570	lwz	$h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
571	lwz	$d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
572	lwz	$h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
573	lwz	$d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
574	lwz	$h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
575	lwz	$d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
576	lwz	$h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
577
578	lis	$mask,0xfff0
579	andc	$d0,$d0,$mask			# mask exponent
580	andc	$d1,$d1,$mask
581	andc	$d2,$d2,$mask
582	andc	$d3,$d3,$mask			# can be partially reduced...
583	li	$mask,3
584
585	srwi	$padbit,$d3,2			# ... so reduce
586	and	$h4,$d3,$mask
587	andc	$d3,$d3,$mask
588	add	$d3,$d3,$padbit
589___
590						if ($SIZE_T==4) {
591$code.=<<___;
592	addc	$h0,$h0,$d3
593	adde	$h1,$h1,$d0
594	adde	$h2,$h2,$d1
595	adde	$h3,$h3,$d2
596	addze	$h4,$h4
597
598	addic	$d0,$h0,5			# compare to modulus
599	addze	$d1,$h1
600	addze	$d2,$h2
601	addze	$d3,$h3
602	addze	$mask,$h4
603
604	srwi	$mask,$mask,2			# did it carry/borrow?
605	neg	$mask,$mask
606	srawi	$mask,$mask,31			# mask
607
608	andc	$h0,$h0,$mask
609	and	$d0,$d0,$mask
610	andc	$h1,$h1,$mask
611	and	$d1,$d1,$mask
612	or	$h0,$h0,$d0
613	lwz	$d0,0($nonce)			# load nonce
614	andc	$h2,$h2,$mask
615	and	$d2,$d2,$mask
616	or	$h1,$h1,$d1
617	lwz	$d1,4($nonce)
618	andc	$h3,$h3,$mask
619	and	$d3,$d3,$mask
620	or	$h2,$h2,$d2
621	lwz	$d2,8($nonce)
622	or	$h3,$h3,$d3
623	lwz	$d3,12($nonce)
624
625	addc	$h0,$h0,$d0			# accumulate nonce
626	adde	$h1,$h1,$d1
627	adde	$h2,$h2,$d2
628	adde	$h3,$h3,$d3
629___
630						} else {
631$code.=<<___;
632	add	$h0,$h0,$d3
633	add	$h1,$h1,$d0
634	add	$h2,$h2,$d1
635	add	$h3,$h3,$d2
636
637	srdi	$d0,$h0,32
638	add	$h1,$h1,$d0
639	srdi	$d1,$h1,32
640	add	$h2,$h2,$d1
641	srdi	$d2,$h2,32
642	add	$h3,$h3,$d2
643	srdi	$d3,$h3,32
644	add	$h4,$h4,$d3
645
646	insrdi	$h0,$h1,32,0
647	insrdi	$h2,$h3,32,0
648
649	addic	$d0,$h0,5			# compare to modulus
650	addze	$d1,$h2
651	addze	$d2,$h4
652
653	srdi	$mask,$d2,2			# did it carry/borrow?
654	neg	$mask,$mask
655	sradi	$mask,$mask,63			# mask
656	ld	$d2,0($nonce)			# load nonce
657	ld	$d3,8($nonce)
658
659	andc	$h0,$h0,$mask
660	and	$d0,$d0,$mask
661	andc	$h2,$h2,$mask
662	and	$d1,$d1,$mask
663	or	$h0,$h0,$d0
664	or	$h2,$h2,$d1
665___
666$code.=<<___	if (!$LITTLE_ENDIAN);
667	rotldi	$d2,$d2,32			# flip nonce words
668	rotldi	$d3,$d3,32
669___
670$code.=<<___;
671	addc	$h0,$h0,$d2			# accumulate nonce
672	adde	$h2,$h2,$d3
673
674	srdi	$h1,$h0,32
675	srdi	$h3,$h2,32
676___
677						}
678$code.=<<___	if ($LITTLE_ENDIAN);
679	stw	$h0,0($mac)			# write result
680	stw	$h1,4($mac)
681	stw	$h2,8($mac)
682	stw	$h3,12($mac)
683___
684$code.=<<___	if (!$LITTLE_ENDIAN);
685	li	$d1,4
686	stwbrx	$h0,0,$mac			# write result
687	li	$d2,8
688	stwbrx	$h1,$d1,$mac
689	li	$d3,12
690	stwbrx	$h2,$d2,$mac
691	stwbrx	$h3,$d3,$mac
692___
693$code.=<<___;
694	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
695	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
696	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
697	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
698	addi	$sp,$sp,$FRAME
699	blr
700	.long	0
701	.byte	0,12,4,1,0x80,4,3,0
702.size	.poly1305_emit_fpu,.-.poly1305_emit_fpu
703___
704}
705# Ugly hack here, because PPC assembler syntax seem to vary too
706# much from platforms to platform...
707$code.=<<___;
708.align	6
709LPICmeup:
710	mflr	r0
711	bcl	20,31,\$+4
712	mflr	$len	# vvvvvv "distance" between . and 1st data entry
713	addi	$len,$len,`64-8`	# borrow $len
714	mtlr	r0
715	blr
716	.long	0
717	.byte	0,12,0x14,0,0,0,0,0
718	.space	`64-9*4`
719
720.quad	0x4330000000000000		# 2^(52+0)
721.quad	0x4530000000000000		# 2^(52+32)
722.quad	0x4730000000000000		# 2^(52+64)
723.quad	0x4930000000000000		# 2^(52+96)
724.quad	0x4b50000000000000		# 2^(52+130)
725
726.quad	0x37f4000000000000		# 5/2^130
727
728.quad	0x4430000000000000		# 2^(52+16+0)
729.quad	0x4630000000000000		# 2^(52+16+32)
730.quad	0x4830000000000000		# 2^(52+16+64)
731.quad	0x4a30000000000000		# 2^(52+16+96)
732.quad	0x3e30000000000000		# 2^(52+16+0-96)
733.quad	0x4030000000000000		# 2^(52+16+32-96)
734.quad	0x4230000000000000		# 2^(52+16+64-96)
735
736.quad	0x0000000000000001		# fpscr: truncate, no exceptions
737.asciz	"Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
738.align	4
739___
740
741$code =~ s/\`([^\`]*)\`/eval $1/gem;
742print $code;
743close STDOUT or die "error closing STDOUT: $!";
744