vpaes-ppc.pl revision 289848
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13# CBC encrypt/decrypt performance in cycles per byte processed with
14# 128-bit key.
15#
16#		aes-ppc.pl		this
17# G4e		35.5/52.1/(23.8)	11.9(*)/15.4
18# POWER6	42.7/54.3/(28.2)	63.0/92.8(**)
19# POWER7	32.3/42.9/(18.4)	18.5/23.3
20#
21# (*)	This is ~10% worse than reported in paper. The reason is
22#	twofold. This module doesn't make any assumption about
23#	key schedule (or data for that matter) alignment and handles
24#	it in-line. Secondly it, being transliterated from
25#	vpaes-x86_64.pl, relies on "nested inversion" better suited
26#	for Intel CPUs.
27# (**)	Inadequate POWER6 performance is due to astronomic AltiVec
28#	latency, 9 cycles per simple logical operation.
29
30$flavour = shift;
31
32if ($flavour =~ /64/) {
33	$SIZE_T	=8;
34	$LRSAVE	=2*$SIZE_T;
35	$STU	="stdu";
36	$POP	="ld";
37	$PUSH	="std";
38	$UCMP	="cmpld";
39} elsif ($flavour =~ /32/) {
40	$SIZE_T	=4;
41	$LRSAVE	=$SIZE_T;
42	$STU	="stwu";
43	$POP	="lwz";
44	$PUSH	="stw";
45	$UCMP	="cmplw";
46} else { die "nonsense $flavour"; }
47
48$sp="r1";
49$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
53( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
54die "can't locate ppc-xlate.pl";
55
56open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
57
58$code.=<<___;
59.machine	"any"
60
61.text
62
63.align	7	# totally strategic alignment
64_vpaes_consts:
65Lk_mc_forward:	# mc_forward
66	.long	0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c	?inv
67	.long	0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300	?inv
68	.long	0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704	?inv
69	.long	0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08	?inv
70Lk_mc_backward:	# mc_backward
71	.long	0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e	?inv
72	.long	0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a	?inv
73	.long	0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506	?inv
74	.long	0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102	?inv
75Lk_sr:		# sr
76	.long	0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f	?inv
77	.long	0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b	?inv
78	.long	0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07	?inv
79	.long	0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603	?inv
80
81##
82## "Hot" constants
83##
84Lk_inv:		# inv, inva
85	.long	0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704	?rev
86	.long	0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03	?rev
87Lk_ipt:		# input transform (lo, hi)
88	.long	0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca	?rev
89	.long	0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd	?rev
90Lk_sbo:		# sbou, sbot
91	.long	0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15	?rev
92	.long	0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e	?rev
93Lk_sb1:		# sb1u, sb1t
94	.long	0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b	?rev
95	.long	0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5	?rev
96Lk_sb2:		# sb2u, sb2t
97	.long	0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2	?rev
98	.long	0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e	?rev
99
100##
101##  Decryption stuff
102##
103Lk_dipt:	# decryption input transform
104	.long	0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15	?rev
105	.long	0x00650560, 0xe683e386, 0x94f191f4, 0x72177712	?rev
106Lk_dsbo:	# decryption sbox final output
107	.long	0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7	?rev
108	.long	0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca	?rev
109Lk_dsb9:	# decryption sbox output *9*u, *9*t
110	.long	0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca	?rev
111	.long	0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72	?rev
112Lk_dsbd:	# decryption sbox output *D*u, *D*t
113	.long	0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5	?rev
114	.long	0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129	?rev
115Lk_dsbb:	# decryption sbox output *B*u, *B*t
116	.long	0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660	?rev
117	.long	0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3	?rev
118Lk_dsbe:	# decryption sbox output *E*u, *E*t
119	.long	0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222	?rev
120	.long	0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794	?rev
121
122##
123##  Key schedule constants
124##
125Lk_dksd:	# decryption key schedule: invskew x*D
126	.long	0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007	?rev
127	.long	0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f	?rev
128Lk_dksb:	# decryption key schedule: invskew x*B
129	.long	0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603	?rev
130	.long	0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9	?rev
131Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
132	.long	0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553	?rev
133	.long	0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd	?rev
134Lk_dks9:	# decryption key schedule: invskew x*9
135	.long	0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a	?rev
136	.long	0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b	?rev
137
138Lk_rcon:	# rcon
139	.long	0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70	?asis
140Lk_s63:
141	.long	0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b	?asis
142
143Lk_opt:		# output transform
144	.long	0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7	?rev
145	.long	0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1	?rev
146Lk_deskew:	# deskew tables: inverts the sbox's "skew"
147	.long	0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d	?rev
148	.long	0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128	?rev
149.align	5
150Lconsts:
151	mflr	r0
152	bcl	20,31,\$+4
153	mflr	r12	#vvvvv "distance between . and _vpaes_consts
154	addi	r12,r12,-0x308
155	mtlr	r0
156	blr
157	.long	0
158	.byte	0,12,0x14,0,0,0,0,0
159.asciz  "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
160.align	6
161___
162
163my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
164{
165my ($inp,$out,$key) = map("r$_",(3..5));
166
167my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
168my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
169my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
170
171$code.=<<___;
172##
173##  _aes_preheat
174##
175##  Fills register %r10 -> .aes_consts (so you can -fPIC)
176##  and %xmm9-%xmm15 as specified below.
177##
178.align	4
179_vpaes_encrypt_preheat:
180	mflr	r8
181	bl	Lconsts
182	mtlr	r8
183	li	r11, 0xc0		# Lk_inv
184	li	r10, 0xd0
185	li	r9,  0xe0		# Lk_ipt
186	li	r8,  0xf0
187	vxor	v7, v7, v7		# 0x00..00
188	vspltisb	v8,4		# 0x04..04
189	vspltisb	v9,0x0f		# 0x0f..0f
190	lvx	$invlo, r12, r11
191	li	r11, 0x100
192	lvx	$invhi, r12, r10
193	li	r10, 0x110
194	lvx	$iptlo, r12, r9
195	li	r9,  0x120
196	lvx	$ipthi, r12, r8
197	li	r8,  0x130
198	lvx	$sbou, r12, r11
199	li	r11, 0x140
200	lvx	$sbot, r12, r10
201	li	r10, 0x150
202	lvx	$sb1u, r12, r9
203	lvx	$sb1t, r12, r8
204	lvx	$sb2u, r12, r11
205	lvx	$sb2t, r12, r10
206	blr
207	.long	0
208	.byte	0,12,0x14,0,0,0,0,0
209
210##
211##  _aes_encrypt_core
212##
213##  AES-encrypt %xmm0.
214##
215##  Inputs:
216##     %xmm0 = input
217##     %xmm9-%xmm15 as in _vpaes_preheat
218##    (%rdx) = scheduled keys
219##
220##  Output in %xmm0
221##  Clobbers  %xmm1-%xmm6, %r9, %r10, %r11, %rax
222##
223##
224.align 5
225_vpaes_encrypt_core:
226	lwz	r8, 240($key)		# pull rounds
227	li	r9, 16
228	lvx	v5, 0, $key		# vmovdqu	(%r9),	%xmm5		# round0 key
229	li	r11, 0x10
230	lvx	v6, r9, $key
231	addi	r9, r9, 16
232	?vperm	v5, v5, v6, $keyperm	# align round key
233	addi	r10, r11, 0x40
234	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
235	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm1
236	vperm	v1, $ipthi, $ipthi, v1	# vpshufb	%xmm0,	%xmm3,	%xmm2
237	vxor	v0, v0, v5		# vpxor	%xmm5,	%xmm1,	%xmm0
238	vxor	v0, v0, v1		# vpxor	%xmm2,	%xmm0,	%xmm0
239	mtctr	r8
240	b	Lenc_entry
241
242.align 4
243Lenc_loop:
244	# middle of middle round
245	vperm	v4, $sb1t, v7, v2	# vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
246	lvx	v1, r12, r11		# vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
247	addi	r11, r11, 16
248	vperm	v0, $sb1u, v7, v3	# vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
249	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
250	andi.	r11, r11, 0x30		# and		\$0x30, %r11	# ... mod 4
251	vperm	v5, $sb2t, v7, v2	# vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
252	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
253	vperm	v2, $sb2u, v7, v3	# vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
254	lvx	v4, r12, r10		# vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
255	addi	r10, r11, 0x40
256	vperm	v3, v0, v7, v1		# vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
257	vxor	v2, v2, v5		# vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
258	vperm	v0, v0, v7, v4		# vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
259	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
260	vperm	v4, v3, v7, v1		# vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
261	vxor	v0, v0, v3		# vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
262	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
263
264Lenc_entry:
265	# top of round
266	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
267	vperm	v5, $invhi, $invhi, v0	# vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
268	vxor	v0, v0, v1		# vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
269	vperm	v3, $invlo, $invlo, v1	# vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
270	vperm	v4, $invlo, $invlo, v0	# vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
271	vand	v0, v0, v9
272	vxor	v3, v3, v5		# vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
273	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
274	vperm	v2, $invlo, v7, v3	# vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
275	vmr	v5, v6
276	lvx	v6, r9, $key		# vmovdqu	(%r9), %xmm5
277	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
278	addi	r9, r9, 16
279	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
280	?vperm	v5, v5, v6, $keyperm	# align round key
281	vxor	v3, v3, v1		# vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
282	bdnz	Lenc_loop
283
284	# middle of last round
285	addi	r10, r11, 0x80
286					# vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
287					# vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
288	vperm	v4, $sbou, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
289	lvx	v1, r12, r10		# vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
290	vperm	v0, $sbot, v7, v3	# vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
291	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
292	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
293	vperm	v0, v0, v7, v1		# vpshufb	%xmm1,	%xmm0,	%xmm0
294	blr
295	.long	0
296	.byte	0,12,0x14,0,0,0,0,0
297
298.globl	.vpaes_encrypt
299.align	5
300.vpaes_encrypt:
301	$STU	$sp,-$FRAME($sp)
302	li	r10,`15+6*$SIZE_T`
303	li	r11,`31+6*$SIZE_T`
304	mflr	r6
305	mfspr	r7, 256			# save vrsave
306	stvx	v20,r10,$sp
307	addi	r10,r10,32
308	stvx	v21,r11,$sp
309	addi	r11,r11,32
310	stvx	v22,r10,$sp
311	addi	r10,r10,32
312	stvx	v23,r11,$sp
313	addi	r11,r11,32
314	stvx	v24,r10,$sp
315	addi	r10,r10,32
316	stvx	v25,r11,$sp
317	addi	r11,r11,32
318	stvx	v26,r10,$sp
319	addi	r10,r10,32
320	stvx	v27,r11,$sp
321	addi	r11,r11,32
322	stvx	v28,r10,$sp
323	addi	r10,r10,32
324	stvx	v29,r11,$sp
325	addi	r11,r11,32
326	stvx	v30,r10,$sp
327	stvx	v31,r11,$sp
328	stw	r7,`$FRAME-4`($sp)	# save vrsave
329	li	r0, -1
330	$PUSH	r6,`$FRAME+$LRSAVE`($sp)
331	mtspr	256, r0			# preserve all AltiVec registers
332
333	bl	_vpaes_encrypt_preheat
334
335	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
336	lvx	v0, 0, $inp
337	addi	$inp, $inp, 15		# 15 is not a typo
338	 ?lvsr	$outperm, 0, $out
339	?lvsl	$keyperm, 0, $key	# prepare for unaligned access
340	 vnor	$outmask, v7, v7	# 0xff..ff
341	lvx	$inptail, 0, $inp	# redundant in aligned case
342	 ?vperm	$outmask, v7, $outmask, $outperm
343	 lvx	$outhead, 0, $out
344	?vperm	v0, v0, $inptail, $inpperm
345
346	bl	_vpaes_encrypt_core
347
348	vperm	v0, v0, v0, $outperm	# rotate right/left
349	vsel	v1, $outhead, v0, $outmask
350	vmr	$outhead, v0
351	stvx	v1, 0, $out
352	addi	$out, $out, 15		# 15 is not a typo
353	########
354
355	lvx	v1, 0, $out		# redundant in aligned case
356	vsel	v1, $outhead, v1, $outmask
357	stvx	v1, 0, $out
358
359	li	r10,`15+6*$SIZE_T`
360	li	r11,`31+6*$SIZE_T`
361	mtlr	r6
362	mtspr	256, r7			# restore vrsave
363	lvx	v20,r10,$sp
364	addi	r10,r10,32
365	lvx	v21,r11,$sp
366	addi	r11,r11,32
367	lvx	v22,r10,$sp
368	addi	r10,r10,32
369	lvx	v23,r11,$sp
370	addi	r11,r11,32
371	lvx	v24,r10,$sp
372	addi	r10,r10,32
373	lvx	v25,r11,$sp
374	addi	r11,r11,32
375	lvx	v26,r10,$sp
376	addi	r10,r10,32
377	lvx	v27,r11,$sp
378	addi	r11,r11,32
379	lvx	v28,r10,$sp
380	addi	r10,r10,32
381	lvx	v29,r11,$sp
382	addi	r11,r11,32
383	lvx	v30,r10,$sp
384	lvx	v31,r11,$sp
385	addi	$sp,$sp,$FRAME
386	blr
387	.long	0
388	.byte	0,12,0x04,1,0x80,0,3,0
389	.long	0
390.size	.vpaes_encrypt,.-.vpaes_encrypt
391
392.align	4
393_vpaes_decrypt_preheat:
394	mflr	r8
395	bl	Lconsts
396	mtlr	r8
397	li	r11, 0xc0		# Lk_inv
398	li	r10, 0xd0
399	li	r9,  0x160		# Ldipt
400	li	r8,  0x170
401	vxor	v7, v7, v7		# 0x00..00
402	vspltisb	v8,4		# 0x04..04
403	vspltisb	v9,0x0f		# 0x0f..0f
404	lvx	$invlo, r12, r11
405	li	r11, 0x180
406	lvx	$invhi, r12, r10
407	li	r10, 0x190
408	lvx	$iptlo, r12, r9
409	li	r9,  0x1a0
410	lvx	$ipthi, r12, r8
411	li	r8,  0x1b0
412	lvx	$sbou, r12, r11
413	li	r11, 0x1c0
414	lvx	$sbot, r12, r10
415	li	r10, 0x1d0
416	lvx	$sb9u, r12, r9
417	li	r9,  0x1e0
418	lvx	$sb9t, r12, r8
419	li	r8,  0x1f0
420	lvx	$sbdu, r12, r11
421	li	r11, 0x200
422	lvx	$sbdt, r12, r10
423	li	r10, 0x210
424	lvx	$sbbu, r12, r9
425	lvx	$sbbt, r12, r8
426	lvx	$sbeu, r12, r11
427	lvx	$sbet, r12, r10
428	blr
429	.long	0
430	.byte	0,12,0x14,0,0,0,0,0
431
432##
433##  Decryption core
434##
435##  Same API as encryption core.
436##
437.align	4
438_vpaes_decrypt_core:
439	lwz	r8, 240($key)		# pull rounds
440	li	r9, 16
441	lvx	v5, 0, $key		# vmovdqu	(%r9),	%xmm4		# round0 key
442	li	r11, 0x30
443	lvx	v6, r9, $key
444	addi	r9, r9, 16
445	?vperm	v5, v5, v6, $keyperm	# align round key
446	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
447	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm2
448	vperm	v1, $ipthi, $ipthi, v1	# vpshufb	%xmm0,	%xmm1,	%xmm0
449	vxor	v0, v0, v5		# vpxor	%xmm4,	%xmm2,	%xmm2
450	vxor	v0, v0, v1		# vpxor	%xmm2,	%xmm0,	%xmm0
451	mtctr	r8
452	b	Ldec_entry
453
454.align 4
455Ldec_loop:
456#
457#  Inverse mix columns
458#
459	lvx	v0, r12, r11		# v5 and v0 are flipped
460					# vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
461					# vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
462	vperm	v4, $sb9u, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
463	subi	r11, r11, 16
464	vperm	v1, $sb9t, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
465	andi.	r11, r11, 0x30
466	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0
467					# vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
468	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
469					# vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
470
471	vperm	v4, $sbdu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
472	vperm 	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
473	vperm	v1, $sbdt, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
474	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
475					# vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
476	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
477					# vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
478
479	vperm	v4, $sbbu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
480	vperm	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
481	vperm	v1, $sbbt, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
482	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
483					# vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
484	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
485					# vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
486
487	vperm	v4, $sbeu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
488	vperm	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
489	vperm	v1, $sbet, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
490	vxor	v0, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
491	vxor	v0, v0, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
492
493Ldec_entry:
494	# top of round
495	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
496	vperm	v2, $invhi, $invhi, v0	# vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
497	vxor	v0, v0, v1		# vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
498	vperm	v3, $invlo, $invlo, v1	# vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
499	vperm	v4, $invlo, $invlo, v0	# vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
500	vand	v0, v0, v9
501	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
502	vxor	v4, v4, v2		# vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
503	vperm	v2, $invlo, v7, v3	# vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
504	vmr	v5, v6
505	lvx	v6, r9, $key		# vmovdqu	(%r9),	%xmm0
506	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
507	addi	r9, r9, 16
508	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
509	?vperm	v5, v5, v6, $keyperm	# align round key
510	vxor	v3, v3, v1		# vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
511	bdnz	Ldec_loop
512
513	# middle of last round
514	addi	r10, r11, 0x80
515					# vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
516	vperm	v4, $sbou, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
517					# vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
518	lvx	v2, r12, r10		# vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
519	vperm	v1, $sbot, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
520	vxor	v4, v4, v5		# vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
521	vxor	v0, v1, v4		# vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
522	vperm	v0, v0, v7, v2		# vpshufb	%xmm2,	%xmm0,	%xmm0
523	blr
524	.long	0
525	.byte	0,12,0x14,0,0,0,0,0
526
527.globl	.vpaes_decrypt
528.align	5
529.vpaes_decrypt:
530	$STU	$sp,-$FRAME($sp)
531	li	r10,`15+6*$SIZE_T`
532	li	r11,`31+6*$SIZE_T`
533	mflr	r6
534	mfspr	r7, 256			# save vrsave
535	stvx	v20,r10,$sp
536	addi	r10,r10,32
537	stvx	v21,r11,$sp
538	addi	r11,r11,32
539	stvx	v22,r10,$sp
540	addi	r10,r10,32
541	stvx	v23,r11,$sp
542	addi	r11,r11,32
543	stvx	v24,r10,$sp
544	addi	r10,r10,32
545	stvx	v25,r11,$sp
546	addi	r11,r11,32
547	stvx	v26,r10,$sp
548	addi	r10,r10,32
549	stvx	v27,r11,$sp
550	addi	r11,r11,32
551	stvx	v28,r10,$sp
552	addi	r10,r10,32
553	stvx	v29,r11,$sp
554	addi	r11,r11,32
555	stvx	v30,r10,$sp
556	stvx	v31,r11,$sp
557	stw	r7,`$FRAME-4`($sp)	# save vrsave
558	li	r0, -1
559	$PUSH	r6,`$FRAME+$LRSAVE`($sp)
560	mtspr	256, r0			# preserve all AltiVec registers
561
562	bl	_vpaes_decrypt_preheat
563
564	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
565	lvx	v0, 0, $inp
566	addi	$inp, $inp, 15		# 15 is not a typo
567	 ?lvsr	$outperm, 0, $out
568	?lvsl	$keyperm, 0, $key
569	 vnor	$outmask, v7, v7	# 0xff..ff
570	lvx	$inptail, 0, $inp	# redundant in aligned case
571	 ?vperm	$outmask, v7, $outmask, $outperm
572	 lvx	$outhead, 0, $out
573	?vperm	v0, v0, $inptail, $inpperm
574
575	bl	_vpaes_decrypt_core
576
577	vperm	v0, v0, v0, $outperm	# rotate right/left
578	vsel	v1, $outhead, v0, $outmask
579	vmr	$outhead, v0
580	stvx	v1, 0, $out
581	addi	$out, $out, 15		# 15 is not a typo
582	########
583
584	lvx	v1, 0, $out		# redundant in aligned case
585	vsel	v1, $outhead, v1, $outmask
586	stvx	v1, 0, $out
587
588	li	r10,`15+6*$SIZE_T`
589	li	r11,`31+6*$SIZE_T`
590	mtlr	r6
591	mtspr	256, r7			# restore vrsave
592	lvx	v20,r10,$sp
593	addi	r10,r10,32
594	lvx	v21,r11,$sp
595	addi	r11,r11,32
596	lvx	v22,r10,$sp
597	addi	r10,r10,32
598	lvx	v23,r11,$sp
599	addi	r11,r11,32
600	lvx	v24,r10,$sp
601	addi	r10,r10,32
602	lvx	v25,r11,$sp
603	addi	r11,r11,32
604	lvx	v26,r10,$sp
605	addi	r10,r10,32
606	lvx	v27,r11,$sp
607	addi	r11,r11,32
608	lvx	v28,r10,$sp
609	addi	r10,r10,32
610	lvx	v29,r11,$sp
611	addi	r11,r11,32
612	lvx	v30,r10,$sp
613	lvx	v31,r11,$sp
614	addi	$sp,$sp,$FRAME
615	blr
616	.long	0
617	.byte	0,12,0x04,1,0x80,0,3,0
618	.long	0
619.size	.vpaes_decrypt,.-.vpaes_decrypt
620
621.globl	.vpaes_cbc_encrypt
622.align	5
623.vpaes_cbc_encrypt:
624	${UCMP}i r5,16
625	bltlr-
626
627	$STU	$sp,-`($FRAME+2*$SIZE_T)`($sp)
628	mflr	r0
629	li	r10,`15+6*$SIZE_T`
630	li	r11,`31+6*$SIZE_T`
631	mfspr	r12, 256
632	stvx	v20,r10,$sp
633	addi	r10,r10,32
634	stvx	v21,r11,$sp
635	addi	r11,r11,32
636	stvx	v22,r10,$sp
637	addi	r10,r10,32
638	stvx	v23,r11,$sp
639	addi	r11,r11,32
640	stvx	v24,r10,$sp
641	addi	r10,r10,32
642	stvx	v25,r11,$sp
643	addi	r11,r11,32
644	stvx	v26,r10,$sp
645	addi	r10,r10,32
646	stvx	v27,r11,$sp
647	addi	r11,r11,32
648	stvx	v28,r10,$sp
649	addi	r10,r10,32
650	stvx	v29,r11,$sp
651	addi	r11,r11,32
652	stvx	v30,r10,$sp
653	stvx	v31,r11,$sp
654	stw	r12,`$FRAME-4`($sp)	# save vrsave
655	$PUSH	r30,`$FRAME+$SIZE_T*0`($sp)
656	$PUSH	r31,`$FRAME+$SIZE_T*1`($sp)
657	li	r9, -16
658	$PUSH	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
659
660	and	r30, r5, r9		# copy length&-16
661	mr	r5, r6			# copy pointer to key
662	mr	r31, r7			# copy pointer to iv
663	blt	Lcbc_abort
664	cmpwi	r8, 0			# test direction
665	li	r6, -1
666	mr	r7, r12			# copy vrsave
667	mtspr	256, r6			# preserve all AltiVec registers
668
669	lvx	v24, 0, r31		# load [potentially unaligned] iv
670	li	r9, 15
671	?lvsl	$inpperm, 0, r31
672	lvx	v25, r9, r31
673	?vperm	v24, v24, v25, $inpperm
674
675	neg	r8, $inp		# prepare for unaligned access
676	 vxor	v7, v7, v7
677	?lvsl	$keyperm, 0, $key
678	 ?lvsr	$outperm, 0, $out
679	?lvsr	$inpperm, 0, r8		# -$inp
680	 vnor	$outmask, v7, v7	# 0xff..ff
681	lvx	$inptail, 0, $inp
682	 ?vperm	$outmask, v7, $outmask, $outperm
683	addi	$inp, $inp, 15		# 15 is not a typo
684	 lvx	$outhead, 0, $out
685
686	beq	Lcbc_decrypt
687
688	bl	_vpaes_encrypt_preheat
689	li	r0, 16
690
691Lcbc_enc_loop:
692	vmr	v0, $inptail
693	lvx	$inptail, 0, $inp
694	addi	$inp, $inp, 16
695	?vperm	v0, v0, $inptail, $inpperm
696	vxor	v0, v0, v24		# ^= iv
697
698	bl	_vpaes_encrypt_core
699
700	vmr	v24, v0			# put aside iv
701	sub.	r30, r30, r0		# len -= 16
702	vperm	v0, v0, v0, $outperm	# rotate right/left
703	vsel	v1, $outhead, v0, $outmask
704	vmr	$outhead, v0
705	stvx	v1, 0, $out
706	addi	$out, $out, 16
707	bne	Lcbc_enc_loop
708
709	b	Lcbc_done
710
711.align	5
712Lcbc_decrypt:
713	bl	_vpaes_decrypt_preheat
714	li	r0, 16
715
716Lcbc_dec_loop:
717	vmr	v0, $inptail
718	lvx	$inptail, 0, $inp
719	addi	$inp, $inp, 16
720	?vperm	v0, v0, $inptail, $inpperm
721	vmr	v25, v0			# put aside input
722
723	bl	_vpaes_decrypt_core
724
725	vxor	v0, v0, v24		# ^= iv
726	vmr	v24, v25
727	sub.	r30, r30, r0		# len -= 16
728	vperm	v0, v0, v0, $outperm	# rotate right/left
729	vsel	v1, $outhead, v0, $outmask
730	vmr	$outhead, v0
731	stvx	v1, 0, $out
732	addi	$out, $out, 16
733	bne	Lcbc_dec_loop
734
735Lcbc_done:
736	addi	$out, $out, -1
737	lvx	v1, 0, $out		# redundant in aligned case
738	vsel	v1, $outhead, v1, $outmask
739	stvx	v1, 0, $out
740
741	neg	r8, r31			# write [potentially unaligned] iv
742	?lvsl	$outperm, 0, r8
743	li	r6, 15
744	vnor	$outmask, v7, v7	# 0xff..ff
745	?vperm	$outmask, v7, $outmask, $outperm
746	lvx	$outhead, 0, r31
747	vperm	v24, v24, v24, $outperm	# rotate right/left
748	vsel	v0, $outhead, v24, $outmask
749	lvx	v1, r6, r31
750	stvx	v0, 0, r31
751	vsel	v1, v24, v1, $outmask
752	stvx	v1, r6, r31
753
754	mtspr	256, r7			# restore vrsave
755	li	r10,`15+6*$SIZE_T`
756	li	r11,`31+6*$SIZE_T`
757	lvx	v20,r10,$sp
758	addi	r10,r10,32
759	lvx	v21,r11,$sp
760	addi	r11,r11,32
761	lvx	v22,r10,$sp
762	addi	r10,r10,32
763	lvx	v23,r11,$sp
764	addi	r11,r11,32
765	lvx	v24,r10,$sp
766	addi	r10,r10,32
767	lvx	v25,r11,$sp
768	addi	r11,r11,32
769	lvx	v26,r10,$sp
770	addi	r10,r10,32
771	lvx	v27,r11,$sp
772	addi	r11,r11,32
773	lvx	v28,r10,$sp
774	addi	r10,r10,32
775	lvx	v29,r11,$sp
776	addi	r11,r11,32
777	lvx	v30,r10,$sp
778	lvx	v31,r11,$sp
779Lcbc_abort:
780	$POP	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
781	$POP	r30,`$FRAME+$SIZE_T*0`($sp)
782	$POP	r31,`$FRAME+$SIZE_T*1`($sp)
783	mtlr	r0
784	addi	$sp,$sp,`$FRAME+$SIZE_T*2`
785	blr
786	.long	0
787	.byte	0,12,0x04,1,0x80,2,6,0
788	.long	0
789.size	.vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
790___
791}
792{
793my ($inp,$bits,$out)=map("r$_",(3..5));
794my $dir="cr1";
795my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
796
797$code.=<<___;
798########################################################
799##                                                    ##
800##                  AES key schedule                  ##
801##                                                    ##
802########################################################
803.align	4
804_vpaes_key_preheat:
805	mflr	r8
806	bl	Lconsts
807	mtlr	r8
808	li	r11, 0xc0		# Lk_inv
809	li	r10, 0xd0
810	li	r9,  0xe0		# L_ipt
811	li	r8,  0xf0
812
813	vspltisb	v8,4		# 0x04..04
814	vxor	v9,v9,v9		# 0x00..00
815	lvx	$invlo, r12, r11	# Lk_inv
816	li	r11, 0x120
817	lvx	$invhi, r12, r10
818	li	r10, 0x130
819	lvx	$iptlo, r12, r9		# Lk_ipt
820	li	r9, 0x220
821	lvx	$ipthi, r12, r8
822	li	r8, 0x230
823
824	lvx	v14, r12, r11		# Lk_sb1
825	li	r11, 0x240
826	lvx	v15, r12, r10
827	li	r10, 0x250
828
829	lvx	v16, r12, r9		# Lk_dksd
830	li	r9, 0x260
831	lvx	v17, r12, r8
832	li	r8, 0x270
833	lvx	v18, r12, r11		# Lk_dksb
834	li	r11, 0x280
835	lvx	v19, r12, r10
836	li	r10, 0x290
837	lvx	v20, r12, r9		# Lk_dkse
838	li	r9, 0x2a0
839	lvx	v21, r12, r8
840	li	r8, 0x2b0
841	lvx	v22, r12, r11		# Lk_dks9
842	lvx	v23, r12, r10
843
844	lvx	v24, r12, r9		# Lk_rcon
845	lvx	v25, 0, r12		# Lk_mc_forward[0]
846	lvx	v26, r12, r8		# Lks63
847	blr
848	.long	0
849	.byte	0,12,0x14,0,0,0,0,0
850
851.align	4
852_vpaes_schedule_core:
853	mflr	r7
854
855	bl	_vpaes_key_preheat	# load the tables
856
857	#lvx	v0, 0, $inp		# vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
858	neg	r8, $inp		# prepare for unaligned access
859	lvx	v0, 0, $inp
860	addi	$inp, $inp, 15		# 15 is not typo
861	?lvsr	$inpperm, 0, r8		# -$inp
862	lvx	v6, 0, $inp		# v6 serves as inptail
863	addi	$inp, $inp, 8
864	?vperm	v0, v0, v6, $inpperm
865
866	# input transform
867	vmr	v3, v0			# vmovdqa	%xmm0,	%xmm3
868	bl	_vpaes_schedule_transform
869	vmr	v7, v0			# vmovdqa	%xmm0,	%xmm7
870
871	bne	$dir, Lschedule_am_decrypting
872
873	# encrypting, output zeroth round key after transform
874	li	r8, 0x30		# mov	\$0x30,%r8d
875	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
876
877	?lvsr	$outperm, 0, $out	# prepare for unaligned access
878	vnor	$outmask, v9, v9	# 0xff..ff
879	lvx	$outhead, 0, $out
880	?vperm	$outmask, v9, $outmask, $outperm
881
882	#stvx	v0, 0, $out		# vmovdqu	%xmm0,	(%rdx)
883	vperm	v1, v0, v0, $outperm	# rotate right/left
884	vsel	v2, $outhead, v1, $outmask
885	vmr	$outhead, v1
886	stvx	v2, 0, $out
887	b	Lschedule_go
888
889Lschedule_am_decrypting:
890	srwi	r8, $bits, 1		# shr	\$1,%r8d
891	andi.	r8, r8, 32		# and	\$32,%r8d
892	xori	r8, r8, 32		# xor	\$32,%r8d	# nbits==192?0:32
893	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
894	# decrypting, output zeroth round key after shiftrows
895	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
896	vperm	v4, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
897
898	neg	r0, $out		# prepare for unaligned access
899	?lvsl	$outperm, 0, r0
900	addi	$out, $out, 15		# 15 is not typo
901	vnor	$outmask, v9, v9	# 0xff..ff
902	lvx	$outhead, 0, $out
903	?vperm	$outmask, $outmask, v9, $outperm
904
905	#stvx	v4, 0, $out		# vmovdqu	%xmm3,	(%rdx)
906	vperm	v4, v4, v4, $outperm	# rotate right/left
907	vsel	v2, $outhead, v4, $outmask
908	vmr	$outhead, v4
909	stvx	v2, 0, $out
910	xori	r8, r8, 0x30		# xor	\$0x30, %r8
911
912Lschedule_go:
913	cmplwi	$bits, 192		# cmp	\$192,	%esi
914	bgt	Lschedule_256
915	beq	Lschedule_192
916	# 128: fall though
917
918##
919##  .schedule_128
920##
921##  128-bit specific part of key schedule.
922##
923##  This schedule is really simple, because all its parts
924##  are accomplished by the subroutines.
925##
926Lschedule_128:
927	li	r0, 10			# mov	\$10, %esi
928	mtctr	r0
929
930Loop_schedule_128:
931	bl 	_vpaes_schedule_round
932	bdz 	Lschedule_mangle_last	# dec	%esi
933	bl	_vpaes_schedule_mangle	# write output
934	b 	Loop_schedule_128
935
936##
937##  .aes_schedule_192
938##
939##  192-bit specific part of key schedule.
940##
941##  The main body of this schedule is the same as the 128-bit
942##  schedule, but with more smearing.  The long, high side is
943##  stored in %xmm7 as before, and the short, low side is in
944##  the high bits of %xmm6.
945##
946##  This schedule is somewhat nastier, however, because each
947##  round produces 192 bits of key material, or 1.5 round keys.
948##  Therefore, on each cycle we do 2 rounds and produce 3 round
949##  keys.
950##
951.align	4
952Lschedule_192:
953	li	r0, 4			# mov	\$4,	%esi
954	lvx	v0, 0, $inp
955	?vperm	v0, v6, v0, $inpperm
956	?vsldoi	v0, v3, v0, 8		# vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
957	bl	_vpaes_schedule_transform	# input transform
958	?vsldoi	v6, v0, v9, 8
959	?vsldoi	v6, v9, v6, 8		# clobber "low" side with zeros
960	mtctr	r0
961
962Loop_schedule_192:
963	bl	_vpaes_schedule_round
964	?vsldoi	v0, v6, v0, 8		# vpalignr	\$8,%xmm6,%xmm0,%xmm0
965	bl	_vpaes_schedule_mangle	# save key n
966	bl	_vpaes_schedule_192_smear
967	bl	_vpaes_schedule_mangle	# save key n+1
968	bl	_vpaes_schedule_round
969	bdz 	Lschedule_mangle_last	# dec	%esi
970	bl	_vpaes_schedule_mangle	# save key n+2
971	bl	_vpaes_schedule_192_smear
972	b	Loop_schedule_192
973
974##
975##  .aes_schedule_256
976##
977##  256-bit specific part of key schedule.
978##
979##  The structure here is very similar to the 128-bit
980##  schedule, but with an additional "low side" in
981##  %xmm6.  The low side's rounds are the same as the
982##  high side's, except no rcon and no rotation.
983##
984.align	4
985Lschedule_256:
986	li	r0, 7			# mov	\$7, %esi
987	addi	$inp, $inp, 8
988	lvx	v0, 0, $inp		# vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
989	?vperm	v0, v6, v0, $inpperm
990	bl	_vpaes_schedule_transform	# input transform
991	mtctr	r0
992
993Loop_schedule_256:
994	bl	_vpaes_schedule_mangle	# output low result
995	vmr	v6, v0			# vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
996
997	# high round
998	bl	_vpaes_schedule_round
999	bdz 	Lschedule_mangle_last	# dec	%esi
1000	bl	_vpaes_schedule_mangle
1001
1002	# low round. swap xmm7 and xmm6
1003	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
1004	vmr	v5, v7			# vmovdqa	%xmm7,	%xmm5
1005	vmr	v7, v6			# vmovdqa	%xmm6,	%xmm7
1006	bl	_vpaes_schedule_low_round
1007	vmr	v7, v5			# vmovdqa	%xmm5,	%xmm7
1008
1009	b	Loop_schedule_256
1010##
1011##  .aes_schedule_mangle_last
1012##
1013##  Mangler for last round of key schedule
1014##  Mangles %xmm0
1015##    when encrypting, outputs out(%xmm0) ^ 63
1016##    when decrypting, outputs unskew(%xmm0)
1017##
1018##  Always called right before return... jumps to cleanup and exits
1019##
1020.align	4
1021Lschedule_mangle_last:
1022	# schedule last round key from xmm0
1023	li	r11, 0x2e0		# lea	.Lk_deskew(%rip),%r11
1024	li	r9,  0x2f0
1025	bne	$dir, Lschedule_mangle_last_dec
1026
1027	# encrypting
1028	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),%xmm1
1029	li	r11, 0x2c0		# lea		.Lk_opt(%rip),	%r11	# prepare to output transform
1030	li	r9,  0x2d0		# prepare to output transform
1031	vperm	v0, v0, v0, v1		# vpshufb	%xmm1,	%xmm0,	%xmm0	# output permute
1032
1033	lvx	$iptlo, r11, r12	# reload $ipt
1034	lvx	$ipthi, r9, r12
1035	addi	$out, $out, 16		# add	\$16,	%rdx
1036	vxor	v0, v0, v26		# vpxor		.Lk_s63(%rip),	%xmm0,	%xmm0
1037	bl	_vpaes_schedule_transform	# output transform
1038
1039	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
1040	vperm	v0, v0, v0, $outperm	# rotate right/left
1041	vsel	v2, $outhead, v0, $outmask
1042	vmr	$outhead, v0
1043	stvx	v2, 0, $out
1044
1045	addi	$out, $out, 15		# 15 is not typo
1046	lvx	v1, 0, $out		# redundant in aligned case
1047	vsel	v1, $outhead, v1, $outmask
1048	stvx	v1, 0, $out
1049	b	Lschedule_mangle_done
1050
1051.align	4
1052Lschedule_mangle_last_dec:
1053	lvx	$iptlo, r11, r12	# reload $ipt
1054	lvx	$ipthi, r9,  r12
1055	addi	$out, $out, -16		# add	\$-16,	%rdx
1056	vxor	v0, v0, v26		# vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
1057	bl	_vpaes_schedule_transform	# output transform
1058
1059	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
1060	vperm	v0, v0, v0, $outperm	# rotate right/left
1061	vsel	v2, $outhead, v0, $outmask
1062	vmr	$outhead, v0
1063	stvx	v2, 0, $out
1064
1065	addi	$out, $out, -15		# -15 is not typo
1066	lvx	v1, 0, $out		# redundant in aligned case
1067	vsel	v1, $outhead, v1, $outmask
1068	stvx	v1, 0, $out
1069
1070Lschedule_mangle_done:
1071	mtlr	r7
1072	# cleanup
1073	vxor	v0, v0, v0		# vpxor		%xmm0,	%xmm0,	%xmm0
1074	vxor	v1, v1, v1		# vpxor		%xmm1,	%xmm1,	%xmm1
1075	vxor	v2, v2, v2		# vpxor		%xmm2,	%xmm2,	%xmm2
1076	vxor	v3, v3, v3		# vpxor		%xmm3,	%xmm3,	%xmm3
1077	vxor	v4, v4, v4		# vpxor		%xmm4,	%xmm4,	%xmm4
1078	vxor	v5, v5, v5		# vpxor		%xmm5,	%xmm5,	%xmm5
1079	vxor	v6, v6, v6		# vpxor		%xmm6,	%xmm6,	%xmm6
1080	vxor	v7, v7, v7		# vpxor		%xmm7,	%xmm7,	%xmm7
1081
1082	blr
1083	.long	0
1084	.byte	0,12,0x14,0,0,0,0,0
1085
1086##
1087##  .aes_schedule_192_smear
1088##
1089##  Smear the short, low side in the 192-bit key schedule.
1090##
1091##  Inputs:
1092##    %xmm7: high side, b  a  x  y
1093##    %xmm6:  low side, d  c  0  0
1094##    %xmm13: 0
1095##
1096##  Outputs:
1097##    %xmm6: b+c+d  b+c  0  0
1098##    %xmm0: b+c+d  b+c  b  a
1099##
1100.align	4
1101_vpaes_schedule_192_smear:
1102	?vspltw	v0, v7, 3
1103	?vsldoi	v1, v9, v6, 12		# vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
1104	?vsldoi	v0, v7, v0, 8		# vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
1105	vxor	v6, v6, v1		# vpxor		%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
1106	vxor	v6, v6, v0		# vpxor		%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
1107	vmr	v0, v6
1108	?vsldoi	v6, v6, v9, 8
1109	?vsldoi	v6, v9, v6, 8		# clobber low side with zeros
1110	blr
1111	.long	0
1112	.byte	0,12,0x14,0,0,0,0,0
1113
1114##
1115##  .aes_schedule_round
1116##
1117##  Runs one main round of the key schedule on %xmm0, %xmm7
1118##
1119##  Specifically, runs subbytes on the high dword of %xmm0
1120##  then rotates it by one byte and xors into the low dword of
1121##  %xmm7.
1122##
1123##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1124##  next rcon.
1125##
1126##  Smears the dwords of %xmm7 by xoring the low into the
1127##  second low, result into third, result into highest.
1128##
1129##  Returns results in %xmm7 = %xmm0.
1130##  Clobbers %xmm1-%xmm4, %r11.
1131##
1132.align	4
1133_vpaes_schedule_round:
1134	# extract rcon from xmm8
1135	#vxor	v4, v4, v4		# vpxor		%xmm4,	%xmm4,	%xmm4
1136	?vsldoi	v1, $rcon, v9, 15	# vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
1137	?vsldoi	$rcon, $rcon, $rcon, 15	# vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
1138	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7
1139
1140	# rotate
1141	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
1142	?vsldoi	v0, v0, v0, 1		# vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
1143
1144	# fall through...
1145
1146	# low round: same as high round, but no rotation and no rcon.
1147_vpaes_schedule_low_round:
1148	# smear xmm7
1149	?vsldoi	v1, v9, v7, 12		# vpslldq	\$4,	%xmm7,	%xmm1
1150	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7
1151	vspltisb	v1, 0x0f	# 0x0f..0f
1152	?vsldoi	v4, v9, v7, 8		# vpslldq	\$8,	%xmm7,	%xmm4
1153
1154	# subbytes
1155	vand	v1, v1, v0		# vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
1156	vsrb	v0, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
1157	 vxor	v7, v7, v4		# vpxor		%xmm4,	%xmm7,	%xmm7
1158	vperm	v2, $invhi, v9, v1	# vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
1159	vxor	v1, v1, v0		# vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
1160	vperm	v3, $invlo, v9, v0	# vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
1161	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
1162	vperm	v4, $invlo, v9, v1	# vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
1163	 vxor	v7, v7, v26		# vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
1164	vperm	v3, $invlo, v9, v3	# vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
1165	vxor	v4, v4, v2		# vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
1166	vperm	v2, $invlo, v9, v4	# vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
1167	vxor	v3, v3, v1		# vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
1168	vxor	v2, v2, v0		# vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
1169	vperm	v4, v15, v9, v3		# vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
1170	vperm	v1, v14, v9, v2		# vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
1171	vxor	v1, v1, v4		# vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
1172
1173	# add in smeared stuff
1174	vxor	v0, v1, v7		# vpxor		%xmm7,	%xmm1,	%xmm0
1175	vxor	v7, v1, v7		# vmovdqa	%xmm0,	%xmm7
1176	blr
1177	.long	0
1178	.byte	0,12,0x14,0,0,0,0,0
1179
1180##
1181##  .aes_schedule_transform
1182##
1183##  Linear-transform %xmm0 according to tables at (%r11)
1184##
1185##  Requires that %xmm9 = 0x0F0F... as in preheat
1186##  Output in %xmm0
1187##  Clobbers %xmm2
1188##
1189.align	4
1190_vpaes_schedule_transform:
1191	#vand	v1, v0, v9		# vpand		%xmm9,	%xmm0,	%xmm1
1192	vsrb	v2, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
1193					# vmovdqa	(%r11),	%xmm2 	# lo
1194	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm2
1195					# vmovdqa	16(%r11),	%xmm1 # hi
1196	vperm	v2, $ipthi, $ipthi, v2	# vpshufb	%xmm0,	%xmm1,	%xmm0
1197	vxor	v0, v0, v2		# vpxor		%xmm2,	%xmm0,	%xmm0
1198	blr
1199	.long	0
1200	.byte	0,12,0x14,0,0,0,0,0
1201
1202##
1203##  .aes_schedule_mangle
1204##
1205##  Mangle xmm0 from (basis-transformed) standard version
1206##  to our version.
1207##
1208##  On encrypt,
1209##    xor with 0x63
1210##    multiply by circulant 0,1,1,1
1211##    apply shiftrows transform
1212##
1213##  On decrypt,
1214##    xor with 0x63
1215##    multiply by "inverse mixcolumns" circulant E,B,D,9
1216##    deskew
1217##    apply shiftrows transform
1218##
1219##
1220##  Writes out to (%rdx), and increments or decrements it
1221##  Keeps track of round number mod 4 in %r8
1222##  Preserves xmm0
1223##  Clobbers xmm1-xmm5
1224##
1225.align	4
1226_vpaes_schedule_mangle:
1227	#vmr	v4, v0			# vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
1228					# vmovdqa	.Lk_mc_forward(%rip),%xmm5
1229	bne	$dir, Lschedule_mangle_dec
1230
1231	# encrypting
1232	vxor	v4, v0, v26		# vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
1233	addi	$out, $out, 16		# add	\$16,	%rdx
1234	vperm	v4, v4, v4, v25		# vpshufb	%xmm5,	%xmm4,	%xmm4
1235	vperm	v1, v4, v4, v25		# vpshufb	%xmm5,	%xmm4,	%xmm1
1236	vperm	v3, v1, v1, v25		# vpshufb	%xmm5,	%xmm1,	%xmm3
1237	vxor	v4, v4, v1		# vpxor		%xmm1,	%xmm4,	%xmm4
1238	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
1239	vxor	v3, v3, v4		# vpxor		%xmm4,	%xmm3,	%xmm3
1240
1241	vperm	v3, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
1242	addi	r8, r8, -16		# add	\$-16,	%r8
1243	andi.	r8, r8, 0x30		# and	\$0x30,	%r8
1244
1245	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
1246	vperm	v1, v3, v3, $outperm	# rotate right/left
1247	vsel	v2, $outhead, v1, $outmask
1248	vmr	$outhead, v1
1249	stvx	v2, 0, $out
1250	blr
1251
1252.align	4
1253Lschedule_mangle_dec:
1254	# inverse mix columns
1255					# lea	.Lk_dksd(%rip),%r11
1256	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
1257	#and	v4, v0, v9		# vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
1258
1259					# vmovdqa	0x00(%r11),	%xmm2
1260	vperm	v2, v16, v16, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1261					# vmovdqa	0x10(%r11),	%xmm3
1262	vperm	v3, v17, v17, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1263	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1264	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1265
1266					# vmovdqa	0x20(%r11),	%xmm2
1267	vperm	v2, v18, v18, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1268	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1269					# vmovdqa	0x30(%r11),	%xmm3
1270	vperm	v3, v19, v19, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1271	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1272	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1273
1274					# vmovdqa	0x40(%r11),	%xmm2
1275	vperm	v2, v20, v20, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1276	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1277					# vmovdqa	0x50(%r11),	%xmm3
1278	vperm	v3, v21, v21, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1279	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1280
1281					# vmovdqa	0x60(%r11),	%xmm2
1282	vperm	v2, v22, v22, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1283	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1284					# vmovdqa	0x70(%r11),	%xmm4
1285	vperm	v4, v23, v23, v1	# vpshufb	%xmm1,	%xmm4,	%xmm4
1286	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
1287	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1288	vxor	v3, v4, v2		# vpxor		%xmm2,	%xmm4,	%xmm3
1289
1290	addi	$out, $out, -16		# add	\$-16,	%rdx
1291
1292	vperm	v3, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
1293	addi	r8, r8, -16		# add	\$-16,	%r8
1294	andi.	r8, r8, 0x30		# and	\$0x30,	%r8
1295
1296	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
1297	vperm	v1, v3, v3, $outperm	# rotate right/left
1298	vsel	v2, $outhead, v1, $outmask
1299	vmr	$outhead, v1
1300	stvx	v2, 0, $out
1301	blr
1302	.long	0
1303	.byte	0,12,0x14,0,0,0,0,0
1304
1305.globl	.vpaes_set_encrypt_key
1306.align	5
1307.vpaes_set_encrypt_key:
1308	$STU	$sp,-$FRAME($sp)
1309	li	r10,`15+6*$SIZE_T`
1310	li	r11,`31+6*$SIZE_T`
1311	mflr	r0
1312	mfspr	r6, 256			# save vrsave
1313	stvx	v20,r10,$sp
1314	addi	r10,r10,32
1315	stvx	v21,r11,$sp
1316	addi	r11,r11,32
1317	stvx	v22,r10,$sp
1318	addi	r10,r10,32
1319	stvx	v23,r11,$sp
1320	addi	r11,r11,32
1321	stvx	v24,r10,$sp
1322	addi	r10,r10,32
1323	stvx	v25,r11,$sp
1324	addi	r11,r11,32
1325	stvx	v26,r10,$sp
1326	addi	r10,r10,32
1327	stvx	v27,r11,$sp
1328	addi	r11,r11,32
1329	stvx	v28,r10,$sp
1330	addi	r10,r10,32
1331	stvx	v29,r11,$sp
1332	addi	r11,r11,32
1333	stvx	v30,r10,$sp
1334	stvx	v31,r11,$sp
1335	stw	r6,`$FRAME-4`($sp)	# save vrsave
1336	li	r7, -1
1337	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
1338	mtspr	256, r7			# preserve all AltiVec registers
1339
1340	srwi	r9, $bits, 5		# shr	\$5,%eax
1341	addi	r9, r9, 6		# add	\$5,%eax
1342	stw	r9, 240($out)		# mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1343
1344	cmplw	$dir, $bits, $bits	# set encrypt direction
1345	li	r8, 0x30		# mov	\$0x30,%r8d
1346	bl	_vpaes_schedule_core
1347
1348	$POP	r0, `$FRAME+$LRSAVE`($sp)
1349	li	r10,`15+6*$SIZE_T`
1350	li	r11,`31+6*$SIZE_T`
1351	mtspr	256, r6			# restore vrsave
1352	mtlr	r0
1353	xor	r3, r3, r3
1354	lvx	v20,r10,$sp
1355	addi	r10,r10,32
1356	lvx	v21,r11,$sp
1357	addi	r11,r11,32
1358	lvx	v22,r10,$sp
1359	addi	r10,r10,32
1360	lvx	v23,r11,$sp
1361	addi	r11,r11,32
1362	lvx	v24,r10,$sp
1363	addi	r10,r10,32
1364	lvx	v25,r11,$sp
1365	addi	r11,r11,32
1366	lvx	v26,r10,$sp
1367	addi	r10,r10,32
1368	lvx	v27,r11,$sp
1369	addi	r11,r11,32
1370	lvx	v28,r10,$sp
1371	addi	r10,r10,32
1372	lvx	v29,r11,$sp
1373	addi	r11,r11,32
1374	lvx	v30,r10,$sp
1375	lvx	v31,r11,$sp
1376	addi	$sp,$sp,$FRAME
1377	blr
1378	.long	0
1379	.byte	0,12,0x04,1,0x80,0,3,0
1380	.long	0
1381.size	.vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1382
1383.globl	.vpaes_set_decrypt_key
1384.align	4
1385.vpaes_set_decrypt_key:
1386	$STU	$sp,-$FRAME($sp)
1387	li	r10,`15+6*$SIZE_T`
1388	li	r11,`31+6*$SIZE_T`
1389	mflr	r0
1390	mfspr	r6, 256			# save vrsave
1391	stvx	v20,r10,$sp
1392	addi	r10,r10,32
1393	stvx	v21,r11,$sp
1394	addi	r11,r11,32
1395	stvx	v22,r10,$sp
1396	addi	r10,r10,32
1397	stvx	v23,r11,$sp
1398	addi	r11,r11,32
1399	stvx	v24,r10,$sp
1400	addi	r10,r10,32
1401	stvx	v25,r11,$sp
1402	addi	r11,r11,32
1403	stvx	v26,r10,$sp
1404	addi	r10,r10,32
1405	stvx	v27,r11,$sp
1406	addi	r11,r11,32
1407	stvx	v28,r10,$sp
1408	addi	r10,r10,32
1409	stvx	v29,r11,$sp
1410	addi	r11,r11,32
1411	stvx	v30,r10,$sp
1412	stvx	v31,r11,$sp
1413	stw	r6,`$FRAME-4`($sp)	# save vrsave
1414	li	r7, -1
1415	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
1416	mtspr	256, r7			# preserve all AltiVec registers
1417
1418	srwi	r9, $bits, 5		# shr	\$5,%eax
1419	addi	r9, r9, 6		# add	\$5,%eax
1420	stw	r9, 240($out)		# mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1421
1422	slwi	r9, r9, 4		# shl	\$4,%eax
1423	add	$out, $out, r9		# lea	(%rdx,%rax),%rdx
1424
1425	cmplwi	$dir, $bits, 0		# set decrypt direction
1426	srwi	r8, $bits, 1		# shr	\$1,%r8d
1427	andi.	r8, r8, 32		# and	\$32,%r8d
1428	xori	r8, r8, 32		# xor	\$32,%r8d	# nbits==192?0:32
1429	bl	_vpaes_schedule_core
1430
1431	$POP	r0,  `$FRAME+$LRSAVE`($sp)
1432	li	r10,`15+6*$SIZE_T`
1433	li	r11,`31+6*$SIZE_T`
1434	mtspr	256, r6			# restore vrsave
1435	mtlr	r0
1436	xor	r3, r3, r3
1437	lvx	v20,r10,$sp
1438	addi	r10,r10,32
1439	lvx	v21,r11,$sp
1440	addi	r11,r11,32
1441	lvx	v22,r10,$sp
1442	addi	r10,r10,32
1443	lvx	v23,r11,$sp
1444	addi	r11,r11,32
1445	lvx	v24,r10,$sp
1446	addi	r10,r10,32
1447	lvx	v25,r11,$sp
1448	addi	r11,r11,32
1449	lvx	v26,r10,$sp
1450	addi	r10,r10,32
1451	lvx	v27,r11,$sp
1452	addi	r11,r11,32
1453	lvx	v28,r10,$sp
1454	addi	r10,r10,32
1455	lvx	v29,r11,$sp
1456	addi	r11,r11,32
1457	lvx	v30,r10,$sp
1458	lvx	v31,r11,$sp
1459	addi	$sp,$sp,$FRAME
1460	blr
1461	.long	0
1462	.byte	0,12,0x04,1,0x80,0,3,0
1463	.long	0
1464.size	.vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1465___
1466}
1467
1468my $consts=1;
1469foreach  (split("\n",$code)) {
1470	s/\`([^\`]*)\`/eval $1/geo;
1471
1472	# constants table endian-specific conversion
1473	if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1474	    my $conv=$2;
1475	    my @bytes=();
1476
1477	    # convert to endian-agnostic format
1478	    foreach (split(/,\s+/,$1)) {
1479		my $l = /^0/?oct:int;
1480		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1481	    }
1482
1483	    # little-endian conversion
1484	    if ($flavour =~ /le$/o) {
1485		SWITCH: for($conv)  {
1486		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1487		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
1488		}
1489	    }
1490
1491	    #emit
1492	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1493	    next;
1494	}
1495	$consts=0 if (m/Lconsts:/o);	# end of table
1496
1497	# instructions prefixed with '?' are endian-specific and need
1498	# to be adjusted accordingly...
1499	if ($flavour =~ /le$/o) {	# little-endian
1500	    s/\?lvsr/lvsl/o or
1501	    s/\?lvsl/lvsr/o or
1502	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1503	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1504	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1505	} else {			# big-endian
1506	    s/\?([a-z]+)/$1/o;
1507	}
1508
1509	print $_,"\n";
1510}
1511
1512close STDOUT;
1513