1#! /usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from CRYPTOGAMs[1] and is included here using the option
5# in the license to distribute the code under the GPL. Therefore this program
6# is free software; you can redistribute it and/or modify it under the terms of
7# the GNU General Public License version 2 as published by the Free Software
8# Foundation.
9#
10# [1] https://www.openssl.org/~appro/cryptogams/
11
12# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
13# All rights reserved.
14#
15# Redistribution and use in source and binary forms, with or without
16# modification, are permitted provided that the following conditions
17# are met:
18#
19#       * Redistributions of source code must retain copyright notices,
20#         this list of conditions and the following disclaimer.
21#
22#       * Redistributions in binary form must reproduce the above
23#         copyright notice, this list of conditions and the following
24#         disclaimer in the documentation and/or other materials
25#         provided with the distribution.
26#
27#       * Neither the name of the CRYPTOGAMS nor the names of its
28#         copyright holder and contributors may be used to endorse or
29#         promote products derived from this software without specific
30#         prior written permission.
31#
32# ALTERNATIVELY, provided that this notice is retained in full, this
33# product may be distributed under the terms of the GNU General Public
34# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
35# those given above.
36#
37# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
38# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48
49# ====================================================================
50# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
51# project. The module is, however, dual licensed under OpenSSL and
52# CRYPTOGAMS licenses depending on where you obtain it. For further
53# details see https://www.openssl.org/~appro/cryptogams/.
54# ====================================================================
55#
56# This module implements support for AES instructions as per PowerISA
57# specification version 2.07, first implemented by POWER8 processor.
58# The module is endian-agnostic in sense that it supports both big-
59# and little-endian cases. Data alignment in parallelizable modes is
60# handled with VSX loads and stores, which implies MSR.VSX flag being
61# set. It should also be noted that ISA specification doesn't prohibit
62# alignment exceptions for these instructions on page boundaries.
63# Initially alignment was handled in pure AltiVec/VMX way [when data
64# is aligned programmatically, which in turn guarantees exception-
65# free execution], but it turned to hamper performance when vcipher
66# instructions are interleaved. It's reckoned that eventual
67# misalignment penalties at page boundaries are in average lower
68# than additional overhead in pure AltiVec approach.
69#
70# May 2016
71#
72# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
73# systems were measured.
74#
75######################################################################
76# Current large-block performance in cycles per byte processed with
77# 128-bit key (less is better).
78#
79#		CBC en-/decrypt	CTR	XTS
80# POWER8[le]	3.96/0.72	0.74	1.1
81# POWER8[be]	3.75/0.65	0.66	1.0
82
83$flavour = shift;
84
85if ($flavour =~ /64/) {
86	$SIZE_T	=8;
87	$LRSAVE	=2*$SIZE_T;
88	$STU	="stdu";
89	$POP	="ld";
90	$PUSH	="std";
91	$UCMP	="cmpld";
92	$SHL	="sldi";
93} elsif ($flavour =~ /32/) {
94	$SIZE_T	=4;
95	$LRSAVE	=$SIZE_T;
96	$STU	="stwu";
97	$POP	="lwz";
98	$PUSH	="stw";
99	$UCMP	="cmplw";
100	$SHL	="slwi";
101} else { die "nonsense $flavour"; }
102
103$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
104
105$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
106( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
107( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
108die "can't locate ppc-xlate.pl";
109
110open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
111
112$FRAME=8*$SIZE_T;
113$prefix="aes_p8";
114
115$sp="r1";
116$vrsave="r12";
117
118#########################################################################
119{{{	# Key setup procedures						#
120my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
121my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
122my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
123
124$code.=<<___;
125.machine	"any"
126
127.text
128
129.align	7
130rcon:
131.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
132.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
133.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
134.long	0,0,0,0						?asis
135.long	0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
136Lconsts:
137	mflr	r0
138	bcl	20,31,\$+4
139	mflr	$ptr	 #vvvvv "distance between . and rcon
140	addi	$ptr,$ptr,-0x58
141	mtlr	r0
142	blr
143	.long	0
144	.byte	0,12,0x14,0,0,0,0,0
145.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
146
147.globl	.${prefix}_set_encrypt_key
148Lset_encrypt_key:
149	mflr		r11
150	$PUSH		r11,$LRSAVE($sp)
151
152	li		$ptr,-1
153	${UCMP}i	$inp,0
154	beq-		Lenc_key_abort		# if ($inp==0) return -1;
155	${UCMP}i	$out,0
156	beq-		Lenc_key_abort		# if ($out==0) return -1;
157	li		$ptr,-2
158	cmpwi		$bits,128
159	blt-		Lenc_key_abort
160	cmpwi		$bits,256
161	bgt-		Lenc_key_abort
162	andi.		r0,$bits,0x3f
163	bne-		Lenc_key_abort
164
165	lis		r0,0xfff0
166	mfspr		$vrsave,256
167	mtspr		256,r0
168
169	bl		Lconsts
170	mtlr		r11
171
172	neg		r9,$inp
173	lvx		$in0,0,$inp
174	addi		$inp,$inp,15		# 15 is not typo
175	lvsr		$key,0,r9		# borrow $key
176	li		r8,0x20
177	cmpwi		$bits,192
178	lvx		$in1,0,$inp
179	le?vspltisb	$mask,0x0f		# borrow $mask
180	lvx		$rcon,0,$ptr
181	le?vxor		$key,$key,$mask		# adjust for byte swap
182	lvx		$mask,r8,$ptr
183	addi		$ptr,$ptr,0x10
184	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
185	li		$cnt,8
186	vxor		$zero,$zero,$zero
187	mtctr		$cnt
188
189	?lvsr		$outperm,0,$out
190	vspltisb	$outmask,-1
191	lvx		$outhead,0,$out
192	?vperm		$outmask,$zero,$outmask,$outperm
193
194	blt		Loop128
195	addi		$inp,$inp,8
196	beq		L192
197	addi		$inp,$inp,8
198	b		L256
199
200.align	4
201Loop128:
202	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
203	vsldoi		$tmp,$zero,$in0,12	# >>32
204	 vperm		$outtail,$in0,$in0,$outperm	# rotate
205	 vsel		$stage,$outhead,$outtail,$outmask
206	 vmr		$outhead,$outtail
207	vcipherlast	$key,$key,$rcon
208	 stvx		$stage,0,$out
209	 addi		$out,$out,16
210
211	vxor		$in0,$in0,$tmp
212	vsldoi		$tmp,$zero,$tmp,12	# >>32
213	vxor		$in0,$in0,$tmp
214	vsldoi		$tmp,$zero,$tmp,12	# >>32
215	vxor		$in0,$in0,$tmp
216	 vadduwm	$rcon,$rcon,$rcon
217	vxor		$in0,$in0,$key
218	bdnz		Loop128
219
220	lvx		$rcon,0,$ptr		# last two round keys
221
222	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
223	vsldoi		$tmp,$zero,$in0,12	# >>32
224	 vperm		$outtail,$in0,$in0,$outperm	# rotate
225	 vsel		$stage,$outhead,$outtail,$outmask
226	 vmr		$outhead,$outtail
227	vcipherlast	$key,$key,$rcon
228	 stvx		$stage,0,$out
229	 addi		$out,$out,16
230
231	vxor		$in0,$in0,$tmp
232	vsldoi		$tmp,$zero,$tmp,12	# >>32
233	vxor		$in0,$in0,$tmp
234	vsldoi		$tmp,$zero,$tmp,12	# >>32
235	vxor		$in0,$in0,$tmp
236	 vadduwm	$rcon,$rcon,$rcon
237	vxor		$in0,$in0,$key
238
239	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
240	vsldoi		$tmp,$zero,$in0,12	# >>32
241	 vperm		$outtail,$in0,$in0,$outperm	# rotate
242	 vsel		$stage,$outhead,$outtail,$outmask
243	 vmr		$outhead,$outtail
244	vcipherlast	$key,$key,$rcon
245	 stvx		$stage,0,$out
246	 addi		$out,$out,16
247
248	vxor		$in0,$in0,$tmp
249	vsldoi		$tmp,$zero,$tmp,12	# >>32
250	vxor		$in0,$in0,$tmp
251	vsldoi		$tmp,$zero,$tmp,12	# >>32
252	vxor		$in0,$in0,$tmp
253	vxor		$in0,$in0,$key
254	 vperm		$outtail,$in0,$in0,$outperm	# rotate
255	 vsel		$stage,$outhead,$outtail,$outmask
256	 vmr		$outhead,$outtail
257	 stvx		$stage,0,$out
258
259	addi		$inp,$out,15		# 15 is not typo
260	addi		$out,$out,0x50
261
262	li		$rounds,10
263	b		Ldone
264
265.align	4
266L192:
267	lvx		$tmp,0,$inp
268	li		$cnt,4
269	 vperm		$outtail,$in0,$in0,$outperm	# rotate
270	 vsel		$stage,$outhead,$outtail,$outmask
271	 vmr		$outhead,$outtail
272	 stvx		$stage,0,$out
273	 addi		$out,$out,16
274	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
275	vspltisb	$key,8			# borrow $key
276	mtctr		$cnt
277	vsububm		$mask,$mask,$key	# adjust the mask
278
279Loop192:
280	vperm		$key,$in1,$in1,$mask	# roate-n-splat
281	vsldoi		$tmp,$zero,$in0,12	# >>32
282	vcipherlast	$key,$key,$rcon
283
284	vxor		$in0,$in0,$tmp
285	vsldoi		$tmp,$zero,$tmp,12	# >>32
286	vxor		$in0,$in0,$tmp
287	vsldoi		$tmp,$zero,$tmp,12	# >>32
288	vxor		$in0,$in0,$tmp
289
290	 vsldoi		$stage,$zero,$in1,8
291	vspltw		$tmp,$in0,3
292	vxor		$tmp,$tmp,$in1
293	vsldoi		$in1,$zero,$in1,12	# >>32
294	 vadduwm	$rcon,$rcon,$rcon
295	vxor		$in1,$in1,$tmp
296	vxor		$in0,$in0,$key
297	vxor		$in1,$in1,$key
298	 vsldoi		$stage,$stage,$in0,8
299
300	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
301	vsldoi		$tmp,$zero,$in0,12	# >>32
302	 vperm		$outtail,$stage,$stage,$outperm	# rotate
303	 vsel		$stage,$outhead,$outtail,$outmask
304	 vmr		$outhead,$outtail
305	vcipherlast	$key,$key,$rcon
306	 stvx		$stage,0,$out
307	 addi		$out,$out,16
308
309	 vsldoi		$stage,$in0,$in1,8
310	vxor		$in0,$in0,$tmp
311	vsldoi		$tmp,$zero,$tmp,12	# >>32
312	 vperm		$outtail,$stage,$stage,$outperm	# rotate
313	 vsel		$stage,$outhead,$outtail,$outmask
314	 vmr		$outhead,$outtail
315	vxor		$in0,$in0,$tmp
316	vsldoi		$tmp,$zero,$tmp,12	# >>32
317	vxor		$in0,$in0,$tmp
318	 stvx		$stage,0,$out
319	 addi		$out,$out,16
320
321	vspltw		$tmp,$in0,3
322	vxor		$tmp,$tmp,$in1
323	vsldoi		$in1,$zero,$in1,12	# >>32
324	 vadduwm	$rcon,$rcon,$rcon
325	vxor		$in1,$in1,$tmp
326	vxor		$in0,$in0,$key
327	vxor		$in1,$in1,$key
328	 vperm		$outtail,$in0,$in0,$outperm	# rotate
329	 vsel		$stage,$outhead,$outtail,$outmask
330	 vmr		$outhead,$outtail
331	 stvx		$stage,0,$out
332	 addi		$inp,$out,15		# 15 is not typo
333	 addi		$out,$out,16
334	bdnz		Loop192
335
336	li		$rounds,12
337	addi		$out,$out,0x20
338	b		Ldone
339
340.align	4
341L256:
342	lvx		$tmp,0,$inp
343	li		$cnt,7
344	li		$rounds,14
345	 vperm		$outtail,$in0,$in0,$outperm	# rotate
346	 vsel		$stage,$outhead,$outtail,$outmask
347	 vmr		$outhead,$outtail
348	 stvx		$stage,0,$out
349	 addi		$out,$out,16
350	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
351	mtctr		$cnt
352
353Loop256:
354	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
355	vsldoi		$tmp,$zero,$in0,12	# >>32
356	 vperm		$outtail,$in1,$in1,$outperm	# rotate
357	 vsel		$stage,$outhead,$outtail,$outmask
358	 vmr		$outhead,$outtail
359	vcipherlast	$key,$key,$rcon
360	 stvx		$stage,0,$out
361	 addi		$out,$out,16
362
363	vxor		$in0,$in0,$tmp
364	vsldoi		$tmp,$zero,$tmp,12	# >>32
365	vxor		$in0,$in0,$tmp
366	vsldoi		$tmp,$zero,$tmp,12	# >>32
367	vxor		$in0,$in0,$tmp
368	 vadduwm	$rcon,$rcon,$rcon
369	vxor		$in0,$in0,$key
370	 vperm		$outtail,$in0,$in0,$outperm	# rotate
371	 vsel		$stage,$outhead,$outtail,$outmask
372	 vmr		$outhead,$outtail
373	 stvx		$stage,0,$out
374	 addi		$inp,$out,15		# 15 is not typo
375	 addi		$out,$out,16
376	bdz		Ldone
377
378	vspltw		$key,$in0,3		# just splat
379	vsldoi		$tmp,$zero,$in1,12	# >>32
380	vsbox		$key,$key
381
382	vxor		$in1,$in1,$tmp
383	vsldoi		$tmp,$zero,$tmp,12	# >>32
384	vxor		$in1,$in1,$tmp
385	vsldoi		$tmp,$zero,$tmp,12	# >>32
386	vxor		$in1,$in1,$tmp
387
388	vxor		$in1,$in1,$key
389	b		Loop256
390
391.align	4
392Ldone:
393	lvx		$in1,0,$inp		# redundant in aligned case
394	vsel		$in1,$outhead,$in1,$outmask
395	stvx		$in1,0,$inp
396	li		$ptr,0
397	mtspr		256,$vrsave
398	stw		$rounds,0($out)
399
400Lenc_key_abort:
401	mr		r3,$ptr
402	blr
403	.long		0
404	.byte		0,12,0x14,1,0,0,3,0
405	.long		0
406.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
407
408.globl	.${prefix}_set_decrypt_key
409	$STU		$sp,-$FRAME($sp)
410	mflr		r10
411	$PUSH		r10,$FRAME+$LRSAVE($sp)
412	bl		Lset_encrypt_key
413	mtlr		r10
414
415	cmpwi		r3,0
416	bne-		Ldec_key_abort
417
418	slwi		$cnt,$rounds,4
419	subi		$inp,$out,240		# first round key
420	srwi		$rounds,$rounds,1
421	add		$out,$inp,$cnt		# last round key
422	mtctr		$rounds
423
424Ldeckey:
425	lwz		r0, 0($inp)
426	lwz		r6, 4($inp)
427	lwz		r7, 8($inp)
428	lwz		r8, 12($inp)
429	addi		$inp,$inp,16
430	lwz		r9, 0($out)
431	lwz		r10,4($out)
432	lwz		r11,8($out)
433	lwz		r12,12($out)
434	stw		r0, 0($out)
435	stw		r6, 4($out)
436	stw		r7, 8($out)
437	stw		r8, 12($out)
438	subi		$out,$out,16
439	stw		r9, -16($inp)
440	stw		r10,-12($inp)
441	stw		r11,-8($inp)
442	stw		r12,-4($inp)
443	bdnz		Ldeckey
444
445	xor		r3,r3,r3		# return value
446Ldec_key_abort:
447	addi		$sp,$sp,$FRAME
448	blr
449	.long		0
450	.byte		0,12,4,1,0x80,0,3,0
451	.long		0
452.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
453___
454}}}
455#########################################################################
456{{{	# Single block en- and decrypt procedures			#
457sub gen_block () {
458my $dir = shift;
459my $n   = $dir eq "de" ? "n" : "";
460my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
461
462$code.=<<___;
463.globl	.${prefix}_${dir}crypt
464	lwz		$rounds,240($key)
465	lis		r0,0xfc00
466	mfspr		$vrsave,256
467	li		$idx,15			# 15 is not typo
468	mtspr		256,r0
469
470	lvx		v0,0,$inp
471	neg		r11,$out
472	lvx		v1,$idx,$inp
473	lvsl		v2,0,$inp		# inpperm
474	le?vspltisb	v4,0x0f
475	?lvsl		v3,0,r11		# outperm
476	le?vxor		v2,v2,v4
477	li		$idx,16
478	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
479	lvx		v1,0,$key
480	?lvsl		v5,0,$key		# keyperm
481	srwi		$rounds,$rounds,1
482	lvx		v2,$idx,$key
483	addi		$idx,$idx,16
484	subi		$rounds,$rounds,1
485	?vperm		v1,v1,v2,v5		# align round key
486
487	vxor		v0,v0,v1
488	lvx		v1,$idx,$key
489	addi		$idx,$idx,16
490	mtctr		$rounds
491
492Loop_${dir}c:
493	?vperm		v2,v2,v1,v5
494	v${n}cipher	v0,v0,v2
495	lvx		v2,$idx,$key
496	addi		$idx,$idx,16
497	?vperm		v1,v1,v2,v5
498	v${n}cipher	v0,v0,v1
499	lvx		v1,$idx,$key
500	addi		$idx,$idx,16
501	bdnz		Loop_${dir}c
502
503	?vperm		v2,v2,v1,v5
504	v${n}cipher	v0,v0,v2
505	lvx		v2,$idx,$key
506	?vperm		v1,v1,v2,v5
507	v${n}cipherlast	v0,v0,v1
508
509	vspltisb	v2,-1
510	vxor		v1,v1,v1
511	li		$idx,15			# 15 is not typo
512	?vperm		v2,v1,v2,v3		# outmask
513	le?vxor		v3,v3,v4
514	lvx		v1,0,$out		# outhead
515	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
516	vsel		v1,v1,v0,v2
517	lvx		v4,$idx,$out
518	stvx		v1,0,$out
519	vsel		v0,v0,v4,v2
520	stvx		v0,$idx,$out
521
522	mtspr		256,$vrsave
523	blr
524	.long		0
525	.byte		0,12,0x14,0,0,0,3,0
526	.long		0
527.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
528___
529}
530&gen_block("en");
531&gen_block("de");
532}}}
533#########################################################################
534{{{	# CBC en- and decrypt procedures				#
535my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
536my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
537my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
538						map("v$_",(4..10));
539$code.=<<___;
540.globl	.${prefix}_cbc_encrypt
541	${UCMP}i	$len,16
542	bltlr-
543
544	cmpwi		$enc,0			# test direction
545	lis		r0,0xffe0
546	mfspr		$vrsave,256
547	mtspr		256,r0
548
549	li		$idx,15
550	vxor		$rndkey0,$rndkey0,$rndkey0
551	le?vspltisb	$tmp,0x0f
552
553	lvx		$ivec,0,$ivp		# load [unaligned] iv
554	lvsl		$inpperm,0,$ivp
555	lvx		$inptail,$idx,$ivp
556	le?vxor		$inpperm,$inpperm,$tmp
557	vperm		$ivec,$ivec,$inptail,$inpperm
558
559	neg		r11,$inp
560	?lvsl		$keyperm,0,$key		# prepare for unaligned key
561	lwz		$rounds,240($key)
562
563	lvsr		$inpperm,0,r11		# prepare for unaligned load
564	lvx		$inptail,0,$inp
565	addi		$inp,$inp,15		# 15 is not typo
566	le?vxor		$inpperm,$inpperm,$tmp
567
568	?lvsr		$outperm,0,$out		# prepare for unaligned store
569	vspltisb	$outmask,-1
570	lvx		$outhead,0,$out
571	?vperm		$outmask,$rndkey0,$outmask,$outperm
572	le?vxor		$outperm,$outperm,$tmp
573
574	srwi		$rounds,$rounds,1
575	li		$idx,16
576	subi		$rounds,$rounds,1
577	beq		Lcbc_dec
578
579Lcbc_enc:
580	vmr		$inout,$inptail
581	lvx		$inptail,0,$inp
582	addi		$inp,$inp,16
583	mtctr		$rounds
584	subi		$len,$len,16		# len-=16
585
586	lvx		$rndkey0,0,$key
587	 vperm		$inout,$inout,$inptail,$inpperm
588	lvx		$rndkey1,$idx,$key
589	addi		$idx,$idx,16
590	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
591	vxor		$inout,$inout,$rndkey0
592	lvx		$rndkey0,$idx,$key
593	addi		$idx,$idx,16
594	vxor		$inout,$inout,$ivec
595
596Loop_cbc_enc:
597	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
598	vcipher		$inout,$inout,$rndkey1
599	lvx		$rndkey1,$idx,$key
600	addi		$idx,$idx,16
601	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
602	vcipher		$inout,$inout,$rndkey0
603	lvx		$rndkey0,$idx,$key
604	addi		$idx,$idx,16
605	bdnz		Loop_cbc_enc
606
607	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
608	vcipher		$inout,$inout,$rndkey1
609	lvx		$rndkey1,$idx,$key
610	li		$idx,16
611	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
612	vcipherlast	$ivec,$inout,$rndkey0
613	${UCMP}i	$len,16
614
615	vperm		$tmp,$ivec,$ivec,$outperm
616	vsel		$inout,$outhead,$tmp,$outmask
617	vmr		$outhead,$tmp
618	stvx		$inout,0,$out
619	addi		$out,$out,16
620	bge		Lcbc_enc
621
622	b		Lcbc_done
623
624.align	4
625Lcbc_dec:
626	${UCMP}i	$len,128
627	bge		_aesp8_cbc_decrypt8x
628	vmr		$tmp,$inptail
629	lvx		$inptail,0,$inp
630	addi		$inp,$inp,16
631	mtctr		$rounds
632	subi		$len,$len,16		# len-=16
633
634	lvx		$rndkey0,0,$key
635	 vperm		$tmp,$tmp,$inptail,$inpperm
636	lvx		$rndkey1,$idx,$key
637	addi		$idx,$idx,16
638	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
639	vxor		$inout,$tmp,$rndkey0
640	lvx		$rndkey0,$idx,$key
641	addi		$idx,$idx,16
642
643Loop_cbc_dec:
644	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
645	vncipher	$inout,$inout,$rndkey1
646	lvx		$rndkey1,$idx,$key
647	addi		$idx,$idx,16
648	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
649	vncipher	$inout,$inout,$rndkey0
650	lvx		$rndkey0,$idx,$key
651	addi		$idx,$idx,16
652	bdnz		Loop_cbc_dec
653
654	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
655	vncipher	$inout,$inout,$rndkey1
656	lvx		$rndkey1,$idx,$key
657	li		$idx,16
658	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
659	vncipherlast	$inout,$inout,$rndkey0
660	${UCMP}i	$len,16
661
662	vxor		$inout,$inout,$ivec
663	vmr		$ivec,$tmp
664	vperm		$tmp,$inout,$inout,$outperm
665	vsel		$inout,$outhead,$tmp,$outmask
666	vmr		$outhead,$tmp
667	stvx		$inout,0,$out
668	addi		$out,$out,16
669	bge		Lcbc_dec
670
671Lcbc_done:
672	addi		$out,$out,-1
673	lvx		$inout,0,$out		# redundant in aligned case
674	vsel		$inout,$outhead,$inout,$outmask
675	stvx		$inout,0,$out
676
677	neg		$enc,$ivp		# write [unaligned] iv
678	li		$idx,15			# 15 is not typo
679	vxor		$rndkey0,$rndkey0,$rndkey0
680	vspltisb	$outmask,-1
681	le?vspltisb	$tmp,0x0f
682	?lvsl		$outperm,0,$enc
683	?vperm		$outmask,$rndkey0,$outmask,$outperm
684	le?vxor		$outperm,$outperm,$tmp
685	lvx		$outhead,0,$ivp
686	vperm		$ivec,$ivec,$ivec,$outperm
687	vsel		$inout,$outhead,$ivec,$outmask
688	lvx		$inptail,$idx,$ivp
689	stvx		$inout,0,$ivp
690	vsel		$inout,$ivec,$inptail,$outmask
691	stvx		$inout,$idx,$ivp
692
693	mtspr		256,$vrsave
694	blr
695	.long		0
696	.byte		0,12,0x14,0,0,0,6,0
697	.long		0
698___
699#########################################################################
700{{	# Optimized CBC decrypt procedure				#
701my $key_="r11";
702my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
703my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
704my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
705my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
706			# v26-v31 last 6 round keys
707my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
708
709$code.=<<___;
710.align	5
711_aesp8_cbc_decrypt8x:
712	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
713	li		r10,`$FRAME+8*16+15`
714	li		r11,`$FRAME+8*16+31`
715	stvx		v20,r10,$sp		# ABI says so
716	addi		r10,r10,32
717	stvx		v21,r11,$sp
718	addi		r11,r11,32
719	stvx		v22,r10,$sp
720	addi		r10,r10,32
721	stvx		v23,r11,$sp
722	addi		r11,r11,32
723	stvx		v24,r10,$sp
724	addi		r10,r10,32
725	stvx		v25,r11,$sp
726	addi		r11,r11,32
727	stvx		v26,r10,$sp
728	addi		r10,r10,32
729	stvx		v27,r11,$sp
730	addi		r11,r11,32
731	stvx		v28,r10,$sp
732	addi		r10,r10,32
733	stvx		v29,r11,$sp
734	addi		r11,r11,32
735	stvx		v30,r10,$sp
736	stvx		v31,r11,$sp
737	li		r0,-1
738	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
739	li		$x10,0x10
740	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
741	li		$x20,0x20
742	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
743	li		$x30,0x30
744	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
745	li		$x40,0x40
746	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
747	li		$x50,0x50
748	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
749	li		$x60,0x60
750	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
751	li		$x70,0x70
752	mtspr		256,r0
753
754	subi		$rounds,$rounds,3	# -4 in total
755	subi		$len,$len,128		# bias
756
757	lvx		$rndkey0,$x00,$key	# load key schedule
758	lvx		v30,$x10,$key
759	addi		$key,$key,0x20
760	lvx		v31,$x00,$key
761	?vperm		$rndkey0,$rndkey0,v30,$keyperm
762	addi		$key_,$sp,$FRAME+15
763	mtctr		$rounds
764
765Load_cbc_dec_key:
766	?vperm		v24,v30,v31,$keyperm
767	lvx		v30,$x10,$key
768	addi		$key,$key,0x20
769	stvx		v24,$x00,$key_		# off-load round[1]
770	?vperm		v25,v31,v30,$keyperm
771	lvx		v31,$x00,$key
772	stvx		v25,$x10,$key_		# off-load round[2]
773	addi		$key_,$key_,0x20
774	bdnz		Load_cbc_dec_key
775
776	lvx		v26,$x10,$key
777	?vperm		v24,v30,v31,$keyperm
778	lvx		v27,$x20,$key
779	stvx		v24,$x00,$key_		# off-load round[3]
780	?vperm		v25,v31,v26,$keyperm
781	lvx		v28,$x30,$key
782	stvx		v25,$x10,$key_		# off-load round[4]
783	addi		$key_,$sp,$FRAME+15	# rewind $key_
784	?vperm		v26,v26,v27,$keyperm
785	lvx		v29,$x40,$key
786	?vperm		v27,v27,v28,$keyperm
787	lvx		v30,$x50,$key
788	?vperm		v28,v28,v29,$keyperm
789	lvx		v31,$x60,$key
790	?vperm		v29,v29,v30,$keyperm
791	lvx		$out0,$x70,$key		# borrow $out0
792	?vperm		v30,v30,v31,$keyperm
793	lvx		v24,$x00,$key_		# pre-load round[1]
794	?vperm		v31,v31,$out0,$keyperm
795	lvx		v25,$x10,$key_		# pre-load round[2]
796
797	#lvx		$inptail,0,$inp		# "caller" already did this
798	#addi		$inp,$inp,15		# 15 is not typo
799	subi		$inp,$inp,15		# undo "caller"
800
801	 le?li		$idx,8
802	lvx_u		$in0,$x00,$inp		# load first 8 "words"
803	 le?lvsl	$inpperm,0,$idx
804	 le?vspltisb	$tmp,0x0f
805	lvx_u		$in1,$x10,$inp
806	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
807	lvx_u		$in2,$x20,$inp
808	 le?vperm	$in0,$in0,$in0,$inpperm
809	lvx_u		$in3,$x30,$inp
810	 le?vperm	$in1,$in1,$in1,$inpperm
811	lvx_u		$in4,$x40,$inp
812	 le?vperm	$in2,$in2,$in2,$inpperm
813	vxor		$out0,$in0,$rndkey0
814	lvx_u		$in5,$x50,$inp
815	 le?vperm	$in3,$in3,$in3,$inpperm
816	vxor		$out1,$in1,$rndkey0
817	lvx_u		$in6,$x60,$inp
818	 le?vperm	$in4,$in4,$in4,$inpperm
819	vxor		$out2,$in2,$rndkey0
820	lvx_u		$in7,$x70,$inp
821	addi		$inp,$inp,0x80
822	 le?vperm	$in5,$in5,$in5,$inpperm
823	vxor		$out3,$in3,$rndkey0
824	 le?vperm	$in6,$in6,$in6,$inpperm
825	vxor		$out4,$in4,$rndkey0
826	 le?vperm	$in7,$in7,$in7,$inpperm
827	vxor		$out5,$in5,$rndkey0
828	vxor		$out6,$in6,$rndkey0
829	vxor		$out7,$in7,$rndkey0
830
831	mtctr		$rounds
832	b		Loop_cbc_dec8x
833.align	5
834Loop_cbc_dec8x:
835	vncipher	$out0,$out0,v24
836	vncipher	$out1,$out1,v24
837	vncipher	$out2,$out2,v24
838	vncipher	$out3,$out3,v24
839	vncipher	$out4,$out4,v24
840	vncipher	$out5,$out5,v24
841	vncipher	$out6,$out6,v24
842	vncipher	$out7,$out7,v24
843	lvx		v24,$x20,$key_		# round[3]
844	addi		$key_,$key_,0x20
845
846	vncipher	$out0,$out0,v25
847	vncipher	$out1,$out1,v25
848	vncipher	$out2,$out2,v25
849	vncipher	$out3,$out3,v25
850	vncipher	$out4,$out4,v25
851	vncipher	$out5,$out5,v25
852	vncipher	$out6,$out6,v25
853	vncipher	$out7,$out7,v25
854	lvx		v25,$x10,$key_		# round[4]
855	bdnz		Loop_cbc_dec8x
856
857	subic		$len,$len,128		# $len-=128
858	vncipher	$out0,$out0,v24
859	vncipher	$out1,$out1,v24
860	vncipher	$out2,$out2,v24
861	vncipher	$out3,$out3,v24
862	vncipher	$out4,$out4,v24
863	vncipher	$out5,$out5,v24
864	vncipher	$out6,$out6,v24
865	vncipher	$out7,$out7,v24
866
867	subfe.		r0,r0,r0		# borrow?-1:0
868	vncipher	$out0,$out0,v25
869	vncipher	$out1,$out1,v25
870	vncipher	$out2,$out2,v25
871	vncipher	$out3,$out3,v25
872	vncipher	$out4,$out4,v25
873	vncipher	$out5,$out5,v25
874	vncipher	$out6,$out6,v25
875	vncipher	$out7,$out7,v25
876
877	and		r0,r0,$len
878	vncipher	$out0,$out0,v26
879	vncipher	$out1,$out1,v26
880	vncipher	$out2,$out2,v26
881	vncipher	$out3,$out3,v26
882	vncipher	$out4,$out4,v26
883	vncipher	$out5,$out5,v26
884	vncipher	$out6,$out6,v26
885	vncipher	$out7,$out7,v26
886
887	add		$inp,$inp,r0		# $inp is adjusted in such
888						# way that at exit from the
889						# loop inX-in7 are loaded
890						# with last "words"
891	vncipher	$out0,$out0,v27
892	vncipher	$out1,$out1,v27
893	vncipher	$out2,$out2,v27
894	vncipher	$out3,$out3,v27
895	vncipher	$out4,$out4,v27
896	vncipher	$out5,$out5,v27
897	vncipher	$out6,$out6,v27
898	vncipher	$out7,$out7,v27
899
900	addi		$key_,$sp,$FRAME+15	# rewind $key_
901	vncipher	$out0,$out0,v28
902	vncipher	$out1,$out1,v28
903	vncipher	$out2,$out2,v28
904	vncipher	$out3,$out3,v28
905	vncipher	$out4,$out4,v28
906	vncipher	$out5,$out5,v28
907	vncipher	$out6,$out6,v28
908	vncipher	$out7,$out7,v28
909	lvx		v24,$x00,$key_		# re-pre-load round[1]
910
911	vncipher	$out0,$out0,v29
912	vncipher	$out1,$out1,v29
913	vncipher	$out2,$out2,v29
914	vncipher	$out3,$out3,v29
915	vncipher	$out4,$out4,v29
916	vncipher	$out5,$out5,v29
917	vncipher	$out6,$out6,v29
918	vncipher	$out7,$out7,v29
919	lvx		v25,$x10,$key_		# re-pre-load round[2]
920
921	vncipher	$out0,$out0,v30
922	 vxor		$ivec,$ivec,v31		# xor with last round key
923	vncipher	$out1,$out1,v30
924	 vxor		$in0,$in0,v31
925	vncipher	$out2,$out2,v30
926	 vxor		$in1,$in1,v31
927	vncipher	$out3,$out3,v30
928	 vxor		$in2,$in2,v31
929	vncipher	$out4,$out4,v30
930	 vxor		$in3,$in3,v31
931	vncipher	$out5,$out5,v30
932	 vxor		$in4,$in4,v31
933	vncipher	$out6,$out6,v30
934	 vxor		$in5,$in5,v31
935	vncipher	$out7,$out7,v30
936	 vxor		$in6,$in6,v31
937
938	vncipherlast	$out0,$out0,$ivec
939	vncipherlast	$out1,$out1,$in0
940	 lvx_u		$in0,$x00,$inp		# load next input block
941	vncipherlast	$out2,$out2,$in1
942	 lvx_u		$in1,$x10,$inp
943	vncipherlast	$out3,$out3,$in2
944	 le?vperm	$in0,$in0,$in0,$inpperm
945	 lvx_u		$in2,$x20,$inp
946	vncipherlast	$out4,$out4,$in3
947	 le?vperm	$in1,$in1,$in1,$inpperm
948	 lvx_u		$in3,$x30,$inp
949	vncipherlast	$out5,$out5,$in4
950	 le?vperm	$in2,$in2,$in2,$inpperm
951	 lvx_u		$in4,$x40,$inp
952	vncipherlast	$out6,$out6,$in5
953	 le?vperm	$in3,$in3,$in3,$inpperm
954	 lvx_u		$in5,$x50,$inp
955	vncipherlast	$out7,$out7,$in6
956	 le?vperm	$in4,$in4,$in4,$inpperm
957	 lvx_u		$in6,$x60,$inp
958	vmr		$ivec,$in7
959	 le?vperm	$in5,$in5,$in5,$inpperm
960	 lvx_u		$in7,$x70,$inp
961	 addi		$inp,$inp,0x80
962
963	le?vperm	$out0,$out0,$out0,$inpperm
964	le?vperm	$out1,$out1,$out1,$inpperm
965	stvx_u		$out0,$x00,$out
966	 le?vperm	$in6,$in6,$in6,$inpperm
967	 vxor		$out0,$in0,$rndkey0
968	le?vperm	$out2,$out2,$out2,$inpperm
969	stvx_u		$out1,$x10,$out
970	 le?vperm	$in7,$in7,$in7,$inpperm
971	 vxor		$out1,$in1,$rndkey0
972	le?vperm	$out3,$out3,$out3,$inpperm
973	stvx_u		$out2,$x20,$out
974	 vxor		$out2,$in2,$rndkey0
975	le?vperm	$out4,$out4,$out4,$inpperm
976	stvx_u		$out3,$x30,$out
977	 vxor		$out3,$in3,$rndkey0
978	le?vperm	$out5,$out5,$out5,$inpperm
979	stvx_u		$out4,$x40,$out
980	 vxor		$out4,$in4,$rndkey0
981	le?vperm	$out6,$out6,$out6,$inpperm
982	stvx_u		$out5,$x50,$out
983	 vxor		$out5,$in5,$rndkey0
984	le?vperm	$out7,$out7,$out7,$inpperm
985	stvx_u		$out6,$x60,$out
986	 vxor		$out6,$in6,$rndkey0
987	stvx_u		$out7,$x70,$out
988	addi		$out,$out,0x80
989	 vxor		$out7,$in7,$rndkey0
990
991	mtctr		$rounds
992	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
993
994	addic.		$len,$len,128
995	beq		Lcbc_dec8x_done
996	nop
997	nop
998
999Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
1000	vncipher	$out1,$out1,v24
1001	vncipher	$out2,$out2,v24
1002	vncipher	$out3,$out3,v24
1003	vncipher	$out4,$out4,v24
1004	vncipher	$out5,$out5,v24
1005	vncipher	$out6,$out6,v24
1006	vncipher	$out7,$out7,v24
1007	lvx		v24,$x20,$key_		# round[3]
1008	addi		$key_,$key_,0x20
1009
1010	vncipher	$out1,$out1,v25
1011	vncipher	$out2,$out2,v25
1012	vncipher	$out3,$out3,v25
1013	vncipher	$out4,$out4,v25
1014	vncipher	$out5,$out5,v25
1015	vncipher	$out6,$out6,v25
1016	vncipher	$out7,$out7,v25
1017	lvx		v25,$x10,$key_		# round[4]
1018	bdnz		Loop_cbc_dec8x_tail
1019
1020	vncipher	$out1,$out1,v24
1021	vncipher	$out2,$out2,v24
1022	vncipher	$out3,$out3,v24
1023	vncipher	$out4,$out4,v24
1024	vncipher	$out5,$out5,v24
1025	vncipher	$out6,$out6,v24
1026	vncipher	$out7,$out7,v24
1027
1028	vncipher	$out1,$out1,v25
1029	vncipher	$out2,$out2,v25
1030	vncipher	$out3,$out3,v25
1031	vncipher	$out4,$out4,v25
1032	vncipher	$out5,$out5,v25
1033	vncipher	$out6,$out6,v25
1034	vncipher	$out7,$out7,v25
1035
1036	vncipher	$out1,$out1,v26
1037	vncipher	$out2,$out2,v26
1038	vncipher	$out3,$out3,v26
1039	vncipher	$out4,$out4,v26
1040	vncipher	$out5,$out5,v26
1041	vncipher	$out6,$out6,v26
1042	vncipher	$out7,$out7,v26
1043
1044	vncipher	$out1,$out1,v27
1045	vncipher	$out2,$out2,v27
1046	vncipher	$out3,$out3,v27
1047	vncipher	$out4,$out4,v27
1048	vncipher	$out5,$out5,v27
1049	vncipher	$out6,$out6,v27
1050	vncipher	$out7,$out7,v27
1051
1052	vncipher	$out1,$out1,v28
1053	vncipher	$out2,$out2,v28
1054	vncipher	$out3,$out3,v28
1055	vncipher	$out4,$out4,v28
1056	vncipher	$out5,$out5,v28
1057	vncipher	$out6,$out6,v28
1058	vncipher	$out7,$out7,v28
1059
1060	vncipher	$out1,$out1,v29
1061	vncipher	$out2,$out2,v29
1062	vncipher	$out3,$out3,v29
1063	vncipher	$out4,$out4,v29
1064	vncipher	$out5,$out5,v29
1065	vncipher	$out6,$out6,v29
1066	vncipher	$out7,$out7,v29
1067
1068	vncipher	$out1,$out1,v30
1069	 vxor		$ivec,$ivec,v31		# last round key
1070	vncipher	$out2,$out2,v30
1071	 vxor		$in1,$in1,v31
1072	vncipher	$out3,$out3,v30
1073	 vxor		$in2,$in2,v31
1074	vncipher	$out4,$out4,v30
1075	 vxor		$in3,$in3,v31
1076	vncipher	$out5,$out5,v30
1077	 vxor		$in4,$in4,v31
1078	vncipher	$out6,$out6,v30
1079	 vxor		$in5,$in5,v31
1080	vncipher	$out7,$out7,v30
1081	 vxor		$in6,$in6,v31
1082
1083	cmplwi		$len,32			# switch($len)
1084	blt		Lcbc_dec8x_one
1085	nop
1086	beq		Lcbc_dec8x_two
1087	cmplwi		$len,64
1088	blt		Lcbc_dec8x_three
1089	nop
1090	beq		Lcbc_dec8x_four
1091	cmplwi		$len,96
1092	blt		Lcbc_dec8x_five
1093	nop
1094	beq		Lcbc_dec8x_six
1095
1096Lcbc_dec8x_seven:
1097	vncipherlast	$out1,$out1,$ivec
1098	vncipherlast	$out2,$out2,$in1
1099	vncipherlast	$out3,$out3,$in2
1100	vncipherlast	$out4,$out4,$in3
1101	vncipherlast	$out5,$out5,$in4
1102	vncipherlast	$out6,$out6,$in5
1103	vncipherlast	$out7,$out7,$in6
1104	vmr		$ivec,$in7
1105
1106	le?vperm	$out1,$out1,$out1,$inpperm
1107	le?vperm	$out2,$out2,$out2,$inpperm
1108	stvx_u		$out1,$x00,$out
1109	le?vperm	$out3,$out3,$out3,$inpperm
1110	stvx_u		$out2,$x10,$out
1111	le?vperm	$out4,$out4,$out4,$inpperm
1112	stvx_u		$out3,$x20,$out
1113	le?vperm	$out5,$out5,$out5,$inpperm
1114	stvx_u		$out4,$x30,$out
1115	le?vperm	$out6,$out6,$out6,$inpperm
1116	stvx_u		$out5,$x40,$out
1117	le?vperm	$out7,$out7,$out7,$inpperm
1118	stvx_u		$out6,$x50,$out
1119	stvx_u		$out7,$x60,$out
1120	addi		$out,$out,0x70
1121	b		Lcbc_dec8x_done
1122
1123.align	5
1124Lcbc_dec8x_six:
1125	vncipherlast	$out2,$out2,$ivec
1126	vncipherlast	$out3,$out3,$in2
1127	vncipherlast	$out4,$out4,$in3
1128	vncipherlast	$out5,$out5,$in4
1129	vncipherlast	$out6,$out6,$in5
1130	vncipherlast	$out7,$out7,$in6
1131	vmr		$ivec,$in7
1132
1133	le?vperm	$out2,$out2,$out2,$inpperm
1134	le?vperm	$out3,$out3,$out3,$inpperm
1135	stvx_u		$out2,$x00,$out
1136	le?vperm	$out4,$out4,$out4,$inpperm
1137	stvx_u		$out3,$x10,$out
1138	le?vperm	$out5,$out5,$out5,$inpperm
1139	stvx_u		$out4,$x20,$out
1140	le?vperm	$out6,$out6,$out6,$inpperm
1141	stvx_u		$out5,$x30,$out
1142	le?vperm	$out7,$out7,$out7,$inpperm
1143	stvx_u		$out6,$x40,$out
1144	stvx_u		$out7,$x50,$out
1145	addi		$out,$out,0x60
1146	b		Lcbc_dec8x_done
1147
1148.align	5
1149Lcbc_dec8x_five:
1150	vncipherlast	$out3,$out3,$ivec
1151	vncipherlast	$out4,$out4,$in3
1152	vncipherlast	$out5,$out5,$in4
1153	vncipherlast	$out6,$out6,$in5
1154	vncipherlast	$out7,$out7,$in6
1155	vmr		$ivec,$in7
1156
1157	le?vperm	$out3,$out3,$out3,$inpperm
1158	le?vperm	$out4,$out4,$out4,$inpperm
1159	stvx_u		$out3,$x00,$out
1160	le?vperm	$out5,$out5,$out5,$inpperm
1161	stvx_u		$out4,$x10,$out
1162	le?vperm	$out6,$out6,$out6,$inpperm
1163	stvx_u		$out5,$x20,$out
1164	le?vperm	$out7,$out7,$out7,$inpperm
1165	stvx_u		$out6,$x30,$out
1166	stvx_u		$out7,$x40,$out
1167	addi		$out,$out,0x50
1168	b		Lcbc_dec8x_done
1169
1170.align	5
1171Lcbc_dec8x_four:
1172	vncipherlast	$out4,$out4,$ivec
1173	vncipherlast	$out5,$out5,$in4
1174	vncipherlast	$out6,$out6,$in5
1175	vncipherlast	$out7,$out7,$in6
1176	vmr		$ivec,$in7
1177
1178	le?vperm	$out4,$out4,$out4,$inpperm
1179	le?vperm	$out5,$out5,$out5,$inpperm
1180	stvx_u		$out4,$x00,$out
1181	le?vperm	$out6,$out6,$out6,$inpperm
1182	stvx_u		$out5,$x10,$out
1183	le?vperm	$out7,$out7,$out7,$inpperm
1184	stvx_u		$out6,$x20,$out
1185	stvx_u		$out7,$x30,$out
1186	addi		$out,$out,0x40
1187	b		Lcbc_dec8x_done
1188
1189.align	5
1190Lcbc_dec8x_three:
1191	vncipherlast	$out5,$out5,$ivec
1192	vncipherlast	$out6,$out6,$in5
1193	vncipherlast	$out7,$out7,$in6
1194	vmr		$ivec,$in7
1195
1196	le?vperm	$out5,$out5,$out5,$inpperm
1197	le?vperm	$out6,$out6,$out6,$inpperm
1198	stvx_u		$out5,$x00,$out
1199	le?vperm	$out7,$out7,$out7,$inpperm
1200	stvx_u		$out6,$x10,$out
1201	stvx_u		$out7,$x20,$out
1202	addi		$out,$out,0x30
1203	b		Lcbc_dec8x_done
1204
1205.align	5
1206Lcbc_dec8x_two:
1207	vncipherlast	$out6,$out6,$ivec
1208	vncipherlast	$out7,$out7,$in6
1209	vmr		$ivec,$in7
1210
1211	le?vperm	$out6,$out6,$out6,$inpperm
1212	le?vperm	$out7,$out7,$out7,$inpperm
1213	stvx_u		$out6,$x00,$out
1214	stvx_u		$out7,$x10,$out
1215	addi		$out,$out,0x20
1216	b		Lcbc_dec8x_done
1217
1218.align	5
1219Lcbc_dec8x_one:
1220	vncipherlast	$out7,$out7,$ivec
1221	vmr		$ivec,$in7
1222
1223	le?vperm	$out7,$out7,$out7,$inpperm
1224	stvx_u		$out7,0,$out
1225	addi		$out,$out,0x10
1226
1227Lcbc_dec8x_done:
1228	le?vperm	$ivec,$ivec,$ivec,$inpperm
1229	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1230
1231	li		r10,`$FRAME+15`
1232	li		r11,`$FRAME+31`
1233	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1234	addi		r10,r10,32
1235	stvx		$inpperm,r11,$sp
1236	addi		r11,r11,32
1237	stvx		$inpperm,r10,$sp
1238	addi		r10,r10,32
1239	stvx		$inpperm,r11,$sp
1240	addi		r11,r11,32
1241	stvx		$inpperm,r10,$sp
1242	addi		r10,r10,32
1243	stvx		$inpperm,r11,$sp
1244	addi		r11,r11,32
1245	stvx		$inpperm,r10,$sp
1246	addi		r10,r10,32
1247	stvx		$inpperm,r11,$sp
1248	addi		r11,r11,32
1249
1250	mtspr		256,$vrsave
1251	lvx		v20,r10,$sp		# ABI says so
1252	addi		r10,r10,32
1253	lvx		v21,r11,$sp
1254	addi		r11,r11,32
1255	lvx		v22,r10,$sp
1256	addi		r10,r10,32
1257	lvx		v23,r11,$sp
1258	addi		r11,r11,32
1259	lvx		v24,r10,$sp
1260	addi		r10,r10,32
1261	lvx		v25,r11,$sp
1262	addi		r11,r11,32
1263	lvx		v26,r10,$sp
1264	addi		r10,r10,32
1265	lvx		v27,r11,$sp
1266	addi		r11,r11,32
1267	lvx		v28,r10,$sp
1268	addi		r10,r10,32
1269	lvx		v29,r11,$sp
1270	addi		r11,r11,32
1271	lvx		v30,r10,$sp
1272	lvx		v31,r11,$sp
1273	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1274	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1275	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1276	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1277	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1278	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1279	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1280	blr
1281	.long		0
1282	.byte		0,12,0x14,0,0x80,6,6,0
1283	.long		0
1284.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1285___
1286}}	}}}
1287
1288#########################################################################
1289{{{	# CTR procedure[s]						#
1290
1291####################### WARNING: Here be dragons! #######################
1292#
1293# This code is written as 'ctr32', based on a 32-bit counter used
1294# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
1295# a 128-bit counter.
1296#
1297# This leads to subtle changes from the upstream code: the counter
1298# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
1299# both the bulk (8 blocks at a time) path, and in the individual block
1300# path. Be aware of this when doing updates.
1301#
1302# See:
1303# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
1304# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
1305# https://github.com/openssl/openssl/pull/8942
1306#
1307#########################################################################
1308my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1309my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1310my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1311						map("v$_",(4..11));
1312my $dat=$tmp;
1313
1314$code.=<<___;
1315.globl	.${prefix}_ctr32_encrypt_blocks
1316	${UCMP}i	$len,1
1317	bltlr-
1318
1319	lis		r0,0xfff0
1320	mfspr		$vrsave,256
1321	mtspr		256,r0
1322
1323	li		$idx,15
1324	vxor		$rndkey0,$rndkey0,$rndkey0
1325	le?vspltisb	$tmp,0x0f
1326
1327	lvx		$ivec,0,$ivp		# load [unaligned] iv
1328	lvsl		$inpperm,0,$ivp
1329	lvx		$inptail,$idx,$ivp
1330	 vspltisb	$one,1
1331	le?vxor		$inpperm,$inpperm,$tmp
1332	vperm		$ivec,$ivec,$inptail,$inpperm
1333	 vsldoi		$one,$rndkey0,$one,1
1334
1335	neg		r11,$inp
1336	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1337	lwz		$rounds,240($key)
1338
1339	lvsr		$inpperm,0,r11		# prepare for unaligned load
1340	lvx		$inptail,0,$inp
1341	addi		$inp,$inp,15		# 15 is not typo
1342	le?vxor		$inpperm,$inpperm,$tmp
1343
1344	srwi		$rounds,$rounds,1
1345	li		$idx,16
1346	subi		$rounds,$rounds,1
1347
1348	${UCMP}i	$len,8
1349	bge		_aesp8_ctr32_encrypt8x
1350
1351	?lvsr		$outperm,0,$out		# prepare for unaligned store
1352	vspltisb	$outmask,-1
1353	lvx		$outhead,0,$out
1354	?vperm		$outmask,$rndkey0,$outmask,$outperm
1355	le?vxor		$outperm,$outperm,$tmp
1356
1357	lvx		$rndkey0,0,$key
1358	mtctr		$rounds
1359	lvx		$rndkey1,$idx,$key
1360	addi		$idx,$idx,16
1361	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1362	vxor		$inout,$ivec,$rndkey0
1363	lvx		$rndkey0,$idx,$key
1364	addi		$idx,$idx,16
1365	b		Loop_ctr32_enc
1366
1367.align	5
1368Loop_ctr32_enc:
1369	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1370	vcipher		$inout,$inout,$rndkey1
1371	lvx		$rndkey1,$idx,$key
1372	addi		$idx,$idx,16
1373	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1374	vcipher		$inout,$inout,$rndkey0
1375	lvx		$rndkey0,$idx,$key
1376	addi		$idx,$idx,16
1377	bdnz		Loop_ctr32_enc
1378
1379	vadduqm		$ivec,$ivec,$one	# Kernel change for 128-bit
1380	 vmr		$dat,$inptail
1381	 lvx		$inptail,0,$inp
1382	 addi		$inp,$inp,16
1383	 subic.		$len,$len,1		# blocks--
1384
1385	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1386	vcipher		$inout,$inout,$rndkey1
1387	lvx		$rndkey1,$idx,$key
1388	 vperm		$dat,$dat,$inptail,$inpperm
1389	 li		$idx,16
1390	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1391	 lvx		$rndkey0,0,$key
1392	vxor		$dat,$dat,$rndkey1	# last round key
1393	vcipherlast	$inout,$inout,$dat
1394
1395	 lvx		$rndkey1,$idx,$key
1396	 addi		$idx,$idx,16
1397	vperm		$inout,$inout,$inout,$outperm
1398	vsel		$dat,$outhead,$inout,$outmask
1399	 mtctr		$rounds
1400	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1401	vmr		$outhead,$inout
1402	 vxor		$inout,$ivec,$rndkey0
1403	 lvx		$rndkey0,$idx,$key
1404	 addi		$idx,$idx,16
1405	stvx		$dat,0,$out
1406	addi		$out,$out,16
1407	bne		Loop_ctr32_enc
1408
1409	addi		$out,$out,-1
1410	lvx		$inout,0,$out		# redundant in aligned case
1411	vsel		$inout,$outhead,$inout,$outmask
1412	stvx		$inout,0,$out
1413
1414	mtspr		256,$vrsave
1415	blr
1416	.long		0
1417	.byte		0,12,0x14,0,0,0,6,0
1418	.long		0
1419___
1420#########################################################################
1421{{	# Optimized CTR procedure					#
1422my $key_="r11";
1423my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1424my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1425my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1426my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1427			# v26-v31 last 6 round keys
1428my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1429my ($two,$three,$four)=($outhead,$outperm,$outmask);
1430
1431$code.=<<___;
1432.align	5
1433_aesp8_ctr32_encrypt8x:
1434	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1435	li		r10,`$FRAME+8*16+15`
1436	li		r11,`$FRAME+8*16+31`
1437	stvx		v20,r10,$sp		# ABI says so
1438	addi		r10,r10,32
1439	stvx		v21,r11,$sp
1440	addi		r11,r11,32
1441	stvx		v22,r10,$sp
1442	addi		r10,r10,32
1443	stvx		v23,r11,$sp
1444	addi		r11,r11,32
1445	stvx		v24,r10,$sp
1446	addi		r10,r10,32
1447	stvx		v25,r11,$sp
1448	addi		r11,r11,32
1449	stvx		v26,r10,$sp
1450	addi		r10,r10,32
1451	stvx		v27,r11,$sp
1452	addi		r11,r11,32
1453	stvx		v28,r10,$sp
1454	addi		r10,r10,32
1455	stvx		v29,r11,$sp
1456	addi		r11,r11,32
1457	stvx		v30,r10,$sp
1458	stvx		v31,r11,$sp
1459	li		r0,-1
1460	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1461	li		$x10,0x10
1462	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1463	li		$x20,0x20
1464	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1465	li		$x30,0x30
1466	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1467	li		$x40,0x40
1468	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1469	li		$x50,0x50
1470	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1471	li		$x60,0x60
1472	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1473	li		$x70,0x70
1474	mtspr		256,r0
1475
1476	subi		$rounds,$rounds,3	# -4 in total
1477
1478	lvx		$rndkey0,$x00,$key	# load key schedule
1479	lvx		v30,$x10,$key
1480	addi		$key,$key,0x20
1481	lvx		v31,$x00,$key
1482	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1483	addi		$key_,$sp,$FRAME+15
1484	mtctr		$rounds
1485
1486Load_ctr32_enc_key:
1487	?vperm		v24,v30,v31,$keyperm
1488	lvx		v30,$x10,$key
1489	addi		$key,$key,0x20
1490	stvx		v24,$x00,$key_		# off-load round[1]
1491	?vperm		v25,v31,v30,$keyperm
1492	lvx		v31,$x00,$key
1493	stvx		v25,$x10,$key_		# off-load round[2]
1494	addi		$key_,$key_,0x20
1495	bdnz		Load_ctr32_enc_key
1496
1497	lvx		v26,$x10,$key
1498	?vperm		v24,v30,v31,$keyperm
1499	lvx		v27,$x20,$key
1500	stvx		v24,$x00,$key_		# off-load round[3]
1501	?vperm		v25,v31,v26,$keyperm
1502	lvx		v28,$x30,$key
1503	stvx		v25,$x10,$key_		# off-load round[4]
1504	addi		$key_,$sp,$FRAME+15	# rewind $key_
1505	?vperm		v26,v26,v27,$keyperm
1506	lvx		v29,$x40,$key
1507	?vperm		v27,v27,v28,$keyperm
1508	lvx		v30,$x50,$key
1509	?vperm		v28,v28,v29,$keyperm
1510	lvx		v31,$x60,$key
1511	?vperm		v29,v29,v30,$keyperm
1512	lvx		$out0,$x70,$key		# borrow $out0
1513	?vperm		v30,v30,v31,$keyperm
1514	lvx		v24,$x00,$key_		# pre-load round[1]
1515	?vperm		v31,v31,$out0,$keyperm
1516	lvx		v25,$x10,$key_		# pre-load round[2]
1517
1518	vadduqm		$two,$one,$one
1519	subi		$inp,$inp,15		# undo "caller"
1520	$SHL		$len,$len,4
1521
1522	vadduqm		$out1,$ivec,$one	# counter values ...
1523	vadduqm		$out2,$ivec,$two	# (do all ctr adds as 128-bit)
1524	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1525	 le?li		$idx,8
1526	vadduqm		$out3,$out1,$two
1527	vxor		$out1,$out1,$rndkey0
1528	 le?lvsl	$inpperm,0,$idx
1529	vadduqm		$out4,$out2,$two
1530	vxor		$out2,$out2,$rndkey0
1531	 le?vspltisb	$tmp,0x0f
1532	vadduqm		$out5,$out3,$two
1533	vxor		$out3,$out3,$rndkey0
1534	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1535	vadduqm		$out6,$out4,$two
1536	vxor		$out4,$out4,$rndkey0
1537	vadduqm		$out7,$out5,$two
1538	vxor		$out5,$out5,$rndkey0
1539	vadduqm		$ivec,$out6,$two	# next counter value
1540	vxor		$out6,$out6,$rndkey0
1541	vxor		$out7,$out7,$rndkey0
1542
1543	mtctr		$rounds
1544	b		Loop_ctr32_enc8x
1545.align	5
1546Loop_ctr32_enc8x:
1547	vcipher 	$out0,$out0,v24
1548	vcipher 	$out1,$out1,v24
1549	vcipher 	$out2,$out2,v24
1550	vcipher 	$out3,$out3,v24
1551	vcipher 	$out4,$out4,v24
1552	vcipher 	$out5,$out5,v24
1553	vcipher 	$out6,$out6,v24
1554	vcipher 	$out7,$out7,v24
1555Loop_ctr32_enc8x_middle:
1556	lvx		v24,$x20,$key_		# round[3]
1557	addi		$key_,$key_,0x20
1558
1559	vcipher 	$out0,$out0,v25
1560	vcipher 	$out1,$out1,v25
1561	vcipher 	$out2,$out2,v25
1562	vcipher 	$out3,$out3,v25
1563	vcipher 	$out4,$out4,v25
1564	vcipher 	$out5,$out5,v25
1565	vcipher 	$out6,$out6,v25
1566	vcipher 	$out7,$out7,v25
1567	lvx		v25,$x10,$key_		# round[4]
1568	bdnz		Loop_ctr32_enc8x
1569
1570	subic		r11,$len,256		# $len-256, borrow $key_
1571	vcipher 	$out0,$out0,v24
1572	vcipher 	$out1,$out1,v24
1573	vcipher 	$out2,$out2,v24
1574	vcipher 	$out3,$out3,v24
1575	vcipher 	$out4,$out4,v24
1576	vcipher 	$out5,$out5,v24
1577	vcipher 	$out6,$out6,v24
1578	vcipher 	$out7,$out7,v24
1579
1580	subfe		r0,r0,r0		# borrow?-1:0
1581	vcipher 	$out0,$out0,v25
1582	vcipher 	$out1,$out1,v25
1583	vcipher 	$out2,$out2,v25
1584	vcipher 	$out3,$out3,v25
1585	vcipher 	$out4,$out4,v25
1586	vcipher		$out5,$out5,v25
1587	vcipher		$out6,$out6,v25
1588	vcipher		$out7,$out7,v25
1589
1590	and		r0,r0,r11
1591	addi		$key_,$sp,$FRAME+15	# rewind $key_
1592	vcipher		$out0,$out0,v26
1593	vcipher		$out1,$out1,v26
1594	vcipher		$out2,$out2,v26
1595	vcipher		$out3,$out3,v26
1596	vcipher		$out4,$out4,v26
1597	vcipher		$out5,$out5,v26
1598	vcipher		$out6,$out6,v26
1599	vcipher		$out7,$out7,v26
1600	lvx		v24,$x00,$key_		# re-pre-load round[1]
1601
1602	subic		$len,$len,129		# $len-=129
1603	vcipher		$out0,$out0,v27
1604	addi		$len,$len,1		# $len-=128 really
1605	vcipher		$out1,$out1,v27
1606	vcipher		$out2,$out2,v27
1607	vcipher		$out3,$out3,v27
1608	vcipher		$out4,$out4,v27
1609	vcipher		$out5,$out5,v27
1610	vcipher		$out6,$out6,v27
1611	vcipher		$out7,$out7,v27
1612	lvx		v25,$x10,$key_		# re-pre-load round[2]
1613
1614	vcipher		$out0,$out0,v28
1615	 lvx_u		$in0,$x00,$inp		# load input
1616	vcipher		$out1,$out1,v28
1617	 lvx_u		$in1,$x10,$inp
1618	vcipher		$out2,$out2,v28
1619	 lvx_u		$in2,$x20,$inp
1620	vcipher		$out3,$out3,v28
1621	 lvx_u		$in3,$x30,$inp
1622	vcipher		$out4,$out4,v28
1623	 lvx_u		$in4,$x40,$inp
1624	vcipher		$out5,$out5,v28
1625	 lvx_u		$in5,$x50,$inp
1626	vcipher		$out6,$out6,v28
1627	 lvx_u		$in6,$x60,$inp
1628	vcipher		$out7,$out7,v28
1629	 lvx_u		$in7,$x70,$inp
1630	 addi		$inp,$inp,0x80
1631
1632	vcipher		$out0,$out0,v29
1633	 le?vperm	$in0,$in0,$in0,$inpperm
1634	vcipher		$out1,$out1,v29
1635	 le?vperm	$in1,$in1,$in1,$inpperm
1636	vcipher		$out2,$out2,v29
1637	 le?vperm	$in2,$in2,$in2,$inpperm
1638	vcipher		$out3,$out3,v29
1639	 le?vperm	$in3,$in3,$in3,$inpperm
1640	vcipher		$out4,$out4,v29
1641	 le?vperm	$in4,$in4,$in4,$inpperm
1642	vcipher		$out5,$out5,v29
1643	 le?vperm	$in5,$in5,$in5,$inpperm
1644	vcipher		$out6,$out6,v29
1645	 le?vperm	$in6,$in6,$in6,$inpperm
1646	vcipher		$out7,$out7,v29
1647	 le?vperm	$in7,$in7,$in7,$inpperm
1648
1649	add		$inp,$inp,r0		# $inp is adjusted in such
1650						# way that at exit from the
1651						# loop inX-in7 are loaded
1652						# with last "words"
1653	subfe.		r0,r0,r0		# borrow?-1:0
1654	vcipher		$out0,$out0,v30
1655	 vxor		$in0,$in0,v31		# xor with last round key
1656	vcipher		$out1,$out1,v30
1657	 vxor		$in1,$in1,v31
1658	vcipher		$out2,$out2,v30
1659	 vxor		$in2,$in2,v31
1660	vcipher		$out3,$out3,v30
1661	 vxor		$in3,$in3,v31
1662	vcipher		$out4,$out4,v30
1663	 vxor		$in4,$in4,v31
1664	vcipher		$out5,$out5,v30
1665	 vxor		$in5,$in5,v31
1666	vcipher		$out6,$out6,v30
1667	 vxor		$in6,$in6,v31
1668	vcipher		$out7,$out7,v30
1669	 vxor		$in7,$in7,v31
1670
1671	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1672
1673	vcipherlast	$in0,$out0,$in0
1674	vcipherlast	$in1,$out1,$in1
1675	 vadduqm	$out1,$ivec,$one	# counter values ...
1676	vcipherlast	$in2,$out2,$in2
1677	 vadduqm	$out2,$ivec,$two
1678	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1679	vcipherlast	$in3,$out3,$in3
1680	 vadduqm	$out3,$out1,$two
1681	 vxor		$out1,$out1,$rndkey0
1682	vcipherlast	$in4,$out4,$in4
1683	 vadduqm	$out4,$out2,$two
1684	 vxor		$out2,$out2,$rndkey0
1685	vcipherlast	$in5,$out5,$in5
1686	 vadduqm	$out5,$out3,$two
1687	 vxor		$out3,$out3,$rndkey0
1688	vcipherlast	$in6,$out6,$in6
1689	 vadduqm	$out6,$out4,$two
1690	 vxor		$out4,$out4,$rndkey0
1691	vcipherlast	$in7,$out7,$in7
1692	 vadduqm	$out7,$out5,$two
1693	 vxor		$out5,$out5,$rndkey0
1694	le?vperm	$in0,$in0,$in0,$inpperm
1695	 vadduqm	$ivec,$out6,$two	# next counter value
1696	 vxor		$out6,$out6,$rndkey0
1697	le?vperm	$in1,$in1,$in1,$inpperm
1698	 vxor		$out7,$out7,$rndkey0
1699	mtctr		$rounds
1700
1701	 vcipher	$out0,$out0,v24
1702	stvx_u		$in0,$x00,$out
1703	le?vperm	$in2,$in2,$in2,$inpperm
1704	 vcipher	$out1,$out1,v24
1705	stvx_u		$in1,$x10,$out
1706	le?vperm	$in3,$in3,$in3,$inpperm
1707	 vcipher	$out2,$out2,v24
1708	stvx_u		$in2,$x20,$out
1709	le?vperm	$in4,$in4,$in4,$inpperm
1710	 vcipher	$out3,$out3,v24
1711	stvx_u		$in3,$x30,$out
1712	le?vperm	$in5,$in5,$in5,$inpperm
1713	 vcipher	$out4,$out4,v24
1714	stvx_u		$in4,$x40,$out
1715	le?vperm	$in6,$in6,$in6,$inpperm
1716	 vcipher	$out5,$out5,v24
1717	stvx_u		$in5,$x50,$out
1718	le?vperm	$in7,$in7,$in7,$inpperm
1719	 vcipher	$out6,$out6,v24
1720	stvx_u		$in6,$x60,$out
1721	 vcipher	$out7,$out7,v24
1722	stvx_u		$in7,$x70,$out
1723	addi		$out,$out,0x80
1724
1725	b		Loop_ctr32_enc8x_middle
1726
1727.align	5
1728Lctr32_enc8x_break:
1729	cmpwi		$len,-0x60
1730	blt		Lctr32_enc8x_one
1731	nop
1732	beq		Lctr32_enc8x_two
1733	cmpwi		$len,-0x40
1734	blt		Lctr32_enc8x_three
1735	nop
1736	beq		Lctr32_enc8x_four
1737	cmpwi		$len,-0x20
1738	blt		Lctr32_enc8x_five
1739	nop
1740	beq		Lctr32_enc8x_six
1741	cmpwi		$len,0x00
1742	blt		Lctr32_enc8x_seven
1743
1744Lctr32_enc8x_eight:
1745	vcipherlast	$out0,$out0,$in0
1746	vcipherlast	$out1,$out1,$in1
1747	vcipherlast	$out2,$out2,$in2
1748	vcipherlast	$out3,$out3,$in3
1749	vcipherlast	$out4,$out4,$in4
1750	vcipherlast	$out5,$out5,$in5
1751	vcipherlast	$out6,$out6,$in6
1752	vcipherlast	$out7,$out7,$in7
1753
1754	le?vperm	$out0,$out0,$out0,$inpperm
1755	le?vperm	$out1,$out1,$out1,$inpperm
1756	stvx_u		$out0,$x00,$out
1757	le?vperm	$out2,$out2,$out2,$inpperm
1758	stvx_u		$out1,$x10,$out
1759	le?vperm	$out3,$out3,$out3,$inpperm
1760	stvx_u		$out2,$x20,$out
1761	le?vperm	$out4,$out4,$out4,$inpperm
1762	stvx_u		$out3,$x30,$out
1763	le?vperm	$out5,$out5,$out5,$inpperm
1764	stvx_u		$out4,$x40,$out
1765	le?vperm	$out6,$out6,$out6,$inpperm
1766	stvx_u		$out5,$x50,$out
1767	le?vperm	$out7,$out7,$out7,$inpperm
1768	stvx_u		$out6,$x60,$out
1769	stvx_u		$out7,$x70,$out
1770	addi		$out,$out,0x80
1771	b		Lctr32_enc8x_done
1772
1773.align	5
1774Lctr32_enc8x_seven:
1775	vcipherlast	$out0,$out0,$in1
1776	vcipherlast	$out1,$out1,$in2
1777	vcipherlast	$out2,$out2,$in3
1778	vcipherlast	$out3,$out3,$in4
1779	vcipherlast	$out4,$out4,$in5
1780	vcipherlast	$out5,$out5,$in6
1781	vcipherlast	$out6,$out6,$in7
1782
1783	le?vperm	$out0,$out0,$out0,$inpperm
1784	le?vperm	$out1,$out1,$out1,$inpperm
1785	stvx_u		$out0,$x00,$out
1786	le?vperm	$out2,$out2,$out2,$inpperm
1787	stvx_u		$out1,$x10,$out
1788	le?vperm	$out3,$out3,$out3,$inpperm
1789	stvx_u		$out2,$x20,$out
1790	le?vperm	$out4,$out4,$out4,$inpperm
1791	stvx_u		$out3,$x30,$out
1792	le?vperm	$out5,$out5,$out5,$inpperm
1793	stvx_u		$out4,$x40,$out
1794	le?vperm	$out6,$out6,$out6,$inpperm
1795	stvx_u		$out5,$x50,$out
1796	stvx_u		$out6,$x60,$out
1797	addi		$out,$out,0x70
1798	b		Lctr32_enc8x_done
1799
1800.align	5
1801Lctr32_enc8x_six:
1802	vcipherlast	$out0,$out0,$in2
1803	vcipherlast	$out1,$out1,$in3
1804	vcipherlast	$out2,$out2,$in4
1805	vcipherlast	$out3,$out3,$in5
1806	vcipherlast	$out4,$out4,$in6
1807	vcipherlast	$out5,$out5,$in7
1808
1809	le?vperm	$out0,$out0,$out0,$inpperm
1810	le?vperm	$out1,$out1,$out1,$inpperm
1811	stvx_u		$out0,$x00,$out
1812	le?vperm	$out2,$out2,$out2,$inpperm
1813	stvx_u		$out1,$x10,$out
1814	le?vperm	$out3,$out3,$out3,$inpperm
1815	stvx_u		$out2,$x20,$out
1816	le?vperm	$out4,$out4,$out4,$inpperm
1817	stvx_u		$out3,$x30,$out
1818	le?vperm	$out5,$out5,$out5,$inpperm
1819	stvx_u		$out4,$x40,$out
1820	stvx_u		$out5,$x50,$out
1821	addi		$out,$out,0x60
1822	b		Lctr32_enc8x_done
1823
1824.align	5
1825Lctr32_enc8x_five:
1826	vcipherlast	$out0,$out0,$in3
1827	vcipherlast	$out1,$out1,$in4
1828	vcipherlast	$out2,$out2,$in5
1829	vcipherlast	$out3,$out3,$in6
1830	vcipherlast	$out4,$out4,$in7
1831
1832	le?vperm	$out0,$out0,$out0,$inpperm
1833	le?vperm	$out1,$out1,$out1,$inpperm
1834	stvx_u		$out0,$x00,$out
1835	le?vperm	$out2,$out2,$out2,$inpperm
1836	stvx_u		$out1,$x10,$out
1837	le?vperm	$out3,$out3,$out3,$inpperm
1838	stvx_u		$out2,$x20,$out
1839	le?vperm	$out4,$out4,$out4,$inpperm
1840	stvx_u		$out3,$x30,$out
1841	stvx_u		$out4,$x40,$out
1842	addi		$out,$out,0x50
1843	b		Lctr32_enc8x_done
1844
1845.align	5
1846Lctr32_enc8x_four:
1847	vcipherlast	$out0,$out0,$in4
1848	vcipherlast	$out1,$out1,$in5
1849	vcipherlast	$out2,$out2,$in6
1850	vcipherlast	$out3,$out3,$in7
1851
1852	le?vperm	$out0,$out0,$out0,$inpperm
1853	le?vperm	$out1,$out1,$out1,$inpperm
1854	stvx_u		$out0,$x00,$out
1855	le?vperm	$out2,$out2,$out2,$inpperm
1856	stvx_u		$out1,$x10,$out
1857	le?vperm	$out3,$out3,$out3,$inpperm
1858	stvx_u		$out2,$x20,$out
1859	stvx_u		$out3,$x30,$out
1860	addi		$out,$out,0x40
1861	b		Lctr32_enc8x_done
1862
1863.align	5
1864Lctr32_enc8x_three:
1865	vcipherlast	$out0,$out0,$in5
1866	vcipherlast	$out1,$out1,$in6
1867	vcipherlast	$out2,$out2,$in7
1868
1869	le?vperm	$out0,$out0,$out0,$inpperm
1870	le?vperm	$out1,$out1,$out1,$inpperm
1871	stvx_u		$out0,$x00,$out
1872	le?vperm	$out2,$out2,$out2,$inpperm
1873	stvx_u		$out1,$x10,$out
1874	stvx_u		$out2,$x20,$out
1875	addi		$out,$out,0x30
1876	b		Lctr32_enc8x_done
1877
1878.align	5
1879Lctr32_enc8x_two:
1880	vcipherlast	$out0,$out0,$in6
1881	vcipherlast	$out1,$out1,$in7
1882
1883	le?vperm	$out0,$out0,$out0,$inpperm
1884	le?vperm	$out1,$out1,$out1,$inpperm
1885	stvx_u		$out0,$x00,$out
1886	stvx_u		$out1,$x10,$out
1887	addi		$out,$out,0x20
1888	b		Lctr32_enc8x_done
1889
1890.align	5
1891Lctr32_enc8x_one:
1892	vcipherlast	$out0,$out0,$in7
1893
1894	le?vperm	$out0,$out0,$out0,$inpperm
1895	stvx_u		$out0,0,$out
1896	addi		$out,$out,0x10
1897
1898Lctr32_enc8x_done:
1899	li		r10,`$FRAME+15`
1900	li		r11,`$FRAME+31`
1901	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1902	addi		r10,r10,32
1903	stvx		$inpperm,r11,$sp
1904	addi		r11,r11,32
1905	stvx		$inpperm,r10,$sp
1906	addi		r10,r10,32
1907	stvx		$inpperm,r11,$sp
1908	addi		r11,r11,32
1909	stvx		$inpperm,r10,$sp
1910	addi		r10,r10,32
1911	stvx		$inpperm,r11,$sp
1912	addi		r11,r11,32
1913	stvx		$inpperm,r10,$sp
1914	addi		r10,r10,32
1915	stvx		$inpperm,r11,$sp
1916	addi		r11,r11,32
1917
1918	mtspr		256,$vrsave
1919	lvx		v20,r10,$sp		# ABI says so
1920	addi		r10,r10,32
1921	lvx		v21,r11,$sp
1922	addi		r11,r11,32
1923	lvx		v22,r10,$sp
1924	addi		r10,r10,32
1925	lvx		v23,r11,$sp
1926	addi		r11,r11,32
1927	lvx		v24,r10,$sp
1928	addi		r10,r10,32
1929	lvx		v25,r11,$sp
1930	addi		r11,r11,32
1931	lvx		v26,r10,$sp
1932	addi		r10,r10,32
1933	lvx		v27,r11,$sp
1934	addi		r11,r11,32
1935	lvx		v28,r10,$sp
1936	addi		r10,r10,32
1937	lvx		v29,r11,$sp
1938	addi		r11,r11,32
1939	lvx		v30,r10,$sp
1940	lvx		v31,r11,$sp
1941	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1942	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1943	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1944	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1945	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1946	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1947	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1948	blr
1949	.long		0
1950	.byte		0,12,0x14,0,0x80,6,6,0
1951	.long		0
1952.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1953___
1954}}	}}}
1955
1956#########################################################################
1957{{{	# XTS procedures						#
1958# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1959#                             const AES_KEY *key1, const AES_KEY *key2,	#
1960#                             [const] unsigned char iv[16]);		#
1961# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1962# input tweak value is assumed to be encrypted already, and last tweak	#
1963# value, one suitable for consecutive call on same chunk of data, is	#
1964# written back to original buffer. In addition, in "tweak chaining"	#
1965# mode only complete input blocks are processed.			#
1966
1967my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1968my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1969my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1970my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1971my $taillen = $key2;
1972
1973   ($inp,$idx) = ($idx,$inp);				# reassign
1974
1975$code.=<<___;
1976.globl	.${prefix}_xts_encrypt
1977	mr		$inp,r3				# reassign
1978	li		r3,-1
1979	${UCMP}i	$len,16
1980	bltlr-
1981
1982	lis		r0,0xfff0
1983	mfspr		r12,256				# save vrsave
1984	li		r11,0
1985	mtspr		256,r0
1986
1987	vspltisb	$seven,0x07			# 0x070707..07
1988	le?lvsl		$leperm,r11,r11
1989	le?vspltisb	$tmp,0x0f
1990	le?vxor		$leperm,$leperm,$seven
1991
1992	li		$idx,15
1993	lvx		$tweak,0,$ivp			# load [unaligned] iv
1994	lvsl		$inpperm,0,$ivp
1995	lvx		$inptail,$idx,$ivp
1996	le?vxor		$inpperm,$inpperm,$tmp
1997	vperm		$tweak,$tweak,$inptail,$inpperm
1998
1999	neg		r11,$inp
2000	lvsr		$inpperm,0,r11			# prepare for unaligned load
2001	lvx		$inout,0,$inp
2002	addi		$inp,$inp,15			# 15 is not typo
2003	le?vxor		$inpperm,$inpperm,$tmp
2004
2005	${UCMP}i	$key2,0				# key2==NULL?
2006	beq		Lxts_enc_no_key2
2007
2008	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2009	lwz		$rounds,240($key2)
2010	srwi		$rounds,$rounds,1
2011	subi		$rounds,$rounds,1
2012	li		$idx,16
2013
2014	lvx		$rndkey0,0,$key2
2015	lvx		$rndkey1,$idx,$key2
2016	addi		$idx,$idx,16
2017	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2018	vxor		$tweak,$tweak,$rndkey0
2019	lvx		$rndkey0,$idx,$key2
2020	addi		$idx,$idx,16
2021	mtctr		$rounds
2022
2023Ltweak_xts_enc:
2024	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2025	vcipher		$tweak,$tweak,$rndkey1
2026	lvx		$rndkey1,$idx,$key2
2027	addi		$idx,$idx,16
2028	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2029	vcipher		$tweak,$tweak,$rndkey0
2030	lvx		$rndkey0,$idx,$key2
2031	addi		$idx,$idx,16
2032	bdnz		Ltweak_xts_enc
2033
2034	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2035	vcipher		$tweak,$tweak,$rndkey1
2036	lvx		$rndkey1,$idx,$key2
2037	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2038	vcipherlast	$tweak,$tweak,$rndkey0
2039
2040	li		$ivp,0				# don't chain the tweak
2041	b		Lxts_enc
2042
2043Lxts_enc_no_key2:
2044	li		$idx,-16
2045	and		$len,$len,$idx			# in "tweak chaining"
2046							# mode only complete
2047							# blocks are processed
2048Lxts_enc:
2049	lvx		$inptail,0,$inp
2050	addi		$inp,$inp,16
2051
2052	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2053	lwz		$rounds,240($key1)
2054	srwi		$rounds,$rounds,1
2055	subi		$rounds,$rounds,1
2056	li		$idx,16
2057
2058	vslb		$eighty7,$seven,$seven		# 0x808080..80
2059	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2060	vspltisb	$tmp,1				# 0x010101..01
2061	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2062
2063	${UCMP}i	$len,96
2064	bge		_aesp8_xts_encrypt6x
2065
2066	andi.		$taillen,$len,15
2067	subic		r0,$len,32
2068	subi		$taillen,$taillen,16
2069	subfe		r0,r0,r0
2070	and		r0,r0,$taillen
2071	add		$inp,$inp,r0
2072
2073	lvx		$rndkey0,0,$key1
2074	lvx		$rndkey1,$idx,$key1
2075	addi		$idx,$idx,16
2076	vperm		$inout,$inout,$inptail,$inpperm
2077	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2078	vxor		$inout,$inout,$tweak
2079	vxor		$inout,$inout,$rndkey0
2080	lvx		$rndkey0,$idx,$key1
2081	addi		$idx,$idx,16
2082	mtctr		$rounds
2083	b		Loop_xts_enc
2084
2085.align	5
2086Loop_xts_enc:
2087	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2088	vcipher		$inout,$inout,$rndkey1
2089	lvx		$rndkey1,$idx,$key1
2090	addi		$idx,$idx,16
2091	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2092	vcipher		$inout,$inout,$rndkey0
2093	lvx		$rndkey0,$idx,$key1
2094	addi		$idx,$idx,16
2095	bdnz		Loop_xts_enc
2096
2097	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2098	vcipher		$inout,$inout,$rndkey1
2099	lvx		$rndkey1,$idx,$key1
2100	li		$idx,16
2101	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2102	vxor		$rndkey0,$rndkey0,$tweak
2103	vcipherlast	$output,$inout,$rndkey0
2104
2105	le?vperm	$tmp,$output,$output,$leperm
2106	be?nop
2107	le?stvx_u	$tmp,0,$out
2108	be?stvx_u	$output,0,$out
2109	addi		$out,$out,16
2110
2111	subic.		$len,$len,16
2112	beq		Lxts_enc_done
2113
2114	vmr		$inout,$inptail
2115	lvx		$inptail,0,$inp
2116	addi		$inp,$inp,16
2117	lvx		$rndkey0,0,$key1
2118	lvx		$rndkey1,$idx,$key1
2119	addi		$idx,$idx,16
2120
2121	subic		r0,$len,32
2122	subfe		r0,r0,r0
2123	and		r0,r0,$taillen
2124	add		$inp,$inp,r0
2125
2126	vsrab		$tmp,$tweak,$seven		# next tweak value
2127	vaddubm		$tweak,$tweak,$tweak
2128	vsldoi		$tmp,$tmp,$tmp,15
2129	vand		$tmp,$tmp,$eighty7
2130	vxor		$tweak,$tweak,$tmp
2131
2132	vperm		$inout,$inout,$inptail,$inpperm
2133	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2134	vxor		$inout,$inout,$tweak
2135	vxor		$output,$output,$rndkey0	# just in case $len<16
2136	vxor		$inout,$inout,$rndkey0
2137	lvx		$rndkey0,$idx,$key1
2138	addi		$idx,$idx,16
2139
2140	mtctr		$rounds
2141	${UCMP}i	$len,16
2142	bge		Loop_xts_enc
2143
2144	vxor		$output,$output,$tweak
2145	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2146	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2147	vspltisb	$tmp,-1
2148	vperm		$inptail,$inptail,$tmp,$inpperm
2149	vsel		$inout,$inout,$output,$inptail
2150
2151	subi		r11,$out,17
2152	subi		$out,$out,16
2153	mtctr		$len
2154	li		$len,16
2155Loop_xts_enc_steal:
2156	lbzu		r0,1(r11)
2157	stb		r0,16(r11)
2158	bdnz		Loop_xts_enc_steal
2159
2160	mtctr		$rounds
2161	b		Loop_xts_enc			# one more time...
2162
2163Lxts_enc_done:
2164	${UCMP}i	$ivp,0
2165	beq		Lxts_enc_ret
2166
2167	vsrab		$tmp,$tweak,$seven		# next tweak value
2168	vaddubm		$tweak,$tweak,$tweak
2169	vsldoi		$tmp,$tmp,$tmp,15
2170	vand		$tmp,$tmp,$eighty7
2171	vxor		$tweak,$tweak,$tmp
2172
2173	le?vperm	$tweak,$tweak,$tweak,$leperm
2174	stvx_u		$tweak,0,$ivp
2175
2176Lxts_enc_ret:
2177	mtspr		256,r12				# restore vrsave
2178	li		r3,0
2179	blr
2180	.long		0
2181	.byte		0,12,0x04,0,0x80,6,6,0
2182	.long		0
2183.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2184
2185.globl	.${prefix}_xts_decrypt
2186	mr		$inp,r3				# reassign
2187	li		r3,-1
2188	${UCMP}i	$len,16
2189	bltlr-
2190
2191	lis		r0,0xfff8
2192	mfspr		r12,256				# save vrsave
2193	li		r11,0
2194	mtspr		256,r0
2195
2196	andi.		r0,$len,15
2197	neg		r0,r0
2198	andi.		r0,r0,16
2199	sub		$len,$len,r0
2200
2201	vspltisb	$seven,0x07			# 0x070707..07
2202	le?lvsl		$leperm,r11,r11
2203	le?vspltisb	$tmp,0x0f
2204	le?vxor		$leperm,$leperm,$seven
2205
2206	li		$idx,15
2207	lvx		$tweak,0,$ivp			# load [unaligned] iv
2208	lvsl		$inpperm,0,$ivp
2209	lvx		$inptail,$idx,$ivp
2210	le?vxor		$inpperm,$inpperm,$tmp
2211	vperm		$tweak,$tweak,$inptail,$inpperm
2212
2213	neg		r11,$inp
2214	lvsr		$inpperm,0,r11			# prepare for unaligned load
2215	lvx		$inout,0,$inp
2216	addi		$inp,$inp,15			# 15 is not typo
2217	le?vxor		$inpperm,$inpperm,$tmp
2218
2219	${UCMP}i	$key2,0				# key2==NULL?
2220	beq		Lxts_dec_no_key2
2221
2222	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2223	lwz		$rounds,240($key2)
2224	srwi		$rounds,$rounds,1
2225	subi		$rounds,$rounds,1
2226	li		$idx,16
2227
2228	lvx		$rndkey0,0,$key2
2229	lvx		$rndkey1,$idx,$key2
2230	addi		$idx,$idx,16
2231	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2232	vxor		$tweak,$tweak,$rndkey0
2233	lvx		$rndkey0,$idx,$key2
2234	addi		$idx,$idx,16
2235	mtctr		$rounds
2236
2237Ltweak_xts_dec:
2238	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2239	vcipher		$tweak,$tweak,$rndkey1
2240	lvx		$rndkey1,$idx,$key2
2241	addi		$idx,$idx,16
2242	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2243	vcipher		$tweak,$tweak,$rndkey0
2244	lvx		$rndkey0,$idx,$key2
2245	addi		$idx,$idx,16
2246	bdnz		Ltweak_xts_dec
2247
2248	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2249	vcipher		$tweak,$tweak,$rndkey1
2250	lvx		$rndkey1,$idx,$key2
2251	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2252	vcipherlast	$tweak,$tweak,$rndkey0
2253
2254	li		$ivp,0				# don't chain the tweak
2255	b		Lxts_dec
2256
2257Lxts_dec_no_key2:
2258	neg		$idx,$len
2259	andi.		$idx,$idx,15
2260	add		$len,$len,$idx			# in "tweak chaining"
2261							# mode only complete
2262							# blocks are processed
2263Lxts_dec:
2264	lvx		$inptail,0,$inp
2265	addi		$inp,$inp,16
2266
2267	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2268	lwz		$rounds,240($key1)
2269	srwi		$rounds,$rounds,1
2270	subi		$rounds,$rounds,1
2271	li		$idx,16
2272
2273	vslb		$eighty7,$seven,$seven		# 0x808080..80
2274	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2275	vspltisb	$tmp,1				# 0x010101..01
2276	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2277
2278	${UCMP}i	$len,96
2279	bge		_aesp8_xts_decrypt6x
2280
2281	lvx		$rndkey0,0,$key1
2282	lvx		$rndkey1,$idx,$key1
2283	addi		$idx,$idx,16
2284	vperm		$inout,$inout,$inptail,$inpperm
2285	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2286	vxor		$inout,$inout,$tweak
2287	vxor		$inout,$inout,$rndkey0
2288	lvx		$rndkey0,$idx,$key1
2289	addi		$idx,$idx,16
2290	mtctr		$rounds
2291
2292	${UCMP}i	$len,16
2293	blt		Ltail_xts_dec
2294	be?b		Loop_xts_dec
2295
2296.align	5
2297Loop_xts_dec:
2298	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2299	vncipher	$inout,$inout,$rndkey1
2300	lvx		$rndkey1,$idx,$key1
2301	addi		$idx,$idx,16
2302	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2303	vncipher	$inout,$inout,$rndkey0
2304	lvx		$rndkey0,$idx,$key1
2305	addi		$idx,$idx,16
2306	bdnz		Loop_xts_dec
2307
2308	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2309	vncipher	$inout,$inout,$rndkey1
2310	lvx		$rndkey1,$idx,$key1
2311	li		$idx,16
2312	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2313	vxor		$rndkey0,$rndkey0,$tweak
2314	vncipherlast	$output,$inout,$rndkey0
2315
2316	le?vperm	$tmp,$output,$output,$leperm
2317	be?nop
2318	le?stvx_u	$tmp,0,$out
2319	be?stvx_u	$output,0,$out
2320	addi		$out,$out,16
2321
2322	subic.		$len,$len,16
2323	beq		Lxts_dec_done
2324
2325	vmr		$inout,$inptail
2326	lvx		$inptail,0,$inp
2327	addi		$inp,$inp,16
2328	lvx		$rndkey0,0,$key1
2329	lvx		$rndkey1,$idx,$key1
2330	addi		$idx,$idx,16
2331
2332	vsrab		$tmp,$tweak,$seven		# next tweak value
2333	vaddubm		$tweak,$tweak,$tweak
2334	vsldoi		$tmp,$tmp,$tmp,15
2335	vand		$tmp,$tmp,$eighty7
2336	vxor		$tweak,$tweak,$tmp
2337
2338	vperm		$inout,$inout,$inptail,$inpperm
2339	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2340	vxor		$inout,$inout,$tweak
2341	vxor		$inout,$inout,$rndkey0
2342	lvx		$rndkey0,$idx,$key1
2343	addi		$idx,$idx,16
2344
2345	mtctr		$rounds
2346	${UCMP}i	$len,16
2347	bge		Loop_xts_dec
2348
2349Ltail_xts_dec:
2350	vsrab		$tmp,$tweak,$seven		# next tweak value
2351	vaddubm		$tweak1,$tweak,$tweak
2352	vsldoi		$tmp,$tmp,$tmp,15
2353	vand		$tmp,$tmp,$eighty7
2354	vxor		$tweak1,$tweak1,$tmp
2355
2356	subi		$inp,$inp,16
2357	add		$inp,$inp,$len
2358
2359	vxor		$inout,$inout,$tweak		# :-(
2360	vxor		$inout,$inout,$tweak1		# :-)
2361
2362Loop_xts_dec_short:
2363	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2364	vncipher	$inout,$inout,$rndkey1
2365	lvx		$rndkey1,$idx,$key1
2366	addi		$idx,$idx,16
2367	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2368	vncipher	$inout,$inout,$rndkey0
2369	lvx		$rndkey0,$idx,$key1
2370	addi		$idx,$idx,16
2371	bdnz		Loop_xts_dec_short
2372
2373	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2374	vncipher	$inout,$inout,$rndkey1
2375	lvx		$rndkey1,$idx,$key1
2376	li		$idx,16
2377	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2378	vxor		$rndkey0,$rndkey0,$tweak1
2379	vncipherlast	$output,$inout,$rndkey0
2380
2381	le?vperm	$tmp,$output,$output,$leperm
2382	be?nop
2383	le?stvx_u	$tmp,0,$out
2384	be?stvx_u	$output,0,$out
2385
2386	vmr		$inout,$inptail
2387	lvx		$inptail,0,$inp
2388	#addi		$inp,$inp,16
2389	lvx		$rndkey0,0,$key1
2390	lvx		$rndkey1,$idx,$key1
2391	addi		$idx,$idx,16
2392	vperm		$inout,$inout,$inptail,$inpperm
2393	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2394
2395	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2396	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2397	vspltisb	$tmp,-1
2398	vperm		$inptail,$inptail,$tmp,$inpperm
2399	vsel		$inout,$inout,$output,$inptail
2400
2401	vxor		$rndkey0,$rndkey0,$tweak
2402	vxor		$inout,$inout,$rndkey0
2403	lvx		$rndkey0,$idx,$key1
2404	addi		$idx,$idx,16
2405
2406	subi		r11,$out,1
2407	mtctr		$len
2408	li		$len,16
2409Loop_xts_dec_steal:
2410	lbzu		r0,1(r11)
2411	stb		r0,16(r11)
2412	bdnz		Loop_xts_dec_steal
2413
2414	mtctr		$rounds
2415	b		Loop_xts_dec			# one more time...
2416
2417Lxts_dec_done:
2418	${UCMP}i	$ivp,0
2419	beq		Lxts_dec_ret
2420
2421	vsrab		$tmp,$tweak,$seven		# next tweak value
2422	vaddubm		$tweak,$tweak,$tweak
2423	vsldoi		$tmp,$tmp,$tmp,15
2424	vand		$tmp,$tmp,$eighty7
2425	vxor		$tweak,$tweak,$tmp
2426
2427	le?vperm	$tweak,$tweak,$tweak,$leperm
2428	stvx_u		$tweak,0,$ivp
2429
2430Lxts_dec_ret:
2431	mtspr		256,r12				# restore vrsave
2432	li		r3,0
2433	blr
2434	.long		0
2435	.byte		0,12,0x04,0,0x80,6,6,0
2436	.long		0
2437.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2438___
2439#########################################################################
2440{{	# Optimized XTS procedures					#
2441my $key_=$key2;
2442my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2443    $x00=0 if ($flavour =~ /osx/);
2444my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2445my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2446my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2447my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2448			# v26-v31 last 6 round keys
2449my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2450my $taillen=$x70;
2451
2452$code.=<<___;
2453.align	5
2454_aesp8_xts_encrypt6x:
2455	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2456	mflr		r11
2457	li		r7,`$FRAME+8*16+15`
2458	li		r3,`$FRAME+8*16+31`
2459	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2460	stvx		v20,r7,$sp		# ABI says so
2461	addi		r7,r7,32
2462	stvx		v21,r3,$sp
2463	addi		r3,r3,32
2464	stvx		v22,r7,$sp
2465	addi		r7,r7,32
2466	stvx		v23,r3,$sp
2467	addi		r3,r3,32
2468	stvx		v24,r7,$sp
2469	addi		r7,r7,32
2470	stvx		v25,r3,$sp
2471	addi		r3,r3,32
2472	stvx		v26,r7,$sp
2473	addi		r7,r7,32
2474	stvx		v27,r3,$sp
2475	addi		r3,r3,32
2476	stvx		v28,r7,$sp
2477	addi		r7,r7,32
2478	stvx		v29,r3,$sp
2479	addi		r3,r3,32
2480	stvx		v30,r7,$sp
2481	stvx		v31,r3,$sp
2482	li		r0,-1
2483	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2484	li		$x10,0x10
2485	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2486	li		$x20,0x20
2487	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2488	li		$x30,0x30
2489	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2490	li		$x40,0x40
2491	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2492	li		$x50,0x50
2493	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2494	li		$x60,0x60
2495	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2496	li		$x70,0x70
2497	mtspr		256,r0
2498
2499	xxlor		2, 32+$eighty7, 32+$eighty7
2500	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
2501	xxlor		1, 32+$eighty7, 32+$eighty7
2502
2503	# Load XOR Lconsts.
2504	mr		$x70, r6
2505	bl		Lconsts
2506	lxvw4x		0, $x40, r6		# load XOR contents
2507	mr		r6, $x70
2508	li		$x70,0x70
2509
2510	subi		$rounds,$rounds,3	# -4 in total
2511
2512	lvx		$rndkey0,$x00,$key1	# load key schedule
2513	lvx		v30,$x10,$key1
2514	addi		$key1,$key1,0x20
2515	lvx		v31,$x00,$key1
2516	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2517	addi		$key_,$sp,$FRAME+15
2518	mtctr		$rounds
2519
2520Load_xts_enc_key:
2521	?vperm		v24,v30,v31,$keyperm
2522	lvx		v30,$x10,$key1
2523	addi		$key1,$key1,0x20
2524	stvx		v24,$x00,$key_		# off-load round[1]
2525	?vperm		v25,v31,v30,$keyperm
2526	lvx		v31,$x00,$key1
2527	stvx		v25,$x10,$key_		# off-load round[2]
2528	addi		$key_,$key_,0x20
2529	bdnz		Load_xts_enc_key
2530
2531	lvx		v26,$x10,$key1
2532	?vperm		v24,v30,v31,$keyperm
2533	lvx		v27,$x20,$key1
2534	stvx		v24,$x00,$key_		# off-load round[3]
2535	?vperm		v25,v31,v26,$keyperm
2536	lvx		v28,$x30,$key1
2537	stvx		v25,$x10,$key_		# off-load round[4]
2538	addi		$key_,$sp,$FRAME+15	# rewind $key_
2539	?vperm		v26,v26,v27,$keyperm
2540	lvx		v29,$x40,$key1
2541	?vperm		v27,v27,v28,$keyperm
2542	lvx		v30,$x50,$key1
2543	?vperm		v28,v28,v29,$keyperm
2544	lvx		v31,$x60,$key1
2545	?vperm		v29,v29,v30,$keyperm
2546	lvx		$twk5,$x70,$key1	# borrow $twk5
2547	?vperm		v30,v30,v31,$keyperm
2548	lvx		v24,$x00,$key_		# pre-load round[1]
2549	?vperm		v31,v31,$twk5,$keyperm
2550	lvx		v25,$x10,$key_		# pre-load round[2]
2551
2552	# Switch to use the following codes with 0x010101..87 to generate tweak.
2553	#     eighty7 = 0x010101..87
2554	# vsrab         tmp, tweak, seven       # next tweak value, right shift 7 bits
2555	# vand          tmp, tmp, eighty7       # last byte with carry
2556	# vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
2557	# xxlor         vsx, 0, 0
2558	# vpermxor      tweak, tweak, tmp, vsx
2559
2560	 vperm		$in0,$inout,$inptail,$inpperm
2561	 subi		$inp,$inp,31		# undo "caller"
2562	vxor		$twk0,$tweak,$rndkey0
2563	vsrab		$tmp,$tweak,$seven	# next tweak value
2564	vaddubm		$tweak,$tweak,$tweak
2565	vand		$tmp,$tmp,$eighty7
2566	 vxor		$out0,$in0,$twk0
2567	xxlor		32+$in1, 0, 0
2568	vpermxor	$tweak, $tweak, $tmp, $in1
2569
2570	 lvx_u		$in1,$x10,$inp
2571	vxor		$twk1,$tweak,$rndkey0
2572	vsrab		$tmp,$tweak,$seven	# next tweak value
2573	vaddubm		$tweak,$tweak,$tweak
2574	 le?vperm	$in1,$in1,$in1,$leperm
2575	vand		$tmp,$tmp,$eighty7
2576	 vxor		$out1,$in1,$twk1
2577	xxlor		32+$in2, 0, 0
2578	vpermxor	$tweak, $tweak, $tmp, $in2
2579
2580	 lvx_u		$in2,$x20,$inp
2581	 andi.		$taillen,$len,15
2582	vxor		$twk2,$tweak,$rndkey0
2583	vsrab		$tmp,$tweak,$seven	# next tweak value
2584	vaddubm		$tweak,$tweak,$tweak
2585	 le?vperm	$in2,$in2,$in2,$leperm
2586	vand		$tmp,$tmp,$eighty7
2587	 vxor		$out2,$in2,$twk2
2588	xxlor		32+$in3, 0, 0
2589	vpermxor	$tweak, $tweak, $tmp, $in3
2590
2591	 lvx_u		$in3,$x30,$inp
2592	 sub		$len,$len,$taillen
2593	vxor		$twk3,$tweak,$rndkey0
2594	vsrab		$tmp,$tweak,$seven	# next tweak value
2595	vaddubm		$tweak,$tweak,$tweak
2596	 le?vperm	$in3,$in3,$in3,$leperm
2597	vand		$tmp,$tmp,$eighty7
2598	 vxor		$out3,$in3,$twk3
2599	xxlor		32+$in4, 0, 0
2600	vpermxor	$tweak, $tweak, $tmp, $in4
2601
2602	 lvx_u		$in4,$x40,$inp
2603	 subi		$len,$len,0x60
2604	vxor		$twk4,$tweak,$rndkey0
2605	vsrab		$tmp,$tweak,$seven	# next tweak value
2606	vaddubm		$tweak,$tweak,$tweak
2607	 le?vperm	$in4,$in4,$in4,$leperm
2608	vand		$tmp,$tmp,$eighty7
2609	 vxor		$out4,$in4,$twk4
2610	xxlor		32+$in5, 0, 0
2611	vpermxor	$tweak, $tweak, $tmp, $in5
2612
2613	 lvx_u		$in5,$x50,$inp
2614	 addi		$inp,$inp,0x60
2615	vxor		$twk5,$tweak,$rndkey0
2616	vsrab		$tmp,$tweak,$seven	# next tweak value
2617	vaddubm		$tweak,$tweak,$tweak
2618	 le?vperm	$in5,$in5,$in5,$leperm
2619	vand		$tmp,$tmp,$eighty7
2620	 vxor		$out5,$in5,$twk5
2621	xxlor		32+$in0, 0, 0
2622	vpermxor	$tweak, $tweak, $tmp, $in0
2623
2624	vxor		v31,v31,$rndkey0
2625	mtctr		$rounds
2626	b		Loop_xts_enc6x
2627
2628.align	5
2629Loop_xts_enc6x:
2630	vcipher		$out0,$out0,v24
2631	vcipher		$out1,$out1,v24
2632	vcipher		$out2,$out2,v24
2633	vcipher		$out3,$out3,v24
2634	vcipher		$out4,$out4,v24
2635	vcipher		$out5,$out5,v24
2636	lvx		v24,$x20,$key_		# round[3]
2637	addi		$key_,$key_,0x20
2638
2639	vcipher		$out0,$out0,v25
2640	vcipher		$out1,$out1,v25
2641	vcipher		$out2,$out2,v25
2642	vcipher		$out3,$out3,v25
2643	vcipher		$out4,$out4,v25
2644	vcipher		$out5,$out5,v25
2645	lvx		v25,$x10,$key_		# round[4]
2646	bdnz		Loop_xts_enc6x
2647
2648	xxlor		32+$eighty7, 1, 1	# 0x010101..87
2649
2650	subic		$len,$len,96		# $len-=96
2651	 vxor		$in0,$twk0,v31		# xor with last round key
2652	vcipher		$out0,$out0,v24
2653	vcipher		$out1,$out1,v24
2654	 vsrab		$tmp,$tweak,$seven	# next tweak value
2655	 vxor		$twk0,$tweak,$rndkey0
2656	 vaddubm	$tweak,$tweak,$tweak
2657	vcipher		$out2,$out2,v24
2658	vcipher		$out3,$out3,v24
2659	vcipher		$out4,$out4,v24
2660	vcipher		$out5,$out5,v24
2661
2662	subfe.		r0,r0,r0		# borrow?-1:0
2663	 vand		$tmp,$tmp,$eighty7
2664	vcipher		$out0,$out0,v25
2665	vcipher		$out1,$out1,v25
2666	 xxlor		32+$in1, 0, 0
2667	 vpermxor	$tweak, $tweak, $tmp, $in1
2668	vcipher		$out2,$out2,v25
2669	vcipher		$out3,$out3,v25
2670	 vxor		$in1,$twk1,v31
2671	 vsrab		$tmp,$tweak,$seven	# next tweak value
2672	 vxor		$twk1,$tweak,$rndkey0
2673	vcipher		$out4,$out4,v25
2674	vcipher		$out5,$out5,v25
2675
2676	and		r0,r0,$len
2677	 vaddubm	$tweak,$tweak,$tweak
2678	vcipher		$out0,$out0,v26
2679	vcipher		$out1,$out1,v26
2680	 vand		$tmp,$tmp,$eighty7
2681	vcipher		$out2,$out2,v26
2682	vcipher		$out3,$out3,v26
2683	 xxlor		32+$in2, 0, 0
2684	 vpermxor	$tweak, $tweak, $tmp, $in2
2685	vcipher		$out4,$out4,v26
2686	vcipher		$out5,$out5,v26
2687
2688	add		$inp,$inp,r0		# $inp is adjusted in such
2689						# way that at exit from the
2690						# loop inX-in5 are loaded
2691						# with last "words"
2692	 vxor		$in2,$twk2,v31
2693	 vsrab		$tmp,$tweak,$seven	# next tweak value
2694	 vxor		$twk2,$tweak,$rndkey0
2695	 vaddubm	$tweak,$tweak,$tweak
2696	vcipher		$out0,$out0,v27
2697	vcipher		$out1,$out1,v27
2698	vcipher		$out2,$out2,v27
2699	vcipher		$out3,$out3,v27
2700	 vand		$tmp,$tmp,$eighty7
2701	vcipher		$out4,$out4,v27
2702	vcipher		$out5,$out5,v27
2703
2704	addi		$key_,$sp,$FRAME+15	# rewind $key_
2705	 xxlor		32+$in3, 0, 0
2706	 vpermxor	$tweak, $tweak, $tmp, $in3
2707	vcipher		$out0,$out0,v28
2708	vcipher		$out1,$out1,v28
2709	 vxor		$in3,$twk3,v31
2710	 vsrab		$tmp,$tweak,$seven	# next tweak value
2711	 vxor		$twk3,$tweak,$rndkey0
2712	vcipher		$out2,$out2,v28
2713	vcipher		$out3,$out3,v28
2714	 vaddubm	$tweak,$tweak,$tweak
2715	vcipher		$out4,$out4,v28
2716	vcipher		$out5,$out5,v28
2717	lvx		v24,$x00,$key_		# re-pre-load round[1]
2718	 vand		$tmp,$tmp,$eighty7
2719
2720	vcipher		$out0,$out0,v29
2721	vcipher		$out1,$out1,v29
2722	 xxlor		32+$in4, 0, 0
2723	 vpermxor	$tweak, $tweak, $tmp, $in4
2724	vcipher		$out2,$out2,v29
2725	vcipher		$out3,$out3,v29
2726	 vxor		$in4,$twk4,v31
2727	 vsrab		$tmp,$tweak,$seven	# next tweak value
2728	 vxor		$twk4,$tweak,$rndkey0
2729	vcipher		$out4,$out4,v29
2730	vcipher		$out5,$out5,v29
2731	lvx		v25,$x10,$key_		# re-pre-load round[2]
2732	 vaddubm	$tweak,$tweak,$tweak
2733
2734	vcipher		$out0,$out0,v30
2735	vcipher		$out1,$out1,v30
2736	 vand		$tmp,$tmp,$eighty7
2737	vcipher		$out2,$out2,v30
2738	vcipher		$out3,$out3,v30
2739	 xxlor		32+$in5, 0, 0
2740	 vpermxor	$tweak, $tweak, $tmp, $in5
2741	vcipher		$out4,$out4,v30
2742	vcipher		$out5,$out5,v30
2743	 vxor		$in5,$twk5,v31
2744	 vsrab		$tmp,$tweak,$seven	# next tweak value
2745	 vxor		$twk5,$tweak,$rndkey0
2746
2747	vcipherlast	$out0,$out0,$in0
2748	 lvx_u		$in0,$x00,$inp		# load next input block
2749	 vaddubm	$tweak,$tweak,$tweak
2750	vcipherlast	$out1,$out1,$in1
2751	 lvx_u		$in1,$x10,$inp
2752	vcipherlast	$out2,$out2,$in2
2753	 le?vperm	$in0,$in0,$in0,$leperm
2754	 lvx_u		$in2,$x20,$inp
2755	 vand		$tmp,$tmp,$eighty7
2756	vcipherlast	$out3,$out3,$in3
2757	 le?vperm	$in1,$in1,$in1,$leperm
2758	 lvx_u		$in3,$x30,$inp
2759	vcipherlast	$out4,$out4,$in4
2760	 le?vperm	$in2,$in2,$in2,$leperm
2761	 lvx_u		$in4,$x40,$inp
2762	 xxlor		10, 32+$in0, 32+$in0
2763	 xxlor		32+$in0, 0, 0
2764	 vpermxor	$tweak, $tweak, $tmp, $in0
2765	 xxlor		32+$in0, 10, 10
2766	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2767						# in stealing mode
2768	 le?vperm	$in3,$in3,$in3,$leperm
2769	 lvx_u		$in5,$x50,$inp
2770	 addi		$inp,$inp,0x60
2771	 le?vperm	$in4,$in4,$in4,$leperm
2772	 le?vperm	$in5,$in5,$in5,$leperm
2773
2774	le?vperm	$out0,$out0,$out0,$leperm
2775	le?vperm	$out1,$out1,$out1,$leperm
2776	stvx_u		$out0,$x00,$out		# store output
2777	 vxor		$out0,$in0,$twk0
2778	le?vperm	$out2,$out2,$out2,$leperm
2779	stvx_u		$out1,$x10,$out
2780	 vxor		$out1,$in1,$twk1
2781	le?vperm	$out3,$out3,$out3,$leperm
2782	stvx_u		$out2,$x20,$out
2783	 vxor		$out2,$in2,$twk2
2784	le?vperm	$out4,$out4,$out4,$leperm
2785	stvx_u		$out3,$x30,$out
2786	 vxor		$out3,$in3,$twk3
2787	le?vperm	$out5,$tmp,$tmp,$leperm
2788	stvx_u		$out4,$x40,$out
2789	 vxor		$out4,$in4,$twk4
2790	le?stvx_u	$out5,$x50,$out
2791	be?stvx_u	$tmp, $x50,$out
2792	 vxor		$out5,$in5,$twk5
2793	addi		$out,$out,0x60
2794
2795	mtctr		$rounds
2796	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2797
2798	xxlor		32+$eighty7, 2, 2	# 0x010101..87
2799
2800	addic.		$len,$len,0x60
2801	beq		Lxts_enc6x_zero
2802	cmpwi		$len,0x20
2803	blt		Lxts_enc6x_one
2804	nop
2805	beq		Lxts_enc6x_two
2806	cmpwi		$len,0x40
2807	blt		Lxts_enc6x_three
2808	nop
2809	beq		Lxts_enc6x_four
2810
2811Lxts_enc6x_five:
2812	vxor		$out0,$in1,$twk0
2813	vxor		$out1,$in2,$twk1
2814	vxor		$out2,$in3,$twk2
2815	vxor		$out3,$in4,$twk3
2816	vxor		$out4,$in5,$twk4
2817
2818	bl		_aesp8_xts_enc5x
2819
2820	le?vperm	$out0,$out0,$out0,$leperm
2821	vmr		$twk0,$twk5		# unused tweak
2822	le?vperm	$out1,$out1,$out1,$leperm
2823	stvx_u		$out0,$x00,$out		# store output
2824	le?vperm	$out2,$out2,$out2,$leperm
2825	stvx_u		$out1,$x10,$out
2826	le?vperm	$out3,$out3,$out3,$leperm
2827	stvx_u		$out2,$x20,$out
2828	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2829	le?vperm	$out4,$out4,$out4,$leperm
2830	stvx_u		$out3,$x30,$out
2831	stvx_u		$out4,$x40,$out
2832	addi		$out,$out,0x50
2833	bne		Lxts_enc6x_steal
2834	b		Lxts_enc6x_done
2835
2836.align	4
2837Lxts_enc6x_four:
2838	vxor		$out0,$in2,$twk0
2839	vxor		$out1,$in3,$twk1
2840	vxor		$out2,$in4,$twk2
2841	vxor		$out3,$in5,$twk3
2842	vxor		$out4,$out4,$out4
2843
2844	bl		_aesp8_xts_enc5x
2845
2846	le?vperm	$out0,$out0,$out0,$leperm
2847	vmr		$twk0,$twk4		# unused tweak
2848	le?vperm	$out1,$out1,$out1,$leperm
2849	stvx_u		$out0,$x00,$out		# store output
2850	le?vperm	$out2,$out2,$out2,$leperm
2851	stvx_u		$out1,$x10,$out
2852	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2853	le?vperm	$out3,$out3,$out3,$leperm
2854	stvx_u		$out2,$x20,$out
2855	stvx_u		$out3,$x30,$out
2856	addi		$out,$out,0x40
2857	bne		Lxts_enc6x_steal
2858	b		Lxts_enc6x_done
2859
2860.align	4
2861Lxts_enc6x_three:
2862	vxor		$out0,$in3,$twk0
2863	vxor		$out1,$in4,$twk1
2864	vxor		$out2,$in5,$twk2
2865	vxor		$out3,$out3,$out3
2866	vxor		$out4,$out4,$out4
2867
2868	bl		_aesp8_xts_enc5x
2869
2870	le?vperm	$out0,$out0,$out0,$leperm
2871	vmr		$twk0,$twk3		# unused tweak
2872	le?vperm	$out1,$out1,$out1,$leperm
2873	stvx_u		$out0,$x00,$out		# store output
2874	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2875	le?vperm	$out2,$out2,$out2,$leperm
2876	stvx_u		$out1,$x10,$out
2877	stvx_u		$out2,$x20,$out
2878	addi		$out,$out,0x30
2879	bne		Lxts_enc6x_steal
2880	b		Lxts_enc6x_done
2881
2882.align	4
2883Lxts_enc6x_two:
2884	vxor		$out0,$in4,$twk0
2885	vxor		$out1,$in5,$twk1
2886	vxor		$out2,$out2,$out2
2887	vxor		$out3,$out3,$out3
2888	vxor		$out4,$out4,$out4
2889
2890	bl		_aesp8_xts_enc5x
2891
2892	le?vperm	$out0,$out0,$out0,$leperm
2893	vmr		$twk0,$twk2		# unused tweak
2894	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2895	le?vperm	$out1,$out1,$out1,$leperm
2896	stvx_u		$out0,$x00,$out		# store output
2897	stvx_u		$out1,$x10,$out
2898	addi		$out,$out,0x20
2899	bne		Lxts_enc6x_steal
2900	b		Lxts_enc6x_done
2901
2902.align	4
2903Lxts_enc6x_one:
2904	vxor		$out0,$in5,$twk0
2905	nop
2906Loop_xts_enc1x:
2907	vcipher		$out0,$out0,v24
2908	lvx		v24,$x20,$key_		# round[3]
2909	addi		$key_,$key_,0x20
2910
2911	vcipher		$out0,$out0,v25
2912	lvx		v25,$x10,$key_		# round[4]
2913	bdnz		Loop_xts_enc1x
2914
2915	add		$inp,$inp,$taillen
2916	cmpwi		$taillen,0
2917	vcipher		$out0,$out0,v24
2918
2919	subi		$inp,$inp,16
2920	vcipher		$out0,$out0,v25
2921
2922	lvsr		$inpperm,0,$taillen
2923	vcipher		$out0,$out0,v26
2924
2925	lvx_u		$in0,0,$inp
2926	vcipher		$out0,$out0,v27
2927
2928	addi		$key_,$sp,$FRAME+15	# rewind $key_
2929	vcipher		$out0,$out0,v28
2930	lvx		v24,$x00,$key_		# re-pre-load round[1]
2931
2932	vcipher		$out0,$out0,v29
2933	lvx		v25,$x10,$key_		# re-pre-load round[2]
2934	 vxor		$twk0,$twk0,v31
2935
2936	le?vperm	$in0,$in0,$in0,$leperm
2937	vcipher		$out0,$out0,v30
2938
2939	vperm		$in0,$in0,$in0,$inpperm
2940	vcipherlast	$out0,$out0,$twk0
2941
2942	vmr		$twk0,$twk1		# unused tweak
2943	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2944	le?vperm	$out0,$out0,$out0,$leperm
2945	stvx_u		$out0,$x00,$out		# store output
2946	addi		$out,$out,0x10
2947	bne		Lxts_enc6x_steal
2948	b		Lxts_enc6x_done
2949
2950.align	4
2951Lxts_enc6x_zero:
2952	cmpwi		$taillen,0
2953	beq		Lxts_enc6x_done
2954
2955	add		$inp,$inp,$taillen
2956	subi		$inp,$inp,16
2957	lvx_u		$in0,0,$inp
2958	lvsr		$inpperm,0,$taillen	# $in5 is no more
2959	le?vperm	$in0,$in0,$in0,$leperm
2960	vperm		$in0,$in0,$in0,$inpperm
2961	vxor		$tmp,$tmp,$twk0
2962Lxts_enc6x_steal:
2963	vxor		$in0,$in0,$twk0
2964	vxor		$out0,$out0,$out0
2965	vspltisb	$out1,-1
2966	vperm		$out0,$out0,$out1,$inpperm
2967	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2968
2969	subi		r30,$out,17
2970	subi		$out,$out,16
2971	mtctr		$taillen
2972Loop_xts_enc6x_steal:
2973	lbzu		r0,1(r30)
2974	stb		r0,16(r30)
2975	bdnz		Loop_xts_enc6x_steal
2976
2977	li		$taillen,0
2978	mtctr		$rounds
2979	b		Loop_xts_enc1x		# one more time...
2980
2981.align	4
2982Lxts_enc6x_done:
2983	${UCMP}i	$ivp,0
2984	beq		Lxts_enc6x_ret
2985
2986	vxor		$tweak,$twk0,$rndkey0
2987	le?vperm	$tweak,$tweak,$tweak,$leperm
2988	stvx_u		$tweak,0,$ivp
2989
2990Lxts_enc6x_ret:
2991	mtlr		r11
2992	li		r10,`$FRAME+15`
2993	li		r11,`$FRAME+31`
2994	stvx		$seven,r10,$sp		# wipe copies of round keys
2995	addi		r10,r10,32
2996	stvx		$seven,r11,$sp
2997	addi		r11,r11,32
2998	stvx		$seven,r10,$sp
2999	addi		r10,r10,32
3000	stvx		$seven,r11,$sp
3001	addi		r11,r11,32
3002	stvx		$seven,r10,$sp
3003	addi		r10,r10,32
3004	stvx		$seven,r11,$sp
3005	addi		r11,r11,32
3006	stvx		$seven,r10,$sp
3007	addi		r10,r10,32
3008	stvx		$seven,r11,$sp
3009	addi		r11,r11,32
3010
3011	mtspr		256,$vrsave
3012	lvx		v20,r10,$sp		# ABI says so
3013	addi		r10,r10,32
3014	lvx		v21,r11,$sp
3015	addi		r11,r11,32
3016	lvx		v22,r10,$sp
3017	addi		r10,r10,32
3018	lvx		v23,r11,$sp
3019	addi		r11,r11,32
3020	lvx		v24,r10,$sp
3021	addi		r10,r10,32
3022	lvx		v25,r11,$sp
3023	addi		r11,r11,32
3024	lvx		v26,r10,$sp
3025	addi		r10,r10,32
3026	lvx		v27,r11,$sp
3027	addi		r11,r11,32
3028	lvx		v28,r10,$sp
3029	addi		r10,r10,32
3030	lvx		v29,r11,$sp
3031	addi		r11,r11,32
3032	lvx		v30,r10,$sp
3033	lvx		v31,r11,$sp
3034	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3035	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3036	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3037	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3038	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3039	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3040	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3041	blr
3042	.long		0
3043	.byte		0,12,0x04,1,0x80,6,6,0
3044	.long		0
3045
3046.align	5
3047_aesp8_xts_enc5x:
3048	vcipher		$out0,$out0,v24
3049	vcipher		$out1,$out1,v24
3050	vcipher		$out2,$out2,v24
3051	vcipher		$out3,$out3,v24
3052	vcipher		$out4,$out4,v24
3053	lvx		v24,$x20,$key_		# round[3]
3054	addi		$key_,$key_,0x20
3055
3056	vcipher		$out0,$out0,v25
3057	vcipher		$out1,$out1,v25
3058	vcipher		$out2,$out2,v25
3059	vcipher		$out3,$out3,v25
3060	vcipher		$out4,$out4,v25
3061	lvx		v25,$x10,$key_		# round[4]
3062	bdnz		_aesp8_xts_enc5x
3063
3064	add		$inp,$inp,$taillen
3065	cmpwi		$taillen,0
3066	vcipher		$out0,$out0,v24
3067	vcipher		$out1,$out1,v24
3068	vcipher		$out2,$out2,v24
3069	vcipher		$out3,$out3,v24
3070	vcipher		$out4,$out4,v24
3071
3072	subi		$inp,$inp,16
3073	vcipher		$out0,$out0,v25
3074	vcipher		$out1,$out1,v25
3075	vcipher		$out2,$out2,v25
3076	vcipher		$out3,$out3,v25
3077	vcipher		$out4,$out4,v25
3078	 vxor		$twk0,$twk0,v31
3079
3080	vcipher		$out0,$out0,v26
3081	lvsr		$inpperm,r0,$taillen	# $in5 is no more
3082	vcipher		$out1,$out1,v26
3083	vcipher		$out2,$out2,v26
3084	vcipher		$out3,$out3,v26
3085	vcipher		$out4,$out4,v26
3086	 vxor		$in1,$twk1,v31
3087
3088	vcipher		$out0,$out0,v27
3089	lvx_u		$in0,0,$inp
3090	vcipher		$out1,$out1,v27
3091	vcipher		$out2,$out2,v27
3092	vcipher		$out3,$out3,v27
3093	vcipher		$out4,$out4,v27
3094	 vxor		$in2,$twk2,v31
3095
3096	addi		$key_,$sp,$FRAME+15	# rewind $key_
3097	vcipher		$out0,$out0,v28
3098	vcipher		$out1,$out1,v28
3099	vcipher		$out2,$out2,v28
3100	vcipher		$out3,$out3,v28
3101	vcipher		$out4,$out4,v28
3102	lvx		v24,$x00,$key_		# re-pre-load round[1]
3103	 vxor		$in3,$twk3,v31
3104
3105	vcipher		$out0,$out0,v29
3106	le?vperm	$in0,$in0,$in0,$leperm
3107	vcipher		$out1,$out1,v29
3108	vcipher		$out2,$out2,v29
3109	vcipher		$out3,$out3,v29
3110	vcipher		$out4,$out4,v29
3111	lvx		v25,$x10,$key_		# re-pre-load round[2]
3112	 vxor		$in4,$twk4,v31
3113
3114	vcipher		$out0,$out0,v30
3115	vperm		$in0,$in0,$in0,$inpperm
3116	vcipher		$out1,$out1,v30
3117	vcipher		$out2,$out2,v30
3118	vcipher		$out3,$out3,v30
3119	vcipher		$out4,$out4,v30
3120
3121	vcipherlast	$out0,$out0,$twk0
3122	vcipherlast	$out1,$out1,$in1
3123	vcipherlast	$out2,$out2,$in2
3124	vcipherlast	$out3,$out3,$in3
3125	vcipherlast	$out4,$out4,$in4
3126	blr
3127        .long   	0
3128        .byte   	0,12,0x14,0,0,0,0,0
3129
3130.align	5
3131_aesp8_xts_decrypt6x:
3132	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3133	mflr		r11
3134	li		r7,`$FRAME+8*16+15`
3135	li		r3,`$FRAME+8*16+31`
3136	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3137	stvx		v20,r7,$sp		# ABI says so
3138	addi		r7,r7,32
3139	stvx		v21,r3,$sp
3140	addi		r3,r3,32
3141	stvx		v22,r7,$sp
3142	addi		r7,r7,32
3143	stvx		v23,r3,$sp
3144	addi		r3,r3,32
3145	stvx		v24,r7,$sp
3146	addi		r7,r7,32
3147	stvx		v25,r3,$sp
3148	addi		r3,r3,32
3149	stvx		v26,r7,$sp
3150	addi		r7,r7,32
3151	stvx		v27,r3,$sp
3152	addi		r3,r3,32
3153	stvx		v28,r7,$sp
3154	addi		r7,r7,32
3155	stvx		v29,r3,$sp
3156	addi		r3,r3,32
3157	stvx		v30,r7,$sp
3158	stvx		v31,r3,$sp
3159	li		r0,-1
3160	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3161	li		$x10,0x10
3162	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3163	li		$x20,0x20
3164	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3165	li		$x30,0x30
3166	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3167	li		$x40,0x40
3168	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3169	li		$x50,0x50
3170	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3171	li		$x60,0x60
3172	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3173	li		$x70,0x70
3174	mtspr		256,r0
3175
3176	xxlor		2, 32+$eighty7, 32+$eighty7
3177	vsldoi		$eighty7,$tmp,$eighty7,1        # 0x010101..87
3178	xxlor		1, 32+$eighty7, 32+$eighty7
3179
3180	# Load XOR Lconsts.
3181	mr		$x70, r6
3182	bl		Lconsts
3183	lxvw4x		0, $x40, r6		# load XOR contents
3184	mr		r6, $x70
3185	li		$x70,0x70
3186
3187	subi		$rounds,$rounds,3	# -4 in total
3188
3189	lvx		$rndkey0,$x00,$key1	# load key schedule
3190	lvx		v30,$x10,$key1
3191	addi		$key1,$key1,0x20
3192	lvx		v31,$x00,$key1
3193	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3194	addi		$key_,$sp,$FRAME+15
3195	mtctr		$rounds
3196
3197Load_xts_dec_key:
3198	?vperm		v24,v30,v31,$keyperm
3199	lvx		v30,$x10,$key1
3200	addi		$key1,$key1,0x20
3201	stvx		v24,$x00,$key_		# off-load round[1]
3202	?vperm		v25,v31,v30,$keyperm
3203	lvx		v31,$x00,$key1
3204	stvx		v25,$x10,$key_		# off-load round[2]
3205	addi		$key_,$key_,0x20
3206	bdnz		Load_xts_dec_key
3207
3208	lvx		v26,$x10,$key1
3209	?vperm		v24,v30,v31,$keyperm
3210	lvx		v27,$x20,$key1
3211	stvx		v24,$x00,$key_		# off-load round[3]
3212	?vperm		v25,v31,v26,$keyperm
3213	lvx		v28,$x30,$key1
3214	stvx		v25,$x10,$key_		# off-load round[4]
3215	addi		$key_,$sp,$FRAME+15	# rewind $key_
3216	?vperm		v26,v26,v27,$keyperm
3217	lvx		v29,$x40,$key1
3218	?vperm		v27,v27,v28,$keyperm
3219	lvx		v30,$x50,$key1
3220	?vperm		v28,v28,v29,$keyperm
3221	lvx		v31,$x60,$key1
3222	?vperm		v29,v29,v30,$keyperm
3223	lvx		$twk5,$x70,$key1	# borrow $twk5
3224	?vperm		v30,v30,v31,$keyperm
3225	lvx		v24,$x00,$key_		# pre-load round[1]
3226	?vperm		v31,v31,$twk5,$keyperm
3227	lvx		v25,$x10,$key_		# pre-load round[2]
3228
3229	 vperm		$in0,$inout,$inptail,$inpperm
3230	 subi		$inp,$inp,31		# undo "caller"
3231	vxor		$twk0,$tweak,$rndkey0
3232	vsrab		$tmp,$tweak,$seven	# next tweak value
3233	vaddubm		$tweak,$tweak,$tweak
3234	vand		$tmp,$tmp,$eighty7
3235	 vxor		$out0,$in0,$twk0
3236	xxlor		32+$in1, 0, 0
3237	vpermxor	$tweak, $tweak, $tmp, $in1
3238
3239	 lvx_u		$in1,$x10,$inp
3240	vxor		$twk1,$tweak,$rndkey0
3241	vsrab		$tmp,$tweak,$seven	# next tweak value
3242	vaddubm		$tweak,$tweak,$tweak
3243	 le?vperm	$in1,$in1,$in1,$leperm
3244	vand		$tmp,$tmp,$eighty7
3245	 vxor		$out1,$in1,$twk1
3246	xxlor		32+$in2, 0, 0
3247	vpermxor	$tweak, $tweak, $tmp, $in2
3248
3249	 lvx_u		$in2,$x20,$inp
3250	 andi.		$taillen,$len,15
3251	vxor		$twk2,$tweak,$rndkey0
3252	vsrab		$tmp,$tweak,$seven	# next tweak value
3253	vaddubm		$tweak,$tweak,$tweak
3254	 le?vperm	$in2,$in2,$in2,$leperm
3255	vand		$tmp,$tmp,$eighty7
3256	 vxor		$out2,$in2,$twk2
3257	xxlor		32+$in3, 0, 0
3258	vpermxor	$tweak, $tweak, $tmp, $in3
3259
3260	 lvx_u		$in3,$x30,$inp
3261	 sub		$len,$len,$taillen
3262	vxor		$twk3,$tweak,$rndkey0
3263	vsrab		$tmp,$tweak,$seven	# next tweak value
3264	vaddubm		$tweak,$tweak,$tweak
3265	 le?vperm	$in3,$in3,$in3,$leperm
3266	vand		$tmp,$tmp,$eighty7
3267	 vxor		$out3,$in3,$twk3
3268	xxlor		32+$in4, 0, 0
3269	vpermxor	$tweak, $tweak, $tmp, $in4
3270
3271	 lvx_u		$in4,$x40,$inp
3272	 subi		$len,$len,0x60
3273	vxor		$twk4,$tweak,$rndkey0
3274	vsrab		$tmp,$tweak,$seven	# next tweak value
3275	vaddubm		$tweak,$tweak,$tweak
3276	 le?vperm	$in4,$in4,$in4,$leperm
3277	vand		$tmp,$tmp,$eighty7
3278	 vxor		$out4,$in4,$twk4
3279	xxlor		32+$in5, 0, 0
3280	vpermxor	$tweak, $tweak, $tmp, $in5
3281
3282	 lvx_u		$in5,$x50,$inp
3283	 addi		$inp,$inp,0x60
3284	vxor		$twk5,$tweak,$rndkey0
3285	vsrab		$tmp,$tweak,$seven	# next tweak value
3286	vaddubm		$tweak,$tweak,$tweak
3287	 le?vperm	$in5,$in5,$in5,$leperm
3288	vand		$tmp,$tmp,$eighty7
3289	 vxor		$out5,$in5,$twk5
3290	xxlor		32+$in0, 0, 0
3291	vpermxor	$tweak, $tweak, $tmp, $in0
3292
3293	vxor		v31,v31,$rndkey0
3294	mtctr		$rounds
3295	b		Loop_xts_dec6x
3296
3297.align	5
3298Loop_xts_dec6x:
3299	vncipher	$out0,$out0,v24
3300	vncipher	$out1,$out1,v24
3301	vncipher	$out2,$out2,v24
3302	vncipher	$out3,$out3,v24
3303	vncipher	$out4,$out4,v24
3304	vncipher	$out5,$out5,v24
3305	lvx		v24,$x20,$key_		# round[3]
3306	addi		$key_,$key_,0x20
3307
3308	vncipher	$out0,$out0,v25
3309	vncipher	$out1,$out1,v25
3310	vncipher	$out2,$out2,v25
3311	vncipher	$out3,$out3,v25
3312	vncipher	$out4,$out4,v25
3313	vncipher	$out5,$out5,v25
3314	lvx		v25,$x10,$key_		# round[4]
3315	bdnz		Loop_xts_dec6x
3316
3317	xxlor		32+$eighty7, 1, 1	# 0x010101..87
3318
3319	subic		$len,$len,96		# $len-=96
3320	 vxor		$in0,$twk0,v31		# xor with last round key
3321	vncipher	$out0,$out0,v24
3322	vncipher	$out1,$out1,v24
3323	 vsrab		$tmp,$tweak,$seven	# next tweak value
3324	 vxor		$twk0,$tweak,$rndkey0
3325	 vaddubm	$tweak,$tweak,$tweak
3326	vncipher	$out2,$out2,v24
3327	vncipher	$out3,$out3,v24
3328	vncipher	$out4,$out4,v24
3329	vncipher	$out5,$out5,v24
3330
3331	subfe.		r0,r0,r0		# borrow?-1:0
3332	 vand		$tmp,$tmp,$eighty7
3333	vncipher	$out0,$out0,v25
3334	vncipher	$out1,$out1,v25
3335	 xxlor		32+$in1, 0, 0
3336	 vpermxor	$tweak, $tweak, $tmp, $in1
3337	vncipher	$out2,$out2,v25
3338	vncipher	$out3,$out3,v25
3339	 vxor		$in1,$twk1,v31
3340	 vsrab		$tmp,$tweak,$seven	# next tweak value
3341	 vxor		$twk1,$tweak,$rndkey0
3342	vncipher	$out4,$out4,v25
3343	vncipher	$out5,$out5,v25
3344
3345	and		r0,r0,$len
3346	 vaddubm	$tweak,$tweak,$tweak
3347	vncipher	$out0,$out0,v26
3348	vncipher	$out1,$out1,v26
3349	 vand		$tmp,$tmp,$eighty7
3350	vncipher	$out2,$out2,v26
3351	vncipher	$out3,$out3,v26
3352	 xxlor		32+$in2, 0, 0
3353	 vpermxor	$tweak, $tweak, $tmp, $in2
3354	vncipher	$out4,$out4,v26
3355	vncipher	$out5,$out5,v26
3356
3357	add		$inp,$inp,r0		# $inp is adjusted in such
3358						# way that at exit from the
3359						# loop inX-in5 are loaded
3360						# with last "words"
3361	 vxor		$in2,$twk2,v31
3362	 vsrab		$tmp,$tweak,$seven	# next tweak value
3363	 vxor		$twk2,$tweak,$rndkey0
3364	 vaddubm	$tweak,$tweak,$tweak
3365	vncipher	$out0,$out0,v27
3366	vncipher	$out1,$out1,v27
3367	vncipher	$out2,$out2,v27
3368	vncipher	$out3,$out3,v27
3369	 vand		$tmp,$tmp,$eighty7
3370	vncipher	$out4,$out4,v27
3371	vncipher	$out5,$out5,v27
3372
3373	addi		$key_,$sp,$FRAME+15	# rewind $key_
3374	 xxlor		32+$in3, 0, 0
3375	 vpermxor	$tweak, $tweak, $tmp, $in3
3376	vncipher	$out0,$out0,v28
3377	vncipher	$out1,$out1,v28
3378	 vxor		$in3,$twk3,v31
3379	 vsrab		$tmp,$tweak,$seven	# next tweak value
3380	 vxor		$twk3,$tweak,$rndkey0
3381	vncipher	$out2,$out2,v28
3382	vncipher	$out3,$out3,v28
3383	 vaddubm	$tweak,$tweak,$tweak
3384	vncipher	$out4,$out4,v28
3385	vncipher	$out5,$out5,v28
3386	lvx		v24,$x00,$key_		# re-pre-load round[1]
3387	 vand		$tmp,$tmp,$eighty7
3388
3389	vncipher	$out0,$out0,v29
3390	vncipher	$out1,$out1,v29
3391	 xxlor		32+$in4, 0, 0
3392	 vpermxor	$tweak, $tweak, $tmp, $in4
3393	vncipher	$out2,$out2,v29
3394	vncipher	$out3,$out3,v29
3395	 vxor		$in4,$twk4,v31
3396	 vsrab		$tmp,$tweak,$seven	# next tweak value
3397	 vxor		$twk4,$tweak,$rndkey0
3398	vncipher	$out4,$out4,v29
3399	vncipher	$out5,$out5,v29
3400	lvx		v25,$x10,$key_		# re-pre-load round[2]
3401	 vaddubm	$tweak,$tweak,$tweak
3402
3403	vncipher	$out0,$out0,v30
3404	vncipher	$out1,$out1,v30
3405	 vand		$tmp,$tmp,$eighty7
3406	vncipher	$out2,$out2,v30
3407	vncipher	$out3,$out3,v30
3408	 xxlor		32+$in5, 0, 0
3409	 vpermxor	$tweak, $tweak, $tmp, $in5
3410	vncipher	$out4,$out4,v30
3411	vncipher	$out5,$out5,v30
3412	 vxor		$in5,$twk5,v31
3413	 vsrab		$tmp,$tweak,$seven	# next tweak value
3414	 vxor		$twk5,$tweak,$rndkey0
3415
3416	vncipherlast	$out0,$out0,$in0
3417	 lvx_u		$in0,$x00,$inp		# load next input block
3418	 vaddubm	$tweak,$tweak,$tweak
3419	vncipherlast	$out1,$out1,$in1
3420	 lvx_u		$in1,$x10,$inp
3421	vncipherlast	$out2,$out2,$in2
3422	 le?vperm	$in0,$in0,$in0,$leperm
3423	 lvx_u		$in2,$x20,$inp
3424	 vand		$tmp,$tmp,$eighty7
3425	vncipherlast	$out3,$out3,$in3
3426	 le?vperm	$in1,$in1,$in1,$leperm
3427	 lvx_u		$in3,$x30,$inp
3428	vncipherlast	$out4,$out4,$in4
3429	 le?vperm	$in2,$in2,$in2,$leperm
3430	 lvx_u		$in4,$x40,$inp
3431	 xxlor		10, 32+$in0, 32+$in0
3432	 xxlor		32+$in0, 0, 0
3433	 vpermxor	$tweak, $tweak, $tmp, $in0
3434	 xxlor		32+$in0, 10, 10
3435	vncipherlast	$out5,$out5,$in5
3436	 le?vperm	$in3,$in3,$in3,$leperm
3437	 lvx_u		$in5,$x50,$inp
3438	 addi		$inp,$inp,0x60
3439	 le?vperm	$in4,$in4,$in4,$leperm
3440	 le?vperm	$in5,$in5,$in5,$leperm
3441
3442	le?vperm	$out0,$out0,$out0,$leperm
3443	le?vperm	$out1,$out1,$out1,$leperm
3444	stvx_u		$out0,$x00,$out		# store output
3445	 vxor		$out0,$in0,$twk0
3446	le?vperm	$out2,$out2,$out2,$leperm
3447	stvx_u		$out1,$x10,$out
3448	 vxor		$out1,$in1,$twk1
3449	le?vperm	$out3,$out3,$out3,$leperm
3450	stvx_u		$out2,$x20,$out
3451	 vxor		$out2,$in2,$twk2
3452	le?vperm	$out4,$out4,$out4,$leperm
3453	stvx_u		$out3,$x30,$out
3454	 vxor		$out3,$in3,$twk3
3455	le?vperm	$out5,$out5,$out5,$leperm
3456	stvx_u		$out4,$x40,$out
3457	 vxor		$out4,$in4,$twk4
3458	stvx_u		$out5,$x50,$out
3459	 vxor		$out5,$in5,$twk5
3460	addi		$out,$out,0x60
3461
3462	mtctr		$rounds
3463	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3464
3465	xxlor		32+$eighty7, 2, 2	# 0x010101..87
3466
3467	addic.		$len,$len,0x60
3468	beq		Lxts_dec6x_zero
3469	cmpwi		$len,0x20
3470	blt		Lxts_dec6x_one
3471	nop
3472	beq		Lxts_dec6x_two
3473	cmpwi		$len,0x40
3474	blt		Lxts_dec6x_three
3475	nop
3476	beq		Lxts_dec6x_four
3477
3478Lxts_dec6x_five:
3479	vxor		$out0,$in1,$twk0
3480	vxor		$out1,$in2,$twk1
3481	vxor		$out2,$in3,$twk2
3482	vxor		$out3,$in4,$twk3
3483	vxor		$out4,$in5,$twk4
3484
3485	bl		_aesp8_xts_dec5x
3486
3487	le?vperm	$out0,$out0,$out0,$leperm
3488	vmr		$twk0,$twk5		# unused tweak
3489	vxor		$twk1,$tweak,$rndkey0
3490	le?vperm	$out1,$out1,$out1,$leperm
3491	stvx_u		$out0,$x00,$out		# store output
3492	vxor		$out0,$in0,$twk1
3493	le?vperm	$out2,$out2,$out2,$leperm
3494	stvx_u		$out1,$x10,$out
3495	le?vperm	$out3,$out3,$out3,$leperm
3496	stvx_u		$out2,$x20,$out
3497	le?vperm	$out4,$out4,$out4,$leperm
3498	stvx_u		$out3,$x30,$out
3499	stvx_u		$out4,$x40,$out
3500	addi		$out,$out,0x50
3501	bne		Lxts_dec6x_steal
3502	b		Lxts_dec6x_done
3503
3504.align	4
3505Lxts_dec6x_four:
3506	vxor		$out0,$in2,$twk0
3507	vxor		$out1,$in3,$twk1
3508	vxor		$out2,$in4,$twk2
3509	vxor		$out3,$in5,$twk3
3510	vxor		$out4,$out4,$out4
3511
3512	bl		_aesp8_xts_dec5x
3513
3514	le?vperm	$out0,$out0,$out0,$leperm
3515	vmr		$twk0,$twk4		# unused tweak
3516	vmr		$twk1,$twk5
3517	le?vperm	$out1,$out1,$out1,$leperm
3518	stvx_u		$out0,$x00,$out		# store output
3519	vxor		$out0,$in0,$twk5
3520	le?vperm	$out2,$out2,$out2,$leperm
3521	stvx_u		$out1,$x10,$out
3522	le?vperm	$out3,$out3,$out3,$leperm
3523	stvx_u		$out2,$x20,$out
3524	stvx_u		$out3,$x30,$out
3525	addi		$out,$out,0x40
3526	bne		Lxts_dec6x_steal
3527	b		Lxts_dec6x_done
3528
3529.align	4
3530Lxts_dec6x_three:
3531	vxor		$out0,$in3,$twk0
3532	vxor		$out1,$in4,$twk1
3533	vxor		$out2,$in5,$twk2
3534	vxor		$out3,$out3,$out3
3535	vxor		$out4,$out4,$out4
3536
3537	bl		_aesp8_xts_dec5x
3538
3539	le?vperm	$out0,$out0,$out0,$leperm
3540	vmr		$twk0,$twk3		# unused tweak
3541	vmr		$twk1,$twk4
3542	le?vperm	$out1,$out1,$out1,$leperm
3543	stvx_u		$out0,$x00,$out		# store output
3544	vxor		$out0,$in0,$twk4
3545	le?vperm	$out2,$out2,$out2,$leperm
3546	stvx_u		$out1,$x10,$out
3547	stvx_u		$out2,$x20,$out
3548	addi		$out,$out,0x30
3549	bne		Lxts_dec6x_steal
3550	b		Lxts_dec6x_done
3551
3552.align	4
3553Lxts_dec6x_two:
3554	vxor		$out0,$in4,$twk0
3555	vxor		$out1,$in5,$twk1
3556	vxor		$out2,$out2,$out2
3557	vxor		$out3,$out3,$out3
3558	vxor		$out4,$out4,$out4
3559
3560	bl		_aesp8_xts_dec5x
3561
3562	le?vperm	$out0,$out0,$out0,$leperm
3563	vmr		$twk0,$twk2		# unused tweak
3564	vmr		$twk1,$twk3
3565	le?vperm	$out1,$out1,$out1,$leperm
3566	stvx_u		$out0,$x00,$out		# store output
3567	vxor		$out0,$in0,$twk3
3568	stvx_u		$out1,$x10,$out
3569	addi		$out,$out,0x20
3570	bne		Lxts_dec6x_steal
3571	b		Lxts_dec6x_done
3572
3573.align	4
3574Lxts_dec6x_one:
3575	vxor		$out0,$in5,$twk0
3576	nop
3577Loop_xts_dec1x:
3578	vncipher	$out0,$out0,v24
3579	lvx		v24,$x20,$key_		# round[3]
3580	addi		$key_,$key_,0x20
3581
3582	vncipher	$out0,$out0,v25
3583	lvx		v25,$x10,$key_		# round[4]
3584	bdnz		Loop_xts_dec1x
3585
3586	subi		r0,$taillen,1
3587	vncipher	$out0,$out0,v24
3588
3589	andi.		r0,r0,16
3590	cmpwi		$taillen,0
3591	vncipher	$out0,$out0,v25
3592
3593	sub		$inp,$inp,r0
3594	vncipher	$out0,$out0,v26
3595
3596	lvx_u		$in0,0,$inp
3597	vncipher	$out0,$out0,v27
3598
3599	addi		$key_,$sp,$FRAME+15	# rewind $key_
3600	vncipher	$out0,$out0,v28
3601	lvx		v24,$x00,$key_		# re-pre-load round[1]
3602
3603	vncipher	$out0,$out0,v29
3604	lvx		v25,$x10,$key_		# re-pre-load round[2]
3605	 vxor		$twk0,$twk0,v31
3606
3607	le?vperm	$in0,$in0,$in0,$leperm
3608	vncipher	$out0,$out0,v30
3609
3610	mtctr		$rounds
3611	vncipherlast	$out0,$out0,$twk0
3612
3613	vmr		$twk0,$twk1		# unused tweak
3614	vmr		$twk1,$twk2
3615	le?vperm	$out0,$out0,$out0,$leperm
3616	stvx_u		$out0,$x00,$out		# store output
3617	addi		$out,$out,0x10
3618	vxor		$out0,$in0,$twk2
3619	bne		Lxts_dec6x_steal
3620	b		Lxts_dec6x_done
3621
3622.align	4
3623Lxts_dec6x_zero:
3624	cmpwi		$taillen,0
3625	beq		Lxts_dec6x_done
3626
3627	lvx_u		$in0,0,$inp
3628	le?vperm	$in0,$in0,$in0,$leperm
3629	vxor		$out0,$in0,$twk1
3630Lxts_dec6x_steal:
3631	vncipher	$out0,$out0,v24
3632	lvx		v24,$x20,$key_		# round[3]
3633	addi		$key_,$key_,0x20
3634
3635	vncipher	$out0,$out0,v25
3636	lvx		v25,$x10,$key_		# round[4]
3637	bdnz		Lxts_dec6x_steal
3638
3639	add		$inp,$inp,$taillen
3640	vncipher	$out0,$out0,v24
3641
3642	cmpwi		$taillen,0
3643	vncipher	$out0,$out0,v25
3644
3645	lvx_u		$in0,0,$inp
3646	vncipher	$out0,$out0,v26
3647
3648	lvsr		$inpperm,0,$taillen	# $in5 is no more
3649	vncipher	$out0,$out0,v27
3650
3651	addi		$key_,$sp,$FRAME+15	# rewind $key_
3652	vncipher	$out0,$out0,v28
3653	lvx		v24,$x00,$key_		# re-pre-load round[1]
3654
3655	vncipher	$out0,$out0,v29
3656	lvx		v25,$x10,$key_		# re-pre-load round[2]
3657	 vxor		$twk1,$twk1,v31
3658
3659	le?vperm	$in0,$in0,$in0,$leperm
3660	vncipher	$out0,$out0,v30
3661
3662	vperm		$in0,$in0,$in0,$inpperm
3663	vncipherlast	$tmp,$out0,$twk1
3664
3665	le?vperm	$out0,$tmp,$tmp,$leperm
3666	le?stvx_u	$out0,0,$out
3667	be?stvx_u	$tmp,0,$out
3668
3669	vxor		$out0,$out0,$out0
3670	vspltisb	$out1,-1
3671	vperm		$out0,$out0,$out1,$inpperm
3672	vsel		$out0,$in0,$tmp,$out0
3673	vxor		$out0,$out0,$twk0
3674
3675	subi		r30,$out,1
3676	mtctr		$taillen
3677Loop_xts_dec6x_steal:
3678	lbzu		r0,1(r30)
3679	stb		r0,16(r30)
3680	bdnz		Loop_xts_dec6x_steal
3681
3682	li		$taillen,0
3683	mtctr		$rounds
3684	b		Loop_xts_dec1x		# one more time...
3685
3686.align	4
3687Lxts_dec6x_done:
3688	${UCMP}i	$ivp,0
3689	beq		Lxts_dec6x_ret
3690
3691	vxor		$tweak,$twk0,$rndkey0
3692	le?vperm	$tweak,$tweak,$tweak,$leperm
3693	stvx_u		$tweak,0,$ivp
3694
3695Lxts_dec6x_ret:
3696	mtlr		r11
3697	li		r10,`$FRAME+15`
3698	li		r11,`$FRAME+31`
3699	stvx		$seven,r10,$sp		# wipe copies of round keys
3700	addi		r10,r10,32
3701	stvx		$seven,r11,$sp
3702	addi		r11,r11,32
3703	stvx		$seven,r10,$sp
3704	addi		r10,r10,32
3705	stvx		$seven,r11,$sp
3706	addi		r11,r11,32
3707	stvx		$seven,r10,$sp
3708	addi		r10,r10,32
3709	stvx		$seven,r11,$sp
3710	addi		r11,r11,32
3711	stvx		$seven,r10,$sp
3712	addi		r10,r10,32
3713	stvx		$seven,r11,$sp
3714	addi		r11,r11,32
3715
3716	mtspr		256,$vrsave
3717	lvx		v20,r10,$sp		# ABI says so
3718	addi		r10,r10,32
3719	lvx		v21,r11,$sp
3720	addi		r11,r11,32
3721	lvx		v22,r10,$sp
3722	addi		r10,r10,32
3723	lvx		v23,r11,$sp
3724	addi		r11,r11,32
3725	lvx		v24,r10,$sp
3726	addi		r10,r10,32
3727	lvx		v25,r11,$sp
3728	addi		r11,r11,32
3729	lvx		v26,r10,$sp
3730	addi		r10,r10,32
3731	lvx		v27,r11,$sp
3732	addi		r11,r11,32
3733	lvx		v28,r10,$sp
3734	addi		r10,r10,32
3735	lvx		v29,r11,$sp
3736	addi		r11,r11,32
3737	lvx		v30,r10,$sp
3738	lvx		v31,r11,$sp
3739	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3740	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3741	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3742	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3743	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3744	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3745	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3746	blr
3747	.long		0
3748	.byte		0,12,0x04,1,0x80,6,6,0
3749	.long		0
3750
3751.align	5
3752_aesp8_xts_dec5x:
3753	vncipher	$out0,$out0,v24
3754	vncipher	$out1,$out1,v24
3755	vncipher	$out2,$out2,v24
3756	vncipher	$out3,$out3,v24
3757	vncipher	$out4,$out4,v24
3758	lvx		v24,$x20,$key_		# round[3]
3759	addi		$key_,$key_,0x20
3760
3761	vncipher	$out0,$out0,v25
3762	vncipher	$out1,$out1,v25
3763	vncipher	$out2,$out2,v25
3764	vncipher	$out3,$out3,v25
3765	vncipher	$out4,$out4,v25
3766	lvx		v25,$x10,$key_		# round[4]
3767	bdnz		_aesp8_xts_dec5x
3768
3769	subi		r0,$taillen,1
3770	vncipher	$out0,$out0,v24
3771	vncipher	$out1,$out1,v24
3772	vncipher	$out2,$out2,v24
3773	vncipher	$out3,$out3,v24
3774	vncipher	$out4,$out4,v24
3775
3776	andi.		r0,r0,16
3777	cmpwi		$taillen,0
3778	vncipher	$out0,$out0,v25
3779	vncipher	$out1,$out1,v25
3780	vncipher	$out2,$out2,v25
3781	vncipher	$out3,$out3,v25
3782	vncipher	$out4,$out4,v25
3783	 vxor		$twk0,$twk0,v31
3784
3785	sub		$inp,$inp,r0
3786	vncipher	$out0,$out0,v26
3787	vncipher	$out1,$out1,v26
3788	vncipher	$out2,$out2,v26
3789	vncipher	$out3,$out3,v26
3790	vncipher	$out4,$out4,v26
3791	 vxor		$in1,$twk1,v31
3792
3793	vncipher	$out0,$out0,v27
3794	lvx_u		$in0,0,$inp
3795	vncipher	$out1,$out1,v27
3796	vncipher	$out2,$out2,v27
3797	vncipher	$out3,$out3,v27
3798	vncipher	$out4,$out4,v27
3799	 vxor		$in2,$twk2,v31
3800
3801	addi		$key_,$sp,$FRAME+15	# rewind $key_
3802	vncipher	$out0,$out0,v28
3803	vncipher	$out1,$out1,v28
3804	vncipher	$out2,$out2,v28
3805	vncipher	$out3,$out3,v28
3806	vncipher	$out4,$out4,v28
3807	lvx		v24,$x00,$key_		# re-pre-load round[1]
3808	 vxor		$in3,$twk3,v31
3809
3810	vncipher	$out0,$out0,v29
3811	le?vperm	$in0,$in0,$in0,$leperm
3812	vncipher	$out1,$out1,v29
3813	vncipher	$out2,$out2,v29
3814	vncipher	$out3,$out3,v29
3815	vncipher	$out4,$out4,v29
3816	lvx		v25,$x10,$key_		# re-pre-load round[2]
3817	 vxor		$in4,$twk4,v31
3818
3819	vncipher	$out0,$out0,v30
3820	vncipher	$out1,$out1,v30
3821	vncipher	$out2,$out2,v30
3822	vncipher	$out3,$out3,v30
3823	vncipher	$out4,$out4,v30
3824
3825	vncipherlast	$out0,$out0,$twk0
3826	vncipherlast	$out1,$out1,$in1
3827	vncipherlast	$out2,$out2,$in2
3828	vncipherlast	$out3,$out3,$in3
3829	vncipherlast	$out4,$out4,$in4
3830	mtctr		$rounds
3831	blr
3832        .long   	0
3833        .byte   	0,12,0x14,0,0,0,0,0
3834___
3835}}	}}}
3836
3837my $consts=1;
3838foreach(split("\n",$code)) {
3839        s/\`([^\`]*)\`/eval($1)/geo;
3840
3841	# constants table endian-specific conversion
3842	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3843	    my $conv=$3;
3844	    my @bytes=();
3845
3846	    # convert to endian-agnostic format
3847	    if ($1 eq "long") {
3848	      foreach (split(/,\s*/,$2)) {
3849		my $l = /^0/?oct:int;
3850		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3851	      }
3852	    } else {
3853		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3854	    }
3855
3856	    # little-endian conversion
3857	    if ($flavour =~ /le$/o) {
3858		SWITCH: for($conv)  {
3859		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3860		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3861		}
3862	    }
3863
3864	    #emit
3865	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3866	    next;
3867	}
3868	$consts=0 if (m/Lconsts:/o);	# end of table
3869
3870	# instructions prefixed with '?' are endian-specific and need
3871	# to be adjusted accordingly...
3872	if ($flavour =~ /le$/o) {	# little-endian
3873	    s/le\?//o		or
3874	    s/be\?/#be#/o	or
3875	    s/\?lvsr/lvsl/o	or
3876	    s/\?lvsl/lvsr/o	or
3877	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3878	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3879	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3880	} else {			# big-endian
3881	    s/le\?/#le#/o	or
3882	    s/be\?//o		or
3883	    s/\?([a-z]+)/$1/o;
3884	}
3885
3886        print $_,"\n";
3887}
3888
3889close STDOUT;
3890