aes-s390x.pl revision 312826
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for keys longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88	$SIZE_T=4;
89	$g="";
90} else {
91	$SIZE_T=8;
92	$g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
98$softonly=0;	# allow hardware support
99
100$t0="%r0";	$mask="%r0";
101$t1="%r1";
102$t2="%r2";	$inp="%r2";
103$t3="%r3";	$out="%r3";	$bits="%r3";
104$key="%r4";
105$i1="%r5";
106$i2="%r6";
107$i3="%r7";
108$s0="%r8";
109$s1="%r9";
110$s2="%r10";
111$s3="%r11";
112$tbl="%r12";
113$rounds="%r13";
114$ra="%r14";
115$sp="%r15";
116
117$stdframe=16*$SIZE_T+4*8;
118
119sub _data_word()
120{ my $i;
121    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
122}
123
124$code=<<___;
125.text
126
127.type	AES_Te,\@object
128.align	256
129AES_Te:
130___
131&_data_word(
132	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
197# Te4[256]
198.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
230# rcon[]
231.long	0x01000000, 0x02000000, 0x04000000, 0x08000000
232.long	0x10000000, 0x20000000, 0x40000000, 0x80000000
233.long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
234.align	256
235.size	AES_Te,.-AES_Te
236
237# void AES_encrypt(const unsigned char *inp, unsigned char *out,
238# 		 const AES_KEY *key) {
239.globl	AES_encrypt
240.type	AES_encrypt,\@function
241AES_encrypt:
242___
243$code.=<<___ if (!$softonly);
244	l	%r0,240($key)
245	lhi	%r1,16
246	clr	%r0,%r1
247	jl	.Lesoft
248
249	la	%r1,0($key)
250	#la	%r2,0($inp)
251	la	%r4,0($out)
252	lghi	%r3,16		# single block length
253	.long	0xb92e0042	# km %r4,%r2
254	brc	1,.-4		# can this happen?
255	br	%r14
256.align	64
257.Lesoft:
258___
259$code.=<<___;
260	stm${g}	%r3,$ra,3*$SIZE_T($sp)
261
262	llgf	$s0,0($inp)
263	llgf	$s1,4($inp)
264	llgf	$s2,8($inp)
265	llgf	$s3,12($inp)
266
267	larl	$tbl,AES_Te
268	bras	$ra,_s390x_AES_encrypt
269
270	l${g}	$out,3*$SIZE_T($sp)
271	st	$s0,0($out)
272	st	$s1,4($out)
273	st	$s2,8($out)
274	st	$s3,12($out)
275
276	lm${g}	%r6,$ra,6*$SIZE_T($sp)
277	br	$ra
278.size	AES_encrypt,.-AES_encrypt
279
280.type   _s390x_AES_encrypt,\@function
281.align	16
282_s390x_AES_encrypt:
283	st${g}	$ra,15*$SIZE_T($sp)
284	x	$s0,0($key)
285	x	$s1,4($key)
286	x	$s2,8($key)
287	x	$s3,12($key)
288	l	$rounds,240($key)
289	llill	$mask,`0xff<<3`
290	aghi	$rounds,-1
291	j	.Lenc_loop
292.align	16
293.Lenc_loop:
294	sllg	$t1,$s0,`0+3`
295	srlg	$t2,$s0,`8-3`
296	srlg	$t3,$s0,`16-3`
297	srl	$s0,`24-3`
298	nr	$s0,$mask
299	ngr	$t1,$mask
300	nr	$t2,$mask
301	nr	$t3,$mask
302
303	srlg	$i1,$s1,`16-3`	# i0
304	sllg	$i2,$s1,`0+3`
305	srlg	$i3,$s1,`8-3`
306	srl	$s1,`24-3`
307	nr	$i1,$mask
308	nr	$s1,$mask
309	ngr	$i2,$mask
310	nr	$i3,$mask
311
312	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
313	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
314	l	$t2,2($t2,$tbl) # Te2[s0>>8]
315	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
316
317	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
318	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
319	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
320	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
321
322	srlg	$i1,$s2,`8-3`	# i0
323	srlg	$i2,$s2,`16-3`	# i1
324	nr	$i1,$mask
325	nr	$i2,$mask
326	sllg	$i3,$s2,`0+3`
327	srl	$s2,`24-3`
328	nr	$s2,$mask
329	ngr	$i3,$mask
330
331	xr	$s1,$t1
332	srlg	$ra,$s3,`8-3`	# i1
333	sllg	$t1,$s3,`0+3`	# i0
334	nr	$ra,$mask
335	la	$key,16($key)
336	ngr	$t1,$mask
337
338	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
339	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
340	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
341	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
342
343	srlg	$i3,$s3,`16-3`	# i2
344	xr	$s2,$t2
345	srl	$s3,`24-3`
346	nr	$i3,$mask
347	nr	$s3,$mask
348
349	x	$s0,0($key)
350	x	$s1,4($key)
351	x	$s2,8($key)
352	x	$t3,12($key)
353
354	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
355	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
356	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
357	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
358	xr	$s3,$t3
359
360	brct	$rounds,.Lenc_loop
361	.align	16
362
363	sllg	$t1,$s0,`0+3`
364	srlg	$t2,$s0,`8-3`
365	ngr	$t1,$mask
366	srlg	$t3,$s0,`16-3`
367	srl	$s0,`24-3`
368	nr	$s0,$mask
369	nr	$t2,$mask
370	nr	$t3,$mask
371
372	srlg	$i1,$s1,`16-3`	# i0
373	sllg	$i2,$s1,`0+3`
374	ngr	$i2,$mask
375	srlg	$i3,$s1,`8-3`
376	srl	$s1,`24-3`
377	nr	$i1,$mask
378	nr	$s1,$mask
379	nr	$i3,$mask
380
381	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
382	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
383	sll	$s0,24
384	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
385	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
386	sll	$t2,8
387	sll	$t3,16
388
389	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
390	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
391	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
392	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
393	sll	$i1,16
394	sll	$s1,24
395	sll	$i3,8
396	or	$s0,$i1
397	or	$s1,$t1
398	or	$t2,$i2
399	or	$t3,$i3
400
401	srlg	$i1,$s2,`8-3`	# i0
402	srlg	$i2,$s2,`16-3`	# i1
403	nr	$i1,$mask
404	nr	$i2,$mask
405	sllg	$i3,$s2,`0+3`
406	srl	$s2,`24-3`
407	ngr	$i3,$mask
408	nr	$s2,$mask
409
410	sllg	$t1,$s3,`0+3`	# i0
411	srlg	$ra,$s3,`8-3`	# i1
412	ngr	$t1,$mask
413
414	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
415	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
416	sll	$i1,8
417	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
418	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
419	sll	$i2,16
420	nr	$ra,$mask
421	sll	$s2,24
422	or	$s0,$i1
423	or	$s1,$i2
424	or	$s2,$t2
425	or	$t3,$i3
426
427	srlg	$i3,$s3,`16-3`	# i2
428	srl	$s3,`24-3`
429	nr	$i3,$mask
430	nr	$s3,$mask
431
432	l	$t0,16($key)
433	l	$t2,20($key)
434
435	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
436	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
437	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
438	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
439	sll	$i2,8
440	sll	$i3,16
441	sll	$s3,24
442	or	$s0,$i1
443	or	$s1,$i2
444	or	$s2,$i3
445	or	$s3,$t3
446
447	l${g}	$ra,15*$SIZE_T($sp)
448	xr	$s0,$t0
449	xr	$s1,$t2
450	x	$s2,24($key)
451	x	$s3,28($key)
452
453	br	$ra
454.size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
455___
456
457$code.=<<___;
458.type	AES_Td,\@object
459.align	256
460AES_Td:
461___
462&_data_word(
463	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
527$code.=<<___;
528# Td4[256]
529.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561.size	AES_Td,.-AES_Td
562
563# void AES_decrypt(const unsigned char *inp, unsigned char *out,
564# 		 const AES_KEY *key) {
565.globl	AES_decrypt
566.type	AES_decrypt,\@function
567AES_decrypt:
568___
569$code.=<<___ if (!$softonly);
570	l	%r0,240($key)
571	lhi	%r1,16
572	clr	%r0,%r1
573	jl	.Ldsoft
574
575	la	%r1,0($key)
576	#la	%r2,0($inp)
577	la	%r4,0($out)
578	lghi	%r3,16		# single block length
579	.long	0xb92e0042	# km %r4,%r2
580	brc	1,.-4		# can this happen?
581	br	%r14
582.align	64
583.Ldsoft:
584___
585$code.=<<___;
586	stm${g}	%r3,$ra,3*$SIZE_T($sp)
587
588	llgf	$s0,0($inp)
589	llgf	$s1,4($inp)
590	llgf	$s2,8($inp)
591	llgf	$s3,12($inp)
592
593	larl	$tbl,AES_Td
594	bras	$ra,_s390x_AES_decrypt
595
596	l${g}	$out,3*$SIZE_T($sp)
597	st	$s0,0($out)
598	st	$s1,4($out)
599	st	$s2,8($out)
600	st	$s3,12($out)
601
602	lm${g}	%r6,$ra,6*$SIZE_T($sp)
603	br	$ra
604.size	AES_decrypt,.-AES_decrypt
605
606.type   _s390x_AES_decrypt,\@function
607.align	16
608_s390x_AES_decrypt:
609	st${g}	$ra,15*$SIZE_T($sp)
610	x	$s0,0($key)
611	x	$s1,4($key)
612	x	$s2,8($key)
613	x	$s3,12($key)
614	l	$rounds,240($key)
615	llill	$mask,`0xff<<3`
616	aghi	$rounds,-1
617	j	.Ldec_loop
618.align	16
619.Ldec_loop:
620	srlg	$t1,$s0,`16-3`
621	srlg	$t2,$s0,`8-3`
622	sllg	$t3,$s0,`0+3`
623	srl	$s0,`24-3`
624	nr	$s0,$mask
625	nr	$t1,$mask
626	nr	$t2,$mask
627	ngr	$t3,$mask
628
629	sllg	$i1,$s1,`0+3`	# i0
630	srlg	$i2,$s1,`16-3`
631	srlg	$i3,$s1,`8-3`
632	srl	$s1,`24-3`
633	ngr	$i1,$mask
634	nr	$s1,$mask
635	nr	$i2,$mask
636	nr	$i3,$mask
637
638	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
639	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
640	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
641	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
642
643	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
644	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
645	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
646	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
647
648	srlg	$i1,$s2,`8-3`	# i0
649	sllg	$i2,$s2,`0+3`	# i1
650	srlg	$i3,$s2,`16-3`
651	srl	$s2,`24-3`
652	nr	$i1,$mask
653	ngr	$i2,$mask
654	nr	$s2,$mask
655	nr	$i3,$mask
656
657	xr	$s1,$t1
658	srlg	$ra,$s3,`8-3`	# i1
659	srlg	$t1,$s3,`16-3`	# i0
660	nr	$ra,$mask
661	la	$key,16($key)
662	nr	$t1,$mask
663
664	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
665	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
666	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
667	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
668
669	sllg	$i3,$s3,`0+3`	# i2
670	srl	$s3,`24-3`
671	ngr	$i3,$mask
672	nr	$s3,$mask
673
674	xr	$s2,$t2
675	x	$s0,0($key)
676	x	$s1,4($key)
677	x	$s2,8($key)
678	x	$t3,12($key)
679
680	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
681	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
682	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
683	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
684	xr	$s3,$t3
685
686	brct	$rounds,.Ldec_loop
687	.align	16
688
689	l	$t1,`2048+0`($tbl)	# prefetch Td4
690	l	$t2,`2048+64`($tbl)
691	l	$t3,`2048+128`($tbl)
692	l	$i1,`2048+192`($tbl)
693	llill	$mask,0xff
694
695	srlg	$i3,$s0,24	# i0
696	srlg	$t1,$s0,16
697	srlg	$t2,$s0,8
698	nr	$s0,$mask	# i3
699	nr	$t1,$mask
700
701	srlg	$i1,$s1,24
702	nr	$t2,$mask
703	srlg	$i2,$s1,16
704	srlg	$ra,$s1,8
705	nr	$s1,$mask	# i0
706	nr	$i2,$mask
707	nr	$ra,$mask
708
709	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
710	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
711	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
712	sll	$t1,16
713	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
714	sllg	$s0,$i3,24
715	sll	$t2,8
716
717	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
718	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
719	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
720	sll	$i1,24
721	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
722	sll	$i2,16
723	sll	$i3,8
724	or	$s0,$s1
725	or	$t1,$i1
726	or	$t2,$i2
727	or	$t3,$i3
728
729	srlg	$i1,$s2,8	# i0
730	srlg	$i2,$s2,24
731	srlg	$i3,$s2,16
732	nr	$s2,$mask	# i1
733	nr	$i1,$mask
734	nr	$i3,$mask
735	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
736	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
737	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
738	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
739	sll	$i1,8
740	sll	$i2,24
741	or	$s0,$i1
742	sll	$i3,16
743	or	$t2,$i2
744	or	$t3,$i3
745
746	srlg	$i1,$s3,16	# i0
747	srlg	$i2,$s3,8	# i1
748	srlg	$i3,$s3,24
749	nr	$s3,$mask	# i2
750	nr	$i1,$mask
751	nr	$i2,$mask
752
753	l${g}	$ra,15*$SIZE_T($sp)
754	or	$s1,$t1
755	l	$t0,16($key)
756	l	$t1,20($key)
757
758	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
759	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
760	sll	$i1,16
761	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
762	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
763	sll	$i2,8
764	sll	$s3,24
765	or	$s0,$i1
766	or	$s1,$i2
767	or	$s2,$t2
768	or	$s3,$t3
769
770	xr	$s0,$t0
771	xr	$s1,$t1
772	x	$s2,24($key)
773	x	$s3,28($key)
774
775	br	$ra
776.size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
777___
778
779$code.=<<___;
780# void AES_set_encrypt_key(const unsigned char *in, int bits,
781# 		 AES_KEY *key) {
782.globl	private_AES_set_encrypt_key
783.type	private_AES_set_encrypt_key,\@function
784.align	16
785private_AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
787	lghi	$t0,0
788	cl${g}r	$inp,$t0
789	je	.Lminus1
790	cl${g}r	$key,$t0
791	je	.Lminus1
792
793	lghi	$t0,128
794	clr	$bits,$t0
795	je	.Lproceed
796	lghi	$t0,192
797	clr	$bits,$t0
798	je	.Lproceed
799	lghi	$t0,256
800	clr	$bits,$t0
801	je	.Lproceed
802	lghi	%r2,-2
803	br	%r14
804
805.align	16
806.Lproceed:
807___
808$code.=<<___ if (!$softonly);
809	# convert bits to km code, [128,192,256]->[18,19,20]
810	lhi	%r5,-128
811	lhi	%r0,18
812	ar	%r5,$bits
813	srl	%r5,6
814	ar	%r5,%r0
815
816	larl	%r1,OPENSSL_s390xcap_P
817	lg	%r0,0(%r1)
818	tmhl	%r0,0x4000	# check for message-security assist
819	jz	.Lekey_internal
820
821	llihh	%r0,0x8000
822	srlg	%r0,%r0,0(%r5)
823	ng	%r0,48(%r1)	# check kmc capability vector
824	jz	.Lekey_internal
825
826	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
827	stmg	%r0,%r1,0($key)
828	lhi	%r0,192
829	cr	$bits,%r0
830	jl	1f
831	lg	%r1,16($inp)
832	stg	%r1,16($key)
833	je	1f
834	lg	%r1,24($inp)
835	stg	%r1,24($key)
8361:	st	$bits,236($key)	# save bits [for debugging purposes]
837	lgr	$t0,%r5
838	st	%r5,240($key)	# save km code
839	lghi	%r2,0
840	br	%r14
841___
842$code.=<<___;
843.align	16
844.Lekey_internal:
845	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
846
847	larl	$tbl,AES_Te+2048
848
849	llgf	$s0,0($inp)
850	llgf	$s1,4($inp)
851	llgf	$s2,8($inp)
852	llgf	$s3,12($inp)
853	st	$s0,0($key)
854	st	$s1,4($key)
855	st	$s2,8($key)
856	st	$s3,12($key)
857	lghi	$t0,128
858	cr	$bits,$t0
859	jne	.Lnot128
860
861	llill	$mask,0xff
862	lghi	$t3,0			# i=0
863	lghi	$rounds,10
864	st	$rounds,240($key)
865
866	llgfr	$t2,$s3			# temp=rk[3]
867	srlg	$i1,$s3,8
868	srlg	$i2,$s3,16
869	srlg	$i3,$s3,24
870	nr	$t2,$mask
871	nr	$i1,$mask
872	nr	$i2,$mask
873
874.align	16
875.L128_loop:
876	la	$t2,0($t2,$tbl)
877	la	$i1,0($i1,$tbl)
878	la	$i2,0($i2,$tbl)
879	la	$i3,0($i3,$tbl)
880	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
881	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
882	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
883	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
884	x	$t2,256($t3,$tbl)	# rcon[i]
885	xr	$s0,$t2			# rk[4]=rk[0]^...
886	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
887	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
888	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
889
890	llgfr	$t2,$s3			# temp=rk[3]
891	srlg	$i1,$s3,8
892	srlg	$i2,$s3,16
893	nr	$t2,$mask
894	nr	$i1,$mask
895	srlg	$i3,$s3,24
896	nr	$i2,$mask
897
898	st	$s0,16($key)
899	st	$s1,20($key)
900	st	$s2,24($key)
901	st	$s3,28($key)
902	la	$key,16($key)		# key+=4
903	la	$t3,4($t3)		# i++
904	brct	$rounds,.L128_loop
905	lghi	$t0,10
906	lghi	%r2,0
907	lm${g}	%r4,%r13,4*$SIZE_T($sp)
908	br	$ra
909
910.align	16
911.Lnot128:
912	llgf	$t0,16($inp)
913	llgf	$t1,20($inp)
914	st	$t0,16($key)
915	st	$t1,20($key)
916	lghi	$t0,192
917	cr	$bits,$t0
918	jne	.Lnot192
919
920	llill	$mask,0xff
921	lghi	$t3,0			# i=0
922	lghi	$rounds,12
923	st	$rounds,240($key)
924	lghi	$rounds,8
925
926	srlg	$i1,$t1,8
927	srlg	$i2,$t1,16
928	srlg	$i3,$t1,24
929	nr	$t1,$mask
930	nr	$i1,$mask
931	nr	$i2,$mask
932
933.align	16
934.L192_loop:
935	la	$t1,0($t1,$tbl)
936	la	$i1,0($i1,$tbl)
937	la	$i2,0($i2,$tbl)
938	la	$i3,0($i3,$tbl)
939	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
940	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
941	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
942	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
943	x	$t1,256($t3,$tbl)	# rcon[i]
944	xr	$s0,$t1			# rk[6]=rk[0]^...
945	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
946	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
947	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
948
949	st	$s0,24($key)
950	st	$s1,28($key)
951	st	$s2,32($key)
952	st	$s3,36($key)
953	brct	$rounds,.L192_continue
954	lghi	$t0,12
955	lghi	%r2,0
956	lm${g}	%r4,%r13,4*$SIZE_T($sp)
957	br	$ra
958
959.align	16
960.L192_continue:
961	lgr	$t1,$s3
962	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
963	st	$t1,40($key)
964	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
965	st	$t1,44($key)
966
967	srlg	$i1,$t1,8
968	srlg	$i2,$t1,16
969	srlg	$i3,$t1,24
970	nr	$t1,$mask
971	nr	$i1,$mask
972	nr	$i2,$mask
973
974	la	$key,24($key)		# key+=6
975	la	$t3,4($t3)		# i++
976	j	.L192_loop
977
978.align	16
979.Lnot192:
980	llgf	$t0,24($inp)
981	llgf	$t1,28($inp)
982	st	$t0,24($key)
983	st	$t1,28($key)
984	llill	$mask,0xff
985	lghi	$t3,0			# i=0
986	lghi	$rounds,14
987	st	$rounds,240($key)
988	lghi	$rounds,7
989
990	srlg	$i1,$t1,8
991	srlg	$i2,$t1,16
992	srlg	$i3,$t1,24
993	nr	$t1,$mask
994	nr	$i1,$mask
995	nr	$i2,$mask
996
997.align	16
998.L256_loop:
999	la	$t1,0($t1,$tbl)
1000	la	$i1,0($i1,$tbl)
1001	la	$i2,0($i2,$tbl)
1002	la	$i3,0($i3,$tbl)
1003	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
1004	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
1005	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
1006	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
1007	x	$t1,256($t3,$tbl)	# rcon[i]
1008	xr	$s0,$t1			# rk[8]=rk[0]^...
1009	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
1010	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
1011	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
1012	st	$s0,32($key)
1013	st	$s1,36($key)
1014	st	$s2,40($key)
1015	st	$s3,44($key)
1016	brct	$rounds,.L256_continue
1017	lghi	$t0,14
1018	lghi	%r2,0
1019	lm${g}	%r4,%r13,4*$SIZE_T($sp)
1020	br	$ra
1021
1022.align	16
1023.L256_continue:
1024	lgr	$t1,$s3			# temp=rk[11]
1025	srlg	$i1,$s3,8
1026	srlg	$i2,$s3,16
1027	srlg	$i3,$s3,24
1028	nr	$t1,$mask
1029	nr	$i1,$mask
1030	nr	$i2,$mask
1031	la	$t1,0($t1,$tbl)
1032	la	$i1,0($i1,$tbl)
1033	la	$i2,0($i2,$tbl)
1034	la	$i3,0($i3,$tbl)
1035	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
1036	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
1037	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
1038	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
1039	x	$t1,16($key)		# rk[12]=rk[4]^...
1040	st	$t1,48($key)
1041	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
1042	st	$t1,52($key)
1043	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
1044	st	$t1,56($key)
1045	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
1046	st	$t1,60($key)
1047
1048	srlg	$i1,$t1,8
1049	srlg	$i2,$t1,16
1050	srlg	$i3,$t1,24
1051	nr	$t1,$mask
1052	nr	$i1,$mask
1053	nr	$i2,$mask
1054
1055	la	$key,32($key)		# key+=8
1056	la	$t3,4($t3)		# i++
1057	j	.L256_loop
1058
1059.Lminus1:
1060	lghi	%r2,-1
1061	br	$ra
1062.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1063
1064# void AES_set_decrypt_key(const unsigned char *in, int bits,
1065# 		 AES_KEY *key) {
1066.globl	private_AES_set_decrypt_key
1067.type	private_AES_set_decrypt_key,\@function
1068.align	16
1069private_AES_set_decrypt_key:
1070	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
1071	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
1072	bras	$ra,_s390x_AES_set_encrypt_key
1073	#l${g}	$key,4*$SIZE_T($sp)
1074	l${g}	$ra,14*$SIZE_T($sp)
1075	ltgr	%r2,%r2
1076	bnzr	$ra
1077___
1078$code.=<<___ if (!$softonly);
1079	#l	$t0,240($key)
1080	lhi	$t1,16
1081	cr	$t0,$t1
1082	jl	.Lgo
1083	oill	$t0,0x80	# set "decrypt" bit
1084	st	$t0,240($key)
1085	br	$ra
1086___
1087$code.=<<___;
1088.align	16
1089.Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
1090	la	$i1,0($key)
1091	sllg	$i2,$rounds,4
1092	la	$i2,0($i2,$key)
1093	srl	$rounds,1
1094	lghi	$t1,-16
1095
1096.align	16
1097.Linv:	lmg	$s0,$s1,0($i1)
1098	lmg	$s2,$s3,0($i2)
1099	stmg	$s0,$s1,0($i2)
1100	stmg	$s2,$s3,0($i1)
1101	la	$i1,16($i1)
1102	la	$i2,0($t1,$i2)
1103	brct	$rounds,.Linv
1104___
1105$mask80=$i1;
1106$mask1b=$i2;
1107$maskfe=$i3;
1108$code.=<<___;
1109	llgf	$rounds,240($key)
1110	aghi	$rounds,-1
1111	sll	$rounds,2	# (rounds-1)*4
1112	llilh	$mask80,0x8080
1113	llilh	$mask1b,0x1b1b
1114	llilh	$maskfe,0xfefe
1115	oill	$mask80,0x8080
1116	oill	$mask1b,0x1b1b
1117	oill	$maskfe,0xfefe
1118
1119.align	16
1120.Lmix:	l	$s0,16($key)	# tp1
1121	lr	$s1,$s0
1122	ngr	$s1,$mask80
1123	srlg	$t1,$s1,7
1124	slr	$s1,$t1
1125	nr	$s1,$mask1b
1126	sllg	$t1,$s0,1
1127	nr	$t1,$maskfe
1128	xr	$s1,$t1		# tp2
1129
1130	lr	$s2,$s1
1131	ngr	$s2,$mask80
1132	srlg	$t1,$s2,7
1133	slr	$s2,$t1
1134	nr	$s2,$mask1b
1135	sllg	$t1,$s1,1
1136	nr	$t1,$maskfe
1137	xr	$s2,$t1		# tp4
1138
1139	lr	$s3,$s2
1140	ngr	$s3,$mask80
1141	srlg	$t1,$s3,7
1142	slr	$s3,$t1
1143	nr	$s3,$mask1b
1144	sllg	$t1,$s2,1
1145	nr	$t1,$maskfe
1146	xr	$s3,$t1		# tp8
1147
1148	xr	$s1,$s0		# tp2^tp1
1149	xr	$s2,$s0		# tp4^tp1
1150	rll	$s0,$s0,24	# = ROTATE(tp1,8)
1151	xr	$s2,$s3		# ^=tp8
1152	xr	$s0,$s1		# ^=tp2^tp1
1153	xr	$s1,$s3		# tp2^tp1^tp8
1154	xr	$s0,$s2		# ^=tp4^tp1^tp8
1155	rll	$s1,$s1,8
1156	rll	$s2,$s2,16
1157	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
1158	rll	$s3,$s3,24
1159	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
1160	xr	$s0,$s3		# ^= ROTATE(tp8,8)
1161
1162	st	$s0,16($key)
1163	la	$key,4($key)
1164	brct	$rounds,.Lmix
1165
1166	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1167	lghi	%r2,0
1168	br	$ra
1169.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1170___
1171
1172########################################################################
1173# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1174#                     size_t length, const AES_KEY *key,
1175#                     unsigned char *ivec, const int enc)
1176{
1177my $inp="%r2";
1178my $out="%r4";	# length and out are swapped
1179my $len="%r3";
1180my $key="%r5";
1181my $ivp="%r6";
1182
1183$code.=<<___;
1184.globl	AES_cbc_encrypt
1185.type	AES_cbc_encrypt,\@function
1186.align	16
1187AES_cbc_encrypt:
1188	xgr	%r3,%r4		# flip %r3 and %r4, out and len
1189	xgr	%r4,%r3
1190	xgr	%r3,%r4
1191___
1192$code.=<<___ if (!$softonly);
1193	lhi	%r0,16
1194	cl	%r0,240($key)
1195	jh	.Lcbc_software
1196
1197	lg	%r0,0($ivp)	# copy ivec
1198	lg	%r1,8($ivp)
1199	stmg	%r0,%r1,16($sp)
1200	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
1201	stmg	%r0,%r1,32($sp)
1202	lmg	%r0,%r1,16($key)
1203	stmg	%r0,%r1,48($sp)
1204	l	%r0,240($key)	# load kmc code
1205	lghi	$key,15		# res=len%16, len-=res;
1206	ngr	$key,$len
1207	sl${g}r	$len,$key
1208	la	%r1,16($sp)	# parameter block - ivec || key
1209	jz	.Lkmc_truncated
1210	.long	0xb92f0042	# kmc %r4,%r2
1211	brc	1,.-4		# pay attention to "partial completion"
1212	ltr	$key,$key
1213	jnz	.Lkmc_truncated
1214.Lkmc_done:
1215	lmg	%r0,%r1,16($sp)	# copy ivec to caller
1216	stg	%r0,0($ivp)
1217	stg	%r1,8($ivp)
1218	br	$ra
1219.align	16
1220.Lkmc_truncated:
1221	ahi	$key,-1		# it's the way it's encoded in mvc
1222	tmll	%r0,0x80
1223	jnz	.Lkmc_truncated_dec
1224	lghi	%r1,0
1225	stg	%r1,16*$SIZE_T($sp)
1226	stg	%r1,16*$SIZE_T+8($sp)
1227	bras	%r1,1f
1228	mvc	16*$SIZE_T(1,$sp),0($inp)
12291:	ex	$key,0(%r1)
1230	la	%r1,16($sp)	# restore parameter block
1231	la	$inp,16*$SIZE_T($sp)
1232	lghi	$len,16
1233	.long	0xb92f0042	# kmc %r4,%r2
1234	j	.Lkmc_done
1235.align	16
1236.Lkmc_truncated_dec:
1237	st${g}	$out,4*$SIZE_T($sp)
1238	la	$out,16*$SIZE_T($sp)
1239	lghi	$len,16
1240	.long	0xb92f0042	# kmc %r4,%r2
1241	l${g}	$out,4*$SIZE_T($sp)
1242	bras	%r1,2f
1243	mvc	0(1,$out),16*$SIZE_T($sp)
12442:	ex	$key,0(%r1)
1245	j	.Lkmc_done
1246.align	16
1247.Lcbc_software:
1248___
1249$code.=<<___;
1250	stm${g}	$key,$ra,5*$SIZE_T($sp)
1251	lhi	%r0,0
1252	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
1253	je	.Lcbc_decrypt
1254
1255	larl	$tbl,AES_Te
1256
1257	llgf	$s0,0($ivp)
1258	llgf	$s1,4($ivp)
1259	llgf	$s2,8($ivp)
1260	llgf	$s3,12($ivp)
1261
1262	lghi	$t0,16
1263	sl${g}r	$len,$t0
1264	brc	4,.Lcbc_enc_tail	# if borrow
1265.Lcbc_enc_loop:
1266	stm${g}	$inp,$out,2*$SIZE_T($sp)
1267	x	$s0,0($inp)
1268	x	$s1,4($inp)
1269	x	$s2,8($inp)
1270	x	$s3,12($inp)
1271	lgr	%r4,$key
1272
1273	bras	$ra,_s390x_AES_encrypt
1274
1275	lm${g}	$inp,$key,2*$SIZE_T($sp)
1276	st	$s0,0($out)
1277	st	$s1,4($out)
1278	st	$s2,8($out)
1279	st	$s3,12($out)
1280
1281	la	$inp,16($inp)
1282	la	$out,16($out)
1283	lghi	$t0,16
1284	lt${g}r	$len,$len
1285	jz	.Lcbc_enc_done
1286	sl${g}r	$len,$t0
1287	brc	4,.Lcbc_enc_tail	# if borrow
1288	j	.Lcbc_enc_loop
1289.align	16
1290.Lcbc_enc_done:
1291	l${g}	$ivp,6*$SIZE_T($sp)
1292	st	$s0,0($ivp)
1293	st	$s1,4($ivp)
1294	st	$s2,8($ivp)
1295	st	$s3,12($ivp)
1296
1297	lm${g}	%r7,$ra,7*$SIZE_T($sp)
1298	br	$ra
1299
1300.align	16
1301.Lcbc_enc_tail:
1302	aghi	$len,15
1303	lghi	$t0,0
1304	stg	$t0,16*$SIZE_T($sp)
1305	stg	$t0,16*$SIZE_T+8($sp)
1306	bras	$t1,3f
1307	mvc	16*$SIZE_T(1,$sp),0($inp)
13083:	ex	$len,0($t1)
1309	lghi	$len,0
1310	la	$inp,16*$SIZE_T($sp)
1311	j	.Lcbc_enc_loop
1312
1313.align	16
1314.Lcbc_decrypt:
1315	larl	$tbl,AES_Td
1316
1317	lg	$t0,0($ivp)
1318	lg	$t1,8($ivp)
1319	stmg	$t0,$t1,16*$SIZE_T($sp)
1320
1321.Lcbc_dec_loop:
1322	stm${g}	$inp,$out,2*$SIZE_T($sp)
1323	llgf	$s0,0($inp)
1324	llgf	$s1,4($inp)
1325	llgf	$s2,8($inp)
1326	llgf	$s3,12($inp)
1327	lgr	%r4,$key
1328
1329	bras	$ra,_s390x_AES_decrypt
1330
1331	lm${g}	$inp,$key,2*$SIZE_T($sp)
1332	sllg	$s0,$s0,32
1333	sllg	$s2,$s2,32
1334	lr	$s0,$s1
1335	lr	$s2,$s3
1336
1337	lg	$t0,0($inp)
1338	lg	$t1,8($inp)
1339	xg	$s0,16*$SIZE_T($sp)
1340	xg	$s2,16*$SIZE_T+8($sp)
1341	lghi	$s1,16
1342	sl${g}r	$len,$s1
1343	brc	4,.Lcbc_dec_tail	# if borrow
1344	brc	2,.Lcbc_dec_done	# if zero
1345	stg	$s0,0($out)
1346	stg	$s2,8($out)
1347	stmg	$t0,$t1,16*$SIZE_T($sp)
1348
1349	la	$inp,16($inp)
1350	la	$out,16($out)
1351	j	.Lcbc_dec_loop
1352
1353.Lcbc_dec_done:
1354	stg	$s0,0($out)
1355	stg	$s2,8($out)
1356.Lcbc_dec_exit:
1357	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1358	stmg	$t0,$t1,0($ivp)
1359
1360	br	$ra
1361
1362.align	16
1363.Lcbc_dec_tail:
1364	aghi	$len,15
1365	stg	$s0,16*$SIZE_T($sp)
1366	stg	$s2,16*$SIZE_T+8($sp)
1367	bras	$s1,4f
1368	mvc	0(1,$out),16*$SIZE_T($sp)
13694:	ex	$len,0($s1)
1370	j	.Lcbc_dec_exit
1371.size	AES_cbc_encrypt,.-AES_cbc_encrypt
1372___
1373}
1374########################################################################
1375# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1376#                     size_t blocks, const AES_KEY *key,
1377#                     const unsigned char *ivec)
1378{
1379my $inp="%r2";
1380my $out="%r4";	# blocks and out are swapped
1381my $len="%r3";
1382my $key="%r5";	my $iv0="%r5";
1383my $ivp="%r6";
1384my $fp ="%r7";
1385
1386$code.=<<___;
1387.globl	AES_ctr32_encrypt
1388.type	AES_ctr32_encrypt,\@function
1389.align	16
1390AES_ctr32_encrypt:
1391	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
1392	xgr	%r4,%r3
1393	xgr	%r3,%r4
1394	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
1395___
1396$code.=<<___ if (!$softonly);
1397	l	%r0,240($key)
1398	lhi	%r1,16
1399	clr	%r0,%r1
1400	jl	.Lctr32_software
1401
1402	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1403
1404	slgr	$out,$inp
1405	la	%r1,0($key)	# %r1 is permanent copy of $key
1406	lg	$iv0,0($ivp)	# load ivec
1407	lg	$ivp,8($ivp)
1408
1409	# prepare and allocate stack frame at the top of 4K page
1410	# with 1K reserved for eventual signal handling
1411	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1412	lghi	$s1,-4096
1413	algr	$s0,$sp
1414	lgr	$fp,$sp
1415	ngr	$s0,$s1		# align at page boundary
1416	slgr	$fp,$s0		# total buffer size
1417	lgr	$s2,$sp
1418	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1419	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1420	# buffer size is at lest 256 and at most 3072+256-16
1421
1422	la	$sp,1024($s0)	# alloca
1423	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
1424	st${g}	$s2,0($sp)	# back-chain
1425	st${g}	$fp,$SIZE_T($sp)
1426
1427	slgr	$len,$fp
1428	brc	1,.Lctr32_hw_switch	# not zero, no borrow
1429	algr	$fp,$len	# input is shorter than allocated buffer
1430	lghi	$len,0
1431	st${g}	$fp,$SIZE_T($sp)
1432
1433.Lctr32_hw_switch:
1434___
1435$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
1436	larl	$s0,OPENSSL_s390xcap_P
1437	lg	$s0,8($s0)
1438	tmhh	$s0,0x0004	# check for message_security-assist-4
1439	jz	.Lctr32_km_loop
1440
1441	llgfr	$s0,%r0
1442	lgr	$s1,%r1
1443	larl	%r1,OPENSSL_s390xcap_P
1444	llihh	%r0,0x8000	# check if kmctr supports the function code
1445	srlg	%r0,%r0,0($s0)
1446	ng	%r0,64(%r1)	# check kmctr capability vector
1447	lgr	%r0,$s0
1448	lgr	%r1,$s1
1449	jz	.Lctr32_km_loop
1450
1451####### kmctr code
1452	algr	$out,$inp	# restore $out
1453	lgr	$s1,$len	# $s1 undertakes $len
1454	j	.Lctr32_kmctr_loop
1455.align	16
1456.Lctr32_kmctr_loop:
1457	la	$s2,16($sp)
1458	lgr	$s3,$fp
1459.Lctr32_kmctr_prepare:
1460	stg	$iv0,0($s2)
1461	stg	$ivp,8($s2)
1462	la	$s2,16($s2)
1463	ahi	$ivp,1		# 32-bit increment, preserves upper half
1464	brct	$s3,.Lctr32_kmctr_prepare
1465
1466	#la	$inp,0($inp)	# inp
1467	sllg	$len,$fp,4	# len
1468	#la	$out,0($out)	# out
1469	la	$s2,16($sp)	# iv
1470	.long	0xb92da042	# kmctr $out,$s2,$inp
1471	brc	1,.-4		# pay attention to "partial completion"
1472
1473	slgr	$s1,$fp
1474	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
1475	algr	$fp,$s1
1476	lghi	$s1,0
1477	brc	4+1,.Lctr32_kmctr_loop	# not zero
1478
1479	l${g}	$sp,0($sp)
1480	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1481	br	$ra
1482.align	16
1483___
1484$code.=<<___;
1485.Lctr32_km_loop:
1486	la	$s2,16($sp)
1487	lgr	$s3,$fp
1488.Lctr32_km_prepare:
1489	stg	$iv0,0($s2)
1490	stg	$ivp,8($s2)
1491	la	$s2,16($s2)
1492	ahi	$ivp,1		# 32-bit increment, preserves upper half
1493	brct	$s3,.Lctr32_km_prepare
1494
1495	la	$s0,16($sp)	# inp
1496	sllg	$s1,$fp,4	# len
1497	la	$s2,16($sp)	# out
1498	.long	0xb92e00a8	# km %r10,%r8
1499	brc	1,.-4		# pay attention to "partial completion"
1500
1501	la	$s2,16($sp)
1502	lgr	$s3,$fp
1503	slgr	$s2,$inp
1504.Lctr32_km_xor:
1505	lg	$s0,0($inp)
1506	lg	$s1,8($inp)
1507	xg	$s0,0($s2,$inp)
1508	xg	$s1,8($s2,$inp)
1509	stg	$s0,0($out,$inp)
1510	stg	$s1,8($out,$inp)
1511	la	$inp,16($inp)
1512	brct	$s3,.Lctr32_km_xor
1513
1514	slgr	$len,$fp
1515	brc	1,.Lctr32_km_loop	# not zero, no borrow
1516	algr	$fp,$len
1517	lghi	$len,0
1518	brc	4+1,.Lctr32_km_loop	# not zero
1519
1520	l${g}	$s0,0($sp)
1521	l${g}	$s1,$SIZE_T($sp)
1522	la	$s2,16($sp)
1523.Lctr32_km_zap:
1524	stg	$s0,0($s2)
1525	stg	$s0,8($s2)
1526	la	$s2,16($s2)
1527	brct	$s1,.Lctr32_km_zap
1528
1529	la	$sp,0($s0)
1530	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1531	br	$ra
1532.align	16
1533.Lctr32_software:
1534___
1535$code.=<<___;
1536	stm${g}	$key,$ra,5*$SIZE_T($sp)
1537	sl${g}r	$inp,$out
1538	larl	$tbl,AES_Te
1539	llgf	$t1,12($ivp)
1540
1541.Lctr32_loop:
1542	stm${g}	$inp,$out,2*$SIZE_T($sp)
1543	llgf	$s0,0($ivp)
1544	llgf	$s1,4($ivp)
1545	llgf	$s2,8($ivp)
1546	lgr	$s3,$t1
1547	st	$t1,16*$SIZE_T($sp)
1548	lgr	%r4,$key
1549
1550	bras	$ra,_s390x_AES_encrypt
1551
1552	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
1553	llgf	$t1,16*$SIZE_T($sp)
1554	x	$s0,0($inp,$out)
1555	x	$s1,4($inp,$out)
1556	x	$s2,8($inp,$out)
1557	x	$s3,12($inp,$out)
1558	stm	$s0,$s3,0($out)
1559
1560	la	$out,16($out)
1561	ahi	$t1,1		# 32-bit increment
1562	brct	$len,.Lctr32_loop
1563
1564	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1565	br	$ra
1566.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
1567___
1568}
1569
1570########################################################################
1571# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1572#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
1573#	const unsigned char iv[16]);
1574#
1575{
1576my $inp="%r2";
1577my $out="%r4";	# len and out are swapped
1578my $len="%r3";
1579my $key1="%r5";	# $i1
1580my $key2="%r6";	# $i2
1581my $fp="%r7";	# $i3
1582my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
1583
1584$code.=<<___;
1585.type	_s390x_xts_km,\@function
1586.align	16
1587_s390x_xts_km:
1588___
1589$code.=<<___ if(1);
1590	llgfr	$s0,%r0			# put aside the function code
1591	lghi	$s1,0x7f
1592	nr	$s1,%r0
1593	larl	%r1,OPENSSL_s390xcap_P
1594	llihh	%r0,0x8000
1595	srlg	%r0,%r0,32($s1)		# check for 32+function code
1596	ng	%r0,32(%r1)		# check km capability vector
1597	lgr	%r0,$s0			# restore the function code
1598	la	%r1,0($key1)		# restore $key1
1599	jz	.Lxts_km_vanilla
1600
1601	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
1602	algr	$out,$inp
1603
1604	oill	%r0,32			# switch to xts function code
1605	aghi	$s1,-18			#
1606	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
1607	la	%r1,$tweak-16($sp)
1608	slgr	%r1,$s1			# parameter block position
1609	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
1610	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
1611					# yes, it contains junk and overlaps
1612					# with the tweak in 128-bit case.
1613					# it's done to avoid conditional
1614					# branch.
1615	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
1616
1617	.long	0xb92e0042		# km %r4,%r2
1618	brc	1,.-4			# pay attention to "partial completion"
1619
1620	lrvg	$s0,$tweak+0($sp)	# load the last tweak
1621	lrvg	$s1,$tweak+8($sp)
1622	stmg	%r0,%r3,$tweak-32($sp)	# wipe copy of the key
1623
1624	nill	%r0,0xffdf		# switch back to original function code
1625	la	%r1,0($key1)		# restore pointer to $key1
1626	slgr	$out,$inp
1627
1628	llgc	$len,2*$SIZE_T-1($sp)
1629	nill	$len,0x0f		# $len%=16
1630	br	$ra
1631
1632.align	16
1633.Lxts_km_vanilla:
1634___
1635$code.=<<___;
1636	# prepare and allocate stack frame at the top of 4K page
1637	# with 1K reserved for eventual signal handling
1638	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1639	lghi	$s1,-4096
1640	algr	$s0,$sp
1641	lgr	$fp,$sp
1642	ngr	$s0,$s1		# align at page boundary
1643	slgr	$fp,$s0		# total buffer size
1644	lgr	$s2,$sp
1645	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1646	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1647	# buffer size is at lest 256 and at most 3072+256-16
1648
1649	la	$sp,1024($s0)	# alloca
1650	nill	$fp,0xfff0	# round to 16*n
1651	st${g}	$s2,0($sp)	# back-chain
1652	nill	$len,0xfff0	# redundant
1653	st${g}	$fp,$SIZE_T($sp)
1654
1655	slgr	$len,$fp
1656	brc	1,.Lxts_km_go	# not zero, no borrow
1657	algr	$fp,$len	# input is shorter than allocated buffer
1658	lghi	$len,0
1659	st${g}	$fp,$SIZE_T($sp)
1660
1661.Lxts_km_go:
1662	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
1663	lrvg	$s1,$tweak+8($s2)
1664
1665	la	$s2,16($sp)		# vector of ascending tweak values
1666	slgr	$s2,$inp
1667	srlg	$s3,$fp,4
1668	j	.Lxts_km_start
1669
1670.Lxts_km_loop:
1671	la	$s2,16($sp)
1672	slgr	$s2,$inp
1673	srlg	$s3,$fp,4
1674.Lxts_km_prepare:
1675	lghi	$i1,0x87
1676	srag	$i2,$s1,63		# broadcast upper bit
1677	ngr	$i1,$i2			# rem
1678	algr	$s0,$s0
1679	alcgr	$s1,$s1
1680	xgr	$s0,$i1
1681.Lxts_km_start:
1682	lrvgr	$i1,$s0			# flip byte order
1683	lrvgr	$i2,$s1
1684	stg	$i1,0($s2,$inp)
1685	stg	$i2,8($s2,$inp)
1686	xg	$i1,0($inp)
1687	xg	$i2,8($inp)
1688	stg	$i1,0($out,$inp)
1689	stg	$i2,8($out,$inp)
1690	la	$inp,16($inp)
1691	brct	$s3,.Lxts_km_prepare
1692
1693	slgr	$inp,$fp		# rewind $inp
1694	la	$s2,0($out,$inp)
1695	lgr	$s3,$fp
1696	.long	0xb92e00aa		# km $s2,$s2
1697	brc	1,.-4			# pay attention to "partial completion"
1698
1699	la	$s2,16($sp)
1700	slgr	$s2,$inp
1701	srlg	$s3,$fp,4
1702.Lxts_km_xor:
1703	lg	$i1,0($out,$inp)
1704	lg	$i2,8($out,$inp)
1705	xg	$i1,0($s2,$inp)
1706	xg	$i2,8($s2,$inp)
1707	stg	$i1,0($out,$inp)
1708	stg	$i2,8($out,$inp)
1709	la	$inp,16($inp)
1710	brct	$s3,.Lxts_km_xor
1711
1712	slgr	$len,$fp
1713	brc	1,.Lxts_km_loop		# not zero, no borrow
1714	algr	$fp,$len
1715	lghi	$len,0
1716	brc	4+1,.Lxts_km_loop	# not zero
1717
1718	l${g}	$i1,0($sp)		# back-chain
1719	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
1720	la	$i2,16($sp)
1721	srlg	$fp,$fp,4
1722.Lxts_km_zap:
1723	stg	$i1,0($i2)
1724	stg	$i1,8($i2)
1725	la	$i2,16($i2)
1726	brct	$fp,.Lxts_km_zap
1727
1728	la	$sp,0($i1)
1729	llgc	$len,2*$SIZE_T-1($i1)
1730	nill	$len,0x0f		# $len%=16
1731	bzr	$ra
1732
1733	# generate one more tweak...
1734	lghi	$i1,0x87
1735	srag	$i2,$s1,63		# broadcast upper bit
1736	ngr	$i1,$i2			# rem
1737	algr	$s0,$s0
1738	alcgr	$s1,$s1
1739	xgr	$s0,$i1
1740
1741	ltr	$len,$len		# clear zero flag
1742	br	$ra
1743.size	_s390x_xts_km,.-_s390x_xts_km
1744
1745.globl	AES_xts_encrypt
1746.type	AES_xts_encrypt,\@function
1747.align	16
1748AES_xts_encrypt:
1749	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
1750	xgr	%r4,%r3
1751	xgr	%r3,%r4
1752___
1753$code.=<<___ if ($SIZE_T==4);
1754	llgfr	$len,$len
1755___
1756$code.=<<___;
1757	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
1758	srag	$len,$len,4		# formally wrong, because it expands
1759					# sign byte, but who can afford asking
1760					# to process more than 2^63-1 bytes?
1761					# I use it, because it sets condition
1762					# code...
1763	bcr	8,$ra			# abort if zero (i.e. less than 16)
1764___
1765$code.=<<___ if (!$softonly);
1766	llgf	%r0,240($key2)
1767	lhi	%r1,16
1768	clr	%r0,%r1
1769	jl	.Lxts_enc_software
1770
1771	st${g}	$ra,5*$SIZE_T($sp)
1772	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1773
1774	sllg	$len,$len,4		# $len&=~15
1775	slgr	$out,$inp
1776
1777	# generate the tweak value
1778	l${g}	$s3,$stdframe($sp)	# pointer to iv
1779	la	$s2,$tweak($sp)
1780	lmg	$s0,$s1,0($s3)
1781	lghi	$s3,16
1782	stmg	$s0,$s1,0($s2)
1783	la	%r1,0($key2)		# $key2 is not needed anymore
1784	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
1785	brc	1,.-4			# can this happen?
1786
1787	l	%r0,240($key1)
1788	la	%r1,0($key1)		# $key1 is not needed anymore
1789	bras	$ra,_s390x_xts_km
1790	jz	.Lxts_enc_km_done
1791
1792	aghi	$inp,-16		# take one step back
1793	la	$i3,0($out,$inp)	# put aside real $out
1794.Lxts_enc_km_steal:
1795	llgc	$i1,16($inp)
1796	llgc	$i2,0($out,$inp)
1797	stc	$i1,0($out,$inp)
1798	stc	$i2,16($out,$inp)
1799	la	$inp,1($inp)
1800	brct	$len,.Lxts_enc_km_steal
1801
1802	la	$s2,0($i3)
1803	lghi	$s3,16
1804	lrvgr	$i1,$s0			# flip byte order
1805	lrvgr	$i2,$s1
1806	xg	$i1,0($s2)
1807	xg	$i2,8($s2)
1808	stg	$i1,0($s2)
1809	stg	$i2,8($s2)
1810	.long	0xb92e00aa		# km $s2,$s2
1811	brc	1,.-4			# can this happen?
1812	lrvgr	$i1,$s0			# flip byte order
1813	lrvgr	$i2,$s1
1814	xg	$i1,0($i3)
1815	xg	$i2,8($i3)
1816	stg	$i1,0($i3)
1817	stg	$i2,8($i3)
1818
1819.Lxts_enc_km_done:
1820	stg	$sp,$tweak+0($sp)	# wipe tweak
1821	stg	$sp,$tweak+8($sp)
1822	l${g}	$ra,5*$SIZE_T($sp)
1823	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1824	br	$ra
1825.align	16
1826.Lxts_enc_software:
1827___
1828$code.=<<___;
1829	stm${g}	%r6,$ra,6*$SIZE_T($sp)
1830
1831	slgr	$out,$inp
1832
1833	l${g}	$s3,$stdframe($sp)	# ivp
1834	llgf	$s0,0($s3)		# load iv
1835	llgf	$s1,4($s3)
1836	llgf	$s2,8($s3)
1837	llgf	$s3,12($s3)
1838	stm${g}	%r2,%r5,2*$SIZE_T($sp)
1839	la	$key,0($key2)
1840	larl	$tbl,AES_Te
1841	bras	$ra,_s390x_AES_encrypt	# generate the tweak
1842	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1843	stm	$s0,$s3,$tweak($sp)	# save the tweak
1844	j	.Lxts_enc_enter
1845
1846.align	16
1847.Lxts_enc_loop:
1848	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1849	lrvg	$s3,$tweak+8($sp)
1850	lghi	%r1,0x87
1851	srag	%r0,$s3,63		# broadcast upper bit
1852	ngr	%r1,%r0			# rem
1853	algr	$s1,$s1
1854	alcgr	$s3,$s3
1855	xgr	$s1,%r1
1856	lrvgr	$s1,$s1			# flip byte order
1857	lrvgr	$s3,$s3
1858	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1859	stg	$s1,$tweak+0($sp)	# save the tweak
1860	llgfr	$s1,$s1
1861	srlg	$s2,$s3,32
1862	stg	$s3,$tweak+8($sp)
1863	llgfr	$s3,$s3
1864	la	$inp,16($inp)		# $inp+=16
1865.Lxts_enc_enter:
1866	x	$s0,0($inp)		# ^=*($inp)
1867	x	$s1,4($inp)
1868	x	$s2,8($inp)
1869	x	$s3,12($inp)
1870	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
1871	la	$key,0($key1)
1872	bras	$ra,_s390x_AES_encrypt
1873	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1874	x	$s0,$tweak+0($sp)	# ^=tweak
1875	x	$s1,$tweak+4($sp)
1876	x	$s2,$tweak+8($sp)
1877	x	$s3,$tweak+12($sp)
1878	st	$s0,0($out,$inp)
1879	st	$s1,4($out,$inp)
1880	st	$s2,8($out,$inp)
1881	st	$s3,12($out,$inp)
1882	brct${g}	$len,.Lxts_enc_loop
1883
1884	llgc	$len,`2*$SIZE_T-1`($sp)
1885	nill	$len,0x0f		# $len%16
1886	jz	.Lxts_enc_done
1887
1888	la	$i3,0($inp,$out)	# put aside real $out
1889.Lxts_enc_steal:
1890	llgc	%r0,16($inp)
1891	llgc	%r1,0($out,$inp)
1892	stc	%r0,0($out,$inp)
1893	stc	%r1,16($out,$inp)
1894	la	$inp,1($inp)
1895	brct	$len,.Lxts_enc_steal
1896	la	$out,0($i3)		# restore real $out
1897
1898	# generate last tweak...
1899	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1900	lrvg	$s3,$tweak+8($sp)
1901	lghi	%r1,0x87
1902	srag	%r0,$s3,63		# broadcast upper bit
1903	ngr	%r1,%r0			# rem
1904	algr	$s1,$s1
1905	alcgr	$s3,$s3
1906	xgr	$s1,%r1
1907	lrvgr	$s1,$s1			# flip byte order
1908	lrvgr	$s3,$s3
1909	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1910	stg	$s1,$tweak+0($sp)	# save the tweak
1911	llgfr	$s1,$s1
1912	srlg	$s2,$s3,32
1913	stg	$s3,$tweak+8($sp)
1914	llgfr	$s3,$s3
1915
1916	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
1917	x	$s1,4($out)
1918	x	$s2,8($out)
1919	x	$s3,12($out)
1920	st${g}	$out,4*$SIZE_T($sp)
1921	la	$key,0($key1)
1922	bras	$ra,_s390x_AES_encrypt
1923	l${g}	$out,4*$SIZE_T($sp)
1924	x	$s0,`$tweak+0`($sp)	# ^=tweak
1925	x	$s1,`$tweak+4`($sp)
1926	x	$s2,`$tweak+8`($sp)
1927	x	$s3,`$tweak+12`($sp)
1928	st	$s0,0($out)
1929	st	$s1,4($out)
1930	st	$s2,8($out)
1931	st	$s3,12($out)
1932
1933.Lxts_enc_done:
1934	stg	$sp,$tweak+0($sp)	# wipe tweak
1935	stg	$sp,$twesk+8($sp)
1936	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1937	br	$ra
1938.size	AES_xts_encrypt,.-AES_xts_encrypt
1939___
1940# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1941#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
1942#	const unsigned char iv[16]);
1943#
1944$code.=<<___;
1945.globl	AES_xts_decrypt
1946.type	AES_xts_decrypt,\@function
1947.align	16
1948AES_xts_decrypt:
1949	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
1950	xgr	%r4,%r3
1951	xgr	%r3,%r4
1952___
1953$code.=<<___ if ($SIZE_T==4);
1954	llgfr	$len,$len
1955___
1956$code.=<<___;
1957	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
1958	aghi	$len,-16
1959	bcr	4,$ra			# abort if less than zero. formally
1960					# wrong, because $len is unsigned,
1961					# but who can afford asking to
1962					# process more than 2^63-1 bytes?
1963	tmll	$len,0x0f
1964	jnz	.Lxts_dec_proceed
1965	aghi	$len,16
1966.Lxts_dec_proceed:
1967___
1968$code.=<<___ if (!$softonly);
1969	llgf	%r0,240($key2)
1970	lhi	%r1,16
1971	clr	%r0,%r1
1972	jl	.Lxts_dec_software
1973
1974	st${g}	$ra,5*$SIZE_T($sp)
1975	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1976
1977	nill	$len,0xfff0		# $len&=~15
1978	slgr	$out,$inp
1979
1980	# generate the tweak value
1981	l${g}	$s3,$stdframe($sp)	# pointer to iv
1982	la	$s2,$tweak($sp)
1983	lmg	$s0,$s1,0($s3)
1984	lghi	$s3,16
1985	stmg	$s0,$s1,0($s2)
1986	la	%r1,0($key2)		# $key2 is not needed past this point
1987	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
1988	brc	1,.-4			# can this happen?
1989
1990	l	%r0,240($key1)
1991	la	%r1,0($key1)		# $key1 is not needed anymore
1992
1993	ltgr	$len,$len
1994	jz	.Lxts_dec_km_short
1995	bras	$ra,_s390x_xts_km
1996	jz	.Lxts_dec_km_done
1997
1998	lrvgr	$s2,$s0			# make copy in reverse byte order
1999	lrvgr	$s3,$s1
2000	j	.Lxts_dec_km_2ndtweak
2001
2002.Lxts_dec_km_short:
2003	llgc	$len,`2*$SIZE_T-1`($sp)
2004	nill	$len,0x0f		# $len%=16
2005	lrvg	$s0,$tweak+0($sp)	# load the tweak
2006	lrvg	$s1,$tweak+8($sp)
2007	lrvgr	$s2,$s0			# make copy in reverse byte order
2008	lrvgr	$s3,$s1
2009
2010.Lxts_dec_km_2ndtweak:
2011	lghi	$i1,0x87
2012	srag	$i2,$s1,63		# broadcast upper bit
2013	ngr	$i1,$i2			# rem
2014	algr	$s0,$s0
2015	alcgr	$s1,$s1
2016	xgr	$s0,$i1
2017	lrvgr	$i1,$s0			# flip byte order
2018	lrvgr	$i2,$s1
2019
2020	xg	$i1,0($inp)
2021	xg	$i2,8($inp)
2022	stg	$i1,0($out,$inp)
2023	stg	$i2,8($out,$inp)
2024	la	$i2,0($out,$inp)
2025	lghi	$i3,16
2026	.long	0xb92e0066		# km $i2,$i2
2027	brc	1,.-4			# can this happen?
2028	lrvgr	$i1,$s0
2029	lrvgr	$i2,$s1
2030	xg	$i1,0($out,$inp)
2031	xg	$i2,8($out,$inp)
2032	stg	$i1,0($out,$inp)
2033	stg	$i2,8($out,$inp)
2034
2035	la	$i3,0($out,$inp)	# put aside real $out
2036.Lxts_dec_km_steal:
2037	llgc	$i1,16($inp)
2038	llgc	$i2,0($out,$inp)
2039	stc	$i1,0($out,$inp)
2040	stc	$i2,16($out,$inp)
2041	la	$inp,1($inp)
2042	brct	$len,.Lxts_dec_km_steal
2043
2044	lgr	$s0,$s2
2045	lgr	$s1,$s3
2046	xg	$s0,0($i3)
2047	xg	$s1,8($i3)
2048	stg	$s0,0($i3)
2049	stg	$s1,8($i3)
2050	la	$s0,0($i3)
2051	lghi	$s1,16
2052	.long	0xb92e0088		# km $s0,$s0
2053	brc	1,.-4			# can this happen?
2054	xg	$s2,0($i3)
2055	xg	$s3,8($i3)
2056	stg	$s2,0($i3)
2057	stg	$s3,8($i3)
2058.Lxts_dec_km_done:
2059	stg	$sp,$tweak+0($sp)	# wipe tweak
2060	stg	$sp,$tweak+8($sp)
2061	l${g}	$ra,5*$SIZE_T($sp)
2062	lm${g}	%r6,$s3,6*$SIZE_T($sp)
2063	br	$ra
2064.align	16
2065.Lxts_dec_software:
2066___
2067$code.=<<___;
2068	stm${g}	%r6,$ra,6*$SIZE_T($sp)
2069
2070	srlg	$len,$len,4
2071	slgr	$out,$inp
2072
2073	l${g}	$s3,$stdframe($sp)	# ivp
2074	llgf	$s0,0($s3)		# load iv
2075	llgf	$s1,4($s3)
2076	llgf	$s2,8($s3)
2077	llgf	$s3,12($s3)
2078	stm${g}	%r2,%r5,2*$SIZE_T($sp)
2079	la	$key,0($key2)
2080	larl	$tbl,AES_Te
2081	bras	$ra,_s390x_AES_encrypt	# generate the tweak
2082	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2083	larl	$tbl,AES_Td
2084	lt${g}r	$len,$len
2085	stm	$s0,$s3,$tweak($sp)	# save the tweak
2086	jz	.Lxts_dec_short
2087	j	.Lxts_dec_enter
2088
2089.align	16
2090.Lxts_dec_loop:
2091	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2092	lrvg	$s3,$tweak+8($sp)
2093	lghi	%r1,0x87
2094	srag	%r0,$s3,63		# broadcast upper bit
2095	ngr	%r1,%r0			# rem
2096	algr	$s1,$s1
2097	alcgr	$s3,$s3
2098	xgr	$s1,%r1
2099	lrvgr	$s1,$s1			# flip byte order
2100	lrvgr	$s3,$s3
2101	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2102	stg	$s1,$tweak+0($sp)	# save the tweak
2103	llgfr	$s1,$s1
2104	srlg	$s2,$s3,32
2105	stg	$s3,$tweak+8($sp)
2106	llgfr	$s3,$s3
2107.Lxts_dec_enter:
2108	x	$s0,0($inp)		# tweak^=*(inp)
2109	x	$s1,4($inp)
2110	x	$s2,8($inp)
2111	x	$s3,12($inp)
2112	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
2113	la	$key,0($key1)
2114	bras	$ra,_s390x_AES_decrypt
2115	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2116	x	$s0,$tweak+0($sp)	# ^=tweak
2117	x	$s1,$tweak+4($sp)
2118	x	$s2,$tweak+8($sp)
2119	x	$s3,$tweak+12($sp)
2120	st	$s0,0($out,$inp)
2121	st	$s1,4($out,$inp)
2122	st	$s2,8($out,$inp)
2123	st	$s3,12($out,$inp)
2124	la	$inp,16($inp)
2125	brct${g}	$len,.Lxts_dec_loop
2126
2127	llgc	$len,`2*$SIZE_T-1`($sp)
2128	nill	$len,0x0f		# $len%16
2129	jz	.Lxts_dec_done
2130
2131	# generate pair of tweaks...
2132	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2133	lrvg	$s3,$tweak+8($sp)
2134	lghi	%r1,0x87
2135	srag	%r0,$s3,63		# broadcast upper bit
2136	ngr	%r1,%r0			# rem
2137	algr	$s1,$s1
2138	alcgr	$s3,$s3
2139	xgr	$s1,%r1
2140	lrvgr	$i2,$s1			# flip byte order
2141	lrvgr	$i3,$s3
2142	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
2143	j	.Lxts_dec_2ndtweak
2144
2145.align	16
2146.Lxts_dec_short:
2147	llgc	$len,`2*$SIZE_T-1`($sp)
2148	nill	$len,0x0f		# $len%16
2149	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2150	lrvg	$s3,$tweak+8($sp)
2151.Lxts_dec_2ndtweak:
2152	lghi	%r1,0x87
2153	srag	%r0,$s3,63		# broadcast upper bit
2154	ngr	%r1,%r0			# rem
2155	algr	$s1,$s1
2156	alcgr	$s3,$s3
2157	xgr	$s1,%r1
2158	lrvgr	$s1,$s1			# flip byte order
2159	lrvgr	$s3,$s3
2160	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2161	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
2162	llgfr	$s1,$s1
2163	srlg	$s2,$s3,32
2164	stg	$s3,$tweak-16+8($sp)
2165	llgfr	$s3,$s3
2166
2167	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
2168	x	$s1,4($inp)
2169	x	$s2,8($inp)
2170	x	$s3,12($inp)
2171	stm${g}	%r2,%r3,2*$SIZE_T($sp)
2172	la	$key,0($key1)
2173	bras	$ra,_s390x_AES_decrypt
2174	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2175	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
2176	x	$s1,$tweak-16+4($sp)
2177	x	$s2,$tweak-16+8($sp)
2178	x	$s3,$tweak-16+12($sp)
2179	st	$s0,0($out,$inp)
2180	st	$s1,4($out,$inp)
2181	st	$s2,8($out,$inp)
2182	st	$s3,12($out,$inp)
2183
2184	la	$i3,0($out,$inp)	# put aside real $out
2185.Lxts_dec_steal:
2186	llgc	%r0,16($inp)
2187	llgc	%r1,0($out,$inp)
2188	stc	%r0,0($out,$inp)
2189	stc	%r1,16($out,$inp)
2190	la	$inp,1($inp)
2191	brct	$len,.Lxts_dec_steal
2192	la	$out,0($i3)		# restore real $out
2193
2194	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
2195	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
2196	x	$s1,4($out)
2197	x	$s2,8($out)
2198	x	$s3,12($out)
2199	st${g}	$out,4*$SIZE_T($sp)
2200	la	$key,0($key1)
2201	bras	$ra,_s390x_AES_decrypt
2202	l${g}	$out,4*$SIZE_T($sp)
2203	x	$s0,$tweak+0($sp)	# ^=tweak
2204	x	$s1,$tweak+4($sp)
2205	x	$s2,$tweak+8($sp)
2206	x	$s3,$tweak+12($sp)
2207	st	$s0,0($out)
2208	st	$s1,4($out)
2209	st	$s2,8($out)
2210	st	$s3,12($out)
2211	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
2212	stg	$sp,$tweak-16+8($sp)
2213.Lxts_dec_done:
2214	stg	$sp,$tweak+0($sp)	# wipe tweak
2215	stg	$sp,$twesk+8($sp)
2216	lm${g}	%r6,$ra,6*$SIZE_T($sp)
2217	br	$ra
2218.size	AES_xts_decrypt,.-AES_xts_decrypt
2219___
2220}
2221$code.=<<___;
2222.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2223.comm	OPENSSL_s390xcap_P,80,8
2224___
2225
2226$code =~ s/\`([^\`]*)\`/eval $1/gem;
2227print $code;
2228close STDOUT;	# force flush
2229