1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for keys longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88	$SIZE_T=4;
89	$g="";
90} else {
91	$SIZE_T=8;
92	$g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
98$softonly=0;	# allow hardware support
99
100$t0="%r0";	$mask="%r0";
101$t1="%r1";
102$t2="%r2";	$inp="%r2";
103$t3="%r3";	$out="%r3";	$bits="%r3";
104$key="%r4";
105$i1="%r5";
106$i2="%r6";
107$i3="%r7";
108$s0="%r8";
109$s1="%r9";
110$s2="%r10";
111$s3="%r11";
112$tbl="%r12";
113$rounds="%r13";
114$ra="%r14";
115$sp="%r15";
116
117$stdframe=16*$SIZE_T+4*8;
118
119sub _data_word()
120{ my $i;
121    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
122}
123
124$code=<<___;
125.text
126
127.type	AES_Te,\@object
128.align	256
129AES_Te:
130___
131&_data_word(
132	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
197# Te4[256]
198.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
230# rcon[]
231.long	0x01000000, 0x02000000, 0x04000000, 0x08000000
232.long	0x10000000, 0x20000000, 0x40000000, 0x80000000
233.long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
234.align	256
235.size	AES_Te,.-AES_Te
236
237# void AES_encrypt(const unsigned char *inp, unsigned char *out,
238# 		 const AES_KEY *key) {
239.globl	AES_encrypt
240.type	AES_encrypt,\@function
241AES_encrypt:
242___
243$code.=<<___ if (!$softonly);
244	l	%r0,240($key)
245	lhi	%r1,16
246	clr	%r0,%r1
247	jl	.Lesoft
248
249	la	%r1,0($key)
250	#la	%r2,0($inp)
251	la	%r4,0($out)
252	lghi	%r3,16		# single block length
253	.long	0xb92e0042	# km %r4,%r2
254	brc	1,.-4		# can this happen?
255	br	%r14
256.align	64
257.Lesoft:
258___
259$code.=<<___;
260	stm${g}	%r3,$ra,3*$SIZE_T($sp)
261
262	llgf	$s0,0($inp)
263	llgf	$s1,4($inp)
264	llgf	$s2,8($inp)
265	llgf	$s3,12($inp)
266
267	larl	$tbl,AES_Te
268	bras	$ra,_s390x_AES_encrypt
269
270	l${g}	$out,3*$SIZE_T($sp)
271	st	$s0,0($out)
272	st	$s1,4($out)
273	st	$s2,8($out)
274	st	$s3,12($out)
275
276	lm${g}	%r6,$ra,6*$SIZE_T($sp)
277	br	$ra
278.size	AES_encrypt,.-AES_encrypt
279
280.type   _s390x_AES_encrypt,\@function
281.align	16
282_s390x_AES_encrypt:
283	st${g}	$ra,15*$SIZE_T($sp)
284	x	$s0,0($key)
285	x	$s1,4($key)
286	x	$s2,8($key)
287	x	$s3,12($key)
288	l	$rounds,240($key)
289	llill	$mask,`0xff<<3`
290	aghi	$rounds,-1
291	j	.Lenc_loop
292.align	16
293.Lenc_loop:
294	sllg	$t1,$s0,`0+3`
295	srlg	$t2,$s0,`8-3`
296	srlg	$t3,$s0,`16-3`
297	srl	$s0,`24-3`
298	nr	$s0,$mask
299	ngr	$t1,$mask
300	nr	$t2,$mask
301	nr	$t3,$mask
302
303	srlg	$i1,$s1,`16-3`	# i0
304	sllg	$i2,$s1,`0+3`
305	srlg	$i3,$s1,`8-3`
306	srl	$s1,`24-3`
307	nr	$i1,$mask
308	nr	$s1,$mask
309	ngr	$i2,$mask
310	nr	$i3,$mask
311
312	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
313	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
314	l	$t2,2($t2,$tbl) # Te2[s0>>8]
315	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
316
317	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
318	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
319	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
320	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
321
322	srlg	$i1,$s2,`8-3`	# i0
323	srlg	$i2,$s2,`16-3`	# i1
324	nr	$i1,$mask
325	nr	$i2,$mask
326	sllg	$i3,$s2,`0+3`
327	srl	$s2,`24-3`
328	nr	$s2,$mask
329	ngr	$i3,$mask
330
331	xr	$s1,$t1
332	srlg	$ra,$s3,`8-3`	# i1
333	sllg	$t1,$s3,`0+3`	# i0
334	nr	$ra,$mask
335	la	$key,16($key)
336	ngr	$t1,$mask
337
338	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
339	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
340	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
341	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
342
343	srlg	$i3,$s3,`16-3`	# i2
344	xr	$s2,$t2
345	srl	$s3,`24-3`
346	nr	$i3,$mask
347	nr	$s3,$mask
348
349	x	$s0,0($key)
350	x	$s1,4($key)
351	x	$s2,8($key)
352	x	$t3,12($key)
353
354	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
355	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
356	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
357	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
358	xr	$s3,$t3
359
360	brct	$rounds,.Lenc_loop
361	.align	16
362
363	sllg	$t1,$s0,`0+3`
364	srlg	$t2,$s0,`8-3`
365	ngr	$t1,$mask
366	srlg	$t3,$s0,`16-3`
367	srl	$s0,`24-3`
368	nr	$s0,$mask
369	nr	$t2,$mask
370	nr	$t3,$mask
371
372	srlg	$i1,$s1,`16-3`	# i0
373	sllg	$i2,$s1,`0+3`
374	ngr	$i2,$mask
375	srlg	$i3,$s1,`8-3`
376	srl	$s1,`24-3`
377	nr	$i1,$mask
378	nr	$s1,$mask
379	nr	$i3,$mask
380
381	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
382	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
383	sll	$s0,24
384	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
385	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
386	sll	$t2,8
387	sll	$t3,16
388
389	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
390	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
391	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
392	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
393	sll	$i1,16
394	sll	$s1,24
395	sll	$i3,8
396	or	$s0,$i1
397	or	$s1,$t1
398	or	$t2,$i2
399	or	$t3,$i3
400
401	srlg	$i1,$s2,`8-3`	# i0
402	srlg	$i2,$s2,`16-3`	# i1
403	nr	$i1,$mask
404	nr	$i2,$mask
405	sllg	$i3,$s2,`0+3`
406	srl	$s2,`24-3`
407	ngr	$i3,$mask
408	nr	$s2,$mask
409
410	sllg	$t1,$s3,`0+3`	# i0
411	srlg	$ra,$s3,`8-3`	# i1
412	ngr	$t1,$mask
413
414	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
415	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
416	sll	$i1,8
417	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
418	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
419	sll	$i2,16
420	nr	$ra,$mask
421	sll	$s2,24
422	or	$s0,$i1
423	or	$s1,$i2
424	or	$s2,$t2
425	or	$t3,$i3
426
427	srlg	$i3,$s3,`16-3`	# i2
428	srl	$s3,`24-3`
429	nr	$i3,$mask
430	nr	$s3,$mask
431
432	l	$t0,16($key)
433	l	$t2,20($key)
434
435	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
436	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
437	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
438	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
439	sll	$i2,8
440	sll	$i3,16
441	sll	$s3,24
442	or	$s0,$i1
443	or	$s1,$i2
444	or	$s2,$i3
445	or	$s3,$t3
446
447	l${g}	$ra,15*$SIZE_T($sp)
448	xr	$s0,$t0
449	xr	$s1,$t2
450	x	$s2,24($key)
451	x	$s3,28($key)
452
453	br	$ra
454.size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
455___
456
457$code.=<<___;
458.type	AES_Td,\@object
459.align	256
460AES_Td:
461___
462&_data_word(
463	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
527$code.=<<___;
528# Td4[256]
529.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561.size	AES_Td,.-AES_Td
562
563# void AES_decrypt(const unsigned char *inp, unsigned char *out,
564# 		 const AES_KEY *key) {
565.globl	AES_decrypt
566.type	AES_decrypt,\@function
567AES_decrypt:
568___
569$code.=<<___ if (!$softonly);
570	l	%r0,240($key)
571	lhi	%r1,16
572	clr	%r0,%r1
573	jl	.Ldsoft
574
575	la	%r1,0($key)
576	#la	%r2,0($inp)
577	la	%r4,0($out)
578	lghi	%r3,16		# single block length
579	.long	0xb92e0042	# km %r4,%r2
580	brc	1,.-4		# can this happen?
581	br	%r14
582.align	64
583.Ldsoft:
584___
585$code.=<<___;
586	stm${g}	%r3,$ra,3*$SIZE_T($sp)
587
588	llgf	$s0,0($inp)
589	llgf	$s1,4($inp)
590	llgf	$s2,8($inp)
591	llgf	$s3,12($inp)
592
593	larl	$tbl,AES_Td
594	bras	$ra,_s390x_AES_decrypt
595
596	l${g}	$out,3*$SIZE_T($sp)
597	st	$s0,0($out)
598	st	$s1,4($out)
599	st	$s2,8($out)
600	st	$s3,12($out)
601
602	lm${g}	%r6,$ra,6*$SIZE_T($sp)
603	br	$ra
604.size	AES_decrypt,.-AES_decrypt
605
606.type   _s390x_AES_decrypt,\@function
607.align	16
608_s390x_AES_decrypt:
609	st${g}	$ra,15*$SIZE_T($sp)
610	x	$s0,0($key)
611	x	$s1,4($key)
612	x	$s2,8($key)
613	x	$s3,12($key)
614	l	$rounds,240($key)
615	llill	$mask,`0xff<<3`
616	aghi	$rounds,-1
617	j	.Ldec_loop
618.align	16
619.Ldec_loop:
620	srlg	$t1,$s0,`16-3`
621	srlg	$t2,$s0,`8-3`
622	sllg	$t3,$s0,`0+3`
623	srl	$s0,`24-3`
624	nr	$s0,$mask
625	nr	$t1,$mask
626	nr	$t2,$mask
627	ngr	$t3,$mask
628
629	sllg	$i1,$s1,`0+3`	# i0
630	srlg	$i2,$s1,`16-3`
631	srlg	$i3,$s1,`8-3`
632	srl	$s1,`24-3`
633	ngr	$i1,$mask
634	nr	$s1,$mask
635	nr	$i2,$mask
636	nr	$i3,$mask
637
638	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
639	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
640	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
641	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
642
643	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
644	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
645	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
646	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
647
648	srlg	$i1,$s2,`8-3`	# i0
649	sllg	$i2,$s2,`0+3`	# i1
650	srlg	$i3,$s2,`16-3`
651	srl	$s2,`24-3`
652	nr	$i1,$mask
653	ngr	$i2,$mask
654	nr	$s2,$mask
655	nr	$i3,$mask
656
657	xr	$s1,$t1
658	srlg	$ra,$s3,`8-3`	# i1
659	srlg	$t1,$s3,`16-3`	# i0
660	nr	$ra,$mask
661	la	$key,16($key)
662	nr	$t1,$mask
663
664	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
665	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
666	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
667	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
668
669	sllg	$i3,$s3,`0+3`	# i2
670	srl	$s3,`24-3`
671	ngr	$i3,$mask
672	nr	$s3,$mask
673
674	xr	$s2,$t2
675	x	$s0,0($key)
676	x	$s1,4($key)
677	x	$s2,8($key)
678	x	$t3,12($key)
679
680	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
681	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
682	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
683	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
684	xr	$s3,$t3
685
686	brct	$rounds,.Ldec_loop
687	.align	16
688
689	l	$t1,`2048+0`($tbl)	# prefetch Td4
690	l	$t2,`2048+64`($tbl)
691	l	$t3,`2048+128`($tbl)
692	l	$i1,`2048+192`($tbl)
693	llill	$mask,0xff
694
695	srlg	$i3,$s0,24	# i0
696	srlg	$t1,$s0,16
697	srlg	$t2,$s0,8
698	nr	$s0,$mask	# i3
699	nr	$t1,$mask
700
701	srlg	$i1,$s1,24
702	nr	$t2,$mask
703	srlg	$i2,$s1,16
704	srlg	$ra,$s1,8
705	nr	$s1,$mask	# i0
706	nr	$i2,$mask
707	nr	$ra,$mask
708
709	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
710	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
711	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
712	sll	$t1,16
713	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
714	sllg	$s0,$i3,24
715	sll	$t2,8
716
717	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
718	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
719	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
720	sll	$i1,24
721	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
722	sll	$i2,16
723	sll	$i3,8
724	or	$s0,$s1
725	or	$t1,$i1
726	or	$t2,$i2
727	or	$t3,$i3
728
729	srlg	$i1,$s2,8	# i0
730	srlg	$i2,$s2,24
731	srlg	$i3,$s2,16
732	nr	$s2,$mask	# i1
733	nr	$i1,$mask
734	nr	$i3,$mask
735	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
736	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
737	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
738	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
739	sll	$i1,8
740	sll	$i2,24
741	or	$s0,$i1
742	sll	$i3,16
743	or	$t2,$i2
744	or	$t3,$i3
745
746	srlg	$i1,$s3,16	# i0
747	srlg	$i2,$s3,8	# i1
748	srlg	$i3,$s3,24
749	nr	$s3,$mask	# i2
750	nr	$i1,$mask
751	nr	$i2,$mask
752
753	l${g}	$ra,15*$SIZE_T($sp)
754	or	$s1,$t1
755	l	$t0,16($key)
756	l	$t1,20($key)
757
758	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
759	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
760	sll	$i1,16
761	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
762	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
763	sll	$i2,8
764	sll	$s3,24
765	or	$s0,$i1
766	or	$s1,$i2
767	or	$s2,$t2
768	or	$s3,$t3
769
770	xr	$s0,$t0
771	xr	$s1,$t1
772	x	$s2,24($key)
773	x	$s3,28($key)
774
775	br	$ra
776.size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
777___
778
779$code.=<<___;
780# void AES_set_encrypt_key(const unsigned char *in, int bits,
781# 		 AES_KEY *key) {
782.globl	private_AES_set_encrypt_key
783.type	private_AES_set_encrypt_key,\@function
784.align	16
785private_AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
787	lghi	$t0,0
788	cl${g}r	$inp,$t0
789	je	.Lminus1
790	cl${g}r	$key,$t0
791	je	.Lminus1
792
793	lghi	$t0,128
794	clr	$bits,$t0
795	je	.Lproceed
796	lghi	$t0,192
797	clr	$bits,$t0
798	je	.Lproceed
799	lghi	$t0,256
800	clr	$bits,$t0
801	je	.Lproceed
802	lghi	%r2,-2
803	br	%r14
804
805.align	16
806.Lproceed:
807___
808$code.=<<___ if (!$softonly);
809	# convert bits to km code, [128,192,256]->[18,19,20]
810	lhi	%r5,-128
811	lhi	%r0,18
812	ar	%r5,$bits
813	srl	%r5,6
814	ar	%r5,%r0
815
816	larl	%r1,OPENSSL_s390xcap_P
817	lg	%r0,0(%r1)
818	tmhl	%r0,0x4000	# check for message-security assist
819	jz	.Lekey_internal
820
821	lghi	%r0,0		# query capability vector
822	la	%r1,16($sp)
823	.long	0xb92f0042	# kmc %r4,%r2
824
825	llihh	%r1,0x8000
826	srlg	%r1,%r1,0(%r5)
827	ng	%r1,16($sp)
828	jz	.Lekey_internal
829
830	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
831	stmg	%r0,%r1,0($key)
832	lhi	%r0,192
833	cr	$bits,%r0
834	jl	1f
835	lg	%r1,16($inp)
836	stg	%r1,16($key)
837	je	1f
838	lg	%r1,24($inp)
839	stg	%r1,24($key)
8401:	st	$bits,236($key)	# save bits [for debugging purposes]
841	lgr	$t0,%r5
842	st	%r5,240($key)	# save km code
843	lghi	%r2,0
844	br	%r14
845___
846$code.=<<___;
847.align	16
848.Lekey_internal:
849	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
850
851	larl	$tbl,AES_Te+2048
852
853	llgf	$s0,0($inp)
854	llgf	$s1,4($inp)
855	llgf	$s2,8($inp)
856	llgf	$s3,12($inp)
857	st	$s0,0($key)
858	st	$s1,4($key)
859	st	$s2,8($key)
860	st	$s3,12($key)
861	lghi	$t0,128
862	cr	$bits,$t0
863	jne	.Lnot128
864
865	llill	$mask,0xff
866	lghi	$t3,0			# i=0
867	lghi	$rounds,10
868	st	$rounds,240($key)
869
870	llgfr	$t2,$s3			# temp=rk[3]
871	srlg	$i1,$s3,8
872	srlg	$i2,$s3,16
873	srlg	$i3,$s3,24
874	nr	$t2,$mask
875	nr	$i1,$mask
876	nr	$i2,$mask
877
878.align	16
879.L128_loop:
880	la	$t2,0($t2,$tbl)
881	la	$i1,0($i1,$tbl)
882	la	$i2,0($i2,$tbl)
883	la	$i3,0($i3,$tbl)
884	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
885	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
886	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
887	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
888	x	$t2,256($t3,$tbl)	# rcon[i]
889	xr	$s0,$t2			# rk[4]=rk[0]^...
890	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
891	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
892	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
893
894	llgfr	$t2,$s3			# temp=rk[3]
895	srlg	$i1,$s3,8
896	srlg	$i2,$s3,16
897	nr	$t2,$mask
898	nr	$i1,$mask
899	srlg	$i3,$s3,24
900	nr	$i2,$mask
901
902	st	$s0,16($key)
903	st	$s1,20($key)
904	st	$s2,24($key)
905	st	$s3,28($key)
906	la	$key,16($key)		# key+=4
907	la	$t3,4($t3)		# i++
908	brct	$rounds,.L128_loop
909	lghi	$t0,10
910	lghi	%r2,0
911	lm${g}	%r4,%r13,4*$SIZE_T($sp)
912	br	$ra
913
914.align	16
915.Lnot128:
916	llgf	$t0,16($inp)
917	llgf	$t1,20($inp)
918	st	$t0,16($key)
919	st	$t1,20($key)
920	lghi	$t0,192
921	cr	$bits,$t0
922	jne	.Lnot192
923
924	llill	$mask,0xff
925	lghi	$t3,0			# i=0
926	lghi	$rounds,12
927	st	$rounds,240($key)
928	lghi	$rounds,8
929
930	srlg	$i1,$t1,8
931	srlg	$i2,$t1,16
932	srlg	$i3,$t1,24
933	nr	$t1,$mask
934	nr	$i1,$mask
935	nr	$i2,$mask
936
937.align	16
938.L192_loop:
939	la	$t1,0($t1,$tbl)
940	la	$i1,0($i1,$tbl)
941	la	$i2,0($i2,$tbl)
942	la	$i3,0($i3,$tbl)
943	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
944	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
945	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
946	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
947	x	$t1,256($t3,$tbl)	# rcon[i]
948	xr	$s0,$t1			# rk[6]=rk[0]^...
949	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
950	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
951	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
952
953	st	$s0,24($key)
954	st	$s1,28($key)
955	st	$s2,32($key)
956	st	$s3,36($key)
957	brct	$rounds,.L192_continue
958	lghi	$t0,12
959	lghi	%r2,0
960	lm${g}	%r4,%r13,4*$SIZE_T($sp)
961	br	$ra
962
963.align	16
964.L192_continue:
965	lgr	$t1,$s3
966	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
967	st	$t1,40($key)
968	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
969	st	$t1,44($key)
970
971	srlg	$i1,$t1,8
972	srlg	$i2,$t1,16
973	srlg	$i3,$t1,24
974	nr	$t1,$mask
975	nr	$i1,$mask
976	nr	$i2,$mask
977
978	la	$key,24($key)		# key+=6
979	la	$t3,4($t3)		# i++
980	j	.L192_loop
981
982.align	16
983.Lnot192:
984	llgf	$t0,24($inp)
985	llgf	$t1,28($inp)
986	st	$t0,24($key)
987	st	$t1,28($key)
988	llill	$mask,0xff
989	lghi	$t3,0			# i=0
990	lghi	$rounds,14
991	st	$rounds,240($key)
992	lghi	$rounds,7
993
994	srlg	$i1,$t1,8
995	srlg	$i2,$t1,16
996	srlg	$i3,$t1,24
997	nr	$t1,$mask
998	nr	$i1,$mask
999	nr	$i2,$mask
1000
1001.align	16
1002.L256_loop:
1003	la	$t1,0($t1,$tbl)
1004	la	$i1,0($i1,$tbl)
1005	la	$i2,0($i2,$tbl)
1006	la	$i3,0($i3,$tbl)
1007	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
1008	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
1009	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
1010	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
1011	x	$t1,256($t3,$tbl)	# rcon[i]
1012	xr	$s0,$t1			# rk[8]=rk[0]^...
1013	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
1014	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
1015	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
1016	st	$s0,32($key)
1017	st	$s1,36($key)
1018	st	$s2,40($key)
1019	st	$s3,44($key)
1020	brct	$rounds,.L256_continue
1021	lghi	$t0,14
1022	lghi	%r2,0
1023	lm${g}	%r4,%r13,4*$SIZE_T($sp)
1024	br	$ra
1025
1026.align	16
1027.L256_continue:
1028	lgr	$t1,$s3			# temp=rk[11]
1029	srlg	$i1,$s3,8
1030	srlg	$i2,$s3,16
1031	srlg	$i3,$s3,24
1032	nr	$t1,$mask
1033	nr	$i1,$mask
1034	nr	$i2,$mask
1035	la	$t1,0($t1,$tbl)
1036	la	$i1,0($i1,$tbl)
1037	la	$i2,0($i2,$tbl)
1038	la	$i3,0($i3,$tbl)
1039	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
1040	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
1041	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
1042	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
1043	x	$t1,16($key)		# rk[12]=rk[4]^...
1044	st	$t1,48($key)
1045	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
1046	st	$t1,52($key)
1047	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
1048	st	$t1,56($key)
1049	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
1050	st	$t1,60($key)
1051
1052	srlg	$i1,$t1,8
1053	srlg	$i2,$t1,16
1054	srlg	$i3,$t1,24
1055	nr	$t1,$mask
1056	nr	$i1,$mask
1057	nr	$i2,$mask
1058
1059	la	$key,32($key)		# key+=8
1060	la	$t3,4($t3)		# i++
1061	j	.L256_loop
1062
1063.Lminus1:
1064	lghi	%r2,-1
1065	br	$ra
1066.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1067
1068# void AES_set_decrypt_key(const unsigned char *in, int bits,
1069# 		 AES_KEY *key) {
1070.globl	private_AES_set_decrypt_key
1071.type	private_AES_set_decrypt_key,\@function
1072.align	16
1073private_AES_set_decrypt_key:
1074	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
1075	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
1076	bras	$ra,_s390x_AES_set_encrypt_key
1077	#l${g}	$key,4*$SIZE_T($sp)
1078	l${g}	$ra,14*$SIZE_T($sp)
1079	ltgr	%r2,%r2
1080	bnzr	$ra
1081___
1082$code.=<<___ if (!$softonly);
1083	#l	$t0,240($key)
1084	lhi	$t1,16
1085	cr	$t0,$t1
1086	jl	.Lgo
1087	oill	$t0,0x80	# set "decrypt" bit
1088	st	$t0,240($key)
1089	br	$ra
1090___
1091$code.=<<___;
1092.align	16
1093.Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
1094	la	$i1,0($key)
1095	sllg	$i2,$rounds,4
1096	la	$i2,0($i2,$key)
1097	srl	$rounds,1
1098	lghi	$t1,-16
1099
1100.align	16
1101.Linv:	lmg	$s0,$s1,0($i1)
1102	lmg	$s2,$s3,0($i2)
1103	stmg	$s0,$s1,0($i2)
1104	stmg	$s2,$s3,0($i1)
1105	la	$i1,16($i1)
1106	la	$i2,0($t1,$i2)
1107	brct	$rounds,.Linv
1108___
1109$mask80=$i1;
1110$mask1b=$i2;
1111$maskfe=$i3;
1112$code.=<<___;
1113	llgf	$rounds,240($key)
1114	aghi	$rounds,-1
1115	sll	$rounds,2	# (rounds-1)*4
1116	llilh	$mask80,0x8080
1117	llilh	$mask1b,0x1b1b
1118	llilh	$maskfe,0xfefe
1119	oill	$mask80,0x8080
1120	oill	$mask1b,0x1b1b
1121	oill	$maskfe,0xfefe
1122
1123.align	16
1124.Lmix:	l	$s0,16($key)	# tp1
1125	lr	$s1,$s0
1126	ngr	$s1,$mask80
1127	srlg	$t1,$s1,7
1128	slr	$s1,$t1
1129	nr	$s1,$mask1b
1130	sllg	$t1,$s0,1
1131	nr	$t1,$maskfe
1132	xr	$s1,$t1		# tp2
1133
1134	lr	$s2,$s1
1135	ngr	$s2,$mask80
1136	srlg	$t1,$s2,7
1137	slr	$s2,$t1
1138	nr	$s2,$mask1b
1139	sllg	$t1,$s1,1
1140	nr	$t1,$maskfe
1141	xr	$s2,$t1		# tp4
1142
1143	lr	$s3,$s2
1144	ngr	$s3,$mask80
1145	srlg	$t1,$s3,7
1146	slr	$s3,$t1
1147	nr	$s3,$mask1b
1148	sllg	$t1,$s2,1
1149	nr	$t1,$maskfe
1150	xr	$s3,$t1		# tp8
1151
1152	xr	$s1,$s0		# tp2^tp1
1153	xr	$s2,$s0		# tp4^tp1
1154	rll	$s0,$s0,24	# = ROTATE(tp1,8)
1155	xr	$s2,$s3		# ^=tp8
1156	xr	$s0,$s1		# ^=tp2^tp1
1157	xr	$s1,$s3		# tp2^tp1^tp8
1158	xr	$s0,$s2		# ^=tp4^tp1^tp8
1159	rll	$s1,$s1,8
1160	rll	$s2,$s2,16
1161	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
1162	rll	$s3,$s3,24
1163	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
1164	xr	$s0,$s3		# ^= ROTATE(tp8,8)
1165
1166	st	$s0,16($key)
1167	la	$key,4($key)
1168	brct	$rounds,.Lmix
1169
1170	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1171	lghi	%r2,0
1172	br	$ra
1173.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1174___
1175
1176########################################################################
1177# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1178#                     size_t length, const AES_KEY *key,
1179#                     unsigned char *ivec, const int enc)
1180{
1181my $inp="%r2";
1182my $out="%r4";	# length and out are swapped
1183my $len="%r3";
1184my $key="%r5";
1185my $ivp="%r6";
1186
1187$code.=<<___;
1188.globl	AES_cbc_encrypt
1189.type	AES_cbc_encrypt,\@function
1190.align	16
1191AES_cbc_encrypt:
1192	xgr	%r3,%r4		# flip %r3 and %r4, out and len
1193	xgr	%r4,%r3
1194	xgr	%r3,%r4
1195___
1196$code.=<<___ if (!$softonly);
1197	lhi	%r0,16
1198	cl	%r0,240($key)
1199	jh	.Lcbc_software
1200
1201	lg	%r0,0($ivp)	# copy ivec
1202	lg	%r1,8($ivp)
1203	stmg	%r0,%r1,16($sp)
1204	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
1205	stmg	%r0,%r1,32($sp)
1206	lmg	%r0,%r1,16($key)
1207	stmg	%r0,%r1,48($sp)
1208	l	%r0,240($key)	# load kmc code
1209	lghi	$key,15		# res=len%16, len-=res;
1210	ngr	$key,$len
1211	sl${g}r	$len,$key
1212	la	%r1,16($sp)	# parameter block - ivec || key
1213	jz	.Lkmc_truncated
1214	.long	0xb92f0042	# kmc %r4,%r2
1215	brc	1,.-4		# pay attention to "partial completion"
1216	ltr	$key,$key
1217	jnz	.Lkmc_truncated
1218.Lkmc_done:
1219	lmg	%r0,%r1,16($sp)	# copy ivec to caller
1220	stg	%r0,0($ivp)
1221	stg	%r1,8($ivp)
1222	br	$ra
1223.align	16
1224.Lkmc_truncated:
1225	ahi	$key,-1		# it's the way it's encoded in mvc
1226	tmll	%r0,0x80
1227	jnz	.Lkmc_truncated_dec
1228	lghi	%r1,0
1229	stg	%r1,16*$SIZE_T($sp)
1230	stg	%r1,16*$SIZE_T+8($sp)
1231	bras	%r1,1f
1232	mvc	16*$SIZE_T(1,$sp),0($inp)
12331:	ex	$key,0(%r1)
1234	la	%r1,16($sp)	# restore parameter block
1235	la	$inp,16*$SIZE_T($sp)
1236	lghi	$len,16
1237	.long	0xb92f0042	# kmc %r4,%r2
1238	j	.Lkmc_done
1239.align	16
1240.Lkmc_truncated_dec:
1241	st${g}	$out,4*$SIZE_T($sp)
1242	la	$out,16*$SIZE_T($sp)
1243	lghi	$len,16
1244	.long	0xb92f0042	# kmc %r4,%r2
1245	l${g}	$out,4*$SIZE_T($sp)
1246	bras	%r1,2f
1247	mvc	0(1,$out),16*$SIZE_T($sp)
12482:	ex	$key,0(%r1)
1249	j	.Lkmc_done
1250.align	16
1251.Lcbc_software:
1252___
1253$code.=<<___;
1254	stm${g}	$key,$ra,5*$SIZE_T($sp)
1255	lhi	%r0,0
1256	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
1257	je	.Lcbc_decrypt
1258
1259	larl	$tbl,AES_Te
1260
1261	llgf	$s0,0($ivp)
1262	llgf	$s1,4($ivp)
1263	llgf	$s2,8($ivp)
1264	llgf	$s3,12($ivp)
1265
1266	lghi	$t0,16
1267	sl${g}r	$len,$t0
1268	brc	4,.Lcbc_enc_tail	# if borrow
1269.Lcbc_enc_loop:
1270	stm${g}	$inp,$out,2*$SIZE_T($sp)
1271	x	$s0,0($inp)
1272	x	$s1,4($inp)
1273	x	$s2,8($inp)
1274	x	$s3,12($inp)
1275	lgr	%r4,$key
1276
1277	bras	$ra,_s390x_AES_encrypt
1278
1279	lm${g}	$inp,$key,2*$SIZE_T($sp)
1280	st	$s0,0($out)
1281	st	$s1,4($out)
1282	st	$s2,8($out)
1283	st	$s3,12($out)
1284
1285	la	$inp,16($inp)
1286	la	$out,16($out)
1287	lghi	$t0,16
1288	lt${g}r	$len,$len
1289	jz	.Lcbc_enc_done
1290	sl${g}r	$len,$t0
1291	brc	4,.Lcbc_enc_tail	# if borrow
1292	j	.Lcbc_enc_loop
1293.align	16
1294.Lcbc_enc_done:
1295	l${g}	$ivp,6*$SIZE_T($sp)
1296	st	$s0,0($ivp)
1297	st	$s1,4($ivp)
1298	st	$s2,8($ivp)
1299	st	$s3,12($ivp)
1300
1301	lm${g}	%r7,$ra,7*$SIZE_T($sp)
1302	br	$ra
1303
1304.align	16
1305.Lcbc_enc_tail:
1306	aghi	$len,15
1307	lghi	$t0,0
1308	stg	$t0,16*$SIZE_T($sp)
1309	stg	$t0,16*$SIZE_T+8($sp)
1310	bras	$t1,3f
1311	mvc	16*$SIZE_T(1,$sp),0($inp)
13123:	ex	$len,0($t1)
1313	lghi	$len,0
1314	la	$inp,16*$SIZE_T($sp)
1315	j	.Lcbc_enc_loop
1316
1317.align	16
1318.Lcbc_decrypt:
1319	larl	$tbl,AES_Td
1320
1321	lg	$t0,0($ivp)
1322	lg	$t1,8($ivp)
1323	stmg	$t0,$t1,16*$SIZE_T($sp)
1324
1325.Lcbc_dec_loop:
1326	stm${g}	$inp,$out,2*$SIZE_T($sp)
1327	llgf	$s0,0($inp)
1328	llgf	$s1,4($inp)
1329	llgf	$s2,8($inp)
1330	llgf	$s3,12($inp)
1331	lgr	%r4,$key
1332
1333	bras	$ra,_s390x_AES_decrypt
1334
1335	lm${g}	$inp,$key,2*$SIZE_T($sp)
1336	sllg	$s0,$s0,32
1337	sllg	$s2,$s2,32
1338	lr	$s0,$s1
1339	lr	$s2,$s3
1340
1341	lg	$t0,0($inp)
1342	lg	$t1,8($inp)
1343	xg	$s0,16*$SIZE_T($sp)
1344	xg	$s2,16*$SIZE_T+8($sp)
1345	lghi	$s1,16
1346	sl${g}r	$len,$s1
1347	brc	4,.Lcbc_dec_tail	# if borrow
1348	brc	2,.Lcbc_dec_done	# if zero
1349	stg	$s0,0($out)
1350	stg	$s2,8($out)
1351	stmg	$t0,$t1,16*$SIZE_T($sp)
1352
1353	la	$inp,16($inp)
1354	la	$out,16($out)
1355	j	.Lcbc_dec_loop
1356
1357.Lcbc_dec_done:
1358	stg	$s0,0($out)
1359	stg	$s2,8($out)
1360.Lcbc_dec_exit:
1361	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1362	stmg	$t0,$t1,0($ivp)
1363
1364	br	$ra
1365
1366.align	16
1367.Lcbc_dec_tail:
1368	aghi	$len,15
1369	stg	$s0,16*$SIZE_T($sp)
1370	stg	$s2,16*$SIZE_T+8($sp)
1371	bras	$s1,4f
1372	mvc	0(1,$out),16*$SIZE_T($sp)
13734:	ex	$len,0($s1)
1374	j	.Lcbc_dec_exit
1375.size	AES_cbc_encrypt,.-AES_cbc_encrypt
1376___
1377}
1378########################################################################
1379# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1380#                     size_t blocks, const AES_KEY *key,
1381#                     const unsigned char *ivec)
1382{
1383my $inp="%r2";
1384my $out="%r4";	# blocks and out are swapped
1385my $len="%r3";
1386my $key="%r5";	my $iv0="%r5";
1387my $ivp="%r6";
1388my $fp ="%r7";
1389
1390$code.=<<___;
1391.globl	AES_ctr32_encrypt
1392.type	AES_ctr32_encrypt,\@function
1393.align	16
1394AES_ctr32_encrypt:
1395	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
1396	xgr	%r4,%r3
1397	xgr	%r3,%r4
1398	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
1399___
1400$code.=<<___ if (!$softonly);
1401	l	%r0,240($key)
1402	lhi	%r1,16
1403	clr	%r0,%r1
1404	jl	.Lctr32_software
1405
1406	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1407
1408	slgr	$out,$inp
1409	la	%r1,0($key)	# %r1 is permanent copy of $key
1410	lg	$iv0,0($ivp)	# load ivec
1411	lg	$ivp,8($ivp)
1412
1413	# prepare and allocate stack frame at the top of 4K page
1414	# with 1K reserved for eventual signal handling
1415	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1416	lghi	$s1,-4096
1417	algr	$s0,$sp
1418	lgr	$fp,$sp
1419	ngr	$s0,$s1		# align at page boundary
1420	slgr	$fp,$s0		# total buffer size
1421	lgr	$s2,$sp
1422	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1423	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1424	# buffer size is at lest 256 and at most 3072+256-16
1425
1426	la	$sp,1024($s0)	# alloca
1427	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
1428	st${g}	$s2,0($sp)	# back-chain
1429	st${g}	$fp,$SIZE_T($sp)
1430
1431	slgr	$len,$fp
1432	brc	1,.Lctr32_hw_switch	# not zero, no borrow
1433	algr	$fp,$len	# input is shorter than allocated buffer
1434	lghi	$len,0
1435	st${g}	$fp,$SIZE_T($sp)
1436
1437.Lctr32_hw_switch:
1438___
1439$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
1440	larl	$s0,OPENSSL_s390xcap_P
1441	lg	$s0,8($s0)
1442	tmhh	$s0,0x0004	# check for message_security-assist-4
1443	jz	.Lctr32_km_loop
1444
1445	llgfr	$s0,%r0
1446	lgr	$s1,%r1
1447	lghi	%r0,0
1448	la	%r1,16($sp)
1449	.long	0xb92d2042	# kmctr %r4,%r2,%r2
1450
1451	llihh	%r0,0x8000	# check if kmctr supports the function code
1452	srlg	%r0,%r0,0($s0)
1453	ng	%r0,16($sp)
1454	lgr	%r0,$s0
1455	lgr	%r1,$s1
1456	jz	.Lctr32_km_loop
1457
1458####### kmctr code
1459	algr	$out,$inp	# restore $out
1460	lgr	$s1,$len	# $s1 undertakes $len
1461	j	.Lctr32_kmctr_loop
1462.align	16
1463.Lctr32_kmctr_loop:
1464	la	$s2,16($sp)
1465	lgr	$s3,$fp
1466.Lctr32_kmctr_prepare:
1467	stg	$iv0,0($s2)
1468	stg	$ivp,8($s2)
1469	la	$s2,16($s2)
1470	ahi	$ivp,1		# 32-bit increment, preserves upper half
1471	brct	$s3,.Lctr32_kmctr_prepare
1472
1473	#la	$inp,0($inp)	# inp
1474	sllg	$len,$fp,4	# len
1475	#la	$out,0($out)	# out
1476	la	$s2,16($sp)	# iv
1477	.long	0xb92da042	# kmctr $out,$s2,$inp
1478	brc	1,.-4		# pay attention to "partial completion"
1479
1480	slgr	$s1,$fp
1481	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
1482	algr	$fp,$s1
1483	lghi	$s1,0
1484	brc	4+1,.Lctr32_kmctr_loop	# not zero
1485
1486	l${g}	$sp,0($sp)
1487	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1488	br	$ra
1489.align	16
1490___
1491$code.=<<___;
1492.Lctr32_km_loop:
1493	la	$s2,16($sp)
1494	lgr	$s3,$fp
1495.Lctr32_km_prepare:
1496	stg	$iv0,0($s2)
1497	stg	$ivp,8($s2)
1498	la	$s2,16($s2)
1499	ahi	$ivp,1		# 32-bit increment, preserves upper half
1500	brct	$s3,.Lctr32_km_prepare
1501
1502	la	$s0,16($sp)	# inp
1503	sllg	$s1,$fp,4	# len
1504	la	$s2,16($sp)	# out
1505	.long	0xb92e00a8	# km %r10,%r8
1506	brc	1,.-4		# pay attention to "partial completion"
1507
1508	la	$s2,16($sp)
1509	lgr	$s3,$fp
1510	slgr	$s2,$inp
1511.Lctr32_km_xor:
1512	lg	$s0,0($inp)
1513	lg	$s1,8($inp)
1514	xg	$s0,0($s2,$inp)
1515	xg	$s1,8($s2,$inp)
1516	stg	$s0,0($out,$inp)
1517	stg	$s1,8($out,$inp)
1518	la	$inp,16($inp)
1519	brct	$s3,.Lctr32_km_xor
1520
1521	slgr	$len,$fp
1522	brc	1,.Lctr32_km_loop	# not zero, no borrow
1523	algr	$fp,$len
1524	lghi	$len,0
1525	brc	4+1,.Lctr32_km_loop	# not zero
1526
1527	l${g}	$s0,0($sp)
1528	l${g}	$s1,$SIZE_T($sp)
1529	la	$s2,16($sp)
1530.Lctr32_km_zap:
1531	stg	$s0,0($s2)
1532	stg	$s0,8($s2)
1533	la	$s2,16($s2)
1534	brct	$s1,.Lctr32_km_zap
1535
1536	la	$sp,0($s0)
1537	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1538	br	$ra
1539.align	16
1540.Lctr32_software:
1541___
1542$code.=<<___;
1543	stm${g}	$key,$ra,5*$SIZE_T($sp)
1544	sl${g}r	$inp,$out
1545	larl	$tbl,AES_Te
1546	llgf	$t1,12($ivp)
1547
1548.Lctr32_loop:
1549	stm${g}	$inp,$out,2*$SIZE_T($sp)
1550	llgf	$s0,0($ivp)
1551	llgf	$s1,4($ivp)
1552	llgf	$s2,8($ivp)
1553	lgr	$s3,$t1
1554	st	$t1,16*$SIZE_T($sp)
1555	lgr	%r4,$key
1556
1557	bras	$ra,_s390x_AES_encrypt
1558
1559	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
1560	llgf	$t1,16*$SIZE_T($sp)
1561	x	$s0,0($inp,$out)
1562	x	$s1,4($inp,$out)
1563	x	$s2,8($inp,$out)
1564	x	$s3,12($inp,$out)
1565	stm	$s0,$s3,0($out)
1566
1567	la	$out,16($out)
1568	ahi	$t1,1		# 32-bit increment
1569	brct	$len,.Lctr32_loop
1570
1571	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1572	br	$ra
1573.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
1574___
1575}
1576
1577########################################################################
1578# void AES_xts_encrypt(const char *inp,char *out,size_t len,
1579#	const AES_KEY *key1, const AES_KEY *key2,
1580#	const unsigned char iv[16]);
1581#
1582{
1583my $inp="%r2";
1584my $out="%r4";	# len and out are swapped
1585my $len="%r3";
1586my $key1="%r5";	# $i1
1587my $key2="%r6";	# $i2
1588my $fp="%r7";	# $i3
1589my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
1590
1591$code.=<<___;
1592.type	_s390x_xts_km,\@function
1593.align	16
1594_s390x_xts_km:
1595___
1596$code.=<<___ if(1);
1597	llgfr	$s0,%r0			# put aside the function code
1598	lghi	$s1,0x7f
1599	nr	$s1,%r0
1600	lghi	%r0,0			# query capability vector
1601	la	%r1,$tweak-16($sp)
1602	.long	0xb92e0042		# km %r4,%r2
1603	llihh	%r1,0x8000
1604	srlg	%r1,%r1,32($s1)		# check for 32+function code
1605	ng	%r1,$tweak-16($sp)
1606	lgr	%r0,$s0			# restore the function code
1607	la	%r1,0($key1)		# restore $key1
1608	jz	.Lxts_km_vanilla
1609
1610	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
1611	algr	$out,$inp
1612
1613	oill	%r0,32			# switch to xts function code
1614	aghi	$s1,-18			#
1615	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
1616	la	%r1,$tweak-16($sp)
1617	slgr	%r1,$s1			# parameter block position
1618	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
1619	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
1620					# yes, it contains junk and overlaps
1621					# with the tweak in 128-bit case.
1622					# it's done to avoid conditional
1623					# branch.
1624	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
1625
1626	.long	0xb92e0042		# km %r4,%r2
1627	brc	1,.-4			# pay attention to "partial completion"
1628
1629	lrvg	$s0,$tweak+0($sp)	# load the last tweak
1630	lrvg	$s1,$tweak+8($sp)
1631	stmg	%r0,%r3,$tweak-32($sp)	# wipe copy of the key
1632
1633	nill	%r0,0xffdf		# switch back to original function code
1634	la	%r1,0($key1)		# restore pointer to $key1
1635	slgr	$out,$inp
1636
1637	llgc	$len,2*$SIZE_T-1($sp)
1638	nill	$len,0x0f		# $len%=16
1639	br	$ra
1640
1641.align	16
1642.Lxts_km_vanilla:
1643___
1644$code.=<<___;
1645	# prepare and allocate stack frame at the top of 4K page
1646	# with 1K reserved for eventual signal handling
1647	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1648	lghi	$s1,-4096
1649	algr	$s0,$sp
1650	lgr	$fp,$sp
1651	ngr	$s0,$s1		# align at page boundary
1652	slgr	$fp,$s0		# total buffer size
1653	lgr	$s2,$sp
1654	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1655	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1656	# buffer size is at lest 256 and at most 3072+256-16
1657
1658	la	$sp,1024($s0)	# alloca
1659	nill	$fp,0xfff0	# round to 16*n
1660	st${g}	$s2,0($sp)	# back-chain
1661	nill	$len,0xfff0	# redundant
1662	st${g}	$fp,$SIZE_T($sp)
1663
1664	slgr	$len,$fp
1665	brc	1,.Lxts_km_go	# not zero, no borrow
1666	algr	$fp,$len	# input is shorter than allocated buffer
1667	lghi	$len,0
1668	st${g}	$fp,$SIZE_T($sp)
1669
1670.Lxts_km_go:
1671	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
1672	lrvg	$s1,$tweak+8($s2)
1673
1674	la	$s2,16($sp)		# vector of ascending tweak values
1675	slgr	$s2,$inp
1676	srlg	$s3,$fp,4
1677	j	.Lxts_km_start
1678
1679.Lxts_km_loop:
1680	la	$s2,16($sp)
1681	slgr	$s2,$inp
1682	srlg	$s3,$fp,4
1683.Lxts_km_prepare:
1684	lghi	$i1,0x87
1685	srag	$i2,$s1,63		# broadcast upper bit
1686	ngr	$i1,$i2			# rem
1687	algr	$s0,$s0
1688	alcgr	$s1,$s1
1689	xgr	$s0,$i1
1690.Lxts_km_start:
1691	lrvgr	$i1,$s0			# flip byte order
1692	lrvgr	$i2,$s1
1693	stg	$i1,0($s2,$inp)
1694	stg	$i2,8($s2,$inp)
1695	xg	$i1,0($inp)
1696	xg	$i2,8($inp)
1697	stg	$i1,0($out,$inp)
1698	stg	$i2,8($out,$inp)
1699	la	$inp,16($inp)
1700	brct	$s3,.Lxts_km_prepare
1701
1702	slgr	$inp,$fp		# rewind $inp
1703	la	$s2,0($out,$inp)
1704	lgr	$s3,$fp
1705	.long	0xb92e00aa		# km $s2,$s2
1706	brc	1,.-4			# pay attention to "partial completion"
1707
1708	la	$s2,16($sp)
1709	slgr	$s2,$inp
1710	srlg	$s3,$fp,4
1711.Lxts_km_xor:
1712	lg	$i1,0($out,$inp)
1713	lg	$i2,8($out,$inp)
1714	xg	$i1,0($s2,$inp)
1715	xg	$i2,8($s2,$inp)
1716	stg	$i1,0($out,$inp)
1717	stg	$i2,8($out,$inp)
1718	la	$inp,16($inp)
1719	brct	$s3,.Lxts_km_xor
1720
1721	slgr	$len,$fp
1722	brc	1,.Lxts_km_loop		# not zero, no borrow
1723	algr	$fp,$len
1724	lghi	$len,0
1725	brc	4+1,.Lxts_km_loop	# not zero
1726
1727	l${g}	$i1,0($sp)		# back-chain
1728	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
1729	la	$i2,16($sp)
1730	srlg	$fp,$fp,4
1731.Lxts_km_zap:
1732	stg	$i1,0($i2)
1733	stg	$i1,8($i2)
1734	la	$i2,16($i2)
1735	brct	$fp,.Lxts_km_zap
1736
1737	la	$sp,0($i1)
1738	llgc	$len,2*$SIZE_T-1($i1)
1739	nill	$len,0x0f		# $len%=16
1740	bzr	$ra
1741
1742	# generate one more tweak...
1743	lghi	$i1,0x87
1744	srag	$i2,$s1,63		# broadcast upper bit
1745	ngr	$i1,$i2			# rem
1746	algr	$s0,$s0
1747	alcgr	$s1,$s1
1748	xgr	$s0,$i1
1749
1750	ltr	$len,$len		# clear zero flag
1751	br	$ra
1752.size	_s390x_xts_km,.-_s390x_xts_km
1753
1754.globl	AES_xts_encrypt
1755.type	AES_xts_encrypt,\@function
1756.align	16
1757AES_xts_encrypt:
1758	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
1759	xgr	%r4,%r3
1760	xgr	%r3,%r4
1761___
1762$code.=<<___ if ($SIZE_T==4);
1763	llgfr	$len,$len
1764___
1765$code.=<<___;
1766	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
1767	srag	$len,$len,4		# formally wrong, because it expands
1768					# sign byte, but who can afford asking
1769					# to process more than 2^63-1 bytes?
1770					# I use it, because it sets condition
1771					# code...
1772	bcr	8,$ra			# abort if zero (i.e. less than 16)
1773___
1774$code.=<<___ if (!$softonly);
1775	llgf	%r0,240($key2)
1776	lhi	%r1,16
1777	clr	%r0,%r1
1778	jl	.Lxts_enc_software
1779
1780	st${g}	$ra,5*$SIZE_T($sp)
1781	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1782
1783	sllg	$len,$len,4		# $len&=~15
1784	slgr	$out,$inp
1785
1786	# generate the tweak value
1787	l${g}	$s3,$stdframe($sp)	# pointer to iv
1788	la	$s2,$tweak($sp)
1789	lmg	$s0,$s1,0($s3)
1790	lghi	$s3,16
1791	stmg	$s0,$s1,0($s2)
1792	la	%r1,0($key2)		# $key2 is not needed anymore
1793	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
1794	brc	1,.-4			# can this happen?
1795
1796	l	%r0,240($key1)
1797	la	%r1,0($key1)		# $key1 is not needed anymore
1798	bras	$ra,_s390x_xts_km
1799	jz	.Lxts_enc_km_done
1800
1801	aghi	$inp,-16		# take one step back
1802	la	$i3,0($out,$inp)	# put aside real $out
1803.Lxts_enc_km_steal:
1804	llgc	$i1,16($inp)
1805	llgc	$i2,0($out,$inp)
1806	stc	$i1,0($out,$inp)
1807	stc	$i2,16($out,$inp)
1808	la	$inp,1($inp)
1809	brct	$len,.Lxts_enc_km_steal
1810
1811	la	$s2,0($i3)
1812	lghi	$s3,16
1813	lrvgr	$i1,$s0			# flip byte order
1814	lrvgr	$i2,$s1
1815	xg	$i1,0($s2)
1816	xg	$i2,8($s2)
1817	stg	$i1,0($s2)
1818	stg	$i2,8($s2)
1819	.long	0xb92e00aa		# km $s2,$s2
1820	brc	1,.-4			# can this happen?
1821	lrvgr	$i1,$s0			# flip byte order
1822	lrvgr	$i2,$s1
1823	xg	$i1,0($i3)
1824	xg	$i2,8($i3)
1825	stg	$i1,0($i3)
1826	stg	$i2,8($i3)
1827
1828.Lxts_enc_km_done:
1829	stg	$sp,$tweak+0($sp)	# wipe tweak
1830	stg	$sp,$tweak+8($sp)
1831	l${g}	$ra,5*$SIZE_T($sp)
1832	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1833	br	$ra
1834.align	16
1835.Lxts_enc_software:
1836___
1837$code.=<<___;
1838	stm${g}	%r6,$ra,6*$SIZE_T($sp)
1839
1840	slgr	$out,$inp
1841
1842	l${g}	$s3,$stdframe($sp)	# ivp
1843	llgf	$s0,0($s3)		# load iv
1844	llgf	$s1,4($s3)
1845	llgf	$s2,8($s3)
1846	llgf	$s3,12($s3)
1847	stm${g}	%r2,%r5,2*$SIZE_T($sp)
1848	la	$key,0($key2)
1849	larl	$tbl,AES_Te
1850	bras	$ra,_s390x_AES_encrypt	# generate the tweak
1851	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1852	stm	$s0,$s3,$tweak($sp)	# save the tweak
1853	j	.Lxts_enc_enter
1854
1855.align	16
1856.Lxts_enc_loop:
1857	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1858	lrvg	$s3,$tweak+8($sp)
1859	lghi	%r1,0x87
1860	srag	%r0,$s3,63		# broadcast upper bit
1861	ngr	%r1,%r0			# rem
1862	algr	$s1,$s1
1863	alcgr	$s3,$s3
1864	xgr	$s1,%r1
1865	lrvgr	$s1,$s1			# flip byte order
1866	lrvgr	$s3,$s3
1867	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1868	stg	$s1,$tweak+0($sp)	# save the tweak
1869	llgfr	$s1,$s1
1870	srlg	$s2,$s3,32
1871	stg	$s3,$tweak+8($sp)
1872	llgfr	$s3,$s3
1873	la	$inp,16($inp)		# $inp+=16
1874.Lxts_enc_enter:
1875	x	$s0,0($inp)		# ^=*($inp)
1876	x	$s1,4($inp)
1877	x	$s2,8($inp)
1878	x	$s3,12($inp)
1879	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
1880	la	$key,0($key1)
1881	bras	$ra,_s390x_AES_encrypt
1882	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1883	x	$s0,$tweak+0($sp)	# ^=tweak
1884	x	$s1,$tweak+4($sp)
1885	x	$s2,$tweak+8($sp)
1886	x	$s3,$tweak+12($sp)
1887	st	$s0,0($out,$inp)
1888	st	$s1,4($out,$inp)
1889	st	$s2,8($out,$inp)
1890	st	$s3,12($out,$inp)
1891	brct${g}	$len,.Lxts_enc_loop
1892
1893	llgc	$len,`2*$SIZE_T-1`($sp)
1894	nill	$len,0x0f		# $len%16
1895	jz	.Lxts_enc_done
1896
1897	la	$i3,0($inp,$out)	# put aside real $out
1898.Lxts_enc_steal:
1899	llgc	%r0,16($inp)
1900	llgc	%r1,0($out,$inp)
1901	stc	%r0,0($out,$inp)
1902	stc	%r1,16($out,$inp)
1903	la	$inp,1($inp)
1904	brct	$len,.Lxts_enc_steal
1905	la	$out,0($i3)		# restore real $out
1906
1907	# generate last tweak...
1908	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1909	lrvg	$s3,$tweak+8($sp)
1910	lghi	%r1,0x87
1911	srag	%r0,$s3,63		# broadcast upper bit
1912	ngr	%r1,%r0			# rem
1913	algr	$s1,$s1
1914	alcgr	$s3,$s3
1915	xgr	$s1,%r1
1916	lrvgr	$s1,$s1			# flip byte order
1917	lrvgr	$s3,$s3
1918	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1919	stg	$s1,$tweak+0($sp)	# save the tweak
1920	llgfr	$s1,$s1
1921	srlg	$s2,$s3,32
1922	stg	$s3,$tweak+8($sp)
1923	llgfr	$s3,$s3
1924
1925	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
1926	x	$s1,4($out)
1927	x	$s2,8($out)
1928	x	$s3,12($out)
1929	st${g}	$out,4*$SIZE_T($sp)
1930	la	$key,0($key1)
1931	bras	$ra,_s390x_AES_encrypt
1932	l${g}	$out,4*$SIZE_T($sp)
1933	x	$s0,`$tweak+0`($sp)	# ^=tweak
1934	x	$s1,`$tweak+4`($sp)
1935	x	$s2,`$tweak+8`($sp)
1936	x	$s3,`$tweak+12`($sp)
1937	st	$s0,0($out)
1938	st	$s1,4($out)
1939	st	$s2,8($out)
1940	st	$s3,12($out)
1941
1942.Lxts_enc_done:
1943	stg	$sp,$tweak+0($sp)	# wipe tweak
1944	stg	$sp,$twesk+8($sp)
1945	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1946	br	$ra
1947.size	AES_xts_encrypt,.-AES_xts_encrypt
1948___
1949# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1950#	const AES_KEY *key1, const AES_KEY *key2,
1951#	const unsigned char iv[16]);
1952#
1953$code.=<<___;
1954.globl	AES_xts_decrypt
1955.type	AES_xts_decrypt,\@function
1956.align	16
1957AES_xts_decrypt:
1958	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
1959	xgr	%r4,%r3
1960	xgr	%r3,%r4
1961___
1962$code.=<<___ if ($SIZE_T==4);
1963	llgfr	$len,$len
1964___
1965$code.=<<___;
1966	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
1967	aghi	$len,-16
1968	bcr	4,$ra			# abort if less than zero. formally
1969					# wrong, because $len is unsigned,
1970					# but who can afford asking to
1971					# process more than 2^63-1 bytes?
1972	tmll	$len,0x0f
1973	jnz	.Lxts_dec_proceed
1974	aghi	$len,16
1975.Lxts_dec_proceed:
1976___
1977$code.=<<___ if (!$softonly);
1978	llgf	%r0,240($key2)
1979	lhi	%r1,16
1980	clr	%r0,%r1
1981	jl	.Lxts_dec_software
1982
1983	st${g}	$ra,5*$SIZE_T($sp)
1984	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1985
1986	nill	$len,0xfff0		# $len&=~15
1987	slgr	$out,$inp
1988
1989	# generate the tweak value
1990	l${g}	$s3,$stdframe($sp)	# pointer to iv
1991	la	$s2,$tweak($sp)
1992	lmg	$s0,$s1,0($s3)
1993	lghi	$s3,16
1994	stmg	$s0,$s1,0($s2)
1995	la	%r1,0($key2)		# $key2 is not needed past this point
1996	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
1997	brc	1,.-4			# can this happen?
1998
1999	l	%r0,240($key1)
2000	la	%r1,0($key1)		# $key1 is not needed anymore
2001
2002	ltgr	$len,$len
2003	jz	.Lxts_dec_km_short
2004	bras	$ra,_s390x_xts_km
2005	jz	.Lxts_dec_km_done
2006
2007	lrvgr	$s2,$s0			# make copy in reverse byte order
2008	lrvgr	$s3,$s1
2009	j	.Lxts_dec_km_2ndtweak
2010
2011.Lxts_dec_km_short:
2012	llgc	$len,`2*$SIZE_T-1`($sp)
2013	nill	$len,0x0f		# $len%=16
2014	lrvg	$s0,$tweak+0($sp)	# load the tweak
2015	lrvg	$s1,$tweak+8($sp)
2016	lrvgr	$s2,$s0			# make copy in reverse byte order
2017	lrvgr	$s3,$s1
2018
2019.Lxts_dec_km_2ndtweak:
2020	lghi	$i1,0x87
2021	srag	$i2,$s1,63		# broadcast upper bit
2022	ngr	$i1,$i2			# rem
2023	algr	$s0,$s0
2024	alcgr	$s1,$s1
2025	xgr	$s0,$i1
2026	lrvgr	$i1,$s0			# flip byte order
2027	lrvgr	$i2,$s1
2028
2029	xg	$i1,0($inp)
2030	xg	$i2,8($inp)
2031	stg	$i1,0($out,$inp)
2032	stg	$i2,8($out,$inp)
2033	la	$i2,0($out,$inp)
2034	lghi	$i3,16
2035	.long	0xb92e0066		# km $i2,$i2
2036	brc	1,.-4			# can this happen?
2037	lrvgr	$i1,$s0
2038	lrvgr	$i2,$s1
2039	xg	$i1,0($out,$inp)
2040	xg	$i2,8($out,$inp)
2041	stg	$i1,0($out,$inp)
2042	stg	$i2,8($out,$inp)
2043
2044	la	$i3,0($out,$inp)	# put aside real $out
2045.Lxts_dec_km_steal:
2046	llgc	$i1,16($inp)
2047	llgc	$i2,0($out,$inp)
2048	stc	$i1,0($out,$inp)
2049	stc	$i2,16($out,$inp)
2050	la	$inp,1($inp)
2051	brct	$len,.Lxts_dec_km_steal
2052
2053	lgr	$s0,$s2
2054	lgr	$s1,$s3
2055	xg	$s0,0($i3)
2056	xg	$s1,8($i3)
2057	stg	$s0,0($i3)
2058	stg	$s1,8($i3)
2059	la	$s0,0($i3)
2060	lghi	$s1,16
2061	.long	0xb92e0088		# km $s0,$s0
2062	brc	1,.-4			# can this happen?
2063	xg	$s2,0($i3)
2064	xg	$s3,8($i3)
2065	stg	$s2,0($i3)
2066	stg	$s3,8($i3)
2067.Lxts_dec_km_done:
2068	stg	$sp,$tweak+0($sp)	# wipe tweak
2069	stg	$sp,$tweak+8($sp)
2070	l${g}	$ra,5*$SIZE_T($sp)
2071	lm${g}	%r6,$s3,6*$SIZE_T($sp)
2072	br	$ra
2073.align	16
2074.Lxts_dec_software:
2075___
2076$code.=<<___;
2077	stm${g}	%r6,$ra,6*$SIZE_T($sp)
2078
2079	srlg	$len,$len,4
2080	slgr	$out,$inp
2081
2082	l${g}	$s3,$stdframe($sp)	# ivp
2083	llgf	$s0,0($s3)		# load iv
2084	llgf	$s1,4($s3)
2085	llgf	$s2,8($s3)
2086	llgf	$s3,12($s3)
2087	stm${g}	%r2,%r5,2*$SIZE_T($sp)
2088	la	$key,0($key2)
2089	larl	$tbl,AES_Te
2090	bras	$ra,_s390x_AES_encrypt	# generate the tweak
2091	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2092	larl	$tbl,AES_Td
2093	lt${g}r	$len,$len
2094	stm	$s0,$s3,$tweak($sp)	# save the tweak
2095	jz	.Lxts_dec_short
2096	j	.Lxts_dec_enter
2097
2098.align	16
2099.Lxts_dec_loop:
2100	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2101	lrvg	$s3,$tweak+8($sp)
2102	lghi	%r1,0x87
2103	srag	%r0,$s3,63		# broadcast upper bit
2104	ngr	%r1,%r0			# rem
2105	algr	$s1,$s1
2106	alcgr	$s3,$s3
2107	xgr	$s1,%r1
2108	lrvgr	$s1,$s1			# flip byte order
2109	lrvgr	$s3,$s3
2110	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2111	stg	$s1,$tweak+0($sp)	# save the tweak
2112	llgfr	$s1,$s1
2113	srlg	$s2,$s3,32
2114	stg	$s3,$tweak+8($sp)
2115	llgfr	$s3,$s3
2116.Lxts_dec_enter:
2117	x	$s0,0($inp)		# tweak^=*(inp)
2118	x	$s1,4($inp)
2119	x	$s2,8($inp)
2120	x	$s3,12($inp)
2121	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
2122	la	$key,0($key1)
2123	bras	$ra,_s390x_AES_decrypt
2124	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2125	x	$s0,$tweak+0($sp)	# ^=tweak
2126	x	$s1,$tweak+4($sp)
2127	x	$s2,$tweak+8($sp)
2128	x	$s3,$tweak+12($sp)
2129	st	$s0,0($out,$inp)
2130	st	$s1,4($out,$inp)
2131	st	$s2,8($out,$inp)
2132	st	$s3,12($out,$inp)
2133	la	$inp,16($inp)
2134	brct${g}	$len,.Lxts_dec_loop
2135
2136	llgc	$len,`2*$SIZE_T-1`($sp)
2137	nill	$len,0x0f		# $len%16
2138	jz	.Lxts_dec_done
2139
2140	# generate pair of tweaks...
2141	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2142	lrvg	$s3,$tweak+8($sp)
2143	lghi	%r1,0x87
2144	srag	%r0,$s3,63		# broadcast upper bit
2145	ngr	%r1,%r0			# rem
2146	algr	$s1,$s1
2147	alcgr	$s3,$s3
2148	xgr	$s1,%r1
2149	lrvgr	$i2,$s1			# flip byte order
2150	lrvgr	$i3,$s3
2151	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
2152	j	.Lxts_dec_2ndtweak
2153
2154.align	16
2155.Lxts_dec_short:
2156	llgc	$len,`2*$SIZE_T-1`($sp)
2157	nill	$len,0x0f		# $len%16
2158	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2159	lrvg	$s3,$tweak+8($sp)
2160.Lxts_dec_2ndtweak:
2161	lghi	%r1,0x87
2162	srag	%r0,$s3,63		# broadcast upper bit
2163	ngr	%r1,%r0			# rem
2164	algr	$s1,$s1
2165	alcgr	$s3,$s3
2166	xgr	$s1,%r1
2167	lrvgr	$s1,$s1			# flip byte order
2168	lrvgr	$s3,$s3
2169	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2170	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
2171	llgfr	$s1,$s1
2172	srlg	$s2,$s3,32
2173	stg	$s3,$tweak-16+8($sp)
2174	llgfr	$s3,$s3
2175
2176	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
2177	x	$s1,4($inp)
2178	x	$s2,8($inp)
2179	x	$s3,12($inp)
2180	stm${g}	%r2,%r3,2*$SIZE_T($sp)
2181	la	$key,0($key1)
2182	bras	$ra,_s390x_AES_decrypt
2183	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2184	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
2185	x	$s1,$tweak-16+4($sp)
2186	x	$s2,$tweak-16+8($sp)
2187	x	$s3,$tweak-16+12($sp)
2188	st	$s0,0($out,$inp)
2189	st	$s1,4($out,$inp)
2190	st	$s2,8($out,$inp)
2191	st	$s3,12($out,$inp)
2192
2193	la	$i3,0($out,$inp)	# put aside real $out
2194.Lxts_dec_steal:
2195	llgc	%r0,16($inp)
2196	llgc	%r1,0($out,$inp)
2197	stc	%r0,0($out,$inp)
2198	stc	%r1,16($out,$inp)
2199	la	$inp,1($inp)
2200	brct	$len,.Lxts_dec_steal
2201	la	$out,0($i3)		# restore real $out
2202
2203	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
2204	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
2205	x	$s1,4($out)
2206	x	$s2,8($out)
2207	x	$s3,12($out)
2208	st${g}	$out,4*$SIZE_T($sp)
2209	la	$key,0($key1)
2210	bras	$ra,_s390x_AES_decrypt
2211	l${g}	$out,4*$SIZE_T($sp)
2212	x	$s0,$tweak+0($sp)	# ^=tweak
2213	x	$s1,$tweak+4($sp)
2214	x	$s2,$tweak+8($sp)
2215	x	$s3,$tweak+12($sp)
2216	st	$s0,0($out)
2217	st	$s1,4($out)
2218	st	$s2,8($out)
2219	st	$s3,12($out)
2220	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
2221	stg	$sp,$tweak-16+8($sp)
2222.Lxts_dec_done:
2223	stg	$sp,$tweak+0($sp)	# wipe tweak
2224	stg	$sp,$twesk+8($sp)
2225	lm${g}	%r6,$ra,6*$SIZE_T($sp)
2226	br	$ra
2227.size	AES_xts_decrypt,.-AES_xts_decrypt
2228___
2229}
2230$code.=<<___;
2231.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2232.comm	OPENSSL_s390xcap_P,16,8
2233___
2234
2235$code =~ s/\`([^\`]*)\`/eval $1/gem;
2236print $code;
2237close STDOUT;	# force flush
2238