sparcv9_modes.pl revision 306195
1#!/usr/bin/env perl
2
3# Specific modes implementations for SPARC Architecture 2011. There
4# is T4 dependency though, an ASI value that is not specified in the
5# Architecture Manual. But as SPARC universe is rather monocultural,
6# we imply that processor capable of executing crypto instructions
7# can handle the ASI in question as well. This means that we ought to
8# keep eyes open when new processors emerge...
9#
10# As for above mentioned ASI. It's so called "block initializing
11# store" which cancels "read" in "read-update-write" on cache lines.
12# This is "cooperative" optimization, as it reduces overall pressure
13# on memory interface. Benefits can't be observed/quantified with
14# usual benchmarks, on the contrary you can notice that single-thread
15# performance for parallelizable modes is ~1.5% worse for largest
16# block sizes [though few percent better for not so long ones]. All
17# this based on suggestions from David Miller.
18
19sub asm_init {		# to be called with @ARGV as argument
20    for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
21    if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
22    else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
23}
24
25# unified interface
26my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
27# local variables
28my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
29
30sub alg_cbc_encrypt_implement {
31my ($alg,$bits) = @_;
32
33$::code.=<<___;
34.globl	${alg}${bits}_t4_cbc_encrypt
35.align	32
36${alg}${bits}_t4_cbc_encrypt:
37	save		%sp, -$::frame, %sp
38	cmp		$len, 0
39	be,pn		$::size_t_cc, .L${bits}_cbc_enc_abort
40	srln		$len, 0, $len		! needed on v8+, "nop" on v9
41	sub		$inp, $out, $blk_init	! $inp!=$out
42___
43$::code.=<<___ if (!$::evp);
44	andcc		$ivec, 7, $ivoff
45	alignaddr	$ivec, %g0, $ivec
46
47	ldd		[$ivec + 0], %f0	! load ivec
48	bz,pt		%icc, 1f
49	ldd		[$ivec + 8], %f2
50	ldd		[$ivec + 16], %f4
51	faligndata	%f0, %f2, %f0
52	faligndata	%f2, %f4, %f2
531:
54___
55$::code.=<<___ if ($::evp);
56	ld		[$ivec + 0], %f0
57	ld		[$ivec + 4], %f1
58	ld		[$ivec + 8], %f2
59	ld		[$ivec + 12], %f3
60___
61$::code.=<<___;
62	prefetch	[$inp], 20
63	prefetch	[$inp + 63], 20
64	call		_${alg}${bits}_load_enckey
65	and		$inp, 7, $ileft
66	andn		$inp, 7, $inp
67	sll		$ileft, 3, $ileft
68	mov		64, $iright
69	mov		0xff, $omask
70	sub		$iright, $ileft, $iright
71	and		$out, 7, $ooff
72	cmp		$len, 127
73	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
74	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
75	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
76	srl		$omask, $ooff, $omask
77
78	alignaddrl	$out, %g0, $out
79	srlx		$len, 4, $len
80	prefetch	[$out], 22
81
82.L${bits}_cbc_enc_loop:
83	ldx		[$inp + 0], %o0
84	brz,pt		$ileft, 4f
85	ldx		[$inp + 8], %o1
86
87	ldx		[$inp + 16], %o2
88	sllx		%o0, $ileft, %o0
89	srlx		%o1, $iright, %g1
90	sllx		%o1, $ileft, %o1
91	or		%g1, %o0, %o0
92	srlx		%o2, $iright, %o2
93	or		%o2, %o1, %o1
944:
95	xor		%g4, %o0, %o0		! ^= rk[0]
96	xor		%g5, %o1, %o1
97	movxtod		%o0, %f12
98	movxtod		%o1, %f14
99
100	fxor		%f12, %f0, %f0		! ^= ivec
101	fxor		%f14, %f2, %f2
102	prefetch	[$out + 63], 22
103	prefetch	[$inp + 16+63], 20
104	call		_${alg}${bits}_encrypt_1x
105	add		$inp, 16, $inp
106
107	brnz,pn		$ooff, 2f
108	sub		$len, 1, $len
109
110	std		%f0, [$out + 0]
111	std		%f2, [$out + 8]
112	brnz,pt		$len, .L${bits}_cbc_enc_loop
113	add		$out, 16, $out
114___
115$::code.=<<___ if ($::evp);
116	st		%f0, [$ivec + 0]
117	st		%f1, [$ivec + 4]
118	st		%f2, [$ivec + 8]
119	st		%f3, [$ivec + 12]
120___
121$::code.=<<___ if (!$::evp);
122	brnz,pn		$ivoff, 3f
123	nop
124
125	std		%f0, [$ivec + 0]	! write out ivec
126	std		%f2, [$ivec + 8]
127___
128$::code.=<<___;
129.L${bits}_cbc_enc_abort:
130	ret
131	restore
132
133.align	16
1342:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
135						! and ~3x deterioration
136						! in inp==out case
137	faligndata	%f0, %f0, %f4		! handle unaligned output
138	faligndata	%f0, %f2, %f6
139	faligndata	%f2, %f2, %f8
140
141	stda		%f4, [$out + $omask]0xc0	! partial store
142	std		%f6, [$out + 8]
143	add		$out, 16, $out
144	orn		%g0, $omask, $omask
145	stda		%f8, [$out + $omask]0xc0	! partial store
146
147	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
148	orn		%g0, $omask, $omask
149___
150$::code.=<<___ if ($::evp);
151	st		%f0, [$ivec + 0]
152	st		%f1, [$ivec + 4]
153	st		%f2, [$ivec + 8]
154	st		%f3, [$ivec + 12]
155___
156$::code.=<<___ if (!$::evp);
157	brnz,pn		$ivoff, 3f
158	nop
159
160	std		%f0, [$ivec + 0]	! write out ivec
161	std		%f2, [$ivec + 8]
162	ret
163	restore
164
165.align	16
1663:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
167	mov		0xff, $omask
168	srl		$omask, $ivoff, $omask
169	faligndata	%f0, %f0, %f4
170	faligndata	%f0, %f2, %f6
171	faligndata	%f2, %f2, %f8
172	stda		%f4, [$ivec + $omask]0xc0
173	std		%f6, [$ivec + 8]
174	add		$ivec, 16, $ivec
175	orn		%g0, $omask, $omask
176	stda		%f8, [$ivec + $omask]0xc0
177___
178$::code.=<<___;
179	ret
180	restore
181
182!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
183.align	32
184.L${bits}cbc_enc_blk:
185	add	$out, $len, $blk_init
186	and	$blk_init, 63, $blk_init	! tail
187	sub	$len, $blk_init, $len
188	add	$blk_init, 15, $blk_init	! round up to 16n
189	srlx	$len, 4, $len
190	srl	$blk_init, 4, $blk_init
191
192.L${bits}_cbc_enc_blk_loop:
193	ldx		[$inp + 0], %o0
194	brz,pt		$ileft, 5f
195	ldx		[$inp + 8], %o1
196
197	ldx		[$inp + 16], %o2
198	sllx		%o0, $ileft, %o0
199	srlx		%o1, $iright, %g1
200	sllx		%o1, $ileft, %o1
201	or		%g1, %o0, %o0
202	srlx		%o2, $iright, %o2
203	or		%o2, %o1, %o1
2045:
205	xor		%g4, %o0, %o0		! ^= rk[0]
206	xor		%g5, %o1, %o1
207	movxtod		%o0, %f12
208	movxtod		%o1, %f14
209
210	fxor		%f12, %f0, %f0		! ^= ivec
211	fxor		%f14, %f2, %f2
212	prefetch	[$inp + 16+63], 20
213	call		_${alg}${bits}_encrypt_1x
214	add		$inp, 16, $inp
215	sub		$len, 1, $len
216
217	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
218	add		$out, 8, $out
219	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
220	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
221	add		$out, 8, $out
222
223	membar		#StoreLoad|#StoreStore
224	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
225	mov		$blk_init, $len
226___
227$::code.=<<___ if ($::evp);
228	st		%f0, [$ivec + 0]
229	st		%f1, [$ivec + 4]
230	st		%f2, [$ivec + 8]
231	st		%f3, [$ivec + 12]
232___
233$::code.=<<___ if (!$::evp);
234	brnz,pn		$ivoff, 3b
235	nop
236
237	std		%f0, [$ivec + 0]	! write out ivec
238	std		%f2, [$ivec + 8]
239___
240$::code.=<<___;
241	ret
242	restore
243.type	${alg}${bits}_t4_cbc_encrypt,#function
244.size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
245___
246}
247
248sub alg_cbc_decrypt_implement {
249my ($alg,$bits) = @_;
250
251$::code.=<<___;
252.globl	${alg}${bits}_t4_cbc_decrypt
253.align	32
254${alg}${bits}_t4_cbc_decrypt:
255	save		%sp, -$::frame, %sp
256	cmp		$len, 0
257	be,pn		$::size_t_cc, .L${bits}_cbc_dec_abort
258	srln		$len, 0, $len		! needed on v8+, "nop" on v9
259	sub		$inp, $out, $blk_init	! $inp!=$out
260___
261$::code.=<<___ if (!$::evp);
262	andcc		$ivec, 7, $ivoff
263	alignaddr	$ivec, %g0, $ivec
264
265	ldd		[$ivec + 0], %f12	! load ivec
266	bz,pt		%icc, 1f
267	ldd		[$ivec + 8], %f14
268	ldd		[$ivec + 16], %f0
269	faligndata	%f12, %f14, %f12
270	faligndata	%f14, %f0, %f14
2711:
272___
273$::code.=<<___ if ($::evp);
274	ld		[$ivec + 0], %f12	! load ivec
275	ld		[$ivec + 4], %f13
276	ld		[$ivec + 8], %f14
277	ld		[$ivec + 12], %f15
278___
279$::code.=<<___;
280	prefetch	[$inp], 20
281	prefetch	[$inp + 63], 20
282	call		_${alg}${bits}_load_deckey
283	and		$inp, 7, $ileft
284	andn		$inp, 7, $inp
285	sll		$ileft, 3, $ileft
286	mov		64, $iright
287	mov		0xff, $omask
288	sub		$iright, $ileft, $iright
289	and		$out, 7, $ooff
290	cmp		$len, 255
291	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
292	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
293	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
294	srl		$omask, $ooff, $omask
295
296	andcc		$len, 16, %g0		! is number of blocks even?
297	srlx		$len, 4, $len
298	alignaddrl	$out, %g0, $out
299	bz		%icc, .L${bits}_cbc_dec_loop2x
300	prefetch	[$out], 22
301.L${bits}_cbc_dec_loop:
302	ldx		[$inp + 0], %o0
303	brz,pt		$ileft, 4f
304	ldx		[$inp + 8], %o1
305
306	ldx		[$inp + 16], %o2
307	sllx		%o0, $ileft, %o0
308	srlx		%o1, $iright, %g1
309	sllx		%o1, $ileft, %o1
310	or		%g1, %o0, %o0
311	srlx		%o2, $iright, %o2
312	or		%o2, %o1, %o1
3134:
314	xor		%g4, %o0, %o2		! ^= rk[0]
315	xor		%g5, %o1, %o3
316	movxtod		%o2, %f0
317	movxtod		%o3, %f2
318
319	prefetch	[$out + 63], 22
320	prefetch	[$inp + 16+63], 20
321	call		_${alg}${bits}_decrypt_1x
322	add		$inp, 16, $inp
323
324	fxor		%f12, %f0, %f0		! ^= ivec
325	fxor		%f14, %f2, %f2
326	movxtod		%o0, %f12
327	movxtod		%o1, %f14
328
329	brnz,pn		$ooff, 2f
330	sub		$len, 1, $len
331
332	std		%f0, [$out + 0]
333	std		%f2, [$out + 8]
334	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
335	add		$out, 16, $out
336___
337$::code.=<<___ if ($::evp);
338	st		%f12, [$ivec + 0]
339	st		%f13, [$ivec + 4]
340	st		%f14, [$ivec + 8]
341	st		%f15, [$ivec + 12]
342___
343$::code.=<<___ if (!$::evp);
344	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
345	nop
346
347	std		%f12, [$ivec + 0]	! write out ivec
348	std		%f14, [$ivec + 8]
349___
350$::code.=<<___;
351.L${bits}_cbc_dec_abort:
352	ret
353	restore
354
355.align	16
3562:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
357						! and ~3x deterioration
358						! in inp==out case
359	faligndata	%f0, %f0, %f4		! handle unaligned output
360	faligndata	%f0, %f2, %f6
361	faligndata	%f2, %f2, %f8
362
363	stda		%f4, [$out + $omask]0xc0	! partial store
364	std		%f6, [$out + 8]
365	add		$out, 16, $out
366	orn		%g0, $omask, $omask
367	stda		%f8, [$out + $omask]0xc0	! partial store
368
369	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
370	orn		%g0, $omask, $omask
371___
372$::code.=<<___ if ($::evp);
373	st		%f12, [$ivec + 0]
374	st		%f13, [$ivec + 4]
375	st		%f14, [$ivec + 8]
376	st		%f15, [$ivec + 12]
377___
378$::code.=<<___ if (!$::evp);
379	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
380	nop
381
382	std		%f12, [$ivec + 0]	! write out ivec
383	std		%f14, [$ivec + 8]
384___
385$::code.=<<___;
386	ret
387	restore
388
389!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
390.align	32
391.L${bits}_cbc_dec_loop2x:
392	ldx		[$inp + 0], %o0
393	ldx		[$inp + 8], %o1
394	ldx		[$inp + 16], %o2
395	brz,pt		$ileft, 4f
396	ldx		[$inp + 24], %o3
397
398	ldx		[$inp + 32], %o4
399	sllx		%o0, $ileft, %o0
400	srlx		%o1, $iright, %g1
401	or		%g1, %o0, %o0
402	sllx		%o1, $ileft, %o1
403	srlx		%o2, $iright, %g1
404	or		%g1, %o1, %o1
405	sllx		%o2, $ileft, %o2
406	srlx		%o3, $iright, %g1
407	or		%g1, %o2, %o2
408	sllx		%o3, $ileft, %o3
409	srlx		%o4, $iright, %o4
410	or		%o4, %o3, %o3
4114:
412	xor		%g4, %o0, %o4		! ^= rk[0]
413	xor		%g5, %o1, %o5
414	movxtod		%o4, %f0
415	movxtod		%o5, %f2
416	xor		%g4, %o2, %o4
417	xor		%g5, %o3, %o5
418	movxtod		%o4, %f4
419	movxtod		%o5, %f6
420
421	prefetch	[$out + 63], 22
422	prefetch	[$inp + 32+63], 20
423	call		_${alg}${bits}_decrypt_2x
424	add		$inp, 32, $inp
425
426	movxtod		%o0, %f8
427	movxtod		%o1, %f10
428	fxor		%f12, %f0, %f0		! ^= ivec
429	fxor		%f14, %f2, %f2
430	movxtod		%o2, %f12
431	movxtod		%o3, %f14
432	fxor		%f8, %f4, %f4
433	fxor		%f10, %f6, %f6
434
435	brnz,pn		$ooff, 2f
436	sub		$len, 2, $len
437
438	std		%f0, [$out + 0]
439	std		%f2, [$out + 8]
440	std		%f4, [$out + 16]
441	std		%f6, [$out + 24]
442	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
443	add		$out, 32, $out
444___
445$::code.=<<___ if ($::evp);
446	st		%f12, [$ivec + 0]
447	st		%f13, [$ivec + 4]
448	st		%f14, [$ivec + 8]
449	st		%f15, [$ivec + 12]
450___
451$::code.=<<___ if (!$::evp);
452	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
453	nop
454
455	std		%f12, [$ivec + 0]	! write out ivec
456	std		%f14, [$ivec + 8]
457___
458$::code.=<<___;
459	ret
460	restore
461
462.align	16
4632:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
464						! and ~3x deterioration
465						! in inp==out case
466	faligndata	%f0, %f0, %f8		! handle unaligned output
467	faligndata	%f0, %f2, %f0
468	faligndata	%f2, %f4, %f2
469	faligndata	%f4, %f6, %f4
470	faligndata	%f6, %f6, %f6
471	stda		%f8, [$out + $omask]0xc0	! partial store
472	std		%f0, [$out + 8]
473	std		%f2, [$out + 16]
474	std		%f4, [$out + 24]
475	add		$out, 32, $out
476	orn		%g0, $omask, $omask
477	stda		%f6, [$out + $omask]0xc0	! partial store
478
479	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
480	orn		%g0, $omask, $omask
481___
482$::code.=<<___ if ($::evp);
483	st		%f12, [$ivec + 0]
484	st		%f13, [$ivec + 4]
485	st		%f14, [$ivec + 8]
486	st		%f15, [$ivec + 12]
487___
488$::code.=<<___ if (!$::evp);
489	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
490	nop
491
492	std		%f12, [$ivec + 0]	! write out ivec
493	std		%f14, [$ivec + 8]
494	ret
495	restore
496
497.align	16
498.L${bits}_cbc_dec_unaligned_ivec:
499	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
500	mov		0xff, $omask
501	srl		$omask, $ivoff, $omask
502	faligndata	%f12, %f12, %f0
503	faligndata	%f12, %f14, %f2
504	faligndata	%f14, %f14, %f4
505	stda		%f0, [$ivec + $omask]0xc0
506	std		%f2, [$ivec + 8]
507	add		$ivec, 16, $ivec
508	orn		%g0, $omask, $omask
509	stda		%f4, [$ivec + $omask]0xc0
510___
511$::code.=<<___;
512	ret
513	restore
514
515!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
516.align	32
517.L${bits}cbc_dec_blk:
518	add	$out, $len, $blk_init
519	and	$blk_init, 63, $blk_init	! tail
520	sub	$len, $blk_init, $len
521	add	$blk_init, 15, $blk_init	! round up to 16n
522	srlx	$len, 4, $len
523	srl	$blk_init, 4, $blk_init
524	sub	$len, 1, $len
525	add	$blk_init, 1, $blk_init
526
527.L${bits}_cbc_dec_blk_loop2x:
528	ldx		[$inp + 0], %o0
529	ldx		[$inp + 8], %o1
530	ldx		[$inp + 16], %o2
531	brz,pt		$ileft, 5f
532	ldx		[$inp + 24], %o3
533
534	ldx		[$inp + 32], %o4
535	sllx		%o0, $ileft, %o0
536	srlx		%o1, $iright, %g1
537	or		%g1, %o0, %o0
538	sllx		%o1, $ileft, %o1
539	srlx		%o2, $iright, %g1
540	or		%g1, %o1, %o1
541	sllx		%o2, $ileft, %o2
542	srlx		%o3, $iright, %g1
543	or		%g1, %o2, %o2
544	sllx		%o3, $ileft, %o3
545	srlx		%o4, $iright, %o4
546	or		%o4, %o3, %o3
5475:
548	xor		%g4, %o0, %o4		! ^= rk[0]
549	xor		%g5, %o1, %o5
550	movxtod		%o4, %f0
551	movxtod		%o5, %f2
552	xor		%g4, %o2, %o4
553	xor		%g5, %o3, %o5
554	movxtod		%o4, %f4
555	movxtod		%o5, %f6
556
557	prefetch	[$inp + 32+63], 20
558	call		_${alg}${bits}_decrypt_2x
559	add		$inp, 32, $inp
560	subcc		$len, 2, $len
561
562	movxtod		%o0, %f8
563	movxtod		%o1, %f10
564	fxor		%f12, %f0, %f0		! ^= ivec
565	fxor		%f14, %f2, %f2
566	movxtod		%o2, %f12
567	movxtod		%o3, %f14
568	fxor		%f8, %f4, %f4
569	fxor		%f10, %f6, %f6
570
571	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
572	add		$out, 8, $out
573	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
574	add		$out, 8, $out
575	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
576	add		$out, 8, $out
577	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
578	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
579	add		$out, 8, $out
580
581	add		$blk_init, $len, $len
582	andcc		$len, 1, %g0		! is number of blocks even?
583	membar		#StoreLoad|#StoreStore
584	bnz,pt		%icc, .L${bits}_cbc_dec_loop
585	srl		$len, 0, $len
586	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
587	nop
588___
589$::code.=<<___ if ($::evp);
590	st		%f12, [$ivec + 0]	! write out ivec
591	st		%f13, [$ivec + 4]
592	st		%f14, [$ivec + 8]
593	st		%f15, [$ivec + 12]
594___
595$::code.=<<___ if (!$::evp);
596	brnz,pn		$ivoff, 3b
597	nop
598
599	std		%f12, [$ivec + 0]	! write out ivec
600	std		%f14, [$ivec + 8]
601___
602$::code.=<<___;
603	ret
604	restore
605.type	${alg}${bits}_t4_cbc_decrypt,#function
606.size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
607___
608}
609
610sub alg_ctr32_implement {
611my ($alg,$bits) = @_;
612
613$::code.=<<___;
614.globl	${alg}${bits}_t4_ctr32_encrypt
615.align	32
616${alg}${bits}_t4_ctr32_encrypt:
617	save		%sp, -$::frame, %sp
618	srln		$len, 0, $len		! needed on v8+, "nop" on v9
619
620	prefetch	[$inp], 20
621	prefetch	[$inp + 63], 20
622	call		_${alg}${bits}_load_enckey
623	sllx		$len, 4, $len
624
625	ld		[$ivec + 0], %l4	! counter
626	ld		[$ivec + 4], %l5
627	ld		[$ivec + 8], %l6
628	ld		[$ivec + 12], %l7
629
630	sllx		%l4, 32, %o5
631	or		%l5, %o5, %o5
632	sllx		%l6, 32, %g1
633	xor		%o5, %g4, %g4		! ^= rk[0]
634	xor		%g1, %g5, %g5
635	movxtod		%g4, %f14		! most significant 64 bits
636
637	sub		$inp, $out, $blk_init	! $inp!=$out
638	and		$inp, 7, $ileft
639	andn		$inp, 7, $inp
640	sll		$ileft, 3, $ileft
641	mov		64, $iright
642	mov		0xff, $omask
643	sub		$iright, $ileft, $iright
644	and		$out, 7, $ooff
645	cmp		$len, 255
646	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
647	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
648	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
649	srl		$omask, $ooff, $omask
650
651	andcc		$len, 16, %g0		! is number of blocks even?
652	alignaddrl	$out, %g0, $out
653	bz		%icc, .L${bits}_ctr32_loop2x
654	srlx		$len, 4, $len
655.L${bits}_ctr32_loop:
656	ldx		[$inp + 0], %o0
657	brz,pt		$ileft, 4f
658	ldx		[$inp + 8], %o1
659
660	ldx		[$inp + 16], %o2
661	sllx		%o0, $ileft, %o0
662	srlx		%o1, $iright, %g1
663	sllx		%o1, $ileft, %o1
664	or		%g1, %o0, %o0
665	srlx		%o2, $iright, %o2
666	or		%o2, %o1, %o1
6674:
668	xor		%g5, %l7, %g1		! ^= rk[0]
669	add		%l7, 1, %l7
670	movxtod		%g1, %f2
671	srl		%l7, 0, %l7		! clruw
672	prefetch	[$out + 63], 22
673	prefetch	[$inp + 16+63], 20
674___
675$::code.=<<___ if ($alg eq "aes");
676	aes_eround01	%f16, %f14, %f2, %f4
677	aes_eround23	%f18, %f14, %f2, %f2
678___
679$::code.=<<___ if ($alg eq "cmll");
680	camellia_f	%f16, %f2, %f14, %f2
681	camellia_f	%f18, %f14, %f2, %f0
682___
683$::code.=<<___;
684	call		_${alg}${bits}_encrypt_1x+8
685	add		$inp, 16, $inp
686
687	movxtod		%o0, %f10
688	movxtod		%o1, %f12
689	fxor		%f10, %f0, %f0		! ^= inp
690	fxor		%f12, %f2, %f2
691
692	brnz,pn		$ooff, 2f
693	sub		$len, 1, $len
694
695	std		%f0, [$out + 0]
696	std		%f2, [$out + 8]
697	brnz,pt		$len, .L${bits}_ctr32_loop2x
698	add		$out, 16, $out
699
700	ret
701	restore
702
703.align	16
7042:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
705						! and ~3x deterioration
706						! in inp==out case
707	faligndata	%f0, %f0, %f4		! handle unaligned output
708	faligndata	%f0, %f2, %f6
709	faligndata	%f2, %f2, %f8
710	stda		%f4, [$out + $omask]0xc0	! partial store
711	std		%f6, [$out + 8]
712	add		$out, 16, $out
713	orn		%g0, $omask, $omask
714	stda		%f8, [$out + $omask]0xc0	! partial store
715
716	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
717	orn		%g0, $omask, $omask
718
719	ret
720	restore
721
722!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
723.align	32
724.L${bits}_ctr32_loop2x:
725	ldx		[$inp + 0], %o0
726	ldx		[$inp + 8], %o1
727	ldx		[$inp + 16], %o2
728	brz,pt		$ileft, 4f
729	ldx		[$inp + 24], %o3
730
731	ldx		[$inp + 32], %o4
732	sllx		%o0, $ileft, %o0
733	srlx		%o1, $iright, %g1
734	or		%g1, %o0, %o0
735	sllx		%o1, $ileft, %o1
736	srlx		%o2, $iright, %g1
737	or		%g1, %o1, %o1
738	sllx		%o2, $ileft, %o2
739	srlx		%o3, $iright, %g1
740	or		%g1, %o2, %o2
741	sllx		%o3, $ileft, %o3
742	srlx		%o4, $iright, %o4
743	or		%o4, %o3, %o3
7444:
745	xor		%g5, %l7, %g1		! ^= rk[0]
746	add		%l7, 1, %l7
747	movxtod		%g1, %f2
748	srl		%l7, 0, %l7		! clruw
749	xor		%g5, %l7, %g1
750	add		%l7, 1, %l7
751	movxtod		%g1, %f6
752	srl		%l7, 0, %l7		! clruw
753	prefetch	[$out + 63], 22
754	prefetch	[$inp + 32+63], 20
755___
756$::code.=<<___ if ($alg eq "aes");
757	aes_eround01	%f16, %f14, %f2, %f8
758	aes_eround23	%f18, %f14, %f2, %f2
759	aes_eround01	%f16, %f14, %f6, %f10
760	aes_eround23	%f18, %f14, %f6, %f6
761___
762$::code.=<<___ if ($alg eq "cmll");
763	camellia_f	%f16, %f2, %f14, %f2
764	camellia_f	%f16, %f6, %f14, %f6
765	camellia_f	%f18, %f14, %f2, %f0
766	camellia_f	%f18, %f14, %f6, %f4
767___
768$::code.=<<___;
769	call		_${alg}${bits}_encrypt_2x+16
770	add		$inp, 32, $inp
771
772	movxtod		%o0, %f8
773	movxtod		%o1, %f10
774	movxtod		%o2, %f12
775	fxor		%f8, %f0, %f0		! ^= inp
776	movxtod		%o3, %f8
777	fxor		%f10, %f2, %f2
778	fxor		%f12, %f4, %f4
779	fxor		%f8, %f6, %f6
780
781	brnz,pn		$ooff, 2f
782	sub		$len, 2, $len
783
784	std		%f0, [$out + 0]
785	std		%f2, [$out + 8]
786	std		%f4, [$out + 16]
787	std		%f6, [$out + 24]
788	brnz,pt		$len, .L${bits}_ctr32_loop2x
789	add		$out, 32, $out
790
791	ret
792	restore
793
794.align	16
7952:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
796						! and ~3x deterioration
797						! in inp==out case
798	faligndata	%f0, %f0, %f8		! handle unaligned output
799	faligndata	%f0, %f2, %f0
800	faligndata	%f2, %f4, %f2
801	faligndata	%f4, %f6, %f4
802	faligndata	%f6, %f6, %f6
803
804	stda		%f8, [$out + $omask]0xc0	! partial store
805	std		%f0, [$out + 8]
806	std		%f2, [$out + 16]
807	std		%f4, [$out + 24]
808	add		$out, 32, $out
809	orn		%g0, $omask, $omask
810	stda		%f6, [$out + $omask]0xc0	! partial store
811
812	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
813	orn		%g0, $omask, $omask
814
815	ret
816	restore
817
818!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
819.align	32
820.L${bits}_ctr32_blk:
821	add	$out, $len, $blk_init
822	and	$blk_init, 63, $blk_init	! tail
823	sub	$len, $blk_init, $len
824	add	$blk_init, 15, $blk_init	! round up to 16n
825	srlx	$len, 4, $len
826	srl	$blk_init, 4, $blk_init
827	sub	$len, 1, $len
828	add	$blk_init, 1, $blk_init
829
830.L${bits}_ctr32_blk_loop2x:
831	ldx		[$inp + 0], %o0
832	ldx		[$inp + 8], %o1
833	ldx		[$inp + 16], %o2
834	brz,pt		$ileft, 5f
835	ldx		[$inp + 24], %o3
836
837	ldx		[$inp + 32], %o4
838	sllx		%o0, $ileft, %o0
839	srlx		%o1, $iright, %g1
840	or		%g1, %o0, %o0
841	sllx		%o1, $ileft, %o1
842	srlx		%o2, $iright, %g1
843	or		%g1, %o1, %o1
844	sllx		%o2, $ileft, %o2
845	srlx		%o3, $iright, %g1
846	or		%g1, %o2, %o2
847	sllx		%o3, $ileft, %o3
848	srlx		%o4, $iright, %o4
849	or		%o4, %o3, %o3
8505:
851	xor		%g5, %l7, %g1		! ^= rk[0]
852	add		%l7, 1, %l7
853	movxtod		%g1, %f2
854	srl		%l7, 0, %l7		! clruw
855	xor		%g5, %l7, %g1
856	add		%l7, 1, %l7
857	movxtod		%g1, %f6
858	srl		%l7, 0, %l7		! clruw
859	prefetch	[$inp + 32+63], 20
860___
861$::code.=<<___ if ($alg eq "aes");
862	aes_eround01	%f16, %f14, %f2, %f8
863	aes_eround23	%f18, %f14, %f2, %f2
864	aes_eround01	%f16, %f14, %f6, %f10
865	aes_eround23	%f18, %f14, %f6, %f6
866___
867$::code.=<<___ if ($alg eq "cmll");
868	camellia_f	%f16, %f2, %f14, %f2
869	camellia_f	%f16, %f6, %f14, %f6
870	camellia_f	%f18, %f14, %f2, %f0
871	camellia_f	%f18, %f14, %f6, %f4
872___
873$::code.=<<___;
874	call		_${alg}${bits}_encrypt_2x+16
875	add		$inp, 32, $inp
876	subcc		$len, 2, $len
877
878	movxtod		%o0, %f8
879	movxtod		%o1, %f10
880	movxtod		%o2, %f12
881	fxor		%f8, %f0, %f0		! ^= inp
882	movxtod		%o3, %f8
883	fxor		%f10, %f2, %f2
884	fxor		%f12, %f4, %f4
885	fxor		%f8, %f6, %f6
886
887	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
888	add		$out, 8, $out
889	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
890	add		$out, 8, $out
891	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
892	add		$out, 8, $out
893	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
894	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
895	add		$out, 8, $out
896
897	add		$blk_init, $len, $len
898	andcc		$len, 1, %g0		! is number of blocks even?
899	membar		#StoreLoad|#StoreStore
900	bnz,pt		%icc, .L${bits}_ctr32_loop
901	srl		$len, 0, $len
902	brnz,pn		$len, .L${bits}_ctr32_loop2x
903	nop
904
905	ret
906	restore
907.type	${alg}${bits}_t4_ctr32_encrypt,#function
908.size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
909___
910}
911
912sub alg_xts_implement {
913my ($alg,$bits,$dir) = @_;
914my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
915my $rem=$ivec;
916
917$::code.=<<___;
918.globl	${alg}${bits}_t4_xts_${dir}crypt
919.align	32
920${alg}${bits}_t4_xts_${dir}crypt:
921	save		%sp, -$::frame-16, %sp
922	srln		$len, 0, $len		! needed on v8+, "nop" on v9
923
924	mov		$ivec, %o0
925	add		%fp, $::bias-16, %o1
926	call		${alg}_t4_encrypt
927	mov		$key2, %o2
928
929	add		%fp, $::bias-16, %l7
930	ldxa		[%l7]0x88, %g2
931	add		%fp, $::bias-8, %l7
932	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
933
934	sethi		%hi(0x76543210), %l7
935	or		%l7, %lo(0x76543210), %l7
936	bmask		%l7, %g0, %g0		! byte swap mask
937
938	prefetch	[$inp], 20
939	prefetch	[$inp + 63], 20
940	call		_${alg}${bits}_load_${dir}ckey
941	and		$len, 15,  $rem
942	and		$len, -16, $len
943___
944$code.=<<___ if ($dir eq "de");
945	mov		0, %l7
946	movrnz		$rem, 16,  %l7
947	sub		$len, %l7, $len
948___
949$code.=<<___;
950
951	sub		$inp, $out, $blk_init	! $inp!=$out
952	and		$inp, 7, $ileft
953	andn		$inp, 7, $inp
954	sll		$ileft, 3, $ileft
955	mov		64, $iright
956	mov		0xff, $omask
957	sub		$iright, $ileft, $iright
958	and		$out, 7, $ooff
959	cmp		$len, 255
960	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
961	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
962	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
963	srl		$omask, $ooff, $omask
964
965	andcc		$len, 16, %g0		! is number of blocks even?
966___
967$code.=<<___ if ($dir eq "de");
968	brz,pn		$len, .L${bits}_xts_${dir}steal
969___
970$code.=<<___;
971	alignaddrl	$out, %g0, $out
972	bz		%icc, .L${bits}_xts_${dir}loop2x
973	srlx		$len, 4, $len
974.L${bits}_xts_${dir}loop:
975	ldx		[$inp + 0], %o0
976	brz,pt		$ileft, 4f
977	ldx		[$inp + 8], %o1
978
979	ldx		[$inp + 16], %o2
980	sllx		%o0, $ileft, %o0
981	srlx		%o1, $iright, %g1
982	sllx		%o1, $ileft, %o1
983	or		%g1, %o0, %o0
984	srlx		%o2, $iright, %o2
985	or		%o2, %o1, %o1
9864:
987	movxtod		%g2, %f12
988	movxtod		%g3, %f14
989	bshuffle	%f12, %f12, %f12
990	bshuffle	%f14, %f14, %f14
991
992	xor		%g4, %o0, %o0		! ^= rk[0]
993	xor		%g5, %o1, %o1
994	movxtod		%o0, %f0
995	movxtod		%o1, %f2
996
997	fxor		%f12, %f0, %f0		! ^= tweak[0]
998	fxor		%f14, %f2, %f2
999
1000	prefetch	[$out + 63], 22
1001	prefetch	[$inp + 16+63], 20
1002	call		_${alg}${bits}_${dir}crypt_1x
1003	add		$inp, 16, $inp
1004
1005	fxor		%f12, %f0, %f0		! ^= tweak[0]
1006	fxor		%f14, %f2, %f2
1007
1008	srax		%g3, 63, %l7		! next tweak value
1009	addcc		%g2, %g2, %g2
1010	and		%l7, 0x87, %l7
1011	addxc		%g3, %g3, %g3
1012	xor		%l7, %g2, %g2
1013
1014	brnz,pn		$ooff, 2f
1015	sub		$len, 1, $len
1016
1017	std		%f0, [$out + 0]
1018	std		%f2, [$out + 8]
1019	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1020	add		$out, 16, $out
1021
1022	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1023	nop
1024
1025	ret
1026	restore
1027
1028.align	16
10292:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1030						! and ~3x deterioration
1031						! in inp==out case
1032	faligndata	%f0, %f0, %f4		! handle unaligned output
1033	faligndata	%f0, %f2, %f6
1034	faligndata	%f2, %f2, %f8
1035	stda		%f4, [$out + $omask]0xc0	! partial store
1036	std		%f6, [$out + 8]
1037	add		$out, 16, $out
1038	orn		%g0, $omask, $omask
1039	stda		%f8, [$out + $omask]0xc0	! partial store
1040
1041	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1042	orn		%g0, $omask, $omask
1043
1044	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1045	nop
1046
1047	ret
1048	restore
1049
1050!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1051.align	32
1052.L${bits}_xts_${dir}loop2x:
1053	ldx		[$inp + 0], %o0
1054	ldx		[$inp + 8], %o1
1055	ldx		[$inp + 16], %o2
1056	brz,pt		$ileft, 4f
1057	ldx		[$inp + 24], %o3
1058
1059	ldx		[$inp + 32], %o4
1060	sllx		%o0, $ileft, %o0
1061	srlx		%o1, $iright, %g1
1062	or		%g1, %o0, %o0
1063	sllx		%o1, $ileft, %o1
1064	srlx		%o2, $iright, %g1
1065	or		%g1, %o1, %o1
1066	sllx		%o2, $ileft, %o2
1067	srlx		%o3, $iright, %g1
1068	or		%g1, %o2, %o2
1069	sllx		%o3, $ileft, %o3
1070	srlx		%o4, $iright, %o4
1071	or		%o4, %o3, %o3
10724:
1073	movxtod		%g2, %f12
1074	movxtod		%g3, %f14
1075	bshuffle	%f12, %f12, %f12
1076	bshuffle	%f14, %f14, %f14
1077
1078	srax		%g3, 63, %l7		! next tweak value
1079	addcc		%g2, %g2, %g2
1080	and		%l7, 0x87, %l7
1081	addxc		%g3, %g3, %g3
1082	xor		%l7, %g2, %g2
1083
1084	movxtod		%g2, %f8
1085	movxtod		%g3, %f10
1086	bshuffle	%f8,  %f8,  %f8
1087	bshuffle	%f10, %f10, %f10
1088
1089	xor		%g4, %o0, %o0		! ^= rk[0]
1090	xor		%g5, %o1, %o1
1091	xor		%g4, %o2, %o2		! ^= rk[0]
1092	xor		%g5, %o3, %o3
1093	movxtod		%o0, %f0
1094	movxtod		%o1, %f2
1095	movxtod		%o2, %f4
1096	movxtod		%o3, %f6
1097
1098	fxor		%f12, %f0, %f0		! ^= tweak[0]
1099	fxor		%f14, %f2, %f2
1100	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1101	fxor		%f10, %f6, %f6
1102
1103	prefetch	[$out + 63], 22
1104	prefetch	[$inp + 32+63], 20
1105	call		_${alg}${bits}_${dir}crypt_2x
1106	add		$inp, 32, $inp
1107
1108	movxtod		%g2, %f8
1109	movxtod		%g3, %f10
1110
1111	srax		%g3, 63, %l7		! next tweak value
1112	addcc		%g2, %g2, %g2
1113	and		%l7, 0x87, %l7
1114	addxc		%g3, %g3, %g3
1115	xor		%l7, %g2, %g2
1116
1117	bshuffle	%f8,  %f8,  %f8
1118	bshuffle	%f10, %f10, %f10
1119
1120	fxor		%f12, %f0, %f0		! ^= tweak[0]
1121	fxor		%f14, %f2, %f2
1122	fxor		%f8,  %f4, %f4
1123	fxor		%f10, %f6, %f6
1124
1125	brnz,pn		$ooff, 2f
1126	sub		$len, 2, $len
1127
1128	std		%f0, [$out + 0]
1129	std		%f2, [$out + 8]
1130	std		%f4, [$out + 16]
1131	std		%f6, [$out + 24]
1132	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1133	add		$out, 32, $out
1134
1135	fsrc2		%f4, %f0
1136	fsrc2		%f6, %f2
1137	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1138	nop
1139
1140	ret
1141	restore
1142
1143.align	16
11442:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1145						! and ~3x deterioration
1146						! in inp==out case
1147	faligndata	%f0, %f0, %f8		! handle unaligned output
1148	faligndata	%f0, %f2, %f10
1149	faligndata	%f2, %f4, %f12
1150	faligndata	%f4, %f6, %f14
1151	faligndata	%f6, %f6, %f0
1152
1153	stda		%f8, [$out + $omask]0xc0	! partial store
1154	std		%f10, [$out + 8]
1155	std		%f12, [$out + 16]
1156	std		%f14, [$out + 24]
1157	add		$out, 32, $out
1158	orn		%g0, $omask, $omask
1159	stda		%f0, [$out + $omask]0xc0	! partial store
1160
1161	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1162	orn		%g0, $omask, $omask
1163
1164	fsrc2		%f4, %f0
1165	fsrc2		%f6, %f2
1166	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1167	nop
1168
1169	ret
1170	restore
1171
1172!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1173.align	32
1174.L${bits}_xts_${dir}blk:
1175	add	$out, $len, $blk_init
1176	and	$blk_init, 63, $blk_init	! tail
1177	sub	$len, $blk_init, $len
1178	add	$blk_init, 15, $blk_init	! round up to 16n
1179	srlx	$len, 4, $len
1180	srl	$blk_init, 4, $blk_init
1181	sub	$len, 1, $len
1182	add	$blk_init, 1, $blk_init
1183
1184.L${bits}_xts_${dir}blk2x:
1185	ldx		[$inp + 0], %o0
1186	ldx		[$inp + 8], %o1
1187	ldx		[$inp + 16], %o2
1188	brz,pt		$ileft, 5f
1189	ldx		[$inp + 24], %o3
1190
1191	ldx		[$inp + 32], %o4
1192	sllx		%o0, $ileft, %o0
1193	srlx		%o1, $iright, %g1
1194	or		%g1, %o0, %o0
1195	sllx		%o1, $ileft, %o1
1196	srlx		%o2, $iright, %g1
1197	or		%g1, %o1, %o1
1198	sllx		%o2, $ileft, %o2
1199	srlx		%o3, $iright, %g1
1200	or		%g1, %o2, %o2
1201	sllx		%o3, $ileft, %o3
1202	srlx		%o4, $iright, %o4
1203	or		%o4, %o3, %o3
12045:
1205	movxtod		%g2, %f12
1206	movxtod		%g3, %f14
1207	bshuffle	%f12, %f12, %f12
1208	bshuffle	%f14, %f14, %f14
1209
1210	srax		%g3, 63, %l7		! next tweak value
1211	addcc		%g2, %g2, %g2
1212	and		%l7, 0x87, %l7
1213	addxc		%g3, %g3, %g3
1214	xor		%l7, %g2, %g2
1215
1216	movxtod		%g2, %f8
1217	movxtod		%g3, %f10
1218	bshuffle	%f8,  %f8,  %f8
1219	bshuffle	%f10, %f10, %f10
1220
1221	xor		%g4, %o0, %o0		! ^= rk[0]
1222	xor		%g5, %o1, %o1
1223	xor		%g4, %o2, %o2		! ^= rk[0]
1224	xor		%g5, %o3, %o3
1225	movxtod		%o0, %f0
1226	movxtod		%o1, %f2
1227	movxtod		%o2, %f4
1228	movxtod		%o3, %f6
1229
1230	fxor		%f12, %f0, %f0		! ^= tweak[0]
1231	fxor		%f14, %f2, %f2
1232	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1233	fxor		%f10, %f6, %f6
1234
1235	prefetch	[$inp + 32+63], 20
1236	call		_${alg}${bits}_${dir}crypt_2x
1237	add		$inp, 32, $inp
1238
1239	movxtod		%g2, %f8
1240	movxtod		%g3, %f10
1241
1242	srax		%g3, 63, %l7		! next tweak value
1243	addcc		%g2, %g2, %g2
1244	and		%l7, 0x87, %l7
1245	addxc		%g3, %g3, %g3
1246	xor		%l7, %g2, %g2
1247
1248	bshuffle	%f8,  %f8,  %f8
1249	bshuffle	%f10, %f10, %f10
1250
1251	fxor		%f12, %f0, %f0		! ^= tweak[0]
1252	fxor		%f14, %f2, %f2
1253	fxor		%f8,  %f4, %f4
1254	fxor		%f10, %f6, %f6
1255
1256	subcc		$len, 2, $len
1257	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1258	add		$out, 8, $out
1259	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1260	add		$out, 8, $out
1261	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1262	add		$out, 8, $out
1263	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1264	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
1265	add		$out, 8, $out
1266
1267	add		$blk_init, $len, $len
1268	andcc		$len, 1, %g0		! is number of blocks even?
1269	membar		#StoreLoad|#StoreStore
1270	bnz,pt		%icc, .L${bits}_xts_${dir}loop
1271	srl		$len, 0, $len
1272	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
1273	nop
1274
1275	fsrc2		%f4, %f0
1276	fsrc2		%f6, %f2
1277	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1278	nop
1279
1280	ret
1281	restore
1282!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1283___
1284$code.=<<___ if ($dir eq "en");
1285.align	32
1286.L${bits}_xts_${dir}steal:
1287	std		%f0, [%fp + $::bias-16]	! copy of output
1288	std		%f2, [%fp + $::bias-8]
1289
1290	srl		$ileft, 3, $ileft
1291	add		%fp, $::bias-16, %l7
1292	add		$inp, $ileft, $inp	! original $inp+$len&-15
1293	add		$out, $ooff, $out	! original $out+$len&-15
1294	mov		0, $ileft
1295	nop					! align
1296
1297.L${bits}_xts_${dir}stealing:
1298	ldub		[$inp + $ileft], %o0
1299	ldub		[%l7  + $ileft], %o1
1300	dec		$rem
1301	stb		%o0, [%l7  + $ileft]
1302	stb		%o1, [$out + $ileft]
1303	brnz		$rem, .L${bits}_xts_${dir}stealing
1304	inc		$ileft
1305
1306	mov		%l7, $inp
1307	sub		$out, 16, $out
1308	mov		0, $ileft
1309	sub		$out, $ooff, $out
1310	ba		.L${bits}_xts_${dir}loop	! one more time
1311	mov		1, $len				! $rem is 0
1312___
1313$code.=<<___ if ($dir eq "de");
1314.align	32
1315.L${bits}_xts_${dir}steal:
1316	ldx		[$inp + 0], %o0
1317	brz,pt		$ileft, 8f
1318	ldx		[$inp + 8], %o1
1319
1320	ldx		[$inp + 16], %o2
1321	sllx		%o0, $ileft, %o0
1322	srlx		%o1, $iright, %g1
1323	sllx		%o1, $ileft, %o1
1324	or		%g1, %o0, %o0
1325	srlx		%o2, $iright, %o2
1326	or		%o2, %o1, %o1
13278:
1328	srax		%g3, 63, %l7		! next tweak value
1329	addcc		%g2, %g2, %o2
1330	and		%l7, 0x87, %l7
1331	addxc		%g3, %g3, %o3
1332	xor		%l7, %o2, %o2
1333
1334	movxtod		%o2, %f12
1335	movxtod		%o3, %f14
1336	bshuffle	%f12, %f12, %f12
1337	bshuffle	%f14, %f14, %f14
1338
1339	xor		%g4, %o0, %o0		! ^= rk[0]
1340	xor		%g5, %o1, %o1
1341	movxtod		%o0, %f0
1342	movxtod		%o1, %f2
1343
1344	fxor		%f12, %f0, %f0		! ^= tweak[0]
1345	fxor		%f14, %f2, %f2
1346
1347	call		_${alg}${bits}_${dir}crypt_1x
1348	add		$inp, 16, $inp
1349
1350	fxor		%f12, %f0, %f0		! ^= tweak[0]
1351	fxor		%f14, %f2, %f2
1352
1353	std		%f0, [%fp + $::bias-16]
1354	std		%f2, [%fp + $::bias-8]
1355
1356	srl		$ileft, 3, $ileft
1357	add		%fp, $::bias-16, %l7
1358	add		$inp, $ileft, $inp	! original $inp+$len&-15
1359	add		$out, $ooff, $out	! original $out+$len&-15
1360	mov		0, $ileft
1361	add		$out, 16, $out
1362	nop					! align
1363
1364.L${bits}_xts_${dir}stealing:
1365	ldub		[$inp + $ileft], %o0
1366	ldub		[%l7  + $ileft], %o1
1367	dec		$rem
1368	stb		%o0, [%l7  + $ileft]
1369	stb		%o1, [$out + $ileft]
1370	brnz		$rem, .L${bits}_xts_${dir}stealing
1371	inc		$ileft
1372
1373	mov		%l7, $inp
1374	sub		$out, 16, $out
1375	mov		0, $ileft
1376	sub		$out, $ooff, $out
1377	ba		.L${bits}_xts_${dir}loop	! one more time
1378	mov		1, $len				! $rem is 0
1379___
1380$code.=<<___;
1381	ret
1382	restore
1383.type	${alg}${bits}_t4_xts_${dir}crypt,#function
1384.size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1385___
1386}
1387
1388# Purpose of these subroutines is to explicitly encode VIS instructions,
1389# so that one can compile the module without having to specify VIS
1390# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1391# Idea is to reserve for option to produce "universal" binary and let
1392# programmer detect if current CPU is VIS capable at run-time.
1393sub unvis {
1394my ($mnemonic,$rs1,$rs2,$rd)=@_;
1395my ($ref,$opf);
1396my %visopf = (	"faligndata"	=> 0x048,
1397		"bshuffle"	=> 0x04c,
1398		"fnot2"		=> 0x066,
1399		"fxor"		=> 0x06c,
1400		"fsrc2"		=> 0x078	);
1401
1402    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1403
1404    if ($opf=$visopf{$mnemonic}) {
1405	foreach ($rs1,$rs2,$rd) {
1406	    return $ref if (!/%f([0-9]{1,2})/);
1407	    $_=$1;
1408	    if ($1>=32) {
1409		return $ref if ($1&1);
1410		# re-encode for upper double register addressing
1411		$_=($1|$1>>5)&31;
1412	    }
1413	}
1414
1415	return	sprintf ".word\t0x%08x !%s",
1416			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1417			$ref;
1418    } else {
1419	return $ref;
1420    }
1421}
1422
1423sub unvis3 {
1424my ($mnemonic,$rs1,$rs2,$rd)=@_;
1425my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1426my ($ref,$opf);
1427my %visopf = (	"addxc"		=> 0x011,
1428		"addxccc"	=> 0x013,
1429		"umulxhi"	=> 0x016,
1430		"alignaddr"	=> 0x018,
1431		"bmask"		=> 0x019,
1432		"alignaddrl"	=> 0x01a	);
1433
1434    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1435
1436    if ($opf=$visopf{$mnemonic}) {
1437	foreach ($rs1,$rs2,$rd) {
1438	    return $ref if (!/%([goli])([0-9])/);
1439	    $_=$bias{$1}+$2;
1440	}
1441
1442	return	sprintf ".word\t0x%08x !%s",
1443			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1444			$ref;
1445    } else {
1446	return $ref;
1447    }
1448}
1449
1450sub unaes_round {	# 4-argument instructions
1451my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1452my ($ref,$opf);
1453my %aesopf = (	"aes_eround01"	=> 0,
1454		"aes_eround23"	=> 1,
1455		"aes_dround01"	=> 2,
1456		"aes_dround23"	=> 3,
1457		"aes_eround01_l"=> 4,
1458		"aes_eround23_l"=> 5,
1459		"aes_dround01_l"=> 6,
1460		"aes_dround23_l"=> 7,
1461		"aes_kexpand1"	=> 8	);
1462
1463    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1464
1465    if (defined($opf=$aesopf{$mnemonic})) {
1466	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1467	foreach ($rs1,$rs2,$rd) {
1468	    return $ref if (!/%f([0-9]{1,2})/);
1469	    $_=$1;
1470	    if ($1>=32) {
1471		return $ref if ($1&1);
1472		# re-encode for upper double register addressing
1473		$_=($1|$1>>5)&31;
1474	    }
1475	}
1476
1477	return	sprintf ".word\t0x%08x !%s",
1478			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1479			$ref;
1480    } else {
1481	return $ref;
1482    }
1483}
1484
1485sub unaes_kexpand {	# 3-argument instructions
1486my ($mnemonic,$rs1,$rs2,$rd)=@_;
1487my ($ref,$opf);
1488my %aesopf = (	"aes_kexpand0"	=> 0x130,
1489		"aes_kexpand2"	=> 0x131	);
1490
1491    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1492
1493    if (defined($opf=$aesopf{$mnemonic})) {
1494	foreach ($rs1,$rs2,$rd) {
1495	    return $ref if (!/%f([0-9]{1,2})/);
1496	    $_=$1;
1497	    if ($1>=32) {
1498		return $ref if ($1&1);
1499		# re-encode for upper double register addressing
1500		$_=($1|$1>>5)&31;
1501	    }
1502	}
1503
1504	return	sprintf ".word\t0x%08x !%s",
1505			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1506			$ref;
1507    } else {
1508	return $ref;
1509    }
1510}
1511
1512sub uncamellia_f {	# 4-argument instructions
1513my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1514my ($ref,$opf);
1515
1516    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1517
1518    if (1) {
1519	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1520	foreach ($rs1,$rs2,$rd) {
1521	    return $ref if (!/%f([0-9]{1,2})/);
1522	    $_=$1;
1523	    if ($1>=32) {
1524		return $ref if ($1&1);
1525		# re-encode for upper double register addressing
1526		$_=($1|$1>>5)&31;
1527	    }
1528	}
1529
1530	return	sprintf ".word\t0x%08x !%s",
1531			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1532			$ref;
1533    } else {
1534	return $ref;
1535    }
1536}
1537
1538sub uncamellia3 {	# 3-argument instructions
1539my ($mnemonic,$rs1,$rs2,$rd)=@_;
1540my ($ref,$opf);
1541my %cmllopf = (	"camellia_fl"	=> 0x13c,
1542		"camellia_fli"	=> 0x13d	);
1543
1544    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1545
1546    if (defined($opf=$cmllopf{$mnemonic})) {
1547	foreach ($rs1,$rs2,$rd) {
1548	    return $ref if (!/%f([0-9]{1,2})/);
1549	    $_=$1;
1550	    if ($1>=32) {
1551		return $ref if ($1&1);
1552		# re-encode for upper double register addressing
1553		$_=($1|$1>>5)&31;
1554	    }
1555	}
1556
1557	return	sprintf ".word\t0x%08x !%s",
1558			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1559			$ref;
1560    } else {
1561	return $ref;
1562    }
1563}
1564
1565sub unmovxtox {		# 2-argument instructions
1566my ($mnemonic,$rs,$rd)=@_;
1567my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1568my ($ref,$opf);
1569my %movxopf = (	"movdtox"	=> 0x110,
1570		"movstouw"	=> 0x111,
1571		"movstosw"	=> 0x113,
1572		"movxtod"	=> 0x118,
1573		"movwtos"	=> 0x119	);
1574
1575    $ref = "$mnemonic\t$rs,$rd";
1576
1577    if (defined($opf=$movxopf{$mnemonic})) {
1578	foreach ($rs,$rd) {
1579	    return $ref if (!/%([fgoli])([0-9]{1,2})/);
1580	    $_=$bias{$1}+$2;
1581	    if ($2>=32) {
1582		return $ref if ($2&1);
1583		# re-encode for upper double register addressing
1584		$_=($2|$2>>5)&31;
1585	    }
1586	}
1587
1588	return	sprintf ".word\t0x%08x !%s",
1589			2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1590			$ref;
1591    } else {
1592	return $ref;
1593    }
1594}
1595
1596sub undes {
1597my ($mnemonic)=shift;
1598my @args=@_;
1599my ($ref,$opf);
1600my %desopf = (	"des_round"	=> 0b1001,
1601		"des_ip"	=> 0b100110100,
1602		"des_iip"	=> 0b100110101,
1603		"des_kexpand"	=> 0b100110110	);
1604
1605    $ref = "$mnemonic\t".join(",",@_);
1606
1607    if (defined($opf=$desopf{$mnemonic})) {	# 4-arg
1608	if ($mnemonic eq "des_round") {
1609	    foreach (@args[0..3]) {
1610		return $ref if (!/%f([0-9]{1,2})/);
1611		$_=$1;
1612		if ($1>=32) {
1613		    return $ref if ($1&1);
1614		    # re-encode for upper double register addressing
1615		    $_=($1|$1>>5)&31;
1616		}
1617	    }
1618	    return  sprintf ".word\t0x%08x !%s",
1619			    2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1620			    $ref;
1621	} elsif ($mnemonic eq "des_kexpand") {	# 3-arg
1622	    foreach (@args[0..2]) {
1623		return $ref if (!/(%f)?([0-9]{1,2})/);
1624		$_=$2;
1625		if ($2>=32) {
1626		    return $ref if ($2&1);
1627		    # re-encode for upper double register addressing
1628		    $_=($2|$2>>5)&31;
1629		}
1630	    }
1631	    return  sprintf ".word\t0x%08x !%s",
1632			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1633			    $ref;
1634	} else {				# 2-arg
1635	    foreach (@args[0..1]) {
1636		return $ref if (!/%f([0-9]{1,2})/);
1637		$_=$1;
1638		if ($1>=32) {
1639		    return $ref if ($2&1);
1640		    # re-encode for upper double register addressing
1641		    $_=($1|$1>>5)&31;
1642		}
1643	    }
1644	    return  sprintf ".word\t0x%08x !%s",
1645			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1646			    $ref;
1647	}
1648    } else {
1649	return $ref;
1650    }
1651}
1652
1653sub emit_assembler {
1654    foreach (split("\n",$::code)) {
1655	s/\`([^\`]*)\`/eval $1/ge;
1656
1657	s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1658
1659	s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1660		&unaes_round($1,$2,$3,$4,$5)
1661	 /geo or
1662	s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1663		&unaes_kexpand($1,$2,$3,$4)
1664	 /geo or
1665	s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1666		&uncamellia_f($1,$2,$3,$4,$5)
1667	 /geo or
1668	s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1669		&uncamellia3($1,$2,$3,$4)
1670	 /geo or
1671	s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1672		&undes($1,$2,$3,$4,$5)
1673	 /geo or
1674	s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1675		&unmovxtox($1,$2,$3)
1676	 /geo or
1677	s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1678		&unmovxtox($1,$2,$3)
1679	 /geo or
1680	s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1681		&unvis($1,$2,$3,$4)
1682	 /geo or
1683	s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1684		&unvis3($1,$2,$3,$4)
1685	 /geo;
1686
1687	print $_,"\n";
1688    }
1689}
1690
16911;
1692