1#! /usr/bin/env perl
2# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# September 2011
18#
19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20# details.
21
22# $output is the last argument if it looks like a file (it has an extension)
23# $flavour is the first argument if it doesn't look like a file
24$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
25$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
35     or die "can't call $xlate: $!";
36*STDOUT=*OUT;
37
38$code=".text\n";
39
40%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32);	# prefetch errata
41$PADLOCK_CHUNK=512;	# Must be a power of 2 between 32 and 2^20
42
43$ctx="%rdx";
44$out="%rdi";
45$inp="%rsi";
46$len="%rcx";
47$chunk="%rbx";
48
49($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
50                                 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
51
52$code.=<<___;
53.globl	padlock_capability
54.type	padlock_capability,\@abi-omnipotent
55.align	16
56padlock_capability:
57	mov	%rbx,%r8
58	xor	%eax,%eax
59	cpuid
60	xor	%eax,%eax
61	cmp	\$`"0x".unpack("H*",'tneC')`,%ebx
62	jne	.Lzhaoxin
63	cmp	\$`"0x".unpack("H*",'Hrua')`,%edx
64	jne	.Lnoluck
65	cmp	\$`"0x".unpack("H*",'slua')`,%ecx
66	jne	.Lnoluck
67	jmp	.LzhaoxinEnd
68.Lzhaoxin:
69	cmp	\$`"0x".unpack("H*",'hS  ')`,%ebx
70	jne	.Lnoluck
71	cmp	\$`"0x".unpack("H*",'hgna')`,%edx
72	jne	.Lnoluck
73	cmp	\$`"0x".unpack("H*",'  ia')`,%ecx
74	jne	.Lnoluck
75.LzhaoxinEnd:
76	mov	\$0xC0000000,%eax
77	cpuid
78	mov	%eax,%edx
79	xor	%eax,%eax
80	cmp	\$0xC0000001,%edx
81	jb	.Lnoluck
82	mov	\$0xC0000001,%eax
83	cpuid
84	mov	%edx,%eax
85	and	\$0xffffffef,%eax
86	or	\$0x10,%eax		# set Nano bit#4
87.Lnoluck:
88	mov	%r8,%rbx
89	ret
90.size	padlock_capability,.-padlock_capability
91
92.globl	padlock_key_bswap
93.type	padlock_key_bswap,\@abi-omnipotent,0
94.align	16
95padlock_key_bswap:
96	mov	240($arg1),%edx
97	inc	%edx
98	shl	\$2,%edx
99.Lbswap_loop:
100	mov	($arg1),%eax
101	bswap	%eax
102	mov	%eax,($arg1)
103	lea	4($arg1),$arg1
104	sub	\$1,%edx
105	jnz	.Lbswap_loop
106	ret
107.size	padlock_key_bswap,.-padlock_key_bswap
108
109.globl	padlock_verify_context
110.type	padlock_verify_context,\@abi-omnipotent
111.align	16
112padlock_verify_context:
113	mov	$arg1,$ctx
114	pushf
115	lea	.Lpadlock_saved_context(%rip),%rax
116	call	_padlock_verify_ctx
117	lea	8(%rsp),%rsp
118	ret
119.size	padlock_verify_context,.-padlock_verify_context
120
121.type	_padlock_verify_ctx,\@abi-omnipotent
122.align	16
123_padlock_verify_ctx:
124	mov	8(%rsp),%r8
125	bt	\$30,%r8
126	jnc	.Lverified
127	cmp	(%rax),$ctx
128	je	.Lverified
129	pushf
130	popf
131.Lverified:
132	mov	$ctx,(%rax)
133	ret
134.size	_padlock_verify_ctx,.-_padlock_verify_ctx
135
136.globl	padlock_reload_key
137.type	padlock_reload_key,\@abi-omnipotent
138.align	16
139padlock_reload_key:
140	pushf
141	popf
142	ret
143.size	padlock_reload_key,.-padlock_reload_key
144
145.globl	padlock_aes_block
146.type	padlock_aes_block,\@function,3
147.align	16
148padlock_aes_block:
149	mov	%rbx,%r8
150	mov	\$1,$len
151	lea	32($ctx),%rbx		# key
152	lea	16($ctx),$ctx		# control word
153	.byte	0xf3,0x0f,0xa7,0xc8	# rep xcryptecb
154	mov	%r8,%rbx
155	ret
156.size	padlock_aes_block,.-padlock_aes_block
157
158.globl	padlock_xstore
159.type	padlock_xstore,\@function,2
160.align	16
161padlock_xstore:
162	mov	%esi,%edx
163	.byte	0x0f,0xa7,0xc0		# xstore
164	ret
165.size	padlock_xstore,.-padlock_xstore
166
167.globl	padlock_sha1_oneshot
168.type	padlock_sha1_oneshot,\@function,3
169.align	16
170padlock_sha1_oneshot:
171	mov	%rdx,%rcx
172	mov	%rdi,%rdx		# put aside %rdi
173	movups	(%rdi),%xmm0		# copy-in context
174	sub	\$128+8,%rsp
175	mov	16(%rdi),%eax
176	movaps	%xmm0,(%rsp)
177	mov	%rsp,%rdi
178	mov	%eax,16(%rsp)
179	xor	%rax,%rax
180	.byte	0xf3,0x0f,0xa6,0xc8	# rep xsha1
181	movaps	(%rsp),%xmm0
182	mov	16(%rsp),%eax
183	add	\$128+8,%rsp
184	movups	%xmm0,(%rdx)		# copy-out context
185	mov	%eax,16(%rdx)
186	ret
187.size	padlock_sha1_oneshot,.-padlock_sha1_oneshot
188
189.globl	padlock_sha1_blocks
190.type	padlock_sha1_blocks,\@function,3
191.align	16
192padlock_sha1_blocks:
193	mov	%rdx,%rcx
194	mov	%rdi,%rdx		# put aside %rdi
195	movups	(%rdi),%xmm0		# copy-in context
196	sub	\$128+8,%rsp
197	mov	16(%rdi),%eax
198	movaps	%xmm0,(%rsp)
199	mov	%rsp,%rdi
200	mov	%eax,16(%rsp)
201	mov	\$-1,%rax
202	.byte	0xf3,0x0f,0xa6,0xc8	# rep xsha1
203	movaps	(%rsp),%xmm0
204	mov	16(%rsp),%eax
205	add	\$128+8,%rsp
206	movups	%xmm0,(%rdx)		# copy-out context
207	mov	%eax,16(%rdx)
208	ret
209.size	padlock_sha1_blocks,.-padlock_sha1_blocks
210
211.globl	padlock_sha256_oneshot
212.type	padlock_sha256_oneshot,\@function,3
213.align	16
214padlock_sha256_oneshot:
215	mov	%rdx,%rcx
216	mov	%rdi,%rdx		# put aside %rdi
217	movups	(%rdi),%xmm0		# copy-in context
218	sub	\$128+8,%rsp
219	movups	16(%rdi),%xmm1
220	movaps	%xmm0,(%rsp)
221	mov	%rsp,%rdi
222	movaps	%xmm1,16(%rsp)
223	xor	%rax,%rax
224	.byte	0xf3,0x0f,0xa6,0xd0	# rep xsha256
225	movaps	(%rsp),%xmm0
226	movaps	16(%rsp),%xmm1
227	add	\$128+8,%rsp
228	movups	%xmm0,(%rdx)		# copy-out context
229	movups	%xmm1,16(%rdx)
230	ret
231.size	padlock_sha256_oneshot,.-padlock_sha256_oneshot
232
233.globl	padlock_sha256_blocks
234.type	padlock_sha256_blocks,\@function,3
235.align	16
236padlock_sha256_blocks:
237	mov	%rdx,%rcx
238	mov	%rdi,%rdx		# put aside %rdi
239	movups	(%rdi),%xmm0		# copy-in context
240	sub	\$128+8,%rsp
241	movups	16(%rdi),%xmm1
242	movaps	%xmm0,(%rsp)
243	mov	%rsp,%rdi
244	movaps	%xmm1,16(%rsp)
245	mov	\$-1,%rax
246	.byte	0xf3,0x0f,0xa6,0xd0	# rep xsha256
247	movaps	(%rsp),%xmm0
248	movaps	16(%rsp),%xmm1
249	add	\$128+8,%rsp
250	movups	%xmm0,(%rdx)		# copy-out context
251	movups	%xmm1,16(%rdx)
252	ret
253.size	padlock_sha256_blocks,.-padlock_sha256_blocks
254
255.globl	padlock_sha512_blocks
256.type	padlock_sha512_blocks,\@function,3
257.align	16
258padlock_sha512_blocks:
259	mov	%rdx,%rcx
260	mov	%rdi,%rdx		# put aside %rdi
261	movups	(%rdi),%xmm0		# copy-in context
262	sub	\$128+8,%rsp
263	movups	16(%rdi),%xmm1
264	movups	32(%rdi),%xmm2
265	movups	48(%rdi),%xmm3
266	movaps	%xmm0,(%rsp)
267	mov	%rsp,%rdi
268	movaps	%xmm1,16(%rsp)
269	movaps	%xmm2,32(%rsp)
270	movaps	%xmm3,48(%rsp)
271	.byte	0xf3,0x0f,0xa6,0xe0	# rep xha512
272	movaps	(%rsp),%xmm0
273	movaps	16(%rsp),%xmm1
274	movaps	32(%rsp),%xmm2
275	movaps	48(%rsp),%xmm3
276	add	\$128+8,%rsp
277	movups	%xmm0,(%rdx)		# copy-out context
278	movups	%xmm1,16(%rdx)
279	movups	%xmm2,32(%rdx)
280	movups	%xmm3,48(%rdx)
281	ret
282.size	padlock_sha512_blocks,.-padlock_sha512_blocks
283___
284
285sub generate_mode {
286my ($mode,$opcode) = @_;
287# int padlock_$mode_encrypt(void *out, const void *inp,
288#		struct padlock_cipher_data *ctx, size_t len);
289$code.=<<___;
290.globl	padlock_${mode}_encrypt
291.type	padlock_${mode}_encrypt,\@function,4
292.align	16
293padlock_${mode}_encrypt:
294	push	%rbp
295	push	%rbx
296
297	xor	%eax,%eax
298	test	\$15,$ctx
299	jnz	.L${mode}_abort
300	test	\$15,$len
301	jnz	.L${mode}_abort
302	lea	.Lpadlock_saved_context(%rip),%rax
303	pushf
304	cld
305	call	_padlock_verify_ctx
306	lea	16($ctx),$ctx		# control word
307	xor	%eax,%eax
308	xor	%ebx,%ebx
309	testl	\$`1<<5`,($ctx)		# align bit in control word
310	jnz	.L${mode}_aligned
311	test	\$0x0f,$out
312	setz	%al			# !out_misaligned
313	test	\$0x0f,$inp
314	setz	%bl			# !inp_misaligned
315	test	%ebx,%eax
316	jnz	.L${mode}_aligned
317	neg	%rax
318	mov	\$$PADLOCK_CHUNK,$chunk
319	not	%rax			# out_misaligned?-1:0
320	lea	(%rsp),%rbp
321	cmp	$chunk,$len
322	cmovc	$len,$chunk		# chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
323	and	$chunk,%rax		# out_misaligned?chunk:0
324	mov	$len,$chunk
325	neg	%rax
326	and	\$$PADLOCK_CHUNK-1,$chunk	# chunk%=PADLOCK_CHUNK
327	lea	(%rax,%rbp),%rsp
328	mov	\$$PADLOCK_CHUNK,%rax
329	cmovz	%rax,$chunk			# chunk=chunk?:PADLOCK_CHUNK
330___
331$code.=<<___				if ($mode eq "ctr32");
332.L${mode}_reenter:
333	mov	-4($ctx),%eax		# pull 32-bit counter
334	bswap	%eax
335	neg	%eax
336	and	\$`$PADLOCK_CHUNK/16-1`,%eax
337	mov	\$$PADLOCK_CHUNK,$chunk
338	shl	\$4,%eax
339	cmovz	$chunk,%rax
340	cmp	%rax,$len
341	cmova	%rax,$chunk		# don't let counter cross PADLOCK_CHUNK
342	cmovbe	$len,$chunk
343___
344$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
345	cmp	$chunk,$len
346	ja	.L${mode}_loop
347	mov	$inp,%rax		# check if prefetch crosses page
348	cmp	%rsp,%rbp
349	cmove	$out,%rax
350	add	$len,%rax
351	neg	%rax
352	and	\$0xfff,%rax		# distance to page boundary
353	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
354	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
355	cmovae	$chunk,%rax		# mask=distance<prefetch?-prefetch:-1
356	and	%rax,$chunk
357	jz	.L${mode}_unaligned_tail
358___
359$code.=<<___;
360	jmp	.L${mode}_loop
361.align	16
362.L${mode}_loop:
363	cmp	$len,$chunk		# ctr32 artefact
364	cmova	$len,$chunk		# ctr32 artefact
365	mov	$out,%r8		# save parameters
366	mov	$inp,%r9
367	mov	$len,%r10
368	mov	$chunk,$len
369	mov	$chunk,%r11
370	test	\$0x0f,$out		# out_misaligned
371	cmovnz	%rsp,$out
372	test	\$0x0f,$inp		# inp_misaligned
373	jz	.L${mode}_inp_aligned
374	shr	\$3,$len
375	.byte	0xf3,0x48,0xa5		# rep movsq
376	sub	$chunk,$out
377	mov	$chunk,$len
378	mov	$out,$inp
379.L${mode}_inp_aligned:
380	lea	-16($ctx),%rax		# ivp
381	lea	16($ctx),%rbx		# key
382	shr	\$4,$len
383	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
384___
385$code.=<<___				if ($mode !~ /ecb|ctr/);
386	movdqa	(%rax),%xmm0
387	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
388___
389$code.=<<___				if ($mode eq "ctr32");
390	mov	-4($ctx),%eax		# pull 32-bit counter
391	test	\$0xffff0000,%eax
392	jnz	.L${mode}_no_carry
393	bswap	%eax
394	add	\$0x10000,%eax
395	bswap	%eax
396	mov	%eax,-4($ctx)
397.L${mode}_no_carry:
398___
399$code.=<<___;
400	mov	%r8,$out		# restore parameters
401	mov	%r11,$chunk
402	test	\$0x0f,$out
403	jz	.L${mode}_out_aligned
404	mov	$chunk,$len
405	lea	(%rsp),$inp
406	shr	\$3,$len
407	.byte	0xf3,0x48,0xa5		# rep movsq
408	sub	$chunk,$out
409.L${mode}_out_aligned:
410	mov	%r9,$inp
411	mov	%r10,$len
412	add	$chunk,$out
413	add	$chunk,$inp
414	sub	$chunk,$len
415	mov	\$$PADLOCK_CHUNK,$chunk
416___
417					if (!$PADLOCK_PREFETCH{$mode}) {
418$code.=<<___;
419	jnz	.L${mode}_loop
420___
421					} else {
422$code.=<<___;
423	jz	.L${mode}_break
424	cmp	$chunk,$len
425	jae	.L${mode}_loop
426___
427$code.=<<___				if ($mode eq "ctr32");
428	mov	$len,$chunk
429	mov	$inp,%rax		# check if prefetch crosses page
430	cmp	%rsp,%rbp
431	cmove	$out,%rax
432	add	$len,%rax
433	neg	%rax
434	and	\$0xfff,%rax		# distance to page boundary
435	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
436	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
437	cmovae	$chunk,%rax
438	and	%rax,$chunk
439	jnz	.L${mode}_loop
440___
441$code.=<<___;
442.L${mode}_unaligned_tail:
443	xor	%eax,%eax
444	cmp	%rsp,%rbp
445	cmove	$len,%rax
446	mov	$out,%r8		# save parameters
447	mov	$len,$chunk
448	sub	%rax,%rsp		# alloca
449	shr	\$3,$len
450	lea	(%rsp),$out
451	.byte	0xf3,0x48,0xa5		# rep movsq
452	mov	%rsp,$inp
453	mov	%r8, $out		# restore parameters
454	mov	$chunk,$len
455	jmp	.L${mode}_loop
456.align	16
457.L${mode}_break:
458___
459					}
460$code.=<<___;
461	cmp	%rbp,%rsp
462	je	.L${mode}_done
463
464	pxor	%xmm0,%xmm0
465	lea	(%rsp),%rax
466.L${mode}_bzero:
467	movaps	%xmm0,(%rax)
468	lea	16(%rax),%rax
469	cmp	%rax,%rbp
470	ja	.L${mode}_bzero
471
472.L${mode}_done:
473	lea	(%rbp),%rsp
474	jmp	.L${mode}_exit
475
476.align	16
477.L${mode}_aligned:
478___
479$code.=<<___				if ($mode eq "ctr32");
480	mov	-4($ctx),%eax		# pull 32-bit counter
481	bswap	%eax
482	neg	%eax
483	and	\$0xffff,%eax
484	mov	\$`16*0x10000`,$chunk
485	shl	\$4,%eax
486	cmovz	$chunk,%rax
487	cmp	%rax,$len
488	cmova	%rax,$chunk		# don't let counter cross 2^16
489	cmovbe	$len,$chunk
490	jbe	.L${mode}_aligned_skip
491
492.L${mode}_aligned_loop:
493	mov	$len,%r10		# save parameters
494	mov	$chunk,$len
495	mov	$chunk,%r11
496
497	lea	-16($ctx),%rax		# ivp
498	lea	16($ctx),%rbx		# key
499	shr	\$4,$len		# len/=AES_BLOCK_SIZE
500	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
501
502	mov	-4($ctx),%eax		# pull 32-bit counter
503	bswap	%eax
504	add	\$0x10000,%eax
505	bswap	%eax
506	mov	%eax,-4($ctx)
507
508	mov	%r10,$len		# restore parameters
509	sub	%r11,$len
510	mov	\$`16*0x10000`,$chunk
511	jz	.L${mode}_exit
512	cmp	$chunk,$len
513	jae	.L${mode}_aligned_loop
514
515.L${mode}_aligned_skip:
516___
517$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
518	lea	($inp,$len),%rbp
519	neg	%rbp
520	and	\$0xfff,%rbp		# distance to page boundary
521	xor	%eax,%eax
522	cmp	\$$PADLOCK_PREFETCH{$mode},%rbp
523	mov	\$$PADLOCK_PREFETCH{$mode}-1,%rbp
524	cmovae	%rax,%rbp
525	and	$len,%rbp		# remainder
526	sub	%rbp,$len
527	jz	.L${mode}_aligned_tail
528___
529$code.=<<___;
530	lea	-16($ctx),%rax		# ivp
531	lea	16($ctx),%rbx		# key
532	shr	\$4,$len		# len/=AES_BLOCK_SIZE
533	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
534___
535$code.=<<___				if ($mode !~ /ecb|ctr/);
536	movdqa	(%rax),%xmm0
537	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
538___
539$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
540	test	%rbp,%rbp		# check remainder
541	jz	.L${mode}_exit
542
543.L${mode}_aligned_tail:
544	mov	$out,%r8
545	mov	%rbp,$chunk
546	mov	%rbp,$len
547	lea	(%rsp),%rbp
548	sub	$len,%rsp
549	shr	\$3,$len
550	lea	(%rsp),$out
551	.byte	0xf3,0x48,0xa5		# rep movsq
552	lea	(%r8),$out
553	lea	(%rsp),$inp
554	mov	$chunk,$len
555	jmp	.L${mode}_loop
556___
557$code.=<<___;
558.L${mode}_exit:
559	mov	\$1,%eax
560	lea	8(%rsp),%rsp
561.L${mode}_abort:
562	pop	%rbx
563	pop	%rbp
564	ret
565.size	padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
566___
567}
568
569&generate_mode("ecb",0xc8);
570&generate_mode("cbc",0xd0);
571&generate_mode("cfb",0xe0);
572&generate_mode("ofb",0xe8);
573&generate_mode("ctr32",0xd8);	# all 64-bit CPUs have working CTR...
574
575$code.=<<___;
576.asciz	"VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
577.align	16
578.data
579.align	8
580.Lpadlock_saved_context:
581	.quad	0
582___
583$code =~ s/\`([^\`]*)\`/eval($1)/gem;
584
585print $code;
586
587close STDOUT;
588