1238384Sjkim#!/usr/bin/env perl
2238384Sjkim#
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# June 2011
11238384Sjkim#
12238384Sjkim# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
13238384Sjkim# http://download.intel.com/design/intarch/papers/323686.pdf, is that
14238384Sjkim# since both algorithms exhibit instruction-level parallelism, ILP,
15238384Sjkim# below theoretical maximum, interleaving them would allow to utilize
16238384Sjkim# processor resources better and achieve better performance. RC4
17238384Sjkim# instruction sequence is virtually identical to rc4-x86_64.pl, which
18238384Sjkim# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
19238384Sjkim# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
20238384Sjkim# minimize register usage, which was used as "main thread" with RC4
21238384Sjkim# weaved into it, one RC4 round per one MD5 round. In addition to the
22238384Sjkim# stiched subroutine the script can generate standalone replacement
23238384Sjkim# md5_block_asm_data_order and RC4. Below are performance numbers in
24238384Sjkim# cycles per processed byte, less is better, for these the standalone
25238384Sjkim# subroutines, sum of them, and stitched one:
26238384Sjkim#
27238384Sjkim#		RC4	MD5	RC4+MD5	stitch	gain
28238384Sjkim# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
29238384Sjkim# Core2		6.5	5.8	12.3	7.7	+60%
30238384Sjkim# Westmere	4.3	5.2	9.5	7.0	+36%
31238384Sjkim# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
32238384Sjkim# Atom		9.3	6.5	15.8	11.1	+42%
33238384Sjkim#
34238384Sjkim# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
35238384Sjkim#	is +53%...
36238384Sjkim
37238384Sjkimmy ($rc4,$md5)=(1,1);	# what to generate?
38238384Sjkimmy $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
39238384Sjkim			# but its result is discarded. Idea here is
40238384Sjkim			# to be able to use 'openssl speed rc4' for
41238384Sjkim			# benchmarking the stitched subroutine...
42238384Sjkim
43238384Sjkimmy $flavour = shift;
44238384Sjkimmy $output  = shift;
45238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46238384Sjkim
47238384Sjkimmy $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48238384Sjkim
49238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
50238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52238384Sjkimdie "can't locate x86_64-xlate.pl";
53238384Sjkim
54246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
55246772Sjkim*STDOUT=*OUT;
56238384Sjkim
57238384Sjkimmy ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
58238384Sjkim
59238384Sjkimif ($rc4 && !$md5) {
60238384Sjkim  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
61238384Sjkim  $func="RC4";				$nargs=4;
62238384Sjkim} elsif ($md5 && !$rc4) {
63238384Sjkim  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
64238384Sjkim  $func="md5_block_asm_data_order";	$nargs=3;
65238384Sjkim} else {
66238384Sjkim  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
67238384Sjkim  $func="rc4_md5_enc";			$nargs=6;
68238384Sjkim  # void rc4_md5_enc(
69238384Sjkim  #		RC4_KEY *key,		#
70238384Sjkim  #		const void *in0,	# RC4 input
71238384Sjkim  #		void *out,		# RC4 output
72238384Sjkim  #		MD5_CTX *ctx,		#
73238384Sjkim  #		const void *inp,	# MD5 input
74238384Sjkim  #		size_t len);		# number of 64-byte blocks
75238384Sjkim}
76238384Sjkim
77238384Sjkimmy @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
78238384Sjkim	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
79238384Sjkim	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
80238384Sjkim	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
81238384Sjkim
82238384Sjkim	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
83238384Sjkim	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
84238384Sjkim	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
85238384Sjkim	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
86238384Sjkim
87238384Sjkim	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
88238384Sjkim	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
89238384Sjkim	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
90238384Sjkim	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
91238384Sjkim
92238384Sjkim	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
93238384Sjkim	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
94238384Sjkim	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
95238384Sjkim	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
96238384Sjkim
97238384Sjkimmy @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
98238384Sjkimmy $tmp="%r12d";
99238384Sjkim
100238384Sjkimmy @XX=("%rbp","%rsi");			# RC4 registers
101238384Sjkimmy @TX=("%rax","%rbx");
102238384Sjkimmy $YY="%rcx";
103238384Sjkimmy $TY="%rdx";
104238384Sjkim
105238384Sjkimmy $MOD=32;				# 16, 32 or 64
106238384Sjkim
107238384Sjkim$code.=<<___;
108238384Sjkim.text
109238384Sjkim.align 16
110238384Sjkim
111238384Sjkim.globl	$func
112238384Sjkim.type	$func,\@function,$nargs
113238384Sjkim$func:
114238384Sjkim	cmp	\$0,$len
115238384Sjkim	je	.Labort
116238384Sjkim	push	%rbx
117238384Sjkim	push	%rbp
118238384Sjkim	push	%r12
119238384Sjkim	push	%r13
120238384Sjkim	push	%r14
121238384Sjkim	push	%r15
122238384Sjkim	sub	\$40,%rsp
123238384Sjkim.Lbody:
124238384Sjkim___
125238384Sjkimif ($rc4) {
126238384Sjkim$code.=<<___;
127238384Sjkim$D#md5#	mov	$ctx,%r11		# reassign arguments
128238384Sjkim	mov	$len,%r12
129238384Sjkim	mov	$in0,%r13
130238384Sjkim	mov	$out,%r14
131238384Sjkim$D#md5#	mov	$inp,%r15
132238384Sjkim___
133238384Sjkim    $ctx="%r11"	if ($md5);		# reassign arguments
134238384Sjkim    $len="%r12";
135238384Sjkim    $in0="%r13";
136238384Sjkim    $out="%r14";
137238384Sjkim    $inp="%r15"	if ($md5);
138238384Sjkim    $inp=$in0	if (!$md5);
139238384Sjkim$code.=<<___;
140238384Sjkim	xor	$XX[0],$XX[0]
141238384Sjkim	xor	$YY,$YY
142238384Sjkim
143238384Sjkim	lea	8($dat),$dat
144238384Sjkim	mov	-8($dat),$XX[0]#b
145238384Sjkim	mov	-4($dat),$YY#b
146238384Sjkim
147238384Sjkim	inc	$XX[0]#b
148238384Sjkim	sub	$in0,$out
149238384Sjkim	movl	($dat,$XX[0],4),$TX[0]#d
150238384Sjkim___
151238384Sjkim$code.=<<___ if (!$md5);
152238384Sjkim	xor	$TX[1],$TX[1]
153238384Sjkim	test	\$-128,$len
154238384Sjkim	jz	.Loop1
155238384Sjkim	sub	$XX[0],$TX[1]
156238384Sjkim	and	\$`$MOD-1`,$TX[1]
157238384Sjkim	jz	.Loop${MOD}_is_hot
158238384Sjkim	sub	$TX[1],$len
159238384Sjkim.Loop${MOD}_warmup:
160238384Sjkim	add	$TX[0]#b,$YY#b
161238384Sjkim	movl	($dat,$YY,4),$TY#d
162238384Sjkim	movl	$TX[0]#d,($dat,$YY,4)
163238384Sjkim	movl	$TY#d,($dat,$XX[0],4)
164238384Sjkim	add	$TY#b,$TX[0]#b
165238384Sjkim	inc	$XX[0]#b
166238384Sjkim	movl	($dat,$TX[0],4),$TY#d
167238384Sjkim	movl	($dat,$XX[0],4),$TX[0]#d
168238384Sjkim	xorb	($in0),$TY#b
169238384Sjkim	movb	$TY#b,($out,$in0)
170238384Sjkim	lea	1($in0),$in0
171238384Sjkim	dec	$TX[1]
172238384Sjkim	jnz	.Loop${MOD}_warmup
173238384Sjkim
174238384Sjkim	mov	$YY,$TX[1]
175238384Sjkim	xor	$YY,$YY
176238384Sjkim	mov	$TX[1]#b,$YY#b
177238384Sjkim
178238384Sjkim.Loop${MOD}_is_hot:
179238384Sjkim	mov	$len,32(%rsp)		# save original $len
180238384Sjkim	shr	\$6,$len		# number of 64-byte blocks
181238384Sjkim___
182238384Sjkim  if ($D && !$md5) {			# stitch in dummy MD5
183238384Sjkim    $md5=1;
184238384Sjkim    $ctx="%r11";
185238384Sjkim    $inp="%r15";
186238384Sjkim    $code.=<<___;
187238384Sjkim	mov	%rsp,$ctx
188238384Sjkim	mov	$in0,$inp
189238384Sjkim___
190238384Sjkim  }
191238384Sjkim}
192238384Sjkim$code.=<<___;
193238384Sjkim#rc4#	add	$TX[0]#b,$YY#b
194238384Sjkim#rc4#	lea	($dat,$XX[0],4),$XX[1]
195238384Sjkim	shl	\$6,$len
196238384Sjkim	add	$inp,$len		# pointer to the end of input
197238384Sjkim	mov	$len,16(%rsp)
198238384Sjkim
199238384Sjkim#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
200238384Sjkim#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
201238384Sjkim#md5#	mov	1*4($ctx),$V[1]
202238384Sjkim#md5#	mov	2*4($ctx),$V[2]
203238384Sjkim#md5#	mov	3*4($ctx),$V[3]
204238384Sjkim	jmp	.Loop
205238384Sjkim
206238384Sjkim.align	16
207238384Sjkim.Loop:
208238384Sjkim#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
209238384Sjkim#md5#	mov	$V[1],1*4(%rsp)
210238384Sjkim#md5#	mov	$V[2],2*4(%rsp)
211238384Sjkim#md5#	mov	$V[3],$tmp		# forward reference
212238384Sjkim#md5#	mov	$V[3],3*4(%rsp)
213238384Sjkim___
214238384Sjkim
215238384Sjkimsub R0 {
216238384Sjkim  my ($i,$a,$b,$c,$d)=@_;
217238384Sjkim  my @rot0=(7,12,17,22);
218238384Sjkim  my $j=$i%16;
219238384Sjkim  my $k=$i%$MOD;
220238384Sjkim  my $xmm="%xmm".($j&1);
221238384Sjkim    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
222238384Sjkim    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
223238384Sjkim    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
224238384Sjkim    $code.=<<___;
225238384Sjkim#rc4#	movl	($dat,$YY,4),$TY#d
226238384Sjkim#md5#	xor	$c,$tmp
227238384Sjkim#rc4#	movl	$TX[0]#d,($dat,$YY,4)
228238384Sjkim#md5#	and	$b,$tmp
229238384Sjkim#md5#	add	4*`$j`($inp),$a
230238384Sjkim#rc4#	add	$TY#b,$TX[0]#b
231238384Sjkim#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
232238384Sjkim#md5#	add	\$$K[$i],$a
233238384Sjkim#md5#	xor	$d,$tmp
234238384Sjkim#rc4#	movz	$TX[0]#b,$TX[0]#d
235238384Sjkim#rc4#	movl	$TY#d,4*$k($XX[1])
236238384Sjkim#md5#	add	$tmp,$a
237238384Sjkim#rc4#	add	$TX[1]#b,$YY#b
238238384Sjkim#md5#	rol	\$$rot0[$j%4],$a
239238384Sjkim#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
240238384Sjkim#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
241238384Sjkim#md5#	add	$b,$a
242238384Sjkim___
243238384Sjkim    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
244238384Sjkim	mov	$YY,$XX[1]
245238384Sjkim	xor	$YY,$YY				# keyword to partial register
246238384Sjkim	mov	$XX[1]#b,$YY#b
247238384Sjkim	lea	($dat,$XX[0],4),$XX[1]
248238384Sjkim___
249238384Sjkim    $code.=<<___ if ($rc4 && $j==15);
250238384Sjkim	psllq	\$8,%xmm1
251238384Sjkim	pxor	%xmm0,%xmm2
252238384Sjkim	pxor	%xmm1,%xmm2
253238384Sjkim___
254238384Sjkim}
255238384Sjkimsub R1 {
256238384Sjkim  my ($i,$a,$b,$c,$d)=@_;
257238384Sjkim  my @rot1=(5,9,14,20);
258238384Sjkim  my $j=$i%16;
259238384Sjkim  my $k=$i%$MOD;
260238384Sjkim  my $xmm="%xmm".($j&1);
261238384Sjkim    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
262238384Sjkim    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
263238384Sjkim    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
264238384Sjkim    $code.=<<___;
265238384Sjkim#rc4#	movl	($dat,$YY,4),$TY#d
266238384Sjkim#md5#	xor	$b,$tmp
267238384Sjkim#rc4#	movl	$TX[0]#d,($dat,$YY,4)
268238384Sjkim#md5#	and	$d,$tmp
269238384Sjkim#md5#	add	4*`((1+5*$j)%16)`($inp),$a
270238384Sjkim#rc4#	add	$TY#b,$TX[0]#b
271238384Sjkim#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
272238384Sjkim#md5#	add	\$$K[$i],$a
273238384Sjkim#md5#	xor	$c,$tmp
274238384Sjkim#rc4#	movz	$TX[0]#b,$TX[0]#d
275238384Sjkim#rc4#	movl	$TY#d,4*$k($XX[1])
276238384Sjkim#md5#	add	$tmp,$a
277238384Sjkim#rc4#	add	$TX[1]#b,$YY#b
278238384Sjkim#md5#	rol	\$$rot1[$j%4],$a
279238384Sjkim#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
280238384Sjkim#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
281238384Sjkim#md5#	add	$b,$a
282238384Sjkim___
283238384Sjkim    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
284238384Sjkim	mov	$YY,$XX[1]
285238384Sjkim	xor	$YY,$YY				# keyword to partial register
286238384Sjkim	mov	$XX[1]#b,$YY#b
287238384Sjkim	lea	($dat,$XX[0],4),$XX[1]
288238384Sjkim___
289238384Sjkim    $code.=<<___ if ($rc4 && $j==15);
290238384Sjkim	psllq	\$8,%xmm1
291238384Sjkim	pxor	%xmm0,%xmm3
292238384Sjkim	pxor	%xmm1,%xmm3
293238384Sjkim___
294238384Sjkim}
295238384Sjkimsub R2 {
296238384Sjkim  my ($i,$a,$b,$c,$d)=@_;
297238384Sjkim  my @rot2=(4,11,16,23);
298238384Sjkim  my $j=$i%16;
299238384Sjkim  my $k=$i%$MOD;
300238384Sjkim  my $xmm="%xmm".($j&1);
301238384Sjkim    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
302238384Sjkim    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
303238384Sjkim    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
304238384Sjkim    $code.=<<___;
305238384Sjkim#rc4#	movl	($dat,$YY,4),$TY#d
306238384Sjkim#md5#	xor	$c,$tmp
307238384Sjkim#rc4#	movl	$TX[0]#d,($dat,$YY,4)
308238384Sjkim#md5#	xor	$b,$tmp
309238384Sjkim#md5#	add	4*`((5+3*$j)%16)`($inp),$a
310238384Sjkim#rc4#	add	$TY#b,$TX[0]#b
311238384Sjkim#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
312238384Sjkim#md5#	add	\$$K[$i],$a
313238384Sjkim#rc4#	movz	$TX[0]#b,$TX[0]#d
314238384Sjkim#md5#	add	$tmp,$a
315238384Sjkim#rc4#	movl	$TY#d,4*$k($XX[1])
316238384Sjkim#rc4#	add	$TX[1]#b,$YY#b
317238384Sjkim#md5#	rol	\$$rot2[$j%4],$a
318238384Sjkim#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
319238384Sjkim#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
320238384Sjkim#md5#	add	$b,$a
321238384Sjkim___
322238384Sjkim    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
323238384Sjkim	mov	$YY,$XX[1]
324238384Sjkim	xor	$YY,$YY				# keyword to partial register
325238384Sjkim	mov	$XX[1]#b,$YY#b
326238384Sjkim	lea	($dat,$XX[0],4),$XX[1]
327238384Sjkim___
328238384Sjkim    $code.=<<___ if ($rc4 && $j==15);
329238384Sjkim	psllq	\$8,%xmm1
330238384Sjkim	pxor	%xmm0,%xmm4
331238384Sjkim	pxor	%xmm1,%xmm4
332238384Sjkim___
333238384Sjkim}
334238384Sjkimsub R3 {
335238384Sjkim  my ($i,$a,$b,$c,$d)=@_;
336238384Sjkim  my @rot3=(6,10,15,21);
337238384Sjkim  my $j=$i%16;
338238384Sjkim  my $k=$i%$MOD;
339238384Sjkim  my $xmm="%xmm".($j&1);
340238384Sjkim    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
341238384Sjkim    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
342238384Sjkim    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
343238384Sjkim    $code.=<<___;
344238384Sjkim#rc4#	movl	($dat,$YY,4),$TY#d
345238384Sjkim#md5#	xor	$d,$tmp
346238384Sjkim#rc4#	movl	$TX[0]#d,($dat,$YY,4)
347238384Sjkim#md5#	or	$b,$tmp
348238384Sjkim#md5#	add	4*`((7*$j)%16)`($inp),$a
349238384Sjkim#rc4#	add	$TY#b,$TX[0]#b
350238384Sjkim#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
351238384Sjkim#md5#	add	\$$K[$i],$a
352238384Sjkim#rc4#	movz	$TX[0]#b,$TX[0]#d
353238384Sjkim#md5#	xor	$c,$tmp
354238384Sjkim#rc4#	movl	$TY#d,4*$k($XX[1])
355238384Sjkim#md5#	add	$tmp,$a
356238384Sjkim#rc4#	add	$TX[1]#b,$YY#b
357238384Sjkim#md5#	rol	\$$rot3[$j%4],$a
358238384Sjkim#md5#	mov	\$-1,$tmp			# forward reference
359238384Sjkim#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
360238384Sjkim#md5#	add	$b,$a
361238384Sjkim___
362238384Sjkim    $code.=<<___ if ($rc4 && $j==15);
363238384Sjkim	mov	$XX[0],$XX[1]
364238384Sjkim	xor	$XX[0],$XX[0]			# keyword to partial register
365238384Sjkim	mov	$XX[1]#b,$XX[0]#b
366238384Sjkim	mov	$YY,$XX[1]
367238384Sjkim	xor	$YY,$YY				# keyword to partial register
368238384Sjkim	mov	$XX[1]#b,$YY#b
369238384Sjkim	lea	($dat,$XX[0],4),$XX[1]
370238384Sjkim	psllq	\$8,%xmm1
371238384Sjkim	pxor	%xmm0,%xmm5
372238384Sjkim	pxor	%xmm1,%xmm5
373238384Sjkim___
374238384Sjkim}
375238384Sjkim
376238384Sjkimmy $i=0;
377238384Sjkimfor(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
378238384Sjkimfor(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
379238384Sjkimfor(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
380238384Sjkimfor(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
381238384Sjkim
382238384Sjkim$code.=<<___;
383238384Sjkim#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
384238384Sjkim#md5#	add	1*4(%rsp),$V[1]
385238384Sjkim#md5#	add	2*4(%rsp),$V[2]
386238384Sjkim#md5#	add	3*4(%rsp),$V[3]
387238384Sjkim
388238384Sjkim#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
389238384Sjkim#rc4#	movdqu	%xmm3,16($out,$in0)
390238384Sjkim#rc4#	movdqu	%xmm4,32($out,$in0)
391238384Sjkim#rc4#	movdqu	%xmm5,48($out,$in0)
392238384Sjkim#md5#	lea	64($inp),$inp
393238384Sjkim#rc4#	lea	64($in0),$in0
394238384Sjkim	cmp	16(%rsp),$inp		# are we done?
395238384Sjkim	jb	.Loop
396238384Sjkim
397238384Sjkim#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
398238384Sjkim#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
399238384Sjkim#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
400238384Sjkim#md5#	mov	$V[1],1*4($len)
401238384Sjkim#md5#	mov	$V[2],2*4($len)
402238384Sjkim#md5#	mov	$V[3],3*4($len)
403238384Sjkim___
404238384Sjkim$code.=<<___ if ($rc4 && (!$md5 || $D));
405238384Sjkim	mov	32(%rsp),$len		# restore original $len
406238384Sjkim	and	\$63,$len		# remaining bytes
407238384Sjkim	jnz	.Loop1
408238384Sjkim	jmp	.Ldone
409238384Sjkim
410238384Sjkim.align	16
411238384Sjkim.Loop1:
412238384Sjkim	add	$TX[0]#b,$YY#b
413238384Sjkim	movl	($dat,$YY,4),$TY#d
414238384Sjkim	movl	$TX[0]#d,($dat,$YY,4)
415238384Sjkim	movl	$TY#d,($dat,$XX[0],4)
416238384Sjkim	add	$TY#b,$TX[0]#b
417238384Sjkim	inc	$XX[0]#b
418238384Sjkim	movl	($dat,$TX[0],4),$TY#d
419238384Sjkim	movl	($dat,$XX[0],4),$TX[0]#d
420238384Sjkim	xorb	($in0),$TY#b
421238384Sjkim	movb	$TY#b,($out,$in0)
422238384Sjkim	lea	1($in0),$in0
423238384Sjkim	dec	$len
424238384Sjkim	jnz	.Loop1
425238384Sjkim
426238384Sjkim.Ldone:
427238384Sjkim___
428238384Sjkim$code.=<<___;
429238384Sjkim#rc4#	sub	\$1,$XX[0]#b
430238384Sjkim#rc4#	movl	$XX[0]#d,-8($dat)
431238384Sjkim#rc4#	movl	$YY#d,-4($dat)
432238384Sjkim
433238384Sjkim	mov	40(%rsp),%r15
434238384Sjkim	mov	48(%rsp),%r14
435238384Sjkim	mov	56(%rsp),%r13
436238384Sjkim	mov	64(%rsp),%r12
437238384Sjkim	mov	72(%rsp),%rbp
438238384Sjkim	mov	80(%rsp),%rbx
439238384Sjkim	lea	88(%rsp),%rsp
440238384Sjkim.Lepilogue:
441238384Sjkim.Labort:
442238384Sjkim	ret
443238384Sjkim.size $func,.-$func
444238384Sjkim___
445238384Sjkim
446238384Sjkimif ($rc4 && $D) {	# sole purpose of this section is to provide
447238384Sjkim			# option to use the generated module as drop-in
448238384Sjkim			# replacement for rc4-x86_64.pl for debugging
449238384Sjkim			# and testing purposes...
450238384Sjkimmy ($idx,$ido)=("%r8","%r9");
451238384Sjkimmy ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
452238384Sjkim
453238384Sjkim$code.=<<___;
454238384Sjkim.globl	RC4_set_key
455238384Sjkim.type	RC4_set_key,\@function,3
456238384Sjkim.align	16
457238384SjkimRC4_set_key:
458238384Sjkim	lea	8($dat),$dat
459238384Sjkim	lea	($inp,$len),$inp
460238384Sjkim	neg	$len
461238384Sjkim	mov	$len,%rcx
462238384Sjkim	xor	%eax,%eax
463238384Sjkim	xor	$ido,$ido
464238384Sjkim	xor	%r10,%r10
465238384Sjkim	xor	%r11,%r11
466238384Sjkim	jmp	.Lw1stloop
467238384Sjkim
468238384Sjkim.align	16
469238384Sjkim.Lw1stloop:
470238384Sjkim	mov	%eax,($dat,%rax,4)
471238384Sjkim	add	\$1,%al
472238384Sjkim	jnc	.Lw1stloop
473238384Sjkim
474238384Sjkim	xor	$ido,$ido
475238384Sjkim	xor	$idx,$idx
476238384Sjkim.align	16
477238384Sjkim.Lw2ndloop:
478238384Sjkim	mov	($dat,$ido,4),%r10d
479238384Sjkim	add	($inp,$len,1),$idx#b
480238384Sjkim	add	%r10b,$idx#b
481238384Sjkim	add	\$1,$len
482238384Sjkim	mov	($dat,$idx,4),%r11d
483238384Sjkim	cmovz	%rcx,$len
484238384Sjkim	mov	%r10d,($dat,$idx,4)
485238384Sjkim	mov	%r11d,($dat,$ido,4)
486238384Sjkim	add	\$1,$ido#b
487238384Sjkim	jnc	.Lw2ndloop
488238384Sjkim
489238384Sjkim	xor	%eax,%eax
490238384Sjkim	mov	%eax,-8($dat)
491238384Sjkim	mov	%eax,-4($dat)
492238384Sjkim	ret
493238384Sjkim.size	RC4_set_key,.-RC4_set_key
494238384Sjkim
495238384Sjkim.globl	RC4_options
496238384Sjkim.type	RC4_options,\@abi-omnipotent
497238384Sjkim.align	16
498238384SjkimRC4_options:
499238384Sjkim	lea	.Lopts(%rip),%rax
500238384Sjkim	ret
501238384Sjkim.align	64
502238384Sjkim.Lopts:
503238384Sjkim.asciz	"rc4(64x,int)"
504238384Sjkim.align	64
505238384Sjkim.size	RC4_options,.-RC4_options
506238384Sjkim___
507238384Sjkim}
508238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
509238384Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
510238384Sjkimif ($win64) {
511238384Sjkimmy $rec="%rcx";
512238384Sjkimmy $frame="%rdx";
513238384Sjkimmy $context="%r8";
514238384Sjkimmy $disp="%r9";
515238384Sjkim
516238384Sjkim$code.=<<___;
517238384Sjkim.extern	__imp_RtlVirtualUnwind
518238384Sjkim.type	se_handler,\@abi-omnipotent
519238384Sjkim.align	16
520238384Sjkimse_handler:
521238384Sjkim	push	%rsi
522238384Sjkim	push	%rdi
523238384Sjkim	push	%rbx
524238384Sjkim	push	%rbp
525238384Sjkim	push	%r12
526238384Sjkim	push	%r13
527238384Sjkim	push	%r14
528238384Sjkim	push	%r15
529238384Sjkim	pushfq
530238384Sjkim	sub	\$64,%rsp
531238384Sjkim
532238384Sjkim	mov	120($context),%rax	# pull context->Rax
533238384Sjkim	mov	248($context),%rbx	# pull context->Rip
534238384Sjkim
535238384Sjkim	lea	.Lbody(%rip),%r10
536238384Sjkim	cmp	%r10,%rbx		# context->Rip<.Lbody
537238384Sjkim	jb	.Lin_prologue
538238384Sjkim
539238384Sjkim	mov	152($context),%rax	# pull context->Rsp
540238384Sjkim
541238384Sjkim	lea	.Lepilogue(%rip),%r10
542238384Sjkim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
543238384Sjkim	jae	.Lin_prologue
544238384Sjkim
545238384Sjkim	mov	40(%rax),%r15
546238384Sjkim	mov	48(%rax),%r14
547238384Sjkim	mov	56(%rax),%r13
548238384Sjkim	mov	64(%rax),%r12
549238384Sjkim	mov	72(%rax),%rbp
550238384Sjkim	mov	80(%rax),%rbx
551238384Sjkim	lea	88(%rax),%rax
552238384Sjkim
553238384Sjkim	mov	%rbx,144($context)	# restore context->Rbx
554238384Sjkim	mov	%rbp,160($context)	# restore context->Rbp
555238384Sjkim	mov	%r12,216($context)	# restore context->R12
556238384Sjkim	mov	%r13,224($context)	# restore context->R12
557238384Sjkim	mov	%r14,232($context)	# restore context->R14
558238384Sjkim	mov	%r15,240($context)	# restore context->R15
559238384Sjkim
560238384Sjkim.Lin_prologue:
561238384Sjkim	mov	8(%rax),%rdi
562238384Sjkim	mov	16(%rax),%rsi
563238384Sjkim	mov	%rax,152($context)	# restore context->Rsp
564238384Sjkim	mov	%rsi,168($context)	# restore context->Rsi
565238384Sjkim	mov	%rdi,176($context)	# restore context->Rdi
566238384Sjkim
567238384Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
568238384Sjkim	mov	$context,%rsi		# context
569238384Sjkim	mov	\$154,%ecx		# sizeof(CONTEXT)
570238384Sjkim	.long	0xa548f3fc		# cld; rep movsq
571238384Sjkim
572238384Sjkim	mov	$disp,%rsi
573238384Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
574238384Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
575238384Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
576238384Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
577238384Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
578238384Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
579238384Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
580238384Sjkim	mov	%r10,32(%rsp)		# arg5
581238384Sjkim	mov	%r11,40(%rsp)		# arg6
582238384Sjkim	mov	%r12,48(%rsp)		# arg7
583238384Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
584238384Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
585238384Sjkim
586238384Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
587238384Sjkim	add	\$64,%rsp
588238384Sjkim	popfq
589238384Sjkim	pop	%r15
590238384Sjkim	pop	%r14
591238384Sjkim	pop	%r13
592238384Sjkim	pop	%r12
593238384Sjkim	pop	%rbp
594238384Sjkim	pop	%rbx
595238384Sjkim	pop	%rdi
596238384Sjkim	pop	%rsi
597238384Sjkim	ret
598238384Sjkim.size	se_handler,.-se_handler
599238384Sjkim
600238384Sjkim.section	.pdata
601238384Sjkim.align	4
602238384Sjkim	.rva	.LSEH_begin_$func
603238384Sjkim	.rva	.LSEH_end_$func
604238384Sjkim	.rva	.LSEH_info_$func
605238384Sjkim
606238384Sjkim.section	.xdata
607238384Sjkim.align	8
608238384Sjkim.LSEH_info_$func:
609238384Sjkim	.byte	9,0,0,0
610238384Sjkim	.rva	se_handler
611238384Sjkim___
612238384Sjkim}
613238384Sjkim
614238384Sjkimsub reg_part {
615238384Sjkimmy ($reg,$conv)=@_;
616238384Sjkim    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
617238384Sjkim    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
618238384Sjkim    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
619238384Sjkim    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
620238384Sjkim    return $reg;
621238384Sjkim}
622238384Sjkim
623238384Sjkim$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
624238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem;
625238384Sjkim$code =~ s/pinsrw\s+\$0,/movd	/gm;
626238384Sjkim
627238384Sjkim$code =~ s/#md5#//gm	if ($md5);
628238384Sjkim$code =~ s/#rc4#//gm	if ($rc4);
629238384Sjkim
630238384Sjkimprint $code;
631238384Sjkim
632238384Sjkimclose STDOUT;
633