1160814Ssimon#!/usr/bin/perl -w
2160814Ssimon#
3160814Ssimon# MD5 optimized for AMD64.
4160814Ssimon#
5160814Ssimon# Author: Marc Bevand <bevand_m (at) epita.fr>
6160814Ssimon# Licence: I hereby disclaim the copyright on this code and place it
7160814Ssimon# in the public domain.
8160814Ssimon#
9160814Ssimon
10160814Ssimonuse strict;
11160814Ssimon
12160814Ssimonmy $code;
13160814Ssimon
14160814Ssimon# round1_step() does:
15160814Ssimon#   dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
16160814Ssimon#   %r10d = X[k_next]
17160814Ssimon#   %r11d = z' (copy of z for the next step)
18160814Ssimon# Each round1_step() takes about 5.71 clocks (9 instructions, 1.58 IPC)
19160814Ssimonsub round1_step
20160814Ssimon{
21160814Ssimon    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
22205128Ssimon    $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
23160814Ssimon    $code .= " mov	0*4(%rsi),	%r10d		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
24160814Ssimon    $code .= " mov	%edx,		%r11d		/* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
25160814Ssimon    $code .= <<EOF;
26160814Ssimon	xor	$y,		%r11d		/* y ^ ... */
27160814Ssimon	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
28160814Ssimon	and	$x,		%r11d		/* x & ... */
29160814Ssimon	xor	$z,		%r11d		/* z ^ ... */
30160814Ssimon	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
31160814Ssimon	add	%r11d,		$dst		/* dst += ... */
32160814Ssimon	rol	\$$s,		$dst		/* dst <<< s */
33160814Ssimon	mov	$y,		%r11d		/* (NEXT STEP) z' = $y */
34160814Ssimon	add	$x,		$dst		/* dst += x */
35160814SsimonEOF
36160814Ssimon}
37160814Ssimon
38160814Ssimon# round2_step() does:
39160814Ssimon#   dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
40160814Ssimon#   %r10d = X[k_next]
41160814Ssimon#   %r11d = y' (copy of y for the next step)
42160814Ssimon# Each round2_step() takes about 6.22 clocks (9 instructions, 1.45 IPC)
43160814Ssimonsub round2_step
44160814Ssimon{
45160814Ssimon    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
46205128Ssimon    $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
47160814Ssimon    $code .= " mov	1*4(%rsi),	%r10d		/* (NEXT STEP) X[1] */\n" if ($pos == -1);
48160814Ssimon    $code .= " mov	%ecx,		%r11d		/* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
49160814Ssimon    $code .= <<EOF;
50160814Ssimon	xor	$x,		%r11d		/* x ^ ... */
51160814Ssimon	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
52160814Ssimon	and	$z,		%r11d		/* z & ... */
53160814Ssimon	xor	$y,		%r11d		/* y ^ ... */
54160814Ssimon	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
55160814Ssimon	add	%r11d,		$dst		/* dst += ... */
56160814Ssimon	rol	\$$s,		$dst		/* dst <<< s */
57160814Ssimon	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
58160814Ssimon	add	$x,		$dst		/* dst += x */
59160814SsimonEOF
60160814Ssimon}
61160814Ssimon
62160814Ssimon# round3_step() does:
63160814Ssimon#   dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
64160814Ssimon#   %r10d = X[k_next]
65160814Ssimon#   %r11d = y' (copy of y for the next step)
66160814Ssimon# Each round3_step() takes about 4.26 clocks (8 instructions, 1.88 IPC)
67160814Ssimonsub round3_step
68160814Ssimon{
69160814Ssimon    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
70205128Ssimon    $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
71160814Ssimon    $code .= " mov	5*4(%rsi),	%r10d		/* (NEXT STEP) X[5] */\n" if ($pos == -1);
72160814Ssimon    $code .= " mov	%ecx,		%r11d		/* (NEXT STEP) y' = %ecx */\n" if ($pos == -1);
73160814Ssimon    $code .= <<EOF;
74160814Ssimon	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
75160814Ssimon	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
76160814Ssimon	xor	$z,		%r11d		/* z ^ ... */
77160814Ssimon	xor	$x,		%r11d		/* x ^ ... */
78160814Ssimon	add	%r11d,		$dst		/* dst += ... */
79160814Ssimon	rol	\$$s,		$dst		/* dst <<< s */
80160814Ssimon	mov	$x,		%r11d		/* (NEXT STEP) y' = $x */
81160814Ssimon	add	$x,		$dst		/* dst += x */
82160814SsimonEOF
83160814Ssimon}
84160814Ssimon
85160814Ssimon# round4_step() does:
86160814Ssimon#   dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
87160814Ssimon#   %r10d = X[k_next]
88160814Ssimon#   %r11d = not z' (copy of not z for the next step)
89160814Ssimon# Each round4_step() takes about 5.27 clocks (9 instructions, 1.71 IPC)
90160814Ssimonsub round4_step
91160814Ssimon{
92160814Ssimon    my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
93205128Ssimon    $T_i = unpack("l",pack("l", hex($T_i))); # convert to 32-bit signed decimal
94160814Ssimon    $code .= " mov	0*4(%rsi),	%r10d		/* (NEXT STEP) X[0] */\n" if ($pos == -1);
95160814Ssimon    $code .= " mov	\$0xffffffff,	%r11d\n" if ($pos == -1);
96160814Ssimon    $code .= " xor	%edx,		%r11d		/* (NEXT STEP) not z' = not %edx*/\n"
97160814Ssimon    if ($pos == -1);
98160814Ssimon    $code .= <<EOF;
99160814Ssimon	lea	$T_i($dst,%r10d),$dst		/* Const + dst + ... */
100160814Ssimon	or	$x,		%r11d		/* x | ... */
101160814Ssimon	xor	$y,		%r11d		/* y ^ ... */
102160814Ssimon	add	%r11d,		$dst		/* dst += ... */
103160814Ssimon	mov	$k_next*4(%rsi),%r10d		/* (NEXT STEP) X[$k_next] */
104160814Ssimon	mov	\$0xffffffff,	%r11d
105160814Ssimon	rol	\$$s,		$dst		/* dst <<< s */
106160814Ssimon	xor	$y,		%r11d		/* (NEXT STEP) not z' = not $y */
107160814Ssimon	add	$x,		$dst		/* dst += x */
108160814SsimonEOF
109160814Ssimon}
110160814Ssimon
111160814Ssimonmy $output = shift;
112160814Ssimonopen STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
113160814Ssimon
114160814Ssimon$code .= <<EOF;
115160814Ssimon.text
116160814Ssimon.align 16
117160814Ssimon
118194206Ssimon.globl md5_block_asm_data_order
119194206Ssimon.type md5_block_asm_data_order,\@function,3
120194206Ssimonmd5_block_asm_data_order:
121160814Ssimon	push	%rbp
122160814Ssimon	push	%rbx
123160814Ssimon	push	%r14
124160814Ssimon	push	%r15
125160814Ssimon
126160814Ssimon	# rdi = arg #1 (ctx, MD5_CTX pointer)
127160814Ssimon	# rsi = arg #2 (ptr, data pointer)
128160814Ssimon	# rdx = arg #3 (nbr, number of 16-word blocks to process)
129160814Ssimon	mov	%rdi,		%rbp	# rbp = ctx
130160814Ssimon	shl	\$6,		%rdx	# rdx = nbr in bytes
131160814Ssimon	lea	(%rsi,%rdx),	%rdi	# rdi = end
132160814Ssimon	mov	0*4(%rbp),	%eax	# eax = ctx->A
133160814Ssimon	mov	1*4(%rbp),	%ebx	# ebx = ctx->B
134160814Ssimon	mov	2*4(%rbp),	%ecx	# ecx = ctx->C
135160814Ssimon	mov	3*4(%rbp),	%edx	# edx = ctx->D
136160814Ssimon	# end is 'rdi'
137160814Ssimon	# ptr is 'rsi'
138160814Ssimon	# A is 'eax'
139160814Ssimon	# B is 'ebx'
140160814Ssimon	# C is 'ecx'
141160814Ssimon	# D is 'edx'
142160814Ssimon
143160814Ssimon	cmp	%rdi,		%rsi		# cmp end with ptr
144160814Ssimon	je	.Lend				# jmp if ptr == end
145160814Ssimon
146160814Ssimon	# BEGIN of loop over 16-word blocks
147160814Ssimon.Lloop:	# save old values of A, B, C, D
148160814Ssimon	mov	%eax,		%r8d
149160814Ssimon	mov	%ebx,		%r9d
150160814Ssimon	mov	%ecx,		%r14d
151160814Ssimon	mov	%edx,		%r15d
152160814SsimonEOF
153160814Ssimonround1_step(-1,'%eax','%ebx','%ecx','%edx', '1','0xd76aa478', '7');
154160814Ssimonround1_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xe8c7b756','12');
155160814Ssimonround1_step( 0,'%ecx','%edx','%eax','%ebx', '3','0x242070db','17');
156160814Ssimonround1_step( 0,'%ebx','%ecx','%edx','%eax', '4','0xc1bdceee','22');
157160814Ssimonround1_step( 0,'%eax','%ebx','%ecx','%edx', '5','0xf57c0faf', '7');
158160814Ssimonround1_step( 0,'%edx','%eax','%ebx','%ecx', '6','0x4787c62a','12');
159160814Ssimonround1_step( 0,'%ecx','%edx','%eax','%ebx', '7','0xa8304613','17');
160160814Ssimonround1_step( 0,'%ebx','%ecx','%edx','%eax', '8','0xfd469501','22');
161160814Ssimonround1_step( 0,'%eax','%ebx','%ecx','%edx', '9','0x698098d8', '7');
162160814Ssimonround1_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8b44f7af','12');
163160814Ssimonround1_step( 0,'%ecx','%edx','%eax','%ebx','11','0xffff5bb1','17');
164160814Ssimonround1_step( 0,'%ebx','%ecx','%edx','%eax','12','0x895cd7be','22');
165160814Ssimonround1_step( 0,'%eax','%ebx','%ecx','%edx','13','0x6b901122', '7');
166160814Ssimonround1_step( 0,'%edx','%eax','%ebx','%ecx','14','0xfd987193','12');
167160814Ssimonround1_step( 0,'%ecx','%edx','%eax','%ebx','15','0xa679438e','17');
168160814Ssimonround1_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x49b40821','22');
169160814Ssimon
170160814Ssimonround2_step(-1,'%eax','%ebx','%ecx','%edx', '6','0xf61e2562', '5');
171160814Ssimonround2_step( 0,'%edx','%eax','%ebx','%ecx','11','0xc040b340', '9');
172160814Ssimonround2_step( 0,'%ecx','%edx','%eax','%ebx', '0','0x265e5a51','14');
173160814Ssimonround2_step( 0,'%ebx','%ecx','%edx','%eax', '5','0xe9b6c7aa','20');
174160814Ssimonround2_step( 0,'%eax','%ebx','%ecx','%edx','10','0xd62f105d', '5');
175160814Ssimonround2_step( 0,'%edx','%eax','%ebx','%ecx','15', '0x2441453', '9');
176160814Ssimonround2_step( 0,'%ecx','%edx','%eax','%ebx', '4','0xd8a1e681','14');
177160814Ssimonround2_step( 0,'%ebx','%ecx','%edx','%eax', '9','0xe7d3fbc8','20');
178160814Ssimonround2_step( 0,'%eax','%ebx','%ecx','%edx','14','0x21e1cde6', '5');
179160814Ssimonround2_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xc33707d6', '9');
180160814Ssimonround2_step( 0,'%ecx','%edx','%eax','%ebx', '8','0xf4d50d87','14');
181160814Ssimonround2_step( 0,'%ebx','%ecx','%edx','%eax','13','0x455a14ed','20');
182160814Ssimonround2_step( 0,'%eax','%ebx','%ecx','%edx', '2','0xa9e3e905', '5');
183160814Ssimonround2_step( 0,'%edx','%eax','%ebx','%ecx', '7','0xfcefa3f8', '9');
184160814Ssimonround2_step( 0,'%ecx','%edx','%eax','%ebx','12','0x676f02d9','14');
185160814Ssimonround2_step( 1,'%ebx','%ecx','%edx','%eax', '0','0x8d2a4c8a','20');
186160814Ssimon
187160814Ssimonround3_step(-1,'%eax','%ebx','%ecx','%edx', '8','0xfffa3942', '4');
188160814Ssimonround3_step( 0,'%edx','%eax','%ebx','%ecx','11','0x8771f681','11');
189160814Ssimonround3_step( 0,'%ecx','%edx','%eax','%ebx','14','0x6d9d6122','16');
190160814Ssimonround3_step( 0,'%ebx','%ecx','%edx','%eax', '1','0xfde5380c','23');
191160814Ssimonround3_step( 0,'%eax','%ebx','%ecx','%edx', '4','0xa4beea44', '4');
192160814Ssimonround3_step( 0,'%edx','%eax','%ebx','%ecx', '7','0x4bdecfa9','11');
193160814Ssimonround3_step( 0,'%ecx','%edx','%eax','%ebx','10','0xf6bb4b60','16');
194160814Ssimonround3_step( 0,'%ebx','%ecx','%edx','%eax','13','0xbebfbc70','23');
195160814Ssimonround3_step( 0,'%eax','%ebx','%ecx','%edx', '0','0x289b7ec6', '4');
196160814Ssimonround3_step( 0,'%edx','%eax','%ebx','%ecx', '3','0xeaa127fa','11');
197160814Ssimonround3_step( 0,'%ecx','%edx','%eax','%ebx', '6','0xd4ef3085','16');
198160814Ssimonround3_step( 0,'%ebx','%ecx','%edx','%eax', '9', '0x4881d05','23');
199160814Ssimonround3_step( 0,'%eax','%ebx','%ecx','%edx','12','0xd9d4d039', '4');
200160814Ssimonround3_step( 0,'%edx','%eax','%ebx','%ecx','15','0xe6db99e5','11');
201160814Ssimonround3_step( 0,'%ecx','%edx','%eax','%ebx', '2','0x1fa27cf8','16');
202160814Ssimonround3_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xc4ac5665','23');
203160814Ssimon
204160814Ssimonround4_step(-1,'%eax','%ebx','%ecx','%edx', '7','0xf4292244', '6');
205160814Ssimonround4_step( 0,'%edx','%eax','%ebx','%ecx','14','0x432aff97','10');
206160814Ssimonround4_step( 0,'%ecx','%edx','%eax','%ebx', '5','0xab9423a7','15');
207160814Ssimonround4_step( 0,'%ebx','%ecx','%edx','%eax','12','0xfc93a039','21');
208160814Ssimonround4_step( 0,'%eax','%ebx','%ecx','%edx', '3','0x655b59c3', '6');
209160814Ssimonround4_step( 0,'%edx','%eax','%ebx','%ecx','10','0x8f0ccc92','10');
210160814Ssimonround4_step( 0,'%ecx','%edx','%eax','%ebx', '1','0xffeff47d','15');
211160814Ssimonround4_step( 0,'%ebx','%ecx','%edx','%eax', '8','0x85845dd1','21');
212160814Ssimonround4_step( 0,'%eax','%ebx','%ecx','%edx','15','0x6fa87e4f', '6');
213160814Ssimonround4_step( 0,'%edx','%eax','%ebx','%ecx', '6','0xfe2ce6e0','10');
214160814Ssimonround4_step( 0,'%ecx','%edx','%eax','%ebx','13','0xa3014314','15');
215160814Ssimonround4_step( 0,'%ebx','%ecx','%edx','%eax', '4','0x4e0811a1','21');
216160814Ssimonround4_step( 0,'%eax','%ebx','%ecx','%edx','11','0xf7537e82', '6');
217160814Ssimonround4_step( 0,'%edx','%eax','%ebx','%ecx', '2','0xbd3af235','10');
218160814Ssimonround4_step( 0,'%ecx','%edx','%eax','%ebx', '9','0x2ad7d2bb','15');
219160814Ssimonround4_step( 1,'%ebx','%ecx','%edx','%eax', '0','0xeb86d391','21');
220160814Ssimon$code .= <<EOF;
221160814Ssimon	# add old values of A, B, C, D
222160814Ssimon	add	%r8d,	%eax
223160814Ssimon	add	%r9d,	%ebx
224160814Ssimon	add	%r14d,	%ecx
225160814Ssimon	add	%r15d,	%edx
226160814Ssimon
227160814Ssimon	# loop control
228160814Ssimon	add	\$64,		%rsi		# ptr += 64
229160814Ssimon	cmp	%rdi,		%rsi		# cmp end with ptr
230160814Ssimon	jb	.Lloop				# jmp if ptr < end
231160814Ssimon	# END of loop over 16-word blocks
232160814Ssimon
233160814Ssimon.Lend:
234160814Ssimon	mov	%eax,		0*4(%rbp)	# ctx->A = A
235160814Ssimon	mov	%ebx,		1*4(%rbp)	# ctx->B = B
236160814Ssimon	mov	%ecx,		2*4(%rbp)	# ctx->C = C
237160814Ssimon	mov	%edx,		3*4(%rbp)	# ctx->D = D
238160814Ssimon
239160814Ssimon	pop	%r15
240160814Ssimon	pop	%r14
241160814Ssimon	pop	%rbx
242160814Ssimon	pop	%rbp
243160814Ssimon	ret
244194206Ssimon.size md5_block_asm_data_order,.-md5_block_asm_data_order
245160814SsimonEOF
246160814Ssimon
247160814Ssimonprint $code;
248160814Ssimon
249160814Ssimonclose STDOUT;
250