1/***************************************************************************
2*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
3*                                                                         *
4*   This program is free software; you can redistribute it and/or modify  *
5*   it under the terms of the GNU General Public License as published by  *
6*   the Free Software Foundation; either version 2 of the License, or     *
7*   (at your option) any later version.                                   *
8*                                                                         *
9*   This program is distributed in the hope that it will be useful,       *
10*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
11*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
12*   GNU General Public License for more details.                          *
13*                                                                         *
14*   You should have received a copy of the GNU General Public License     *
15*   along with this program; if not, write to the                         *
16*   Free Software Foundation, Inc.,                                       *
17*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
18***************************************************************************/
19
20.file "twofish-i586-asm.S"
21.text
22
23#include <asm/asm-offsets.h>
24
25/* return adress at 0 */
26
27#define in_blk    12  /* input byte array address parameter*/
28#define out_blk   8  /* output byte array address parameter*/
29#define tfm       4  /* Twofish context structure */
30
31#define a_offset	0
32#define b_offset	4
33#define c_offset	8
34#define d_offset	12
35
36/* Structure of the crypto context struct*/
37
38#define s0	0	/* S0 Array 256 Words each */
39#define s1	1024	/* S1 Array */
40#define s2	2048	/* S2 Array */
41#define s3	3072	/* S3 Array */
42#define w	4096	/* 8 whitening keys (word) */
43#define k	4128	/* key 1-32 ( word ) */
44
45/* define a few register aliases to allow macro substitution */
46
47#define R0D    %eax
48#define R0B    %al
49#define R0H    %ah
50
51#define R1D    %ebx
52#define R1B    %bl
53#define R1H    %bh
54
55#define R2D    %ecx
56#define R2B    %cl
57#define R2H    %ch
58
59#define R3D    %edx
60#define R3B    %dl
61#define R3H    %dh
62
63
64/* performs input whitening */
65#define input_whitening(src,context,offset)\
66	xor	w+offset(context),	src;
67
68/* performs input whitening */
69#define output_whitening(src,context,offset)\
70	xor	w+16+offset(context),	src;
71
72/*
73 * a input register containing a (rotated 16)
74 * b input register containing b
75 * c input register containing c
76 * d input register containing d (already rol $1)
77 * operations on a and b are interleaved to increase performance
78 */
79#define encrypt_round(a,b,c,d,round)\
80	push	d ## D;\
81	movzx	b ## B,		%edi;\
82	mov	s1(%ebp,%edi,4),d ## D;\
83	movzx	a ## B,		%edi;\
84	mov	s2(%ebp,%edi,4),%esi;\
85	movzx	b ## H,		%edi;\
86	ror	$16,		b ## D;\
87	xor	s2(%ebp,%edi,4),d ## D;\
88	movzx	a ## H,		%edi;\
89	ror	$16,		a ## D;\
90	xor	s3(%ebp,%edi,4),%esi;\
91	movzx	b ## B,		%edi;\
92	xor	s3(%ebp,%edi,4),d ## D;\
93	movzx	a ## B,		%edi;\
94	xor	(%ebp,%edi,4),	%esi;\
95	movzx	b ## H,		%edi;\
96	ror	$15,		b ## D;\
97	xor	(%ebp,%edi,4),	d ## D;\
98	movzx	a ## H,		%edi;\
99	xor	s1(%ebp,%edi,4),%esi;\
100	pop	%edi;\
101	add	d ## D,		%esi;\
102	add	%esi,		d ## D;\
103	add	k+round(%ebp),	%esi;\
104	xor	%esi,		c ## D;\
105	rol	$15,		c ## D;\
106	add	k+4+round(%ebp),d ## D;\
107	xor	%edi,		d ## D;
108
109/*
110 * a input register containing a (rotated 16)
111 * b input register containing b
112 * c input register containing c
113 * d input register containing d (already rol $1)
114 * operations on a and b are interleaved to increase performance
115 * last round has different rotations for the output preparation
116 */
117#define encrypt_last_round(a,b,c,d,round)\
118	push	d ## D;\
119	movzx	b ## B,		%edi;\
120	mov	s1(%ebp,%edi,4),d ## D;\
121	movzx	a ## B,		%edi;\
122	mov	s2(%ebp,%edi,4),%esi;\
123	movzx	b ## H,		%edi;\
124	ror	$16,		b ## D;\
125	xor	s2(%ebp,%edi,4),d ## D;\
126	movzx	a ## H,		%edi;\
127	ror	$16,		a ## D;\
128	xor	s3(%ebp,%edi,4),%esi;\
129	movzx	b ## B,		%edi;\
130	xor	s3(%ebp,%edi,4),d ## D;\
131	movzx	a ## B,		%edi;\
132	xor	(%ebp,%edi,4),	%esi;\
133	movzx	b ## H,		%edi;\
134	ror	$16,		b ## D;\
135	xor	(%ebp,%edi,4),	d ## D;\
136	movzx	a ## H,		%edi;\
137	xor	s1(%ebp,%edi,4),%esi;\
138	pop	%edi;\
139	add	d ## D,		%esi;\
140	add	%esi,		d ## D;\
141	add	k+round(%ebp),	%esi;\
142	xor	%esi,		c ## D;\
143	ror	$1,		c ## D;\
144	add	k+4+round(%ebp),d ## D;\
145	xor	%edi,		d ## D;
146
147/*
148 * a input register containing a
149 * b input register containing b (rotated 16)
150 * c input register containing c
151 * d input register containing d (already rol $1)
152 * operations on a and b are interleaved to increase performance
153 */
154#define decrypt_round(a,b,c,d,round)\
155	push	c ## D;\
156	movzx	a ## B,		%edi;\
157	mov	(%ebp,%edi,4),	c ## D;\
158	movzx	b ## B,		%edi;\
159	mov	s3(%ebp,%edi,4),%esi;\
160	movzx	a ## H,		%edi;\
161	ror	$16,		a ## D;\
162	xor	s1(%ebp,%edi,4),c ## D;\
163	movzx	b ## H,		%edi;\
164	ror	$16,		b ## D;\
165	xor	(%ebp,%edi,4),	%esi;\
166	movzx	a ## B,		%edi;\
167	xor	s2(%ebp,%edi,4),c ## D;\
168	movzx	b ## B,		%edi;\
169	xor	s1(%ebp,%edi,4),%esi;\
170	movzx	a ## H,		%edi;\
171	ror	$15,		a ## D;\
172	xor	s3(%ebp,%edi,4),c ## D;\
173	movzx	b ## H,		%edi;\
174	xor	s2(%ebp,%edi,4),%esi;\
175	pop	%edi;\
176	add	%esi,		c ## D;\
177	add	c ## D,		%esi;\
178	add	k+round(%ebp),	c ## D;\
179	xor	%edi,		c ## D;\
180	add	k+4+round(%ebp),%esi;\
181	xor	%esi,		d ## D;\
182	rol	$15,		d ## D;
183
184/*
185 * a input register containing a
186 * b input register containing b (rotated 16)
187 * c input register containing c
188 * d input register containing d (already rol $1)
189 * operations on a and b are interleaved to increase performance
190 * last round has different rotations for the output preparation
191 */
192#define decrypt_last_round(a,b,c,d,round)\
193	push	c ## D;\
194	movzx	a ## B,		%edi;\
195	mov	(%ebp,%edi,4),	c ## D;\
196	movzx	b ## B,		%edi;\
197	mov	s3(%ebp,%edi,4),%esi;\
198	movzx	a ## H,		%edi;\
199	ror	$16,		a ## D;\
200	xor	s1(%ebp,%edi,4),c ## D;\
201	movzx	b ## H,		%edi;\
202	ror	$16,		b ## D;\
203	xor	(%ebp,%edi,4),	%esi;\
204	movzx	a ## B,		%edi;\
205	xor	s2(%ebp,%edi,4),c ## D;\
206	movzx	b ## B,		%edi;\
207	xor	s1(%ebp,%edi,4),%esi;\
208	movzx	a ## H,		%edi;\
209	ror	$16,		a ## D;\
210	xor	s3(%ebp,%edi,4),c ## D;\
211	movzx	b ## H,		%edi;\
212	xor	s2(%ebp,%edi,4),%esi;\
213	pop	%edi;\
214	add	%esi,		c ## D;\
215	add	c ## D,		%esi;\
216	add	k+round(%ebp),	c ## D;\
217	xor	%edi,		c ## D;\
218	add	k+4+round(%ebp),%esi;\
219	xor	%esi,		d ## D;\
220	ror	$1,		d ## D;
221
222.align 4
223.global twofish_enc_blk
224.global twofish_dec_blk
225
226twofish_enc_blk:
227	push	%ebp			/* save registers according to calling convention*/
228	push    %ebx
229	push    %esi
230	push    %edi
231
232	mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */
233	add	$crypto_tfm_ctx_offset, %ebp	/* ctx adress */
234	mov     in_blk+16(%esp),%edi	/* input adress in edi */
235
236	mov	(%edi),		%eax
237	mov	b_offset(%edi),	%ebx
238	mov	c_offset(%edi),	%ecx
239	mov	d_offset(%edi),	%edx
240	input_whitening(%eax,%ebp,a_offset)
241	ror	$16,	%eax
242	input_whitening(%ebx,%ebp,b_offset)
243	input_whitening(%ecx,%ebp,c_offset)
244	input_whitening(%edx,%ebp,d_offset)
245	rol	$1,	%edx
246
247	encrypt_round(R0,R1,R2,R3,0);
248	encrypt_round(R2,R3,R0,R1,8);
249	encrypt_round(R0,R1,R2,R3,2*8);
250	encrypt_round(R2,R3,R0,R1,3*8);
251	encrypt_round(R0,R1,R2,R3,4*8);
252	encrypt_round(R2,R3,R0,R1,5*8);
253	encrypt_round(R0,R1,R2,R3,6*8);
254	encrypt_round(R2,R3,R0,R1,7*8);
255	encrypt_round(R0,R1,R2,R3,8*8);
256	encrypt_round(R2,R3,R0,R1,9*8);
257	encrypt_round(R0,R1,R2,R3,10*8);
258	encrypt_round(R2,R3,R0,R1,11*8);
259	encrypt_round(R0,R1,R2,R3,12*8);
260	encrypt_round(R2,R3,R0,R1,13*8);
261	encrypt_round(R0,R1,R2,R3,14*8);
262	encrypt_last_round(R2,R3,R0,R1,15*8);
263
264	output_whitening(%eax,%ebp,c_offset)
265	output_whitening(%ebx,%ebp,d_offset)
266	output_whitening(%ecx,%ebp,a_offset)
267	output_whitening(%edx,%ebp,b_offset)
268	mov	out_blk+16(%esp),%edi;
269	mov	%eax,		c_offset(%edi)
270	mov	%ebx,		d_offset(%edi)
271	mov	%ecx,		(%edi)
272	mov	%edx,		b_offset(%edi)
273
274	pop	%edi
275	pop	%esi
276	pop	%ebx
277	pop	%ebp
278	mov	$1,	%eax
279	ret
280
281twofish_dec_blk:
282	push	%ebp			/* save registers according to calling convention*/
283	push    %ebx
284	push    %esi
285	push    %edi
286
287
288	mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */
289	add	$crypto_tfm_ctx_offset, %ebp	/* ctx adress */
290	mov     in_blk+16(%esp),%edi	/* input adress in edi */
291
292	mov	(%edi),		%eax
293	mov	b_offset(%edi),	%ebx
294	mov	c_offset(%edi),	%ecx
295	mov	d_offset(%edi),	%edx
296	output_whitening(%eax,%ebp,a_offset)
297	output_whitening(%ebx,%ebp,b_offset)
298	ror	$16,	%ebx
299	output_whitening(%ecx,%ebp,c_offset)
300	output_whitening(%edx,%ebp,d_offset)
301	rol	$1,	%ecx
302
303	decrypt_round(R0,R1,R2,R3,15*8);
304	decrypt_round(R2,R3,R0,R1,14*8);
305	decrypt_round(R0,R1,R2,R3,13*8);
306	decrypt_round(R2,R3,R0,R1,12*8);
307	decrypt_round(R0,R1,R2,R3,11*8);
308	decrypt_round(R2,R3,R0,R1,10*8);
309	decrypt_round(R0,R1,R2,R3,9*8);
310	decrypt_round(R2,R3,R0,R1,8*8);
311	decrypt_round(R0,R1,R2,R3,7*8);
312	decrypt_round(R2,R3,R0,R1,6*8);
313	decrypt_round(R0,R1,R2,R3,5*8);
314	decrypt_round(R2,R3,R0,R1,4*8);
315	decrypt_round(R0,R1,R2,R3,3*8);
316	decrypt_round(R2,R3,R0,R1,2*8);
317	decrypt_round(R0,R1,R2,R3,1*8);
318	decrypt_last_round(R2,R3,R0,R1,0);
319
320	input_whitening(%eax,%ebp,c_offset)
321	input_whitening(%ebx,%ebp,d_offset)
322	input_whitening(%ecx,%ebp,a_offset)
323	input_whitening(%edx,%ebp,b_offset)
324	mov	out_blk+16(%esp),%edi;
325	mov	%eax,		c_offset(%edi)
326	mov	%ebx,		d_offset(%edi)
327	mov	%ecx,		(%edi)
328	mov	%edx,		b_offset(%edi)
329
330	pop	%edi
331	pop	%esi
332	pop	%ebx
333	pop	%ebp
334	mov	$1,	%eax
335	ret
336