1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4238384Sjkim# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 5238384Sjkim# 6238384Sjkim# This module may be used under the terms of either the GNU General 7238384Sjkim# Public License version 2 or later, the GNU Lesser General Public 8238384Sjkim# License version 2.1 or later, the Mozilla Public License version 9238384Sjkim# 1.1 or the BSD License. The exact terms of either license are 10238384Sjkim# distributed along with this module. For further details see 11238384Sjkim# http://www.openssl.org/~appro/camellia/. 12238384Sjkim# ==================================================================== 13238384Sjkim 14238384Sjkim# Performance in cycles per processed byte (less is better) in 15238384Sjkim# 'openssl speed ...' benchmark: 16238384Sjkim# 17238384Sjkim# AMD K8 Core2 PIII P4 18238384Sjkim# -evp camellia-128-ecb 21.5 22.8 27.0 28.9 19238384Sjkim# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% 20238384Sjkim# + over icc 8.0 +48/19% +21/15% +21/17% +55/37% 21238384Sjkim# 22238384Sjkim# camellia-128-cbc 17.3 21.1 23.9 25.9 23238384Sjkim# 24238384Sjkim# 128-bit key setup 196 280 256 240 cycles/key 25238384Sjkim# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% 26238384Sjkim# + over icc 8.0 +18/3% +10/0% +10/3% +21/10% 27238384Sjkim# 28238384Sjkim# Pairs of numbers in "+" rows represent performance improvement over 29238384Sjkim# compiler generated position-independent code, PIC, and non-PIC 30238384Sjkim# respectively. PIC results are of greater relevance, as this module 31238384Sjkim# is position-independent, i.e. suitable for a shared library or PIE. 32238384Sjkim# Position independence "costs" one register, which is why compilers 33238384Sjkim# are so close with non-PIC results, they have an extra register to 34238384Sjkim# spare. CBC results are better than ECB ones thanks to "zero-copy" 35238384Sjkim# private _x86_* interface, and are ~30-40% better than with compiler 36238384Sjkim# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on 37238384Sjkim# same CPU (where applicable). 38238384Sjkim 39238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40238384Sjkimpush(@INC,"${dir}","${dir}../../perlasm"); 41238384Sjkimrequire "x86asm.pl"; 42238384Sjkim 43238384Sjkim$OPENSSL=1; 44238384Sjkim 45238384Sjkim&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386"); 46238384Sjkim 47238384Sjkim@T=("eax","ebx","ecx","edx"); 48238384Sjkim$idx="esi"; 49238384Sjkim$key="edi"; 50238384Sjkim$Tbl="ebp"; 51238384Sjkim 52238384Sjkim# stack frame layout in _x86_Camellia_* routines, frame is allocated 53238384Sjkim# by caller 54238384Sjkim$__ra=&DWP(0,"esp"); # return address 55238384Sjkim$__s0=&DWP(4,"esp"); # s0 backing store 56238384Sjkim$__s1=&DWP(8,"esp"); # s1 backing store 57238384Sjkim$__s2=&DWP(12,"esp"); # s2 backing store 58238384Sjkim$__s3=&DWP(16,"esp"); # s3 backing store 59238384Sjkim$__end=&DWP(20,"esp"); # pointer to end/start of key schedule 60238384Sjkim 61238384Sjkim# stack frame layout in Camellia_[en|crypt] routines, which differs from 62238384Sjkim# above by 4 and overlaps by pointer to end/start of key schedule 63238384Sjkim$_end=&DWP(16,"esp"); 64238384Sjkim$_esp=&DWP(20,"esp"); 65238384Sjkim 66238384Sjkim# const unsigned int Camellia_SBOX[4][256]; 67238384Sjkim# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 68238384Sjkim# and [2][] - with [3][]. This is done to optimize code size. 69238384Sjkim$SBOX1_1110=0; # Camellia_SBOX[0] 70238384Sjkim$SBOX4_4404=4; # Camellia_SBOX[1] 71238384Sjkim$SBOX2_0222=2048; # Camellia_SBOX[2] 72238384Sjkim$SBOX3_3033=2052; # Camellia_SBOX[3] 73238384Sjkim&static_label("Camellia_SIGMA"); 74238384Sjkim&static_label("Camellia_SBOX"); 75238384Sjkim 76238384Sjkimsub Camellia_Feistel { 77238384Sjkimmy $i=@_[0]; 78238384Sjkimmy $seed=defined(@_[1])?@_[1]:0; 79238384Sjkimmy $scale=$seed<0?-8:8; 80238384Sjkimmy $frame=defined(@_[2])?@_[2]:0; 81238384Sjkimmy $j=($i&1)*2; 82238384Sjkimmy $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; 83238384Sjkim 84238384Sjkim &xor ($t0,$idx); # t0^=key[0] 85238384Sjkim &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] 86238384Sjkim &movz ($idx,&HB($t0)); # (t0>>8)&0xff 87238384Sjkim &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] 88238384Sjkim &movz ($idx,&LB($t0)); # (t0>>0)&0xff 89238384Sjkim &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] 90238384Sjkim &shr ($t0,16); 91238384Sjkim &movz ($idx,&LB($t1)); # (t1>>0)&0xff 92238384Sjkim &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] 93238384Sjkim &movz ($idx,&HB($t0)); # (t0>>24)&0xff 94238384Sjkim &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] 95238384Sjkim &movz ($idx,&HB($t1)); # (t1>>8)&0xff 96238384Sjkim &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] 97238384Sjkim &shr ($t1,16); 98238384Sjkim &movz ($t0,&LB($t0)); # (t0>>16)&0xff 99238384Sjkim &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] 100238384Sjkim &movz ($idx,&HB($t1)); # (t1>>24)&0xff 101238384Sjkim &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" 102238384Sjkim &xor ($t2,$t3); # t2^=t3 103238384Sjkim &rotr ($t3,8); # t3=RightRotate(t3,8) 104238384Sjkim &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] 105238384Sjkim &movz ($idx,&LB($t1)); # (t1>>16)&0xff 106238384Sjkim &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" 107238384Sjkim &xor ($t3,$t0); # t3^=s3 108238384Sjkim &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] 109238384Sjkim &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] 110238384Sjkim &xor ($t3,$t2); # t3^=t2 111238384Sjkim &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 112238384Sjkim &xor ($t2,$t1); # t2^=s2 113238384Sjkim &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 114238384Sjkim} 115238384Sjkim 116238384Sjkim# void Camellia_EncryptBlock_Rounds( 117238384Sjkim# int grandRounds, 118238384Sjkim# const Byte plaintext[], 119238384Sjkim# const KEY_TABLE_TYPE keyTable, 120238384Sjkim# Byte ciphertext[]) 121238384Sjkim&function_begin("Camellia_EncryptBlock_Rounds"); 122238384Sjkim &mov ("eax",&wparam(0)); # load grandRounds 123238384Sjkim &mov ($idx,&wparam(1)); # load plaintext pointer 124238384Sjkim &mov ($key,&wparam(2)); # load key schedule pointer 125238384Sjkim 126238384Sjkim &mov ("ebx","esp"); 127238384Sjkim &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 128238384Sjkim &and ("esp",-64); 129238384Sjkim 130238384Sjkim # place stack frame just "above mod 1024" the key schedule 131238384Sjkim # this ensures that cache associativity of 2 suffices 132238384Sjkim &lea ("ecx",&DWP(-64-63,$key)); 133238384Sjkim &sub ("ecx","esp"); 134238384Sjkim &neg ("ecx"); 135238384Sjkim &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 136238384Sjkim &sub ("esp","ecx"); 137238384Sjkim &add ("esp",4); # 4 is reserved for callee's return address 138238384Sjkim 139238384Sjkim &shl ("eax",6); 140238384Sjkim &lea ("eax",&DWP(0,$key,"eax")); 141238384Sjkim &mov ($_esp,"ebx"); # save %esp 142238384Sjkim &mov ($_end,"eax"); # save keyEnd 143238384Sjkim 144238384Sjkim &call (&label("pic_point")); 145238384Sjkim &set_label("pic_point"); 146238384Sjkim &blindpop($Tbl); 147238384Sjkim &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 148238384Sjkim 149238384Sjkim &mov (@T[0],&DWP(0,$idx)); # load plaintext 150238384Sjkim &mov (@T[1],&DWP(4,$idx)); 151238384Sjkim &mov (@T[2],&DWP(8,$idx)); 152238384Sjkim &bswap (@T[0]); 153238384Sjkim &mov (@T[3],&DWP(12,$idx)); 154238384Sjkim &bswap (@T[1]); 155238384Sjkim &bswap (@T[2]); 156238384Sjkim &bswap (@T[3]); 157238384Sjkim 158238384Sjkim &call ("_x86_Camellia_encrypt"); 159238384Sjkim 160238384Sjkim &mov ("esp",$_esp); 161238384Sjkim &bswap (@T[0]); 162238384Sjkim &mov ($idx,&wparam(3)); # load ciphertext pointer 163238384Sjkim &bswap (@T[1]); 164238384Sjkim &bswap (@T[2]); 165238384Sjkim &bswap (@T[3]); 166238384Sjkim &mov (&DWP(0,$idx),@T[0]); # write ciphertext 167238384Sjkim &mov (&DWP(4,$idx),@T[1]); 168238384Sjkim &mov (&DWP(8,$idx),@T[2]); 169238384Sjkim &mov (&DWP(12,$idx),@T[3]); 170238384Sjkim&function_end("Camellia_EncryptBlock_Rounds"); 171238384Sjkim# V1.x API 172238384Sjkim&function_begin_B("Camellia_EncryptBlock"); 173238384Sjkim &mov ("eax",128); 174238384Sjkim &sub ("eax",&wparam(0)); # load keyBitLength 175238384Sjkim &mov ("eax",3); 176238384Sjkim &adc ("eax",0); # keyBitLength==128?3:4 177238384Sjkim &mov (&wparam(0),"eax"); 178238384Sjkim &jmp (&label("Camellia_EncryptBlock_Rounds")); 179238384Sjkim&function_end_B("Camellia_EncryptBlock"); 180238384Sjkim 181238384Sjkimif ($OPENSSL) { 182238384Sjkim# void Camellia_encrypt( 183238384Sjkim# const unsigned char *in, 184238384Sjkim# unsigned char *out, 185238384Sjkim# const CAMELLIA_KEY *key) 186238384Sjkim&function_begin("Camellia_encrypt"); 187238384Sjkim &mov ($idx,&wparam(0)); # load plaintext pointer 188238384Sjkim &mov ($key,&wparam(2)); # load key schedule pointer 189238384Sjkim 190238384Sjkim &mov ("ebx","esp"); 191238384Sjkim &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 192238384Sjkim &and ("esp",-64); 193238384Sjkim &mov ("eax",&DWP(272,$key)); # load grandRounds counter 194238384Sjkim 195238384Sjkim # place stack frame just "above mod 1024" the key schedule 196238384Sjkim # this ensures that cache associativity of 2 suffices 197238384Sjkim &lea ("ecx",&DWP(-64-63,$key)); 198238384Sjkim &sub ("ecx","esp"); 199238384Sjkim &neg ("ecx"); 200238384Sjkim &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 201238384Sjkim &sub ("esp","ecx"); 202238384Sjkim &add ("esp",4); # 4 is reserved for callee's return address 203238384Sjkim 204238384Sjkim &shl ("eax",6); 205238384Sjkim &lea ("eax",&DWP(0,$key,"eax")); 206238384Sjkim &mov ($_esp,"ebx"); # save %esp 207238384Sjkim &mov ($_end,"eax"); # save keyEnd 208238384Sjkim 209238384Sjkim &call (&label("pic_point")); 210238384Sjkim &set_label("pic_point"); 211238384Sjkim &blindpop($Tbl); 212238384Sjkim &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 213238384Sjkim 214238384Sjkim &mov (@T[0],&DWP(0,$idx)); # load plaintext 215238384Sjkim &mov (@T[1],&DWP(4,$idx)); 216238384Sjkim &mov (@T[2],&DWP(8,$idx)); 217238384Sjkim &bswap (@T[0]); 218238384Sjkim &mov (@T[3],&DWP(12,$idx)); 219238384Sjkim &bswap (@T[1]); 220238384Sjkim &bswap (@T[2]); 221238384Sjkim &bswap (@T[3]); 222238384Sjkim 223238384Sjkim &call ("_x86_Camellia_encrypt"); 224238384Sjkim 225238384Sjkim &mov ("esp",$_esp); 226238384Sjkim &bswap (@T[0]); 227238384Sjkim &mov ($idx,&wparam(1)); # load ciphertext pointer 228238384Sjkim &bswap (@T[1]); 229238384Sjkim &bswap (@T[2]); 230238384Sjkim &bswap (@T[3]); 231238384Sjkim &mov (&DWP(0,$idx),@T[0]); # write ciphertext 232238384Sjkim &mov (&DWP(4,$idx),@T[1]); 233238384Sjkim &mov (&DWP(8,$idx),@T[2]); 234238384Sjkim &mov (&DWP(12,$idx),@T[3]); 235238384Sjkim&function_end("Camellia_encrypt"); 236238384Sjkim} 237238384Sjkim 238238384Sjkim&function_begin_B("_x86_Camellia_encrypt"); 239238384Sjkim &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 240238384Sjkim &xor (@T[1],&DWP(4,$key)); 241238384Sjkim &xor (@T[2],&DWP(8,$key)); 242238384Sjkim &xor (@T[3],&DWP(12,$key)); 243238384Sjkim &mov ($idx,&DWP(16,$key)); # prefetch key[4] 244238384Sjkim 245238384Sjkim &mov ($__s0,@T[0]); # save s[0-3] 246238384Sjkim &mov ($__s1,@T[1]); 247238384Sjkim &mov ($__s2,@T[2]); 248238384Sjkim &mov ($__s3,@T[3]); 249238384Sjkim 250238384Sjkim&set_label("loop",16); 251238384Sjkim for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } 252238384Sjkim 253238384Sjkim &add ($key,16*4); 254238384Sjkim &cmp ($key,$__end); 255238384Sjkim &je (&label("done")); 256238384Sjkim 257238384Sjkim # @T[0-1] are preloaded, $idx is preloaded with key[0] 258238384Sjkim &and ($idx,@T[0]); 259238384Sjkim &mov (@T[3],$__s3); 260238384Sjkim &rotl ($idx,1); 261238384Sjkim &mov (@T[2],@T[3]); 262238384Sjkim &xor (@T[1],$idx); 263238384Sjkim &or (@T[2],&DWP(12,$key)); 264238384Sjkim &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 265238384Sjkim &xor (@T[2],$__s2); 266238384Sjkim 267238384Sjkim &mov ($idx,&DWP(4,$key)); 268238384Sjkim &mov ($__s2,@T[2]); # s2^=s3|key[3]; 269238384Sjkim &or ($idx,@T[1]); 270238384Sjkim &and (@T[2],&DWP(8,$key)); 271238384Sjkim &xor (@T[0],$idx); 272238384Sjkim &rotl (@T[2],1); 273238384Sjkim &mov ($__s0,@T[0]); # s0^=s1|key[1]; 274238384Sjkim &xor (@T[3],@T[2]); 275238384Sjkim &mov ($idx,&DWP(16,$key)); # prefetch key[4] 276238384Sjkim &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 277238384Sjkim &jmp (&label("loop")); 278238384Sjkim 279238384Sjkim&set_label("done",8); 280238384Sjkim &mov (@T[2],@T[0]); # SwapHalf 281238384Sjkim &mov (@T[3],@T[1]); 282238384Sjkim &mov (@T[0],$__s2); 283238384Sjkim &mov (@T[1],$__s3); 284238384Sjkim &xor (@T[0],$idx); # $idx is preloaded with key[0] 285238384Sjkim &xor (@T[1],&DWP(4,$key)); 286238384Sjkim &xor (@T[2],&DWP(8,$key)); 287238384Sjkim &xor (@T[3],&DWP(12,$key)); 288238384Sjkim &ret (); 289238384Sjkim&function_end_B("_x86_Camellia_encrypt"); 290238384Sjkim 291238384Sjkim# void Camellia_DecryptBlock_Rounds( 292238384Sjkim# int grandRounds, 293238384Sjkim# const Byte ciphertext[], 294238384Sjkim# const KEY_TABLE_TYPE keyTable, 295238384Sjkim# Byte plaintext[]) 296238384Sjkim&function_begin("Camellia_DecryptBlock_Rounds"); 297238384Sjkim &mov ("eax",&wparam(0)); # load grandRounds 298238384Sjkim &mov ($idx,&wparam(1)); # load ciphertext pointer 299238384Sjkim &mov ($key,&wparam(2)); # load key schedule pointer 300238384Sjkim 301238384Sjkim &mov ("ebx","esp"); 302238384Sjkim &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 303238384Sjkim &and ("esp",-64); 304238384Sjkim 305238384Sjkim # place stack frame just "above mod 1024" the key schedule 306238384Sjkim # this ensures that cache associativity of 2 suffices 307238384Sjkim &lea ("ecx",&DWP(-64-63,$key)); 308238384Sjkim &sub ("ecx","esp"); 309238384Sjkim &neg ("ecx"); 310238384Sjkim &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 311238384Sjkim &sub ("esp","ecx"); 312238384Sjkim &add ("esp",4); # 4 is reserved for callee's return address 313238384Sjkim 314238384Sjkim &shl ("eax",6); 315238384Sjkim &mov (&DWP(4*4,"esp"),$key); # save keyStart 316238384Sjkim &lea ($key,&DWP(0,$key,"eax")); 317238384Sjkim &mov (&DWP(5*4,"esp"),"ebx");# save %esp 318238384Sjkim 319238384Sjkim &call (&label("pic_point")); 320238384Sjkim &set_label("pic_point"); 321238384Sjkim &blindpop($Tbl); 322238384Sjkim &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 323238384Sjkim 324238384Sjkim &mov (@T[0],&DWP(0,$idx)); # load ciphertext 325238384Sjkim &mov (@T[1],&DWP(4,$idx)); 326238384Sjkim &mov (@T[2],&DWP(8,$idx)); 327238384Sjkim &bswap (@T[0]); 328238384Sjkim &mov (@T[3],&DWP(12,$idx)); 329238384Sjkim &bswap (@T[1]); 330238384Sjkim &bswap (@T[2]); 331238384Sjkim &bswap (@T[3]); 332238384Sjkim 333238384Sjkim &call ("_x86_Camellia_decrypt"); 334238384Sjkim 335238384Sjkim &mov ("esp",&DWP(5*4,"esp")); 336238384Sjkim &bswap (@T[0]); 337238384Sjkim &mov ($idx,&wparam(3)); # load plaintext pointer 338238384Sjkim &bswap (@T[1]); 339238384Sjkim &bswap (@T[2]); 340238384Sjkim &bswap (@T[3]); 341238384Sjkim &mov (&DWP(0,$idx),@T[0]); # write plaintext 342238384Sjkim &mov (&DWP(4,$idx),@T[1]); 343238384Sjkim &mov (&DWP(8,$idx),@T[2]); 344238384Sjkim &mov (&DWP(12,$idx),@T[3]); 345238384Sjkim&function_end("Camellia_DecryptBlock_Rounds"); 346238384Sjkim# V1.x API 347238384Sjkim&function_begin_B("Camellia_DecryptBlock"); 348238384Sjkim &mov ("eax",128); 349238384Sjkim &sub ("eax",&wparam(0)); # load keyBitLength 350238384Sjkim &mov ("eax",3); 351238384Sjkim &adc ("eax",0); # keyBitLength==128?3:4 352238384Sjkim &mov (&wparam(0),"eax"); 353238384Sjkim &jmp (&label("Camellia_DecryptBlock_Rounds")); 354238384Sjkim&function_end_B("Camellia_DecryptBlock"); 355238384Sjkim 356238384Sjkimif ($OPENSSL) { 357238384Sjkim# void Camellia_decrypt( 358238384Sjkim# const unsigned char *in, 359238384Sjkim# unsigned char *out, 360238384Sjkim# const CAMELLIA_KEY *key) 361238384Sjkim&function_begin("Camellia_decrypt"); 362238384Sjkim &mov ($idx,&wparam(0)); # load ciphertext pointer 363238384Sjkim &mov ($key,&wparam(2)); # load key schedule pointer 364238384Sjkim 365238384Sjkim &mov ("ebx","esp"); 366238384Sjkim &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 367238384Sjkim &and ("esp",-64); 368238384Sjkim &mov ("eax",&DWP(272,$key)); # load grandRounds counter 369238384Sjkim 370238384Sjkim # place stack frame just "above mod 1024" the key schedule 371238384Sjkim # this ensures that cache associativity of 2 suffices 372238384Sjkim &lea ("ecx",&DWP(-64-63,$key)); 373238384Sjkim &sub ("ecx","esp"); 374238384Sjkim &neg ("ecx"); 375238384Sjkim &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 376238384Sjkim &sub ("esp","ecx"); 377238384Sjkim &add ("esp",4); # 4 is reserved for callee's return address 378238384Sjkim 379238384Sjkim &shl ("eax",6); 380238384Sjkim &mov (&DWP(4*4,"esp"),$key); # save keyStart 381238384Sjkim &lea ($key,&DWP(0,$key,"eax")); 382238384Sjkim &mov (&DWP(5*4,"esp"),"ebx");# save %esp 383238384Sjkim 384238384Sjkim &call (&label("pic_point")); 385238384Sjkim &set_label("pic_point"); 386238384Sjkim &blindpop($Tbl); 387238384Sjkim &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 388238384Sjkim 389238384Sjkim &mov (@T[0],&DWP(0,$idx)); # load ciphertext 390238384Sjkim &mov (@T[1],&DWP(4,$idx)); 391238384Sjkim &mov (@T[2],&DWP(8,$idx)); 392238384Sjkim &bswap (@T[0]); 393238384Sjkim &mov (@T[3],&DWP(12,$idx)); 394238384Sjkim &bswap (@T[1]); 395238384Sjkim &bswap (@T[2]); 396238384Sjkim &bswap (@T[3]); 397238384Sjkim 398238384Sjkim &call ("_x86_Camellia_decrypt"); 399238384Sjkim 400238384Sjkim &mov ("esp",&DWP(5*4,"esp")); 401238384Sjkim &bswap (@T[0]); 402238384Sjkim &mov ($idx,&wparam(1)); # load plaintext pointer 403238384Sjkim &bswap (@T[1]); 404238384Sjkim &bswap (@T[2]); 405238384Sjkim &bswap (@T[3]); 406238384Sjkim &mov (&DWP(0,$idx),@T[0]); # write plaintext 407238384Sjkim &mov (&DWP(4,$idx),@T[1]); 408238384Sjkim &mov (&DWP(8,$idx),@T[2]); 409238384Sjkim &mov (&DWP(12,$idx),@T[3]); 410238384Sjkim&function_end("Camellia_decrypt"); 411238384Sjkim} 412238384Sjkim 413238384Sjkim&function_begin_B("_x86_Camellia_decrypt"); 414238384Sjkim &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 415238384Sjkim &xor (@T[1],&DWP(4,$key)); 416238384Sjkim &xor (@T[2],&DWP(8,$key)); 417238384Sjkim &xor (@T[3],&DWP(12,$key)); 418238384Sjkim &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] 419238384Sjkim 420238384Sjkim &mov ($__s0,@T[0]); # save s[0-3] 421238384Sjkim &mov ($__s1,@T[1]); 422238384Sjkim &mov ($__s2,@T[2]); 423238384Sjkim &mov ($__s3,@T[3]); 424238384Sjkim 425238384Sjkim&set_label("loop",16); 426238384Sjkim for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } 427238384Sjkim 428238384Sjkim &sub ($key,16*4); 429238384Sjkim &cmp ($key,$__end); 430238384Sjkim &je (&label("done")); 431238384Sjkim 432238384Sjkim # @T[0-1] are preloaded, $idx is preloaded with key[2] 433238384Sjkim &and ($idx,@T[0]); 434238384Sjkim &mov (@T[3],$__s3); 435238384Sjkim &rotl ($idx,1); 436238384Sjkim &mov (@T[2],@T[3]); 437238384Sjkim &xor (@T[1],$idx); 438238384Sjkim &or (@T[2],&DWP(4,$key)); 439238384Sjkim &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 440238384Sjkim &xor (@T[2],$__s2); 441238384Sjkim 442238384Sjkim &mov ($idx,&DWP(12,$key)); 443238384Sjkim &mov ($__s2,@T[2]); # s2^=s3|key[3]; 444238384Sjkim &or ($idx,@T[1]); 445238384Sjkim &and (@T[2],&DWP(0,$key)); 446238384Sjkim &xor (@T[0],$idx); 447238384Sjkim &rotl (@T[2],1); 448238384Sjkim &mov ($__s0,@T[0]); # s0^=s1|key[1]; 449238384Sjkim &xor (@T[3],@T[2]); 450238384Sjkim &mov ($idx,&DWP(-8,$key)); # prefetch key[4] 451238384Sjkim &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 452238384Sjkim &jmp (&label("loop")); 453238384Sjkim 454238384Sjkim&set_label("done",8); 455238384Sjkim &mov (@T[2],@T[0]); # SwapHalf 456238384Sjkim &mov (@T[3],@T[1]); 457238384Sjkim &mov (@T[0],$__s2); 458238384Sjkim &mov (@T[1],$__s3); 459238384Sjkim &xor (@T[2],$idx); # $idx is preloaded with key[2] 460238384Sjkim &xor (@T[3],&DWP(12,$key)); 461238384Sjkim &xor (@T[0],&DWP(0,$key)); 462238384Sjkim &xor (@T[1],&DWP(4,$key)); 463238384Sjkim &ret (); 464238384Sjkim&function_end_B("_x86_Camellia_decrypt"); 465238384Sjkim 466238384Sjkim# shld is very slow on Intel P4 family. Even on AMD it limits 467238384Sjkim# instruction decode rate [because it's VectorPath] and consequently 468238384Sjkim# performance. PIII, PM and Core[2] seem to be the only ones which 469238384Sjkim# execute this code ~7% faster... 470238384Sjkimsub __rotl128 { 471238384Sjkim my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 472238384Sjkim 473238384Sjkim $rnd *= 2; 474238384Sjkim if ($rot) { 475238384Sjkim &mov ($idx,$i0); 476238384Sjkim &shld ($i0,$i1,$rot); 477238384Sjkim &shld ($i1,$i2,$rot); 478238384Sjkim &shld ($i2,$i3,$rot); 479238384Sjkim &shld ($i3,$idx,$rot); 480238384Sjkim } 481238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 482238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 483238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 484238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 485238384Sjkim} 486238384Sjkim 487238384Sjkim# ... Implementing 128-bit rotate without shld gives >3x performance 488238384Sjkim# improvement on P4, only ~7% degradation on other Intel CPUs and 489238384Sjkim# not worse performance on AMD. This is therefore preferred. 490238384Sjkimsub _rotl128 { 491238384Sjkim my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 492238384Sjkim 493238384Sjkim $rnd *= 2; 494238384Sjkim if ($rot) { 495238384Sjkim &mov ($Tbl,$i0); 496238384Sjkim &shl ($i0,$rot); 497238384Sjkim &mov ($idx,$i1); 498238384Sjkim &shr ($idx,32-$rot); 499238384Sjkim &shl ($i1,$rot); 500238384Sjkim &or ($i0,$idx); 501238384Sjkim &mov ($idx,$i2); 502238384Sjkim &shl ($i2,$rot); 503238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 504238384Sjkim &shr ($idx,32-$rot); 505238384Sjkim &or ($i1,$idx); 506238384Sjkim &shr ($Tbl,32-$rot); 507238384Sjkim &mov ($idx,$i3); 508238384Sjkim &shr ($idx,32-$rot); 509238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 510238384Sjkim &shl ($i3,$rot); 511238384Sjkim &or ($i2,$idx); 512238384Sjkim &or ($i3,$Tbl); 513238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 514238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 515238384Sjkim } else { 516238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 517238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 518238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 519238384Sjkim &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 520238384Sjkim } 521238384Sjkim} 522238384Sjkim 523238384Sjkimsub _saveround { 524238384Sjkimmy ($rnd,$key,@T)=@_; 525238384Sjkimmy $bias=int(@T[0])?shift(@T):0; 526238384Sjkim 527238384Sjkim &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); 528238384Sjkim &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); 529238384Sjkim &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); 530238384Sjkim &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); 531238384Sjkim} 532238384Sjkim 533238384Sjkimsub _loadround { 534238384Sjkimmy ($rnd,$key,@T)=@_; 535238384Sjkimmy $bias=int(@T[0])?shift(@T):0; 536238384Sjkim 537238384Sjkim &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); 538238384Sjkim &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); 539238384Sjkim &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); 540238384Sjkim &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); 541238384Sjkim} 542238384Sjkim 543238384Sjkim# void Camellia_Ekeygen( 544238384Sjkim# const int keyBitLength, 545238384Sjkim# const Byte *rawKey, 546238384Sjkim# KEY_TABLE_TYPE keyTable) 547238384Sjkim&function_begin("Camellia_Ekeygen"); 548238384Sjkim{ my $step=0; 549238384Sjkim 550238384Sjkim &stack_push(4); # place for s[0-3] 551238384Sjkim 552238384Sjkim &mov ($Tbl,&wparam(0)); # load arguments 553238384Sjkim &mov ($idx,&wparam(1)); 554238384Sjkim &mov ($key,&wparam(2)); 555238384Sjkim 556238384Sjkim &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits 557238384Sjkim &mov (@T[1],&DWP(4,$idx)); 558238384Sjkim &mov (@T[2],&DWP(8,$idx)); 559238384Sjkim &mov (@T[3],&DWP(12,$idx)); 560238384Sjkim 561238384Sjkim &bswap (@T[0]); 562238384Sjkim &bswap (@T[1]); 563238384Sjkim &bswap (@T[2]); 564238384Sjkim &bswap (@T[3]); 565238384Sjkim 566238384Sjkim &_saveround (0,$key,@T); # KL<<<0 567238384Sjkim 568238384Sjkim &cmp ($Tbl,128); 569238384Sjkim &je (&label("1st128")); 570238384Sjkim 571238384Sjkim &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits 572238384Sjkim &mov (@T[1],&DWP(20,$idx)); 573238384Sjkim &cmp ($Tbl,192); 574238384Sjkim &je (&label("1st192")); 575238384Sjkim &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits 576238384Sjkim &mov (@T[3],&DWP(28,$idx)); 577238384Sjkim &jmp (&label("1st256")); 578238384Sjkim&set_label("1st192",4); 579238384Sjkim &mov (@T[2],@T[0]); 580238384Sjkim &mov (@T[3],@T[1]); 581238384Sjkim ¬ (@T[2]); 582238384Sjkim ¬ (@T[3]); 583238384Sjkim&set_label("1st256",4); 584238384Sjkim &bswap (@T[0]); 585238384Sjkim &bswap (@T[1]); 586238384Sjkim &bswap (@T[2]); 587238384Sjkim &bswap (@T[3]); 588238384Sjkim 589238384Sjkim &_saveround (4,$key,@T); # temporary storage for KR! 590238384Sjkim 591238384Sjkim &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL 592238384Sjkim &xor (@T[1],&DWP(0*8+4,$key)); 593238384Sjkim &xor (@T[2],&DWP(1*8+0,$key)); 594238384Sjkim &xor (@T[3],&DWP(1*8+4,$key)); 595238384Sjkim 596238384Sjkim&set_label("1st128",4); 597238384Sjkim &call (&label("pic_point")); 598238384Sjkim &set_label("pic_point"); 599238384Sjkim &blindpop($Tbl); 600238384Sjkim &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 601238384Sjkim &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); 602238384Sjkim 603238384Sjkim &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] 604238384Sjkim &mov (&swtmp(0),@T[0]); # save s[0-3] 605238384Sjkim &mov (&swtmp(1),@T[1]); 606238384Sjkim &mov (&swtmp(2),@T[2]); 607238384Sjkim &mov (&swtmp(3),@T[3]); 608238384Sjkim &Camellia_Feistel($step++); 609238384Sjkim &Camellia_Feistel($step++); 610238384Sjkim &mov (@T[2],&swtmp(2)); 611238384Sjkim &mov (@T[3],&swtmp(3)); 612238384Sjkim 613238384Sjkim &mov ($idx,&wparam(2)); 614238384Sjkim &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL 615238384Sjkim &xor (@T[1],&DWP(0*8+4,$idx)); 616238384Sjkim &xor (@T[2],&DWP(1*8+0,$idx)); 617238384Sjkim &xor (@T[3],&DWP(1*8+4,$idx)); 618238384Sjkim 619238384Sjkim &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] 620238384Sjkim &mov (&swtmp(0),@T[0]); # save s[0-3] 621238384Sjkim &mov (&swtmp(1),@T[1]); 622238384Sjkim &mov (&swtmp(2),@T[2]); 623238384Sjkim &mov (&swtmp(3),@T[3]); 624238384Sjkim &Camellia_Feistel($step++); 625238384Sjkim &Camellia_Feistel($step++); 626238384Sjkim &mov (@T[2],&swtmp(2)); 627238384Sjkim &mov (@T[3],&swtmp(3)); 628238384Sjkim 629238384Sjkim &mov ($idx,&wparam(0)); 630238384Sjkim &cmp ($idx,128); 631238384Sjkim &jne (&label("2nd256")); 632238384Sjkim 633238384Sjkim &mov ($key,&wparam(2)); 634238384Sjkim &lea ($key,&DWP(128,$key)); # size optimization 635238384Sjkim 636238384Sjkim ####### process KA 637238384Sjkim &_saveround (2,$key,-128,@T); # KA<<<0 638238384Sjkim &_rotl128 (@T,15,6,@T); # KA<<<15 639238384Sjkim &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) 640238384Sjkim &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) 641238384Sjkim &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) 642238384Sjkim push (@T,shift(@T)); # rotl128(@T,32); 643238384Sjkim &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) 644238384Sjkim &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) 645238384Sjkim 646238384Sjkim ####### process KL 647238384Sjkim &_loadround (0,$key,-128,@T); # load KL 648238384Sjkim &_rotl128 (@T,15,4,@T); # KL<<<15 649238384Sjkim &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) 650238384Sjkim &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) 651238384Sjkim &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) 652238384Sjkim &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) 653238384Sjkim &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) 654238384Sjkim 655238384Sjkim while (@T[0] ne "eax") # restore order 656238384Sjkim { unshift (@T,pop(@T)); } 657238384Sjkim 658238384Sjkim &mov ("eax",3); # 3 grandRounds 659238384Sjkim &jmp (&label("done")); 660238384Sjkim 661238384Sjkim&set_label("2nd256",16); 662238384Sjkim &mov ($idx,&wparam(2)); 663238384Sjkim &_saveround (6,$idx,@T); # temporary storage for KA! 664238384Sjkim 665238384Sjkim &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR 666238384Sjkim &xor (@T[1],&DWP(4*8+4,$idx)); 667238384Sjkim &xor (@T[2],&DWP(5*8+0,$idx)); 668238384Sjkim &xor (@T[3],&DWP(5*8+4,$idx)); 669238384Sjkim 670238384Sjkim &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] 671238384Sjkim &mov (&swtmp(0),@T[0]); # save s[0-3] 672238384Sjkim &mov (&swtmp(1),@T[1]); 673238384Sjkim &mov (&swtmp(2),@T[2]); 674238384Sjkim &mov (&swtmp(3),@T[3]); 675238384Sjkim &Camellia_Feistel($step++); 676238384Sjkim &Camellia_Feistel($step++); 677238384Sjkim &mov (@T[2],&swtmp(2)); 678238384Sjkim &mov (@T[3],&swtmp(3)); 679238384Sjkim 680238384Sjkim &mov ($key,&wparam(2)); 681238384Sjkim &lea ($key,&DWP(128,$key)); # size optimization 682238384Sjkim 683238384Sjkim ####### process KB 684238384Sjkim &_saveround (2,$key,-128,@T); # KB<<<0 685238384Sjkim &_rotl128 (@T,30,10,@T); # KB<<<30 686238384Sjkim &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) 687238384Sjkim push (@T,shift(@T)); # rotl128(@T,32); 688238384Sjkim &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) 689238384Sjkim 690238384Sjkim ####### process KR 691238384Sjkim &_loadround (4,$key,-128,@T); # load KR 692238384Sjkim &_rotl128 (@T,15,4,@T); # KR<<<15 693238384Sjkim &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) 694238384Sjkim &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) 695238384Sjkim push (@T,shift(@T)); # rotl128(@T,32); 696238384Sjkim &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) 697238384Sjkim 698238384Sjkim ####### process KA 699238384Sjkim &_loadround (6,$key,-128,@T); # load KA 700238384Sjkim &_rotl128 (@T,15,6,@T); # KA<<<15 701238384Sjkim &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) 702238384Sjkim push (@T,shift(@T)); # rotl128(@T,32); 703238384Sjkim &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) 704238384Sjkim &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) 705238384Sjkim 706238384Sjkim ####### process KL 707238384Sjkim &_loadround (0,$key,-128,@T); # load KL 708238384Sjkim push (@T,shift(@T)); # rotl128(@T,32); 709238384Sjkim &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) 710238384Sjkim &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) 711238384Sjkim &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) 712238384Sjkim push (@T,shift(@T)); # rotl128(@T,32); 713238384Sjkim &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) 714238384Sjkim 715238384Sjkim while (@T[0] ne "eax") # restore order 716238384Sjkim { unshift (@T,pop(@T)); } 717238384Sjkim 718238384Sjkim &mov ("eax",4); # 4 grandRounds 719238384Sjkim&set_label("done"); 720238384Sjkim &lea ("edx",&DWP(272-128,$key)); # end of key schedule 721238384Sjkim &stack_pop(4); 722238384Sjkim} 723238384Sjkim&function_end("Camellia_Ekeygen"); 724238384Sjkim 725238384Sjkimif ($OPENSSL) { 726238384Sjkim# int private_Camellia_set_key ( 727238384Sjkim# const unsigned char *userKey, 728238384Sjkim# int bits, 729238384Sjkim# CAMELLIA_KEY *key) 730238384Sjkim&function_begin_B("private_Camellia_set_key"); 731238384Sjkim &push ("ebx"); 732238384Sjkim &mov ("ecx",&wparam(0)); # pull arguments 733238384Sjkim &mov ("ebx",&wparam(1)); 734238384Sjkim &mov ("edx",&wparam(2)); 735238384Sjkim 736238384Sjkim &mov ("eax",-1); 737238384Sjkim &test ("ecx","ecx"); 738238384Sjkim &jz (&label("done")); # userKey==NULL? 739238384Sjkim &test ("edx","edx"); 740238384Sjkim &jz (&label("done")); # key==NULL? 741238384Sjkim 742238384Sjkim &mov ("eax",-2); 743238384Sjkim &cmp ("ebx",256); 744238384Sjkim &je (&label("arg_ok")); # bits==256? 745238384Sjkim &cmp ("ebx",192); 746238384Sjkim &je (&label("arg_ok")); # bits==192? 747238384Sjkim &cmp ("ebx",128); 748238384Sjkim &jne (&label("done")); # bits!=128? 749238384Sjkim&set_label("arg_ok",4); 750238384Sjkim 751238384Sjkim &push ("edx"); # push arguments 752238384Sjkim &push ("ecx"); 753238384Sjkim &push ("ebx"); 754238384Sjkim &call ("Camellia_Ekeygen"); 755238384Sjkim &stack_pop(3); 756238384Sjkim 757238384Sjkim # eax holds grandRounds and edx points at where to put it 758238384Sjkim &mov (&DWP(0,"edx"),"eax"); 759238384Sjkim &xor ("eax","eax"); 760238384Sjkim&set_label("done",4); 761238384Sjkim &pop ("ebx"); 762238384Sjkim &ret (); 763238384Sjkim&function_end_B("private_Camellia_set_key"); 764238384Sjkim} 765238384Sjkim 766238384Sjkim@SBOX=( 767238384Sjkim112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 768238384Sjkim 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 769238384Sjkim134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 770238384Sjkim166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 771238384Sjkim139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 772238384Sjkim223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 773238384Sjkim 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 774238384Sjkim254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 775238384Sjkim170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 776238384Sjkim 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 777238384Sjkim135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 778238384Sjkim 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 779238384Sjkim233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 780238384Sjkim120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 781238384Sjkim114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 782238384Sjkim 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 783238384Sjkim 784238384Sjkimsub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; } 785238384Sjkimsub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; } 786238384Sjkimsub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; } 787238384Sjkimsub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; } 788238384Sjkim 789238384Sjkim&set_label("Camellia_SIGMA",64); 790238384Sjkim&data_word( 791238384Sjkim 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, 792238384Sjkim 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c, 793238384Sjkim 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd, 794238384Sjkim 0, 0, 0, 0); 795238384Sjkim&set_label("Camellia_SBOX",64); 796238384Sjkim# tables are interleaved, remember? 797238384Sjkimfor ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 798238384Sjkimfor ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 799238384Sjkim 800238384Sjkim# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 801238384Sjkim# size_t length, const CAMELLIA_KEY *key, 802238384Sjkim# unsigned char *ivp,const int enc); 803238384Sjkim{ 804238384Sjkim# stack frame layout 805238384Sjkim# -4(%esp) # return address 0(%esp) 806238384Sjkim# 0(%esp) # s0 4(%esp) 807238384Sjkim# 4(%esp) # s1 8(%esp) 808238384Sjkim# 8(%esp) # s2 12(%esp) 809238384Sjkim# 12(%esp) # s3 16(%esp) 810238384Sjkim# 16(%esp) # end of key schedule 20(%esp) 811238384Sjkim# 20(%esp) # %esp backup 812238384Sjkimmy $_inp=&DWP(24,"esp"); #copy of wparam(0) 813238384Sjkimmy $_out=&DWP(28,"esp"); #copy of wparam(1) 814238384Sjkimmy $_len=&DWP(32,"esp"); #copy of wparam(2) 815238384Sjkimmy $_key=&DWP(36,"esp"); #copy of wparam(3) 816238384Sjkimmy $_ivp=&DWP(40,"esp"); #copy of wparam(4) 817238384Sjkimmy $ivec=&DWP(44,"esp"); #ivec[16] 818238384Sjkimmy $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec] 819238384Sjkimmy ($s0,$s1,$s2,$s3) = @T; 820238384Sjkim 821238384Sjkim&function_begin("Camellia_cbc_encrypt"); 822238384Sjkim &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 823238384Sjkim &cmp ($s2,0); 824238384Sjkim &je (&label("enc_out")); 825238384Sjkim 826238384Sjkim &pushf (); 827238384Sjkim &cld (); 828238384Sjkim 829238384Sjkim &mov ($s0,&wparam(0)); # load inp 830238384Sjkim &mov ($s1,&wparam(1)); # load out 831238384Sjkim #&mov ($s2,&wparam(2)); # load len 832238384Sjkim &mov ($s3,&wparam(3)); # load key 833238384Sjkim &mov ($Tbl,&wparam(4)); # load ivp 834238384Sjkim 835238384Sjkim # allocate aligned stack frame... 836238384Sjkim &lea ($idx,&DWP(-64,"esp")); 837238384Sjkim &and ($idx,-64); 838238384Sjkim 839238384Sjkim # place stack frame just "above mod 1024" the key schedule 840238384Sjkim # this ensures that cache associativity of 2 suffices 841238384Sjkim &lea ($key,&DWP(-64-63,$s3)); 842238384Sjkim &sub ($key,$idx); 843238384Sjkim &neg ($key); 844238384Sjkim &and ($key,0x3C0); # modulo 1024, but aligned to cache-line 845238384Sjkim &sub ($idx,$key); 846238384Sjkim 847238384Sjkim &mov ($key,&wparam(5)); # load enc 848238384Sjkim 849238384Sjkim &exch ("esp",$idx); 850238384Sjkim &add ("esp",4); # reserve for return address! 851238384Sjkim &mov ($_esp,$idx); # save %esp 852238384Sjkim 853238384Sjkim &mov ($_inp,$s0); # save copy of inp 854238384Sjkim &mov ($_out,$s1); # save copy of out 855238384Sjkim &mov ($_len,$s2); # save copy of len 856238384Sjkim &mov ($_key,$s3); # save copy of key 857238384Sjkim &mov ($_ivp,$Tbl); # save copy of ivp 858238384Sjkim 859238384Sjkim &call (&label("pic_point")); # make it PIC! 860238384Sjkim &set_label("pic_point"); 861238384Sjkim &blindpop($Tbl); 862238384Sjkim &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 863238384Sjkim 864238384Sjkim &mov ($idx,32); 865238384Sjkim &set_label("prefetch_sbox",4); 866238384Sjkim &mov ($s0,&DWP(0,$Tbl)); 867238384Sjkim &mov ($s1,&DWP(32,$Tbl)); 868238384Sjkim &mov ($s2,&DWP(64,$Tbl)); 869238384Sjkim &mov ($s3,&DWP(96,$Tbl)); 870238384Sjkim &lea ($Tbl,&DWP(128,$Tbl)); 871238384Sjkim &dec ($idx); 872238384Sjkim &jnz (&label("prefetch_sbox")); 873238384Sjkim &mov ($s0,$_key); 874238384Sjkim &sub ($Tbl,4096); 875238384Sjkim &mov ($idx,$_inp); 876238384Sjkim &mov ($s3,&DWP(272,$s0)); # load grandRounds 877238384Sjkim 878238384Sjkim &cmp ($key,0); 879238384Sjkim &je (&label("DECRYPT")); 880238384Sjkim 881238384Sjkim &mov ($s2,$_len); 882238384Sjkim &mov ($key,$_ivp); 883238384Sjkim &shl ($s3,6); 884238384Sjkim &lea ($s3,&DWP(0,$s0,$s3)); 885238384Sjkim &mov ($_end,$s3); 886238384Sjkim 887238384Sjkim &test ($s2,0xFFFFFFF0); 888238384Sjkim &jz (&label("enc_tail")); # short input... 889238384Sjkim 890238384Sjkim &mov ($s0,&DWP(0,$key)); # load iv 891238384Sjkim &mov ($s1,&DWP(4,$key)); 892238384Sjkim 893238384Sjkim &set_label("enc_loop",4); 894238384Sjkim &mov ($s2,&DWP(8,$key)); 895238384Sjkim &mov ($s3,&DWP(12,$key)); 896238384Sjkim 897238384Sjkim &xor ($s0,&DWP(0,$idx)); # xor input data 898238384Sjkim &xor ($s1,&DWP(4,$idx)); 899238384Sjkim &xor ($s2,&DWP(8,$idx)); 900238384Sjkim &bswap ($s0); 901238384Sjkim &xor ($s3,&DWP(12,$idx)); 902238384Sjkim &bswap ($s1); 903238384Sjkim &mov ($key,$_key); # load key 904238384Sjkim &bswap ($s2); 905238384Sjkim &bswap ($s3); 906238384Sjkim 907238384Sjkim &call ("_x86_Camellia_encrypt"); 908238384Sjkim 909238384Sjkim &mov ($idx,$_inp); # load inp 910238384Sjkim &mov ($key,$_out); # load out 911238384Sjkim 912238384Sjkim &bswap ($s0); 913238384Sjkim &bswap ($s1); 914238384Sjkim &bswap ($s2); 915238384Sjkim &mov (&DWP(0,$key),$s0); # save output data 916238384Sjkim &bswap ($s3); 917238384Sjkim &mov (&DWP(4,$key),$s1); 918238384Sjkim &mov (&DWP(8,$key),$s2); 919238384Sjkim &mov (&DWP(12,$key),$s3); 920238384Sjkim 921238384Sjkim &mov ($s2,$_len); # load len 922238384Sjkim 923238384Sjkim &lea ($idx,&DWP(16,$idx)); 924238384Sjkim &mov ($_inp,$idx); # save inp 925238384Sjkim 926238384Sjkim &lea ($s3,&DWP(16,$key)); 927238384Sjkim &mov ($_out,$s3); # save out 928238384Sjkim 929238384Sjkim &sub ($s2,16); 930238384Sjkim &test ($s2,0xFFFFFFF0); 931238384Sjkim &mov ($_len,$s2); # save len 932238384Sjkim &jnz (&label("enc_loop")); 933238384Sjkim &test ($s2,15); 934238384Sjkim &jnz (&label("enc_tail")); 935238384Sjkim &mov ($idx,$_ivp); # load ivp 936238384Sjkim &mov ($s2,&DWP(8,$key)); # restore last dwords 937238384Sjkim &mov ($s3,&DWP(12,$key)); 938238384Sjkim &mov (&DWP(0,$idx),$s0); # save ivec 939238384Sjkim &mov (&DWP(4,$idx),$s1); 940238384Sjkim &mov (&DWP(8,$idx),$s2); 941238384Sjkim &mov (&DWP(12,$idx),$s3); 942238384Sjkim 943238384Sjkim &mov ("esp",$_esp); 944238384Sjkim &popf (); 945238384Sjkim &set_label("enc_out"); 946238384Sjkim &function_end_A(); 947238384Sjkim &pushf (); # kludge, never executed 948238384Sjkim 949238384Sjkim &set_label("enc_tail",4); 950238384Sjkim &mov ($s0,$key eq "edi" ? $key : ""); 951238384Sjkim &mov ($key,$_out); # load out 952238384Sjkim &push ($s0); # push ivp 953238384Sjkim &mov ($s1,16); 954238384Sjkim &sub ($s1,$s2); 955238384Sjkim &cmp ($key,$idx); # compare with inp 956238384Sjkim &je (&label("enc_in_place")); 957238384Sjkim &align (4); 958238384Sjkim &data_word(0xA4F3F689); # rep movsb # copy input 959238384Sjkim &jmp (&label("enc_skip_in_place")); 960238384Sjkim &set_label("enc_in_place"); 961238384Sjkim &lea ($key,&DWP(0,$key,$s2)); 962238384Sjkim &set_label("enc_skip_in_place"); 963238384Sjkim &mov ($s2,$s1); 964238384Sjkim &xor ($s0,$s0); 965238384Sjkim &align (4); 966238384Sjkim &data_word(0xAAF3F689); # rep stosb # zero tail 967238384Sjkim &pop ($key); # pop ivp 968238384Sjkim 969238384Sjkim &mov ($idx,$_out); # output as input 970238384Sjkim &mov ($s0,&DWP(0,$key)); 971238384Sjkim &mov ($s1,&DWP(4,$key)); 972238384Sjkim &mov ($_len,16); # len=16 973238384Sjkim &jmp (&label("enc_loop")); # one more spin... 974238384Sjkim 975238384Sjkim#----------------------------- DECRYPT -----------------------------# 976238384Sjkim&set_label("DECRYPT",16); 977238384Sjkim &shl ($s3,6); 978238384Sjkim &lea ($s3,&DWP(0,$s0,$s3)); 979238384Sjkim &mov ($_end,$s0); 980238384Sjkim &mov ($_key,$s3); 981238384Sjkim 982238384Sjkim &cmp ($idx,$_out); 983238384Sjkim &je (&label("dec_in_place")); # in-place processing... 984238384Sjkim 985238384Sjkim &mov ($key,$_ivp); # load ivp 986238384Sjkim &mov ($_tmp,$key); 987238384Sjkim 988238384Sjkim &set_label("dec_loop",4); 989238384Sjkim &mov ($s0,&DWP(0,$idx)); # read input 990238384Sjkim &mov ($s1,&DWP(4,$idx)); 991238384Sjkim &mov ($s2,&DWP(8,$idx)); 992238384Sjkim &bswap ($s0); 993238384Sjkim &mov ($s3,&DWP(12,$idx)); 994238384Sjkim &bswap ($s1); 995238384Sjkim &mov ($key,$_key); # load key 996238384Sjkim &bswap ($s2); 997238384Sjkim &bswap ($s3); 998238384Sjkim 999238384Sjkim &call ("_x86_Camellia_decrypt"); 1000238384Sjkim 1001238384Sjkim &mov ($key,$_tmp); # load ivp 1002238384Sjkim &mov ($idx,$_len); # load len 1003238384Sjkim 1004238384Sjkim &bswap ($s0); 1005238384Sjkim &bswap ($s1); 1006238384Sjkim &bswap ($s2); 1007238384Sjkim &xor ($s0,&DWP(0,$key)); # xor iv 1008238384Sjkim &bswap ($s3); 1009238384Sjkim &xor ($s1,&DWP(4,$key)); 1010238384Sjkim &xor ($s2,&DWP(8,$key)); 1011238384Sjkim &xor ($s3,&DWP(12,$key)); 1012238384Sjkim 1013238384Sjkim &sub ($idx,16); 1014238384Sjkim &jc (&label("dec_partial")); 1015238384Sjkim &mov ($_len,$idx); # save len 1016238384Sjkim &mov ($idx,$_inp); # load inp 1017238384Sjkim &mov ($key,$_out); # load out 1018238384Sjkim 1019238384Sjkim &mov (&DWP(0,$key),$s0); # write output 1020238384Sjkim &mov (&DWP(4,$key),$s1); 1021238384Sjkim &mov (&DWP(8,$key),$s2); 1022238384Sjkim &mov (&DWP(12,$key),$s3); 1023238384Sjkim 1024238384Sjkim &mov ($_tmp,$idx); # save ivp 1025238384Sjkim &lea ($idx,&DWP(16,$idx)); 1026238384Sjkim &mov ($_inp,$idx); # save inp 1027238384Sjkim 1028238384Sjkim &lea ($key,&DWP(16,$key)); 1029238384Sjkim &mov ($_out,$key); # save out 1030238384Sjkim 1031238384Sjkim &jnz (&label("dec_loop")); 1032238384Sjkim &mov ($key,$_tmp); # load temp ivp 1033238384Sjkim &set_label("dec_end"); 1034238384Sjkim &mov ($idx,$_ivp); # load user ivp 1035238384Sjkim &mov ($s0,&DWP(0,$key)); # load iv 1036238384Sjkim &mov ($s1,&DWP(4,$key)); 1037238384Sjkim &mov ($s2,&DWP(8,$key)); 1038238384Sjkim &mov ($s3,&DWP(12,$key)); 1039238384Sjkim &mov (&DWP(0,$idx),$s0); # copy back to user 1040238384Sjkim &mov (&DWP(4,$idx),$s1); 1041238384Sjkim &mov (&DWP(8,$idx),$s2); 1042238384Sjkim &mov (&DWP(12,$idx),$s3); 1043238384Sjkim &jmp (&label("dec_out")); 1044238384Sjkim 1045238384Sjkim &set_label("dec_partial",4); 1046238384Sjkim &lea ($key,$ivec); 1047238384Sjkim &mov (&DWP(0,$key),$s0); # dump output to stack 1048238384Sjkim &mov (&DWP(4,$key),$s1); 1049238384Sjkim &mov (&DWP(8,$key),$s2); 1050238384Sjkim &mov (&DWP(12,$key),$s3); 1051238384Sjkim &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx)); 1052238384Sjkim &mov ($idx eq "esi" ? $idx : "",$key); 1053238384Sjkim &mov ($key eq "edi" ? $key : "",$_out); # load out 1054238384Sjkim &data_word(0xA4F3F689); # rep movsb # copy output 1055238384Sjkim &mov ($key,$_inp); # use inp as temp ivp 1056238384Sjkim &jmp (&label("dec_end")); 1057238384Sjkim 1058238384Sjkim &set_label("dec_in_place",4); 1059238384Sjkim &set_label("dec_in_place_loop"); 1060238384Sjkim &lea ($key,$ivec); 1061238384Sjkim &mov ($s0,&DWP(0,$idx)); # read input 1062238384Sjkim &mov ($s1,&DWP(4,$idx)); 1063238384Sjkim &mov ($s2,&DWP(8,$idx)); 1064238384Sjkim &mov ($s3,&DWP(12,$idx)); 1065238384Sjkim 1066238384Sjkim &mov (&DWP(0,$key),$s0); # copy to temp 1067238384Sjkim &mov (&DWP(4,$key),$s1); 1068238384Sjkim &mov (&DWP(8,$key),$s2); 1069238384Sjkim &bswap ($s0); 1070238384Sjkim &mov (&DWP(12,$key),$s3); 1071238384Sjkim &bswap ($s1); 1072238384Sjkim &mov ($key,$_key); # load key 1073238384Sjkim &bswap ($s2); 1074238384Sjkim &bswap ($s3); 1075238384Sjkim 1076238384Sjkim &call ("_x86_Camellia_decrypt"); 1077238384Sjkim 1078238384Sjkim &mov ($key,$_ivp); # load ivp 1079238384Sjkim &mov ($idx,$_out); # load out 1080238384Sjkim 1081238384Sjkim &bswap ($s0); 1082238384Sjkim &bswap ($s1); 1083238384Sjkim &bswap ($s2); 1084238384Sjkim &xor ($s0,&DWP(0,$key)); # xor iv 1085238384Sjkim &bswap ($s3); 1086238384Sjkim &xor ($s1,&DWP(4,$key)); 1087238384Sjkim &xor ($s2,&DWP(8,$key)); 1088238384Sjkim &xor ($s3,&DWP(12,$key)); 1089238384Sjkim 1090238384Sjkim &mov (&DWP(0,$idx),$s0); # write output 1091238384Sjkim &mov (&DWP(4,$idx),$s1); 1092238384Sjkim &mov (&DWP(8,$idx),$s2); 1093238384Sjkim &mov (&DWP(12,$idx),$s3); 1094238384Sjkim 1095238384Sjkim &lea ($idx,&DWP(16,$idx)); 1096238384Sjkim &mov ($_out,$idx); # save out 1097238384Sjkim 1098238384Sjkim &lea ($idx,$ivec); 1099238384Sjkim &mov ($s0,&DWP(0,$idx)); # read temp 1100238384Sjkim &mov ($s1,&DWP(4,$idx)); 1101238384Sjkim &mov ($s2,&DWP(8,$idx)); 1102238384Sjkim &mov ($s3,&DWP(12,$idx)); 1103238384Sjkim 1104238384Sjkim &mov (&DWP(0,$key),$s0); # copy iv 1105238384Sjkim &mov (&DWP(4,$key),$s1); 1106238384Sjkim &mov (&DWP(8,$key),$s2); 1107238384Sjkim &mov (&DWP(12,$key),$s3); 1108238384Sjkim 1109238384Sjkim &mov ($idx,$_inp); # load inp 1110238384Sjkim 1111238384Sjkim &lea ($idx,&DWP(16,$idx)); 1112238384Sjkim &mov ($_inp,$idx); # save inp 1113238384Sjkim 1114238384Sjkim &mov ($s2,$_len); # load len 1115238384Sjkim &sub ($s2,16); 1116238384Sjkim &jc (&label("dec_in_place_partial")); 1117238384Sjkim &mov ($_len,$s2); # save len 1118238384Sjkim &jnz (&label("dec_in_place_loop")); 1119238384Sjkim &jmp (&label("dec_out")); 1120238384Sjkim 1121238384Sjkim &set_label("dec_in_place_partial",4); 1122238384Sjkim # one can argue if this is actually required... 1123238384Sjkim &mov ($key eq "edi" ? $key : "",$_out); 1124238384Sjkim &lea ($idx eq "esi" ? $idx : "",$ivec); 1125238384Sjkim &lea ($key,&DWP(0,$key,$s2)); 1126238384Sjkim &lea ($idx,&DWP(16,$idx,$s2)); 1127238384Sjkim &neg ($s2 eq "ecx" ? $s2 : ""); 1128238384Sjkim &data_word(0xA4F3F689); # rep movsb # restore tail 1129238384Sjkim 1130238384Sjkim &set_label("dec_out",4); 1131238384Sjkim &mov ("esp",$_esp); 1132238384Sjkim &popf (); 1133238384Sjkim&function_end("Camellia_cbc_encrypt"); 1134238384Sjkim} 1135238384Sjkim 1136238384Sjkim&asciz("Camellia for x86 by <appro\@openssl.org>"); 1137238384Sjkim 1138238384Sjkim&asm_finish(); 1139