1/* 2 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved. 3 * 4 * The contents of this file constitute Original Code as defined in and are 5 * subject to the Apple Public Source License Version 1.2 (the 'License'). 6 * You may not use this file except in compliance with the License. Please obtain 7 * a copy of the License at http://www.apple.com/publicsource and read it before 8 * using this file. 9 * 10 * This Original Code and all software distributed under the License are 11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS 12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT 13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the 15 * specific language governing rights and limitations under the License. 16 */ 17 18 19/* rijndael-alg-ref.c v2.0 August '99 20 * Reference ANSI C code 21 * authors: Paulo Barreto 22 * Vincent Rijmen 23 * 24 * PPC and 128-bit block optimization by Doug Mitchell May 2001. 25 */ 26 27#include <stdio.h> 28#include <stdlib.h> 29#include <string.h> 30 31#include "rijndael-alg-ref.h" 32#include <cspdebugging.h> 33 34#define SC ((BC - 4) >> 1) 35 36#include "boxes-ref.h" 37 38static const word8 shifts[3][4][2] = { 39 { { 0, 0 }, 40 { 1, 3 }, 41 { 2, 2 }, 42 { 3, 1 } 43 }, 44 { { 0, 0 }, 45 { 1, 5 }, 46 { 2, 4 }, 47 { 3, 3 } 48 }, 49 { { 0, 0 }, 50 { 1, 7 }, 51 { 3, 5 }, 52 { 4, 4 } 53 } 54}; 55 56#if !GLADMAN_AES_128_ENABLE 57 58/* 128 bit key/word shift table in bits */ 59static const word8 shifts128[4][2] = { 60 { 0, 0 }, 61 { 8, 24 }, 62 { 16, 16 }, 63 { 24, 8 } 64}; 65 66#endif /* GLADMAN_AES_128_ENABLE */ 67 68#if !AES_MUL_BY_LOOKUP 69/* 70 * Profiling measurements showed that the mul routine is where a large propertion of 71 * the time is spent. Since the first argument to mul is always one of six 72 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to 73 * do the multiplies. This eliminates the need for the log/antilog tables, so 74 * it's only adding one kilobyte of const data. Throughput improvement for this 75 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized 76 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt 77 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the 78 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken 79 * on a 500 MHz G4 with 1 MB of L2 cache. 80 */ 81 82/* 83 * The mod 255 op in mul is really expensive... 84 * 85 * We know that b <= (254 * 2), so there are only two cases. Either return b, 86 * or return b-255. 87 * 88 * On a G4 this single optimization results in a 24% speedup for encrypt and 89 * a 25% speedup for decrypt. 90 */ 91static inline word8 mod255(word32 b) 92{ 93 if(b >= 255) { 94 b -= 255; 95 } 96 return b; 97} 98 99word8 mul(word8 a, word8 b) { 100 /* multiply two elements of GF(2^m) 101 * needed for MixColumn and InvMixColumn 102 */ 103 if (a && b) return Alogtable[mod255(Logtable[a] + Logtable[b])]; 104 else return 0; 105} 106#endif /* !AES_MUL_BY_LOOKUP */ 107 108static 109void KeyAddition(word8 a[4][MAXBC], word8 rk[4][MAXBC], word8 BC) { 110 /* Exor corresponding text input and round key input bytes 111 */ 112 int i, j; 113 114 for(i = 0; i < 4; i++) 115 for(j = 0; j < BC; j++) a[i][j] ^= rk[i][j]; 116} 117 118static 119void ShiftRow(word8 a[4][MAXBC], word8 d, word8 BC) { 120 /* Row 0 remains unchanged 121 * The other three rows are shifted a variable amount 122 */ 123 word8 tmp[MAXBC]; 124 int i, j; 125 126 for(i = 1; i < 4; i++) { 127 for(j = 0; j < BC; j++) tmp[j] = a[i][(j + shifts[SC][i][d]) % BC]; 128 for(j = 0; j < BC; j++) a[i][j] = tmp[j]; 129 } 130} 131 132static 133void Substitution(word8 a[4][MAXBC], const word8 box[256], word8 BC) { 134 /* Replace every byte of the input by the byte at that place 135 * in the nonlinear S-box 136 */ 137 int i, j; 138 139 for(i = 0; i < 4; i++) 140 for(j = 0; j < BC; j++) a[i][j] = box[a[i][j]] ; 141} 142 143static 144void MixColumn(word8 a[4][MAXBC], word8 BC) { 145 /* Mix the four bytes of every column in a linear way 146 */ 147 word8 b[4][MAXBC]; 148 int i, j; 149 150 for(j = 0; j < BC; j++) { 151 for(i = 0; i < 4; i++) { 152 #if AES_MUL_BY_LOOKUP 153 b[i][j] = mulBy0x02[a[i][j]] 154 ^ mulBy0x03[a[(i + 1) % 4][j]] 155 ^ a[(i + 2) % 4][j] 156 ^ a[(i + 3) % 4][j]; 157 #else 158 b[i][j] = mul(2,a[i][j]) 159 ^ mul(3,a[(i + 1) % 4][j]) 160 ^ a[(i + 2) % 4][j] 161 ^ a[(i + 3) % 4][j]; 162 #endif 163 } 164 } 165 for(i = 0; i < 4; i++) { 166 for(j = 0; j < BC; j++) a[i][j] = b[i][j]; 167 } 168} 169 170static 171void InvMixColumn(word8 a[4][MAXBC], word8 BC) { 172 /* Mix the four bytes of every column in a linear way 173 * This is the opposite operation of Mixcolumn 174 */ 175 word8 b[4][MAXBC]; 176 int i, j; 177 178 for(j = 0; j < BC; j++) { 179 for(i = 0; i < 4; i++) { 180 #if AES_MUL_BY_LOOKUP 181 b[i][j] = mulBy0x0e[a[i][j]] 182 ^ mulBy0x0b[a[(i + 1) % 4][j]] 183 ^ mulBy0x0d[a[(i + 2) % 4][j]] 184 ^ mulBy0x09[a[(i + 3) % 4][j]]; 185 #else 186 b[i][j] = mul(0xe,a[i][j]) 187 ^ mul(0xb,a[(i + 1) % 4][j]) 188 ^ mul(0xd,a[(i + 2) % 4][j]) 189 ^ mul(0x9,a[(i + 3) % 4][j]); 190 #endif 191 } 192 } 193 for(i = 0; i < 4; i++) { 194 for(j = 0; j < BC; j++) a[i][j] = b[i][j]; 195 } 196} 197 198int rijndaelKeySched ( 199 word8 k[4][MAXKC], 200 int keyBits, 201 int blockBits, 202 word8 W[MAXROUNDS+1][4][MAXBC]) { 203 204 /* Calculate the necessary round keys 205 * The number of calculations depends on keyBits and blockBits 206 */ 207 int KC, BC, ROUNDS; 208 int i, j, t, rconpointer = 0; 209 word8 tk[4][MAXKC]; 210 211 switch (keyBits) { 212 case 128: KC = 4; break; 213 case 192: KC = 6; break; 214 case 256: KC = 8; break; 215 default : return (-1); 216 } 217 218 switch (blockBits) { 219 case 128: BC = 4; break; 220 case 192: BC = 6; break; 221 case 256: BC = 8; break; 222 default : return (-2); 223 } 224 225 switch (keyBits >= blockBits ? keyBits : blockBits) { 226 case 128: ROUNDS = 10; break; 227 case 192: ROUNDS = 12; break; 228 case 256: ROUNDS = 14; break; 229 default : return (-3); /* this cannot happen */ 230 } 231 232 233 for(j = 0; j < KC; j++) 234 for(i = 0; i < 4; i++) 235 tk[i][j] = k[i][j]; 236 t = 0; 237 /* copy values into round key array */ 238 for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++) 239 for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j]; 240 241 while (t < (ROUNDS+1)*BC) { /* while not enough round key material calculated */ 242 /* calculate new values */ 243 for(i = 0; i < 4; i++) 244 tk[i][0] ^= S[tk[(i+1)%4][KC-1]]; 245 tk[0][0] ^= rcon[rconpointer++]; 246 247 if (KC != 8) 248 for(j = 1; j < KC; j++) 249 for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1]; 250 else { 251 for(j = 1; j < KC/2; j++) 252 for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1]; 253 for(i = 0; i < 4; i++) tk[i][KC/2] ^= S[tk[i][KC/2 - 1]]; 254 for(j = KC/2 + 1; j < KC; j++) 255 for(i = 0; i < 4; i++) tk[i][j] ^= tk[i][j-1]; 256 } 257 /* copy values into round key array */ 258 for(j = 0; (j < KC) && (t < (ROUNDS+1)*BC); j++, t++) 259 for(i = 0; i < 4; i++) W[t / BC][i][t % BC] = tk[i][j]; 260 } 261 262 return 0; 263} 264 265int rijndaelEncrypt ( 266 word8 a[4][MAXBC], 267 int keyBits, 268 int blockBits, 269 word8 rk[MAXROUNDS+1][4][MAXBC]) 270{ 271 /* Encryption of one block, general case. 272 */ 273 int r, BC, ROUNDS; 274 275 switch (blockBits) { 276 case 128: BC = 4; break; 277 case 192: BC = 6; break; 278 case 256: BC = 8; break; 279 default : return (-2); 280 } 281 282 switch (keyBits >= blockBits ? keyBits : blockBits) { 283 case 128: ROUNDS = 10; break; 284 case 192: ROUNDS = 12; break; 285 case 256: ROUNDS = 14; break; 286 default : return (-3); /* this cannot happen */ 287 } 288 289 /* begin with a key addition 290 */ 291 KeyAddition(a,rk[0],BC); 292 293 /* ROUNDS-1 ordinary rounds 294 */ 295 for(r = 1; r < ROUNDS; r++) { 296 Substitution(a,S,BC); 297 ShiftRow(a,0,BC); 298 MixColumn(a,BC); 299 KeyAddition(a,rk[r],BC); 300 } 301 302 /* Last round is special: there is no MixColumn 303 */ 304 Substitution(a,S,BC); 305 ShiftRow(a,0,BC); 306 KeyAddition(a,rk[ROUNDS],BC); 307 308 return 0; 309} 310 311int rijndaelDecrypt ( 312 word8 a[4][MAXBC], 313 int keyBits, 314 int blockBits, 315 word8 rk[MAXROUNDS+1][4][MAXBC]) 316{ 317 int r, BC, ROUNDS; 318 319 switch (blockBits) { 320 case 128: BC = 4; break; 321 case 192: BC = 6; break; 322 case 256: BC = 8; break; 323 default : return (-2); 324 } 325 326 switch (keyBits >= blockBits ? keyBits : blockBits) { 327 case 128: ROUNDS = 10; break; 328 case 192: ROUNDS = 12; break; 329 case 256: ROUNDS = 14; break; 330 default : return (-3); /* this cannot happen */ 331 } 332 333 /* To decrypt: apply the inverse operations of the encrypt routine, 334 * in opposite order 335 * 336 * (KeyAddition is an involution: it 's equal to its inverse) 337 * (the inverse of Substitution with table S is Substitution with the 338 * inverse table of S) 339 * (the inverse of Shiftrow is Shiftrow over a suitable distance) 340 */ 341 342 /* First the special round: 343 * without InvMixColumn 344 * with extra KeyAddition 345 */ 346 KeyAddition(a,rk[ROUNDS],BC); 347 Substitution(a,Si,BC); 348 ShiftRow(a,1,BC); 349 350 /* ROUNDS-1 ordinary rounds 351 */ 352 for(r = ROUNDS-1; r > 0; r--) { 353 KeyAddition(a,rk[r],BC); 354 InvMixColumn(a,BC); 355 Substitution(a,Si,BC); 356 ShiftRow(a,1,BC); 357 } 358 359 /* End with the extra key addition 360 */ 361 362 KeyAddition(a,rk[0],BC); 363 364 return 0; 365} 366 367#if !GLADMAN_AES_128_ENABLE 368 369/* 370 * All of these 128-bit-key-and-block routines require 32-bit word-aligned 371 * char array pointers.�The key schedule arrays are easy; they come from 372 * keyInstance which has a 4-byte-aligned element preceeding the key schedule. 373 * Others require manual alignment of a local variable by the caller. 374 */ 375 376static inline void KeyAddition128( 377 word8 a[4][BC_128_OPT], 378 word8 rk[4][MAXBC]) { 379 380 /* these casts are endian-independent */ 381 ((word32 *)a)[0] ^= *((word32 *)(&rk[0])); 382 ((word32 *)a)[1] ^= *((word32 *)(&rk[1])); 383 ((word32 *)a)[2] ^= *((word32 *)(&rk[2])); 384 ((word32 *)a)[3] ^= *((word32 *)(&rk[3])); 385} 386 387static void Substitution128( 388 word8 a[4][BC_128_OPT], 389 const word8 box[256]) { 390 /* Replace every byte of the input by the byte at that place 391 * in the nonlinear S-box 392 */ 393 int i, j; 394 395 /* still to be optimized - larger S boxes? */ 396 for(i = 0; i < 4; i++) { 397 for(j = 0; j < BC_128_OPT; j++) { 398 a[i][j] = box[a[i][j]]; 399 } 400 } 401} 402 403#if defined(__ppc__) && defined(__GNUC__) 404 405static inline void rotateWordLeft( 406 word8 *word, // known to be word aligned 407 unsigned rotCount) // in bits 408{ 409 word32 lword = *((word32 *)word); 410 asm("rlwnm %0,%1,%2,0,31" : "=r"(lword) : "0"(lword), "r"(rotCount)); 411 *((word32 *)word) = lword; 412} 413 414#else 415 416/* 417 * Insert your machine/compiler dependent code here, 418 * or just use this, which works on any platform and compiler 419 * which supports the __attribute__((aligned(4))) directive. 420 */ 421static void rotateWordLeft( 422 word8 *word, // known to be word aligned 423 unsigned rotCount) // in bits 424{ 425 word8 tmp[BC_128_OPT] __attribute__((aligned(4))); 426 unsigned bytes = rotCount / 8; 427 428 tmp[0] = word[bytes & (BC_128_OPT-1)]; 429 tmp[1] = word[(1+bytes) & (BC_128_OPT-1)]; 430 tmp[2] = word[(2+bytes) & (BC_128_OPT-1)]; 431 tmp[3] = word[(3+bytes) & (BC_128_OPT-1)]; 432 *((word32 *)word) = *((word32 *)tmp); 433} 434#endif 435 436static inline void ShiftRow128( 437 word8 a[4][BC_128_OPT], 438 word8 d) { 439 /* Row 0 remains unchanged 440 * The other three rows are shifted (actually rotated) a variable amount 441 */ 442 int i; 443 444 for(i = 1; i < 4; i++) { 445 rotateWordLeft(a[i], shifts128[i][d]); 446 } 447} 448 449/* 450 * The following two routines are where most of the time is spent in this 451 * module. Further optimization would have to focus here. 452 */ 453static void MixColumn128(word8 a[4][BC_128_OPT]) { 454 /* Mix the four bytes of every column in a linear way 455 */ 456 word8 b[4][BC_128_OPT]; 457 int i, j; 458 459 for(j = 0; j < BC_128_OPT; j++) { 460 for(i = 0; i < 4; i++) { 461 #if AES_MUL_BY_LOOKUP 462 b[i][j] = mulBy0x02[a[i][j]] 463 ^ mulBy0x03[a[(i + 1) % 4][j]] 464 ^ a[(i + 2) % 4][j] 465 ^ a[(i + 3) % 4][j]; 466 #else 467 b[i][j] = mul(2,a[i][j]) 468 ^ mul(3,a[(i + 1) % 4][j]) 469 ^ a[(i + 2) % 4][j] 470 ^ a[(i + 3) % 4][j]; 471 #endif 472 } 473 } 474 memmove(a, b, 4 * BC_128_OPT); 475} 476 477static void InvMixColumn128(word8 a[4][BC_128_OPT]) { 478 /* Mix the four bytes of every column in a linear way 479 * This is the opposite operation of Mixcolumn 480 */ 481 word8 b[4][BC_128_OPT]; 482 int i, j; 483 484 for(j = 0; j < BC_128_OPT; j++) { 485 for(i = 0; i < 4; i++) { 486 #if AES_MUL_BY_LOOKUP 487 b[i][j] = mulBy0x0e[a[i][j]] 488 ^ mulBy0x0b[a[(i + 1) % 4][j]] 489 ^ mulBy0x0d[a[(i + 2) % 4][j]] 490 ^ mulBy0x09[a[(i + 3) % 4][j]]; 491 #else 492 b[i][j] = mul(0xe,a[i][j]) 493 ^ mul(0xb,a[(i + 1) % 4][j]) 494 ^ mul(0xd,a[(i + 2) % 4][j]) 495 ^ mul(0x9,a[(i + 3) % 4][j]); 496 #endif 497 } 498 } 499 memmove(a, b, 4 * BC_128_OPT); 500} 501 502int rijndaelKeySched128 ( 503 word8 k[4][KC_128_OPT], 504 word8 W[MAXROUNDS+1][4][MAXBC]) { 505 506 /* Calculate the necessary round keys 507 * The number of calculations depends on keyBits and blockBits 508 */ 509 int i, j, t, rconpointer = 0; 510 word8 tk[4][KC_128_OPT]; 511 unsigned numSchedRows = (ROUNDS_128_OPT + 1) * BC_128_OPT; 512 513 for(j = 0; j < KC_128_OPT; j++) 514 for(i = 0; i < 4; i++) 515 tk[i][j] = k[i][j]; 516 t = 0; 517 /* copy values into round key array */ 518 for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) { 519 for(i = 0; i < 4; i++) { 520 W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j]; 521 } 522 } 523 524 while (t < numSchedRows) { 525 /* while not enough round key material calculated */ 526 /* calculate new values */ 527 for(i = 0; i < 4; i++) { 528 tk[i][0] ^= S[tk[(i+1)%4][KC_128_OPT-1]]; 529 } 530 tk[0][0] ^= rcon[rconpointer++]; 531 532 for(j = 1; j < KC_128_OPT; j++) { 533 for(i = 0; i < 4; i++) { 534 tk[i][j] ^= tk[i][j-1]; 535 } 536 } 537 538 /* copy values into round key array */ 539 for(j = 0; (j < KC_128_OPT) && (t < numSchedRows); j++, t++) { 540 for(i = 0; i < 4; i++) { 541 W[t / BC_128_OPT][i][t % BC_128_OPT] = tk[i][j]; 542 } 543 } 544 } 545 546 return 0; 547} 548 549int rijndaelEncrypt128 ( 550 word8 a[4][BC_128_OPT], 551 word8 rk[MAXROUNDS+1][4][MAXBC]) 552{ 553 /* Encryption of one block. 554 */ 555 int r; 556 557 /* begin with a key addition 558 */ 559 KeyAddition128(a,rk[0]); 560 561 /* ROUNDS-1 ordinary rounds 562 */ 563 for(r = 1; r < ROUNDS_128_OPT; r++) { 564 Substitution128(a,S); 565 ShiftRow128(a,0); 566 MixColumn128(a); 567 KeyAddition128(a,rk[r]); 568 } 569 570 /* Last round is special: there is no MixColumn 571 */ 572 Substitution128(a,S); 573 ShiftRow128(a,0); 574 KeyAddition128(a,rk[ROUNDS_128_OPT]); 575 576 return 0; 577} 578 579int rijndaelDecrypt128 ( 580 word8 a[4][BC_128_OPT], 581 word8 rk[MAXROUNDS+1][4][MAXBC]) 582{ 583 int r; 584 585 /* To decrypt: apply the inverse operations of the encrypt routine, 586 * in opposite order 587 * 588 * (KeyAddition is an involution: it 's equal to its inverse) 589 * (the inverse of Substitution with table S is Substitution with the 590 * inverse table of S) 591 * (the inverse of Shiftrow is Shiftrow over a suitable distance) 592 */ 593 594 /* First the special round: 595 * without InvMixColumn 596 * with extra KeyAddition 597 */ 598 KeyAddition128(a,rk[ROUNDS_128_OPT]); 599 Substitution128(a,Si); 600 ShiftRow128(a,1); 601 602 /* ROUNDS-1 ordinary rounds 603 */ 604 for(r = ROUNDS_128_OPT-1; r > 0; r--) { 605 KeyAddition128(a,rk[r]); 606 InvMixColumn128(a); 607 Substitution128(a,Si); 608 ShiftRow128(a,1); 609 } 610 611 /* End with the extra key addition 612 */ 613 614 KeyAddition128(a,rk[0]); 615 616 return 0; 617} 618 619#endif /* !GLADMAN_AES_128_ENABLE */ 620 621