1// $Id: WKdmDecompress.intel.s,v 1.1 2010/01/30 00:39:21 cclee Exp cclee $ 2 3// This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Decompressor. 4// The implementation is derived by compiling (gcc -O3) the original C code (WKdmDecompress.c) 5// followed by hand tweaking of the compiled assembly code. 6// cclee, 1/29/10 7 8#if defined __i386__ 9 .text 10 .align 4,0x90 11 12 .globl _WKdm_decompress 13_WKdm_decompress: 14 15 // save registers, set up base pointer %ebp, and allocate stack memory for local veriables 16 17 pushl %ebp 18 movl %esp, %ebp 19 pushl %edi 20 pushl %esi 21 pushl %ebx 22 subl $7324, %esp 23 24 // PRELOAD_DICTIONARY; dictionary starting address : -88(%ebp) 25 movl $1, -88(%ebp) 26 movl $1, -84(%ebp) 27 movl $1, -80(%ebp) 28 movl $1, -76(%ebp) 29 movl $1, -72(%ebp) 30 movl $1, -68(%ebp) 31 movl $1, -64(%ebp) 32 movl $1, -60(%ebp) 33 movl $1, -56(%ebp) 34 movl $1, -52(%ebp) 35 movl $1, -48(%ebp) 36 movl $1, -44(%ebp) 37 movl $1, -40(%ebp) 38 movl $1, -36(%ebp) 39 movl $1, -32(%ebp) 40 movl $1, -28(%ebp) 41 42 #define dictionary_addr -88(%ebp) 43 #define TAGS_AREA_END -7292(%ebp) 44 #define tempTagsArray -7300(%ebp) 45 #define tempQPosArray -2488(%ebp) 46 #define tempLowBitsArray -7288(%ebp) 47 #define next_low_bits -7296(%ebp) 48 #define dictionary -7308(%ebp) 49 #define tag_area_end -7304(%ebp) 50 51 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); 52 53 movl 8(%ebp), %eax // src_buf 54 addl $272, %eax // src_buf + 16 (WKdm Header) + 256 (Tags) 55 movl %eax, TAGS_AREA_END // TAGS_AREA_END(src_buf) 56 movl 8(%ebp), %eax // src_buf 57 movl %eax, %edi // src_buf 58 addl $16, %eax // TAGS_AREA_START(src_buf) = src_buf + 16 (WKdm Header) 59 leal -1288(%ebp), %edx // tempTagsArray 60 movl %edx, tempTagsArray // save a copy of tempTagsArray[] at the said location 61 cmpl %eax, TAGS_AREA_END // TAGS_AREA_END vs TAGS_AREA_START 62 jbe 1f // if TAGS_AREA_END<=TAGS_AREA_START, no need for WK_unpack_2bits 63 movl %edx, %ecx // %ecx -> tempTagsArray[0] 64 xorl %esi, %esi // i=0 65 movl $50529027, %ebx // 0x03030303, mask to extract 4 2-bit tags 66 .align 4,0x90 67L_WK_unpack_2bits: 68 movl 16(%edi,%esi,4), %edx // src_buf[i] for 16 tags, 16 (WKdm header) 69 movl %edx, %eax // w = src_buf[i] 70 andl %ebx, %eax // 1st 4 tags, each in bytes 71 movl %eax, (%ecx) // save 1st 4 tags 72 movl %edx, %eax // w = src_buf[i] 73 shrl $2, %eax // shift down 2 bits 74 andl %ebx, %eax // 2nd 4 tags, each in bytes 75 movl %eax, 4(%ecx) // save 2nd 4 tags 76 shrl $4, %edx // shift down w by 4 bits 77 movl %edx, %eax // w>>4 78 andl %ebx, %eax // 3rd 4 tags 79 movl %eax, 8(%ecx) // save 3rd 4 tags 80 shrl $2, %edx // w>>6 81 andl %ebx, %edx // 4th 4 tags 82 movl %edx, 12(%ecx) // save 4th 4 tags 83 addl $16, %ecx // point to next tempTagsArray[i*16] 84 incl %esi // i++ 85 cmpl $64, %esi // i vs 64 86 jne L_WK_unpack_2bits // repeat the loop until i==64 871: 88 89 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); 90 91 movl 8(%edi), %eax // WKdm header qpos end 92 leal (%edi,%eax,4), %esi // QPOS_AREA_END 93 movl 4(%edi), %eax // WKdm header qpos start 94 leal (%edi,%eax,4), %ecx // QPOS_AREA_START 95 cmpl %ecx, %esi // QPOS_AREA_END vs QPOS_AREA_START 96 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip WK_unpack_4bits 97 leal tempQPosArray, %edi // tempQPosArray 98 movl $252645135, %ebx // 0x0f0f0f0f : mask to extract 4 4-bit qpos 99L_WK_unpack_4bits: 100 movl (%ecx), %eax // w 101 movl %eax, %edx // w 102 andl %ebx, %edx // 1st 4 qpos 103 movl %edx, (%edi) // save 1st 4 qpos 104 shrl $4, %eax // w>>4 105 andl %ebx, %eax // 2nd 4 qpos 106 movl %eax, 4(%edi) // save 2nd 4 qpos 107 addl $4, %ecx // point to next word w 108 addl $8, %edi // qpos += 8 109 cmpl %ecx, %esi // QPOS_AREA_END vs qpos_pointer 110 ja L_WK_unpack_4bits // repeat until qpos_pointer >= QPOS_AREA_END 111 112 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); 113 1141: 115 movl 8(%ebp), %edx // src_buf 116 movl 12(%edx), %eax // LOW_BITS_AREA_END offset 117 leal (%edx,%eax,4), %edi // LOW_BITS_AREA_END 118 cmpl %edi, %esi // LOW_BITS_AREA_START(=QPOS_AREA_END) vs LOW_BITS_AREA_END 119 jae 1f // if (LOW_BITS_AREA_START>=LOW_BITS_AREA_END) skip unpack_3_tenbits 120 leal tempLowBitsArray, %ecx // tempLowBitsArray 121 movl $1023, %ebx // 0x03ff to extact lower 10-bits 122 123 .align 4,0x90 124L_WK_unpack_3_tenbits: 125 movl (%esi), %eax // w = *next_low_bits 126 movl %eax, %edx // w 127 andl %ebx, %edx // 1st 10-bit 128 movl %edx, (%ecx) // save 1st 10-bit 129 shrl $10, %eax // (w>>10) 130 movl %eax, %edx // (w>>10) 131 andl %ebx, %edx // 2nd 10-bit 132 movl %edx, 4(%ecx) // save 2nd 10-bit 133 shrl $10, %eax // (w>>20), no need to and with mask, the top 2 bits should be zero 134 movl %eax, 8(%ecx) // save 3rd 10-bits 135 addl $4, %esi // point to next w 136 addl $12, %ecx // tempLowBitsArray += 3; 137 cmpl %esi, %edi // LOW_BITS_AREA_END vs next_low_bits 138 ja L_WK_unpack_3_tenbits // repeat until next_low_bits>=LOW_BITS_AREA_END 1391: 140 call Lhash 141Lhash: 142 popl %ebx // set up %ebx for use in Hash Table loopup[ 143 144 #define next_tag %esi 145 #define next_qpos %edi 146 147 movl tempTagsArray, next_tag // next_tag = tempTagsArray 148 leal tempQPosArray, next_qpos // next_qpos = tempQPosArray 149 movl 12(%ebp), %ecx // dest_buf 150 addl $4, %ecx // for some reason, performance is better if we points to the next one 151 leal tempLowBitsArray, %eax // tempLowBitsArray 152 movl %eax, next_low_bits // next_low_bits = next_low_bits; 153 leal -264(%ebp), %edx 154 movl %edx, tag_area_end // tag_area_end 155 leal dictionary_addr, %eax // dictionary starting address 156 movl %eax, dictionary // dictionary 157 jmp L11 158 .align 4,0x90 159L29: 160 jle L_ZERO_TAG 161 cmpb $2, %al // MISS_TAG 162 je L_MISS_TAG 163L_EXACT_TAG: 164 movsbl (next_qpos),%eax // qpos = *next_qpos 165 incl next_qpos // next_qpos++ 166 movl dictionary, %edx // dictionary 167 movl (%edx,%eax,4), %eax // w = dictionary[qpos] 168 movl %eax, -4(%ecx) // *dest_buf = w 169 .align 4,0x90 170L_next: 171 incl next_tag // next_tag++ 172 addl $4, %ecx // dest_buf++ 173 cmpl tag_area_end, next_tag // next_tag vs tag_area_end 174 jae L_done // if (next_tag>=tag_area_end) 175L11: 176 movzbl (next_tag), %eax // tag = *next_tag 177 cmpb $1, %al // Partial match? 178 jne L29 179L_PARTIAL_TAG: 180 movsbl (next_qpos),%edx // qpos = *next_qpos 181 movl dictionary, %eax // dictionary 182 leal (%eax,%edx,4), %edx // dict_location = &dictionary[qpos] 183 movl %edx, -7324(%ebp) // save dict_location to release %edx 184 incl next_qpos // next_qpos++ 185 movl (%edx), %eax // read dictionary word 186 andl $-1024, %eax // keep only higher 22-bits 187 movl next_low_bits, %edx // low_bits = *next_low_bits 188 orl (%edx), %eax // construct the new partially matched word 189 addl $4, %edx // 190 movl %edx, next_low_bits // next_low_bits++ 191 movl -7324(%ebp), %edx // dict_location 192 movl %eax, (%edx) // update *dict_location with the newly constructed word 193 movl %eax, -4(%ecx) // *dest_buf = the newly constructed word 194 incl next_tag // next_tag++ 195 addl $4, %ecx // dest_buf++ 196 cmpl tag_area_end, next_tag // next_tag vs tag_area_end 197 jb L11 // if next_tag < tag_area_end, repeat the loop 198L_done: 199 200 // release stack memory, restore registers, and return 201 addl $7324, %esp 202 popl %ebx 203 popl %esi 204 popl %edi 205 leave 206 ret 207 208 #define next_full_patt -7292(%ebp) /* next_full_patt starts with initial value of TAGS_AREA_END */ 209 210 .align 4,0x90 211L_MISS_TAG: 212 movl next_full_patt, %edx // next_full_patt 213 movl (%edx), %eax // word = *next_full_patt 214 addl $4, %edx // next_full_patt++ 215 movl %edx, next_full_patt // save next_full_patt 216 movl %eax, %edx // word 217 shrl $10, %edx // word>>10 218 andl $255, %edx // 8-bit hash table index 219 movsbl _hashLookupTable-Lhash(%ebx,%edx),%edx // qpos 220 movl %eax, -88(%ebp,%edx) // dictionary[qpos] = word 221 movl %eax, -4(%ecx) // *dest_buf = word 222 jmp L_next // repeat the loop 223 224 .align 4,0x90 225L_ZERO_TAG: 226 movl $0, -4(%ecx) // *dest_buf = 0 227 jmp L_next // repeat the loop 228 229#endif // __i386__ 230 231#if defined __x86_64__ 232 233 234 .text 235 .align 4,0x90 236 237 .globl _WKdm_decompress 238_WKdm_decompress: 239 240 // save registers, and allocate stack memory for local variables 241 242 pushq %rbp 243 movq %rsp, %rbp 244 pushq %r12 245 pushq %rbx 246 subq $7144, %rsp 247 248 movq %rsi, %r12 // dest_buf 249 250 // PRELOAD_DICTIONARY; dictionary starting address : starting address -80(%rpb) 251 movl $1, -80(%rbp) 252 movl $1, -76(%rbp) 253 movl $1, -72(%rbp) 254 movl $1, -68(%rbp) 255 movl $1, -64(%rbp) 256 movl $1, -60(%rbp) 257 movl $1, -56(%rbp) 258 movl $1, -52(%rbp) 259 movl $1, -48(%rbp) 260 movl $1, -44(%rbp) 261 movl $1, -40(%rbp) 262 movl $1, -36(%rbp) 263 movl $1, -32(%rbp) 264 movl $1, -28(%rbp) 265 movl $1, -24(%rbp) 266 movl $1, -20(%rbp) 267 268 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); 269 leaq 272(%rdi), %r10 // TAGS_AREA_END 270 leaq 16(%rdi), %rax // TAGS_AREA_START 271 leaq -1280(%rbp), %rsi // tempTagsArray 272 cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START 273 jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits 274 movq %rsi, %rcx // next_word 275 xorl %r8d, %r8d // i = 0 276 .align 4,0x90 277L_WK_unpack_2bits: 278 movl 16(%rdi,%r8,4), %edx // w = *next_word 279 movl %edx, %eax // w 280 andl $50529027, %eax // 1st 4 tags 281 movl %eax, (%rcx) // write 1st 4 tags 282 movl %edx, %eax // w 283 shrl $2, %eax // w>>2 284 andl $50529027, %eax // 2nd 4 tags 285 movl %eax, 4(%rcx) // write 2nd 4 tags 286 shrl $4, %edx // w>>4 287 movl %edx, %eax // w>>4 288 andl $50529027, %eax // 3rd 4 tags 289 movl %eax, 8(%rcx) // write 3rd 4 tags 290 shrl $2, %edx // w>>6 291 andl $50529027, %edx // 4th 4 tags 292 movl %edx, 12(%rcx) // write 4th 4 tags 293 addq $16, %rcx // next_tags += 16 294 incq %r8 // i++ 295 cmpq $64, %r8 // i vs 64 296 jne L_WK_unpack_2bits // repeat loop until i==64 2971: 298 299 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); 300 301 mov 8(%rdi), %eax // WKdm header qpos end 302 leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END 303 mov 4(%rdi), %eax // WKdm header qpos start 304 leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START 305 leaq -2480(%rbp), %rbx // tempQPosArray 306 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START 307 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits 308 leaq 8(%rbx), %rcx // next_qpos 309L_WK_unpack_4bits: 310 movl (%r8), %eax // w = *next_word 311 movl %eax, %edx // w 312 andl $252645135, %edx // 1st 4 qpos 313 movl %edx, -8(%rcx) // write 1st 4 qpos 314 shrl $4, %eax // w>>4 315 andl $252645135, %eax // 2nd 4 qpos 316 movl %eax, -4(%rcx) // write 2nd 4 qpos 317 addq $4, %r8 // next_word++ 318 addq $8, %rcx // next_qpos+=8 319 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START 320 ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START 3211: 322 323 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); 324 325 mov 12(%rdi), %eax // LOW_BITS_AREA_END offset 326 leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END 327 leaq -7280(%rbp), %r11 // tempLowBitsArray 328 cmpq %rdi, %r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END 329 jae 1f // if START>=END, skip L_WK_unpack_3_tenbits 330 leaq 12(%r11), %rcx // next_low_bits 331L_WK_unpack_3_tenbits: 332 movl (%r9), %eax // w = *next_word 333 movl %eax, %edx // w 334 andl $1023, %edx // 1st tenbits 335 movl %edx, -12(%rcx) // write 1st tenbits 336 shrl $10, %eax // w >> 10 337 movl %eax, %edx // w >> 10 338 andl $1023, %edx // 2nd tenbits 339 movl %edx, -8(%rcx) // write 2nd tenbits 340 shrl $10, %eax // w >> 20, 3rd tenbits 341 movl %eax, -4(%rcx) // write 3rd tenbits 342 addq $4, %r9 // next_word++ 343 addq $12, %rcx // next_low_bits += 3 344 cmpq %r9, %rdi // LOW_BITS_AREA_END vs next_word 345 ja L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word 3461: 347 movq %rsi, %rdi // next_tag 348 movq %rbx, %r8 // next_qpos 349 leaq 4(%r12), %rcx // dest_buf 350 movq %r11, %r9 // next_low_bits 351 leaq -80(%rbp), %r11 // dictionary 352 leaq _hashLookupTable(%rip), %rbx // hash look up table 353 leaq 1024(%rsi), %rsi // tag_area_end 354 355 jmp L11 356 .align 4,0x90 357L31: 358 jle L_ZERO_TAG 359 cmpb $2, %al // MISS_TAG 360 je L_MISS_TAG 361L_EXACT_TAG: 362 movsbq (%r8),%rax // qpos = *next_qpos 363 incq %r8 // next_qpos++ 364 movl (%r11,%rax,4), %eax // w = dictionary[qpos] 365 movl %eax, -4(%rcx) // *dest_buf = w 366 .align 4,0x90 367L_next: 368 incq %rdi // next_tag++ 369 addq $4, %rcx // dest_buf++ 370 cmpq %rsi, %rdi // next_tag vs tag_area_end 371 jae L_done // if next_tag >= tag_area_end, we're done 372L11: 373 movzbl (%rdi), %eax // tag = *next_tag 374 cmpb $1, %al // partial match tag ? 375 jne L31 376L_PARTIAL_TAG: 377 movsbq (%r8),%rdx // qpos = *next_qpos 378 leaq (%r11,%rdx,4), %rdx // dict_location = &dictionary[qpos] 379 incq %r8 // next_qpos++ 380 movl (%rdx), %eax // read dictionary word 381 andl $-1024, %eax // clear lower 10 bits 382 orl (%r9), %eax // pad the lower 10-bits from *next_low_bits 383 addq $4, %r9 // next_low_bits++ 384 movl %eax, (%rdx) // *dict_location = newly formed word 385 movl %eax, -4(%rcx) // *dest_buf = newly formed word 386 cmpq %rsi, %rdi // compare next_tag vs tag_area_end 387 jne L_next // repeat loop until next_tag==tag_area_end 388L_done: 389 390 // release stack memory, restore registers, and return 391 addq $7144, %rsp 392 popq %rbx 393 popq %r12 394 leave 395 ret 396 397 .align 4,0x90 398L_MISS_TAG: 399 movl (%r10), %eax // w = *next_full_patt 400 addq $4, %r10 // next_full_patt++ 401 movl %eax, %edx // w 402 shrl $10, %edx // w>>10 403 movzbl %dl, %edx // 8-bit hash table index 404 movsbq (%rbx,%rdx),%rdx // qpos 405 movl %eax, -80(%rbp,%rdx) // dictionary[qpos] = word 406 movl %eax, -4(%rcx) // *dest_buf = word 407 jmp L_next // repeat the loop 408 409 .align 4,0x90 410L_ZERO_TAG: 411 movl $0, -4(%rcx) // *dest_buf = 0 412 jmp L_next // repeat the loop 413 414#endif // --X86_64__ 415 416.globl _hashLookupTable 417 .const 418 .align 5 419_hashLookupTable: 420 .byte 0 421 .byte 52 422 .byte 8 423 .byte 56 424 .byte 16 425 .byte 12 426 .byte 28 427 .byte 20 428 .byte 4 429 .byte 36 430 .byte 48 431 .byte 24 432 .byte 44 433 .byte 40 434 .byte 32 435 .byte 60 436 .byte 8 437 .byte 12 438 .byte 28 439 .byte 20 440 .byte 4 441 .byte 60 442 .byte 16 443 .byte 36 444 .byte 24 445 .byte 48 446 .byte 44 447 .byte 32 448 .byte 52 449 .byte 56 450 .byte 40 451 .byte 12 452 .byte 8 453 .byte 48 454 .byte 16 455 .byte 52 456 .byte 60 457 .byte 28 458 .byte 56 459 .byte 32 460 .byte 20 461 .byte 24 462 .byte 36 463 .byte 40 464 .byte 44 465 .byte 4 466 .byte 8 467 .byte 40 468 .byte 60 469 .byte 32 470 .byte 20 471 .byte 44 472 .byte 4 473 .byte 36 474 .byte 52 475 .byte 24 476 .byte 16 477 .byte 56 478 .byte 48 479 .byte 12 480 .byte 28 481 .byte 16 482 .byte 8 483 .byte 40 484 .byte 36 485 .byte 28 486 .byte 32 487 .byte 12 488 .byte 4 489 .byte 44 490 .byte 52 491 .byte 20 492 .byte 24 493 .byte 48 494 .byte 60 495 .byte 56 496 .byte 40 497 .byte 48 498 .byte 8 499 .byte 32 500 .byte 28 501 .byte 36 502 .byte 4 503 .byte 44 504 .byte 20 505 .byte 56 506 .byte 60 507 .byte 24 508 .byte 52 509 .byte 16 510 .byte 12 511 .byte 12 512 .byte 4 513 .byte 48 514 .byte 20 515 .byte 8 516 .byte 52 517 .byte 16 518 .byte 60 519 .byte 24 520 .byte 36 521 .byte 44 522 .byte 28 523 .byte 56 524 .byte 40 525 .byte 32 526 .byte 36 527 .byte 20 528 .byte 24 529 .byte 60 530 .byte 40 531 .byte 44 532 .byte 52 533 .byte 16 534 .byte 32 535 .byte 4 536 .byte 48 537 .byte 8 538 .byte 28 539 .byte 56 540 .byte 12 541 .byte 28 542 .byte 32 543 .byte 40 544 .byte 52 545 .byte 36 546 .byte 16 547 .byte 20 548 .byte 48 549 .byte 8 550 .byte 4 551 .byte 60 552 .byte 24 553 .byte 56 554 .byte 44 555 .byte 12 556 .byte 8 557 .byte 36 558 .byte 24 559 .byte 28 560 .byte 16 561 .byte 60 562 .byte 20 563 .byte 56 564 .byte 32 565 .byte 40 566 .byte 48 567 .byte 12 568 .byte 4 569 .byte 44 570 .byte 52 571 .byte 44 572 .byte 40 573 .byte 12 574 .byte 56 575 .byte 8 576 .byte 36 577 .byte 24 578 .byte 60 579 .byte 28 580 .byte 48 581 .byte 4 582 .byte 32 583 .byte 20 584 .byte 16 585 .byte 52 586 .byte 60 587 .byte 12 588 .byte 24 589 .byte 36 590 .byte 8 591 .byte 4 592 .byte 16 593 .byte 56 594 .byte 48 595 .byte 44 596 .byte 40 597 .byte 52 598 .byte 32 599 .byte 20 600 .byte 28 601 .byte 32 602 .byte 12 603 .byte 36 604 .byte 28 605 .byte 24 606 .byte 56 607 .byte 40 608 .byte 16 609 .byte 52 610 .byte 44 611 .byte 4 612 .byte 20 613 .byte 60 614 .byte 8 615 .byte 48 616 .byte 48 617 .byte 52 618 .byte 12 619 .byte 20 620 .byte 32 621 .byte 44 622 .byte 36 623 .byte 28 624 .byte 4 625 .byte 40 626 .byte 24 627 .byte 8 628 .byte 56 629 .byte 60 630 .byte 16 631 .byte 36 632 .byte 32 633 .byte 8 634 .byte 40 635 .byte 4 636 .byte 52 637 .byte 24 638 .byte 44 639 .byte 20 640 .byte 12 641 .byte 28 642 .byte 48 643 .byte 56 644 .byte 16 645 .byte 60 646 .byte 4 647 .byte 52 648 .byte 60 649 .byte 48 650 .byte 20 651 .byte 16 652 .byte 56 653 .byte 44 654 .byte 24 655 .byte 8 656 .byte 40 657 .byte 12 658 .byte 32 659 .byte 28 660 .byte 36 661 .byte 24 662 .byte 32 663 .byte 12 664 .byte 4 665 .byte 20 666 .byte 16 667 .byte 60 668 .byte 36 669 .byte 28 670 .byte 8 671 .byte 52 672 .byte 40 673 .byte 48 674 .byte 44 675 .byte 56 676