// $Id: WKdmDecompress.intel.s,v 1.1 2010/01/30 00:39:21 cclee Exp cclee $ // This file contains i386 and x86_64 (no SSE) optimized implementation of WKdm Decompressor. // The implementation is derived by compiling (gcc -O3) the original C code (WKdmDecompress.c) // followed by hand tweaking of the compiled assembly code. // cclee, 1/29/10 #if defined __i386__ .text .align 4,0x90 .globl _WKdm_decompress _WKdm_decompress: // save registers, set up base pointer %ebp, and allocate stack memory for local veriables pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %ebx subl $7324, %esp // PRELOAD_DICTIONARY; dictionary starting address : -88(%ebp) movl $1, -88(%ebp) movl $1, -84(%ebp) movl $1, -80(%ebp) movl $1, -76(%ebp) movl $1, -72(%ebp) movl $1, -68(%ebp) movl $1, -64(%ebp) movl $1, -60(%ebp) movl $1, -56(%ebp) movl $1, -52(%ebp) movl $1, -48(%ebp) movl $1, -44(%ebp) movl $1, -40(%ebp) movl $1, -36(%ebp) movl $1, -32(%ebp) movl $1, -28(%ebp) #define dictionary_addr -88(%ebp) #define TAGS_AREA_END -7292(%ebp) #define tempTagsArray -7300(%ebp) #define tempQPosArray -2488(%ebp) #define tempLowBitsArray -7288(%ebp) #define next_low_bits -7296(%ebp) #define dictionary -7308(%ebp) #define tag_area_end -7304(%ebp) // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); movl 8(%ebp), %eax // src_buf addl $272, %eax // src_buf + 16 (WKdm Header) + 256 (Tags) movl %eax, TAGS_AREA_END // TAGS_AREA_END(src_buf) movl 8(%ebp), %eax // src_buf movl %eax, %edi // src_buf addl $16, %eax // TAGS_AREA_START(src_buf) = src_buf + 16 (WKdm Header) leal -1288(%ebp), %edx // tempTagsArray movl %edx, tempTagsArray // save a copy of tempTagsArray[] at the said location cmpl %eax, TAGS_AREA_END // TAGS_AREA_END vs TAGS_AREA_START jbe 1f // if TAGS_AREA_END<=TAGS_AREA_START, no need for WK_unpack_2bits movl %edx, %ecx // %ecx -> tempTagsArray[0] xorl %esi, %esi // i=0 movl $50529027, %ebx // 0x03030303, mask to extract 4 2-bit tags .align 4,0x90 L_WK_unpack_2bits: movl 16(%edi,%esi,4), %edx // src_buf[i] for 16 tags, 16 (WKdm header) movl %edx, %eax // w = src_buf[i] andl %ebx, %eax // 1st 4 tags, each in bytes movl %eax, (%ecx) // save 1st 4 tags movl %edx, %eax // w = src_buf[i] shrl $2, %eax // shift down 2 bits andl %ebx, %eax // 2nd 4 tags, each in bytes movl %eax, 4(%ecx) // save 2nd 4 tags shrl $4, %edx // shift down w by 4 bits movl %edx, %eax // w>>4 andl %ebx, %eax // 3rd 4 tags movl %eax, 8(%ecx) // save 3rd 4 tags shrl $2, %edx // w>>6 andl %ebx, %edx // 4th 4 tags movl %edx, 12(%ecx) // save 4th 4 tags addl $16, %ecx // point to next tempTagsArray[i*16] incl %esi // i++ cmpl $64, %esi // i vs 64 jne L_WK_unpack_2bits // repeat the loop until i==64 1: // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); movl 8(%edi), %eax // WKdm header qpos end leal (%edi,%eax,4), %esi // QPOS_AREA_END movl 4(%edi), %eax // WKdm header qpos start leal (%edi,%eax,4), %ecx // QPOS_AREA_START cmpl %ecx, %esi // QPOS_AREA_END vs QPOS_AREA_START jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip WK_unpack_4bits leal tempQPosArray, %edi // tempQPosArray movl $252645135, %ebx // 0x0f0f0f0f : mask to extract 4 4-bit qpos L_WK_unpack_4bits: movl (%ecx), %eax // w movl %eax, %edx // w andl %ebx, %edx // 1st 4 qpos movl %edx, (%edi) // save 1st 4 qpos shrl $4, %eax // w>>4 andl %ebx, %eax // 2nd 4 qpos movl %eax, 4(%edi) // save 2nd 4 qpos addl $4, %ecx // point to next word w addl $8, %edi // qpos += 8 cmpl %ecx, %esi // QPOS_AREA_END vs qpos_pointer ja L_WK_unpack_4bits // repeat until qpos_pointer >= QPOS_AREA_END // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); 1: movl 8(%ebp), %edx // src_buf movl 12(%edx), %eax // LOW_BITS_AREA_END offset leal (%edx,%eax,4), %edi // LOW_BITS_AREA_END cmpl %edi, %esi // LOW_BITS_AREA_START(=QPOS_AREA_END) vs LOW_BITS_AREA_END jae 1f // if (LOW_BITS_AREA_START>=LOW_BITS_AREA_END) skip unpack_3_tenbits leal tempLowBitsArray, %ecx // tempLowBitsArray movl $1023, %ebx // 0x03ff to extact lower 10-bits .align 4,0x90 L_WK_unpack_3_tenbits: movl (%esi), %eax // w = *next_low_bits movl %eax, %edx // w andl %ebx, %edx // 1st 10-bit movl %edx, (%ecx) // save 1st 10-bit shrl $10, %eax // (w>>10) movl %eax, %edx // (w>>10) andl %ebx, %edx // 2nd 10-bit movl %edx, 4(%ecx) // save 2nd 10-bit shrl $10, %eax // (w>>20), no need to and with mask, the top 2 bits should be zero movl %eax, 8(%ecx) // save 3rd 10-bits addl $4, %esi // point to next w addl $12, %ecx // tempLowBitsArray += 3; cmpl %esi, %edi // LOW_BITS_AREA_END vs next_low_bits ja L_WK_unpack_3_tenbits // repeat until next_low_bits>=LOW_BITS_AREA_END 1: call Lhash Lhash: popl %ebx // set up %ebx for use in Hash Table loopup[ #define next_tag %esi #define next_qpos %edi movl tempTagsArray, next_tag // next_tag = tempTagsArray leal tempQPosArray, next_qpos // next_qpos = tempQPosArray movl 12(%ebp), %ecx // dest_buf addl $4, %ecx // for some reason, performance is better if we points to the next one leal tempLowBitsArray, %eax // tempLowBitsArray movl %eax, next_low_bits // next_low_bits = next_low_bits; leal -264(%ebp), %edx movl %edx, tag_area_end // tag_area_end leal dictionary_addr, %eax // dictionary starting address movl %eax, dictionary // dictionary jmp L11 .align 4,0x90 L29: jle L_ZERO_TAG cmpb $2, %al // MISS_TAG je L_MISS_TAG L_EXACT_TAG: movsbl (next_qpos),%eax // qpos = *next_qpos incl next_qpos // next_qpos++ movl dictionary, %edx // dictionary movl (%edx,%eax,4), %eax // w = dictionary[qpos] movl %eax, -4(%ecx) // *dest_buf = w .align 4,0x90 L_next: incl next_tag // next_tag++ addl $4, %ecx // dest_buf++ cmpl tag_area_end, next_tag // next_tag vs tag_area_end jae L_done // if (next_tag>=tag_area_end) L11: movzbl (next_tag), %eax // tag = *next_tag cmpb $1, %al // Partial match? jne L29 L_PARTIAL_TAG: movsbl (next_qpos),%edx // qpos = *next_qpos movl dictionary, %eax // dictionary leal (%eax,%edx,4), %edx // dict_location = &dictionary[qpos] movl %edx, -7324(%ebp) // save dict_location to release %edx incl next_qpos // next_qpos++ movl (%edx), %eax // read dictionary word andl $-1024, %eax // keep only higher 22-bits movl next_low_bits, %edx // low_bits = *next_low_bits orl (%edx), %eax // construct the new partially matched word addl $4, %edx // movl %edx, next_low_bits // next_low_bits++ movl -7324(%ebp), %edx // dict_location movl %eax, (%edx) // update *dict_location with the newly constructed word movl %eax, -4(%ecx) // *dest_buf = the newly constructed word incl next_tag // next_tag++ addl $4, %ecx // dest_buf++ cmpl tag_area_end, next_tag // next_tag vs tag_area_end jb L11 // if next_tag < tag_area_end, repeat the loop L_done: // release stack memory, restore registers, and return addl $7324, %esp popl %ebx popl %esi popl %edi leave ret #define next_full_patt -7292(%ebp) /* next_full_patt starts with initial value of TAGS_AREA_END */ .align 4,0x90 L_MISS_TAG: movl next_full_patt, %edx // next_full_patt movl (%edx), %eax // word = *next_full_patt addl $4, %edx // next_full_patt++ movl %edx, next_full_patt // save next_full_patt movl %eax, %edx // word shrl $10, %edx // word>>10 andl $255, %edx // 8-bit hash table index movsbl _hashLookupTable-Lhash(%ebx,%edx),%edx // qpos movl %eax, -88(%ebp,%edx) // dictionary[qpos] = word movl %eax, -4(%ecx) // *dest_buf = word jmp L_next // repeat the loop .align 4,0x90 L_ZERO_TAG: movl $0, -4(%ecx) // *dest_buf = 0 jmp L_next // repeat the loop #endif // __i386__ #if defined __x86_64__ .text .align 4,0x90 .globl _WKdm_decompress _WKdm_decompress: // save registers, and allocate stack memory for local variables pushq %rbp movq %rsp, %rbp pushq %r12 pushq %rbx subq $7144, %rsp movq %rsi, %r12 // dest_buf // PRELOAD_DICTIONARY; dictionary starting address : starting address -80(%rpb) movl $1, -80(%rbp) movl $1, -76(%rbp) movl $1, -72(%rbp) movl $1, -68(%rbp) movl $1, -64(%rbp) movl $1, -60(%rbp) movl $1, -56(%rbp) movl $1, -52(%rbp) movl $1, -48(%rbp) movl $1, -44(%rbp) movl $1, -40(%rbp) movl $1, -36(%rbp) movl $1, -32(%rbp) movl $1, -28(%rbp) movl $1, -24(%rbp) movl $1, -20(%rbp) // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); leaq 272(%rdi), %r10 // TAGS_AREA_END leaq 16(%rdi), %rax // TAGS_AREA_START leaq -1280(%rbp), %rsi // tempTagsArray cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits movq %rsi, %rcx // next_word xorl %r8d, %r8d // i = 0 .align 4,0x90 L_WK_unpack_2bits: movl 16(%rdi,%r8,4), %edx // w = *next_word movl %edx, %eax // w andl $50529027, %eax // 1st 4 tags movl %eax, (%rcx) // write 1st 4 tags movl %edx, %eax // w shrl $2, %eax // w>>2 andl $50529027, %eax // 2nd 4 tags movl %eax, 4(%rcx) // write 2nd 4 tags shrl $4, %edx // w>>4 movl %edx, %eax // w>>4 andl $50529027, %eax // 3rd 4 tags movl %eax, 8(%rcx) // write 3rd 4 tags shrl $2, %edx // w>>6 andl $50529027, %edx // 4th 4 tags movl %edx, 12(%rcx) // write 4th 4 tags addq $16, %rcx // next_tags += 16 incq %r8 // i++ cmpq $64, %r8 // i vs 64 jne L_WK_unpack_2bits // repeat loop until i==64 1: // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); mov 8(%rdi), %eax // WKdm header qpos end leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END mov 4(%rdi), %eax // WKdm header qpos start leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START leaq -2480(%rbp), %rbx // tempQPosArray cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits leaq 8(%rbx), %rcx // next_qpos L_WK_unpack_4bits: movl (%r8), %eax // w = *next_word movl %eax, %edx // w andl $252645135, %edx // 1st 4 qpos movl %edx, -8(%rcx) // write 1st 4 qpos shrl $4, %eax // w>>4 andl $252645135, %eax // 2nd 4 qpos movl %eax, -4(%rcx) // write 2nd 4 qpos addq $4, %r8 // next_word++ addq $8, %rcx // next_qpos+=8 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START 1: // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); mov 12(%rdi), %eax // LOW_BITS_AREA_END offset leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END leaq -7280(%rbp), %r11 // tempLowBitsArray cmpq %rdi, %r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END jae 1f // if START>=END, skip L_WK_unpack_3_tenbits leaq 12(%r11), %rcx // next_low_bits L_WK_unpack_3_tenbits: movl (%r9), %eax // w = *next_word movl %eax, %edx // w andl $1023, %edx // 1st tenbits movl %edx, -12(%rcx) // write 1st tenbits shrl $10, %eax // w >> 10 movl %eax, %edx // w >> 10 andl $1023, %edx // 2nd tenbits movl %edx, -8(%rcx) // write 2nd tenbits shrl $10, %eax // w >> 20, 3rd tenbits movl %eax, -4(%rcx) // write 3rd tenbits addq $4, %r9 // next_word++ addq $12, %rcx // next_low_bits += 3 cmpq %r9, %rdi // LOW_BITS_AREA_END vs next_word ja L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word 1: movq %rsi, %rdi // next_tag movq %rbx, %r8 // next_qpos leaq 4(%r12), %rcx // dest_buf movq %r11, %r9 // next_low_bits leaq -80(%rbp), %r11 // dictionary leaq _hashLookupTable(%rip), %rbx // hash look up table leaq 1024(%rsi), %rsi // tag_area_end jmp L11 .align 4,0x90 L31: jle L_ZERO_TAG cmpb $2, %al // MISS_TAG je L_MISS_TAG L_EXACT_TAG: movsbq (%r8),%rax // qpos = *next_qpos incq %r8 // next_qpos++ movl (%r11,%rax,4), %eax // w = dictionary[qpos] movl %eax, -4(%rcx) // *dest_buf = w .align 4,0x90 L_next: incq %rdi // next_tag++ addq $4, %rcx // dest_buf++ cmpq %rsi, %rdi // next_tag vs tag_area_end jae L_done // if next_tag >= tag_area_end, we're done L11: movzbl (%rdi), %eax // tag = *next_tag cmpb $1, %al // partial match tag ? jne L31 L_PARTIAL_TAG: movsbq (%r8),%rdx // qpos = *next_qpos leaq (%r11,%rdx,4), %rdx // dict_location = &dictionary[qpos] incq %r8 // next_qpos++ movl (%rdx), %eax // read dictionary word andl $-1024, %eax // clear lower 10 bits orl (%r9), %eax // pad the lower 10-bits from *next_low_bits addq $4, %r9 // next_low_bits++ movl %eax, (%rdx) // *dict_location = newly formed word movl %eax, -4(%rcx) // *dest_buf = newly formed word cmpq %rsi, %rdi // compare next_tag vs tag_area_end jne L_next // repeat loop until next_tag==tag_area_end L_done: // release stack memory, restore registers, and return addq $7144, %rsp popq %rbx popq %r12 leave ret .align 4,0x90 L_MISS_TAG: movl (%r10), %eax // w = *next_full_patt addq $4, %r10 // next_full_patt++ movl %eax, %edx // w shrl $10, %edx // w>>10 movzbl %dl, %edx // 8-bit hash table index movsbq (%rbx,%rdx),%rdx // qpos movl %eax, -80(%rbp,%rdx) // dictionary[qpos] = word movl %eax, -4(%rcx) // *dest_buf = word jmp L_next // repeat the loop .align 4,0x90 L_ZERO_TAG: movl $0, -4(%rcx) // *dest_buf = 0 jmp L_next // repeat the loop #endif // --X86_64__ .globl _hashLookupTable .const .align 5 _hashLookupTable: .byte 0 .byte 52 .byte 8 .byte 56 .byte 16 .byte 12 .byte 28 .byte 20 .byte 4 .byte 36 .byte 48 .byte 24 .byte 44 .byte 40 .byte 32 .byte 60 .byte 8 .byte 12 .byte 28 .byte 20 .byte 4 .byte 60 .byte 16 .byte 36 .byte 24 .byte 48 .byte 44 .byte 32 .byte 52 .byte 56 .byte 40 .byte 12 .byte 8 .byte 48 .byte 16 .byte 52 .byte 60 .byte 28 .byte 56 .byte 32 .byte 20 .byte 24 .byte 36 .byte 40 .byte 44 .byte 4 .byte 8 .byte 40 .byte 60 .byte 32 .byte 20 .byte 44 .byte 4 .byte 36 .byte 52 .byte 24 .byte 16 .byte 56 .byte 48 .byte 12 .byte 28 .byte 16 .byte 8 .byte 40 .byte 36 .byte 28 .byte 32 .byte 12 .byte 4 .byte 44 .byte 52 .byte 20 .byte 24 .byte 48 .byte 60 .byte 56 .byte 40 .byte 48 .byte 8 .byte 32 .byte 28 .byte 36 .byte 4 .byte 44 .byte 20 .byte 56 .byte 60 .byte 24 .byte 52 .byte 16 .byte 12 .byte 12 .byte 4 .byte 48 .byte 20 .byte 8 .byte 52 .byte 16 .byte 60 .byte 24 .byte 36 .byte 44 .byte 28 .byte 56 .byte 40 .byte 32 .byte 36 .byte 20 .byte 24 .byte 60 .byte 40 .byte 44 .byte 52 .byte 16 .byte 32 .byte 4 .byte 48 .byte 8 .byte 28 .byte 56 .byte 12 .byte 28 .byte 32 .byte 40 .byte 52 .byte 36 .byte 16 .byte 20 .byte 48 .byte 8 .byte 4 .byte 60 .byte 24 .byte 56 .byte 44 .byte 12 .byte 8 .byte 36 .byte 24 .byte 28 .byte 16 .byte 60 .byte 20 .byte 56 .byte 32 .byte 40 .byte 48 .byte 12 .byte 4 .byte 44 .byte 52 .byte 44 .byte 40 .byte 12 .byte 56 .byte 8 .byte 36 .byte 24 .byte 60 .byte 28 .byte 48 .byte 4 .byte 32 .byte 20 .byte 16 .byte 52 .byte 60 .byte 12 .byte 24 .byte 36 .byte 8 .byte 4 .byte 16 .byte 56 .byte 48 .byte 44 .byte 40 .byte 52 .byte 32 .byte 20 .byte 28 .byte 32 .byte 12 .byte 36 .byte 28 .byte 24 .byte 56 .byte 40 .byte 16 .byte 52 .byte 44 .byte 4 .byte 20 .byte 60 .byte 8 .byte 48 .byte 48 .byte 52 .byte 12 .byte 20 .byte 32 .byte 44 .byte 36 .byte 28 .byte 4 .byte 40 .byte 24 .byte 8 .byte 56 .byte 60 .byte 16 .byte 36 .byte 32 .byte 8 .byte 40 .byte 4 .byte 52 .byte 24 .byte 44 .byte 20 .byte 12 .byte 28 .byte 48 .byte 56 .byte 16 .byte 60 .byte 4 .byte 52 .byte 60 .byte 48 .byte 20 .byte 16 .byte 56 .byte 44 .byte 24 .byte 8 .byte 40 .byte 12 .byte 32 .byte 28 .byte 36 .byte 24 .byte 32 .byte 12 .byte 4 .byte 20 .byte 16 .byte 60 .byte 36 .byte 28 .byte 8 .byte 52 .byte 40 .byte 48 .byte 44 .byte 56