1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 This file contains x86_64 hand optimized implementation of WKdm memory page decompressor. 31 32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words); 33 34 input : 35 src_buf : address of input compressed data buffer 36 dest_buf : address of output decompressed buffer 37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller 38 words : this argument is not used in the implementation 39 40 output : 41 42 the input buffer is decompressed and the dest_buf is written with decompressed data. 43 44 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s 45 46 The bit stream (*src_buf) consists of 47 a. 12 bytes header 48 b. 256 bytes for 1024 packed tags 49 c. (varying number of) words for new words not matched to dictionary word. 50 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) 51 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) 52 53 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively. 54 55 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows 56 57 for (i=0;i<1024;i++) { 58 tag = *next_tag++ 59 switch (tag) { 60 case 0 : *dest_buf++ = 0; break; 61 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; 62 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; 63 case 3 : *dest_buf++ = dictionary[*dict_index++]; break; 64 } 65 66 cclee, 11/30/12 67*/ 68 69 .text 70 71 .globl _WKdm_decompress_new 72_WKdm_decompress_new: 73 74 // save registers, and allocate stack memory for local variables 75 76 pushq %rbp 77 movq %rsp, %rbp 78 pushq %r12 79 pushq %r13 80 pushq %rbx 81 82 subq $(64+8+16), %rsp 83 84 movq %rsi, %r12 // dest_buf 85 movq %rdx, %r13 // scracht_buf 86 87 // PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp) 88#if 1 89 movl $1, 0(%rsp) 90 movl $1, 4(%rsp) 91 movl $1, 8(%rsp) 92 movl $1, 12(%rsp) 93 movl $1, 16(%rsp) 94 movl $1, 20(%rsp) 95 movl $1, 24(%rsp) 96 movl $1, 28(%rsp) 97 movl $1, 32(%rsp) 98 movl $1, 36(%rsp) 99 movl $1, 40(%rsp) 100 movl $1, 44(%rsp) 101 movl $1, 48(%rsp) 102 movl $1, 52(%rsp) 103 movl $1, 56(%rsp) 104 movl $1, 60(%rsp) 105#else 106 mov $0x100000001, %rax 107 mov %rax, (%rsp) 108 mov %rax, 8(%rsp) 109 mov %rax, 16(%rsp) 110 mov %rax, 24(%rsp) 111 mov %rax, 32(%rsp) 112 mov %rax, 40(%rsp) 113 mov %rax, 48(%rsp) 114 mov %rax, 56(%rsp) 115#endif 116 117 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); 118 119 leaq 268(%rdi), %r10 // TAGS_AREA_END 120 leaq 12(%rdi), %rax // TAGS_AREA_START 121 movq %r13, %rsi // tempTagsArray 122 cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START 123 jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits 124 movq %r13, %rcx // next_word 125 xorl %r8d, %r8d // i = 0 126 mov $(50529027<<32)+50529027, %r9 127L_WK_unpack_2bits: 128 movl 12(%rdi,%r8, 4), %eax 129 movl 12(%rdi,%r8, 4), %edx 130 shrl $2, %eax 131 shlq $32, %rax 132 orq %rdx, %rax 133 movq %rax, %rdx 134 shrq $4, %rax 135 andq %r9, %rdx 136 andq %r9, %rax 137 incq %r8 // i++ 138 movq %rdx, (%rcx) 139 movq %rax, 8(%rcx) 140 addq $16, %rcx // next_tags += 16 141 cmpq $64, %r8 // i vs 64 142 jne L_WK_unpack_2bits // repeat loop until i==64 1431: 144 145 146 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); 147 148 mov 4(%rdi), %eax // WKdm header qpos end 149 leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END 150 mov 0(%rdi), %eax // WKdm header qpos start 151 leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START 152 leaq 1024(%r13), %rbx // tempQPosArray 153 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START 154 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits 155 leaq 8(%rbx), %rcx // next_qpos 156 157 mov $(252645135<<32)+252645135, %r11 158L_WK_unpack_4bits: 159 movl (%r8), %eax // w = *next_word 160 movl %eax, %edx // w 161 shlq $28, %rax 162 orq %rdx, %rax 163 addq $4, %r8 // next_word++ 164 andq %r11, %rax 165 movq %rax, -8(%rcx) 166 addq $8, %rcx // next_qpos+=8 167 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START 168 ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START 169 170 1711: 172 173 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); 174 175 movl 8(%rdi), %eax // LOW_BITS_AREA_END offset 176 leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END 177 leaq 2048(%r13), %r11 // tempLowBitsArray 178 leaq 4094(%r13), %r13 // final tenbits addr 179 sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END 180 jle 1f // if START>=END, skip L_WK_unpack_3_tenbits 181 movq %r11, %rcx // next_low_bits 182L_WK_unpack_3_tenbits: 183 movl (%r9), %eax // w = *next_word, 0:c:b:a 184 movl $(1023<<10), %edx 185 movl $(1023<<20), %r8d 186 andl %eax, %edx // b << 10 187 andl %eax, %r8d // c << 20 188 andq $1023, %rax 189 shll $6, %edx 190 shlq $12, %r8 191 orl %edx, %eax 192 orq %r8, %rax 193 cmp %r13, %rcx 194 je 2f 195 mov %rax, (%rcx) 196 jmp 3f 1972: mov %ax, (%rcx) 1983: 199 addq $4, %r9 // next_word++ 200 addq $6, %rcx // next_low_bits += 3 201 sub $4, %rdi 202 jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word 2031: 204 205 206 #define next_qpos %rbx 207 #define hash %r8 208 #define tags_counter %edi 209 #define dest_buf %r12 210 #define next_full_patt %r10 211 212 leaq _hashLookupTable_new(%rip), hash // hash look up table 213 movl $1024, tags_counter // tags_counter 214 jmp L_next 215 216 .align 4,0x90 217L_nonpartital: 218 jl L_ZERO_TAG 219 cmpb $2, -1(%rsi) 220 je L_MISS_TAG 221 222L_EXACT_TAG: 223 movzbl (next_qpos), %eax // qpos = *next_qpos 224 incq next_qpos // next_qpos++ 225 decl tags_counter // tags_counter-- 226 movl (%rsp,%rax,4), %eax // w = dictionary[qpos] 227 movl %eax, -4(dest_buf) // *dest_buf = w 228 je L_done 229 230L_next: 231 incq %rsi // next_tag++ 232 addq $4, dest_buf 233 cmpb $1, -1(%rsi) 234 jne L_nonpartital 235 236L_PARTIAL_TAG: 237 movzbl (next_qpos),%edx // qpos = *next_qpos 238 incq next_qpos // next_qpos++ 239 movl (%rsp,%rdx,4), %eax // read dictionary word 240 andl $-1024, %eax // clear lower 10 bits 241 or (%r11), %ax // pad the lower 10-bits from *next_low_bits 242 addq $2, %r11 // next_low_bits++ 243 decl tags_counter // tags_counter-- 244 movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word 245 movl %eax, -4(dest_buf) // *dest_buf = newly formed word 246 jg L_next // repeat loop until next_tag==tag_area_end 247 248L_done: 249 250 // release stack memory, restore registers, and return 251 252 addq $(64+8+16), %rsp 253 popq %rbx 254 popq %r13 255 popq %r12 256 leave 257 ret 258 259 .align 4,0x90 260L_MISS_TAG: 261 movl (next_full_patt), %edx // w = *next_full_patt 262 movl (next_full_patt), %eax // w = *next_full_patt 263 shrl $10, %edx // w>>10 264 addq $4, next_full_patt // next_full_patt++ 265 movzbl %dl, %edx // 8-bit hash table index 266 movl %eax, -4(dest_buf) // *dest_buf = word 267 movzbl (hash,%rdx),%edx // qpos 268 decl tags_counter // tags_counter-- 269 movl %eax, (%rsp,%rdx) // dictionary[qpos] = word 270 jg L_next // repeat the loop 271 jmp L_done 272 273 .align 4,0x90 274L_ZERO_TAG: 275 decl tags_counter // tags_counter-- 276 movl $0, -4(dest_buf) // *dest_buf = 0 277 jg L_next // repeat the loop 278 jmp L_done 279 280 281