1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 This file contains x86_64 hand optimized implementation of WKdm memory page compressor. 31 32 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); 33 34 input : 35 src_buf : address of input page (length = 1024 words) 36 dest_buf : address of output buffer (may not be 16-byte aligned) 37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller, 38 bytes_budget : a given byte target in compression 39 40 output : 41 42 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data 43 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget. 44 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area. 45 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined. 46 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault. 47 48 The 4th argument bytes_budget is the target compress budget in bytes. 49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes. 50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed). 51 52 WKdm Compression algorithm is briefly stated as follows: 53 54 There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function. 55 For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits). 56 a. k = (x>>10)&255; // 8-bit hash table index 57 b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed 58 c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated 59 60 Each input word x is classified/tagged into 4 classes : 61 0 : x = 0 62 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word 63 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss 64 3 : (x == dict_word), the exact input word is in the dictionary 65 66 For each class, different numbers of bits are needed for the decompressor to reproduce the original input word. 67 0 : 2-bits tag (32->2 compression) 68 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression) 69 2 : 2-bits tag + 32-bits new word (32->34 expansion) 70 3 : 2-bits tag + 4-bits dict_index (32->6 compression) 71 72 It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6). 73 74 the output bit stream (*dest_buf) consists of 75 a. 12 bytes header 76 b. 256 bytes for 1024 packed tags 77 c. (varying number of) words for new words not matched to dictionary word. 78 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) 79 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) 80 81 the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively. 82 Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word. 83 84 85 The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer. 86 87 The temp buffers are 88 89 uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing 90 uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing 91 uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing 92 93 Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to 94 the destination buffer. 95 96 uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags. 97 98 Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass. 99 100 bytes_budget -= 12 + 256; // header and tags (1024 * 2 /8 = 256 bytes) 101 102 whenever an input word is classified as class 103 104 2 : bytes_budget-=4; if (bytes_budget<=0) exit -1; 105 106 when writing the 8 4-bits/3 10-bits, monitor bytes_budget and exit -1 when byte_budget <=0; 107 108 without showing the bit budget management, the pseudo code is given as follows: 109 110 uint8_t *tags=tempTagsArray; 111 uint8_t *dict=tempQPosArray; 112 uint8_t *partial=tempLowBitsArray; 113 114 for (i=0;i<1024;i++) { 115 x = *src_buf++; 116 if (x == 0) { // zero, 2-bits tag 117 *tags++ = 0; 118 } else { 119 120 // find dict_index and dict_word from x 121 k = (x>>10)&255; 122 dict_index = hashTable[k]; 123 dict_word = dictionary[dict_index]; 124 125 if (dict_word == x) { // exactly match 126 // 2-bits tag + 4-bits table index 127 *tags++ = 3; 128 *dict++ = dict_index; 129 } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched 130 // 2-bits tag + 4-bits table index + 10-bits lower partial 131 *tags++ = 1; 132 *dict++ = dict_index; 133 *partial++ = x &0x3ff; 134 dictionary[dict_index] = x; 135 } else { // not matched 136 // 2-bits tag + 32-bits new word 137 *tags++ = 2; 138 *new_word++ = x; 139 dictionary[dict_index] = x; 140 } 141 } 142 } 143 144 after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf: 145 146 1. 1024 tags are packed into 256 bytes right after the 12-bytes header 147 2. dictionary indices (4-bits each) are packed into are right after the new words section 148 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section. 149 150 cclee, 11/30/12 151*/ 152 153 .text 154 .align 4,0x90 155 156.globl _WKdm_compress_new 157_WKdm_compress_new: 158 pushq %rbp 159 movq %rsp, %rbp 160 pushq %r15 161 pushq %r14 162 pushq %r13 163 pushq %r12 164 pushq %rbx 165 subq $(24+64), %rsp 166 167 168 #define tempTagsArray 64(%rsp) 169 #define tempLowBitsArray 72(%rsp) 170 #define next_tag %r8 171 #define next_input_word %rdi 172 #define end_of_input %r13 173 #define next_full_patt %rbx 174 #define dict_location %rcx 175 #define next_qp %r10 176 #define dictionary %rsp 177 #define scratch %r11 178 #define dest_buf %r12 179 #define hashTable %r14 180 #define tempQPosArray %r15 181 #define next_low_bits %rsi 182 #define byte_count %r9d 183 184 movq %rsi, %r12 // dest_buf 185 movq %rdx, scratch // scratch = dictionary 186 187 movq %rdx, tempTagsArray // &tempTagsArray[0] 188 movq %rdx, next_tag // next_tag always points to the one following the current tag 189 190 leaq 1024(%rdx), tempQPosArray // &tempQPosArray[0] 191 movq tempQPosArray, next_qp // next_qp 192 193 leaq 4096(%rdi), end_of_input // end_of_input = src_buf + num_input_words 194 leaq 268(%rsi), %rbx // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4 195 196 movl %ecx, byte_count 197 subl $(12+256), byte_count // header + tags 198 jle L_budgetExhausted 199 200 // PRELOAD_DICTIONARY; 201 movl $1, 0(dictionary) 202 movl $1, 4(dictionary) 203 movl $1, 8(dictionary) 204 movl $1, 12(dictionary) 205 movl $1, 16(dictionary) 206 movl $1, 20(dictionary) 207 movl $1, 24(dictionary) 208 movl $1, 28(dictionary) 209 movl $1, 32(dictionary) 210 movl $1, 36(dictionary) 211 movl $1, 40(dictionary) 212 movl $1, 44(dictionary) 213 movl $1, 48(dictionary) 214 movl $1, 52(dictionary) 215 movl $1, 56(dictionary) 216 movl $1, 60(dictionary) 217 218 leaq 2048(%rdx), %rax // &tempLowBitsArray[0] 219 movq %rax, tempLowBitsArray // save for later reference 220 movq %rax, next_low_bits // next_low_bits 221 222 leaq _hashLookupTable_new(%rip), hashTable // hash look up table 223 jmp L_scan_loop 224 225 .align 4,0x90 226L_RECORD_ZERO: 227 movb $0, -1(next_tag) // *next_tag = ZERO; 228 addq $4, next_input_word // next_input_word++; 229 cmpq next_input_word, end_of_input // end_of_input vs next_input_word 230 jbe L_done_search 231L_scan_loop: 232 movl (next_input_word), %edx 233 incq next_tag // next_tag++ 234 testl %edx, %edx 235 je L_RECORD_ZERO // if (input_word==0) RECORD_ZERO 236 movl %edx, %eax // a copy of input_word 237 shrl $10, %eax // input_high_bits = HIGH_BITS(input_word); 238 movzbl %al, %eax // 8-bit index to the Hash Table 239 movsbq (hashTable,%rax),%rax // HASH_TO_DICT_BYTE_OFFSET(input_word) 240 leaq (dictionary, %rax), dict_location // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word)); 241 movl (dict_location), %eax // dict_word = *dict_location; 242 addq $4, next_input_word // next_input_word++ 243 cmpl %eax, %edx // dict_word vs input_word 244 je L_RECORD_EXACT // if identical, RECORD_EXACT 245 xorl %edx, %eax 246 shrl $10, %eax // HIGH_BITS(dict_word) 247 je L_RECORD_PARTIAL // if identical, RECORD_PARTIAL 248 249L_RECORD_MISS: 250 movl %edx, (next_full_patt) // *next_full_patt = input_word; 251 addq $4, next_full_patt // next_full_patt++ 252 movl %edx, (dict_location) // *dict_location = input_word 253 movb $2, -1(next_tag) // *next_tag = 2 for miss 254 subl $4, byte_count // fill in a new 4-bytes word 255 jle L_budgetExhausted 256 cmpq next_input_word, end_of_input // end_of_input vs next_input_word 257 ja L_scan_loop 258 259L_done_search: 260 261 // SET_QPOS_AREA_START(dest_buf,next_full_patt); 262 movq next_full_patt, %rax // next_full_patt 263 subq dest_buf, %rax // next_full_patt - dest_buf 264 sarq $2, %rax // offset in 4-bytes 265 movl %eax, %r13d // r13d = (next_full_patt - dest_buf) 266 movl %eax, 0(dest_buf) // dest_buf[0] = next_full_patt - dest_buf 267 decq next_tag 268 cmpq next_tag, tempTagsArray // &tempTagsArray[0] vs next_tag 269 jae L13 // if (&tempTagsArray[0] >= next_tag), skip the following 270 271 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS); 272 273 movq dest_buf, %rdi // dest_buf 274 movq tempTagsArray, %rcx // &tempTagsArray[0] 275 276 .align 4,0x90 277L_pack_2bits: 278 movq 8(%rcx), %rax // w3 279 addq $16, %rcx // tempTagsArray += 16; 280 shlq $4, %rax 281 addq $4, %rdi // dest_buf += 4; 282 orq -16(%rcx), %rax // w3 283 movq %rax, %rdx 284 shrq $30, %rax 285 orl %edx, %eax 286 cmpq %rcx, next_tag // cmp next_tag vs dest_buf 287 movl %eax, 8(%rdi) // save at *(dest_buf + HEADER_SIZE_IN_WORDS) 288 ja L_pack_2bits // if (next_tag > dest_buf) repeat L_pack_2bits 289 290 /* Pack the queue positions into the area just after the full words. */ 291 292L13: 293 mov next_qp, %rax // next_qp 294 sub tempQPosArray, %rax // num_bytes_to_pack = next_qp - (char *) tempQPosArray; 295 addl $7, %eax // num_bytes_to_pack+7 296 shrl $3, %eax // num_packed_words = (num_bytes_to_pack + 7) >> 3 297 298 shll $2, %eax // turn into bytes 299 subl %eax, byte_count // 300 jl L_budgetExhausted 301 shrl $1, %eax // num_source_words = num_packed_words * 2; 302 303 leaq (tempQPosArray,%rax,4), %rcx // endQPosArray = tempQPosArray + num_source_words 304 cmpq %rcx, next_qp // next_qp vs endQPosArray 305 jae L16 // if (next_qp >= endQPosArray) skip the following zero paddings 306 movq %rcx, %rax 307 subq next_qp, %rax 308 subl $4, %eax 309 jl 1f 310 .align 4,0x90 3110: movl $0, (next_qp) 312 addq $4, next_qp 313 subl $4, %eax 314 jge 0b 3151: testl $2, %eax 316 je 1f 317 movw $0, (next_qp) 318 addq $2, next_qp 3191: testl $1, %eax 320 je 1f 321 movb $0, (next_qp) 322 addq $1, next_qp 3231: 324L16: 325 movq next_full_patt, %rdi // next_full_patt 326 cmpq tempQPosArray, %rcx // endQPosArray vs tempQPosArray 327 jbe L20 // if (endQPosArray <= tempQPosArray) skip the following 328 movq tempQPosArray, %rdx // tempQPosArray 329 330 /* byte_count -= (rcx - tempQPosArray)/2 */ 331 332 .align 4,0x90 333L_pack_4bits: 334 movl 4(%rdx), %eax // src_next[1] 335 addq $8, %rdx // src_next += 2; 336 sall $4, %eax // (src_next[1] << 4) 337 addq $4, %rdi // dest_next++; 338 orl -8(%rdx), %eax // temp = src_next[0] | (src_next[1] << 4) 339 cmpq %rdx, %rcx // source_end vs src_next 340 movl %eax, -4(%rdi) // dest_next[0] = temp; 341 ja L_pack_4bits // while (src_next < source_end) repeat the loop 342 343 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp); 344 movq %rdi, %rax // boundary_tmp 345 subq dest_buf, %rax // boundary_tmp - dest_buf 346 movq %rax, %r13 // boundary_tmp - dest_buf 347 shrq $2, %r13 // boundary_tmp - dest_buf in words 348L20: 349 movl %r13d, 4(dest_buf) // dest_buf[1] = boundary_tmp - dest_buf 350 351 movq tempLowBitsArray, %rcx // tempLowBitsArray 352 movq next_low_bits, %rbx // next_low_bits 353 subq %rcx, %rbx // next_low_bits - tempLowBitsArray (in bytes) 354 sarq $1, %rbx // num_tenbits_to_pack (in half-words) 355 356 #define size %ebx 357 358 subl $3, size // pre-decrement num_tenbits_to_pack by 3 359 jl 1f // if num_tenbits_to_pack < 3, skip the following loop 360 361 .align 4,0x90 3620: 363 movzwl 4(%rcx), %eax // w2 364 addq $6, %rcx // next w0/w1/w2 triplet 365 sall $10, %eax // w1 << 10 366 or -4(%rcx), %ax // w1 367 addq $4, %rdi // dest_buf++ 368 sall $10, %eax // w1 << 10 369 or -6(%rcx), %ax // (w0) | (w1<<10) | (w2<<20) 370 subl $4, byte_count // fill in a new 4-bytes word 371 jle L_budgetExhausted 372 subl $3, size // num_tenbits_to_pack-=3 373 movl %eax, -4(%rdi) // pack w0,w1,w2 into 1 dest_buf word 374 jge 0b // if no less than 3 elements, back to loop head 375 3761: addl $3, size // post-increment num_tenbits_to_pack by 3 377 je 3f // if num_tenbits_to_pack is a multiple of 3, skip the following 378 movzwl (%rcx), %eax // w0 379 subl $1, size // num_tenbits_to_pack-- 380 je 2f // 381 movzwl 2(%rcx), %edx // w1 382 sall $10, %edx // w1 << 10 383 orl %edx, %eax // w0 | (w1<<10) 3842: 385 subl $4, byte_count // fill in a new 4-bytes word 386 jle L_budgetExhausted 387 movl %eax, (%rdi) // write the final dest_buf word 388 addq $4, %rdi // dest_buf++ 389 3903: movq %rdi, %rax // boundary_tmp 391 subq dest_buf, %rax // boundary_tmp - dest_buf 392 shrq $2, %rax // boundary_tmp - dest_buf in terms of words 393 movl %eax, 8(dest_buf) // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp) 394 shlq $2, %rax // boundary_tmp - dest_buf in terms of bytes 395 396L_done: 397 // restore registers and return 398 addq $(24+64), %rsp 399 popq %rbx 400 popq %r12 401 popq %r13 402 popq %r14 403 popq %r15 404 leave 405 ret 406 407 .align 4 408L_budgetExhausted: 409 mov $-1, %rax 410 jmp L_done 411 412 413 .align 4,0x90 414L_RECORD_EXACT: 415 subq dictionary, %rcx // dict_location - dictionary 416 sarq $2, %rcx // divide by 4 for word offset 417 movb $3, -1(next_tag) // *next_tag = 3 for exact 418 movb %cl, (next_qp) // *next_qp = word offset (4-bit) 419 incq next_qp // next_qp++ 420 cmpq next_input_word, end_of_input // end_of_input vs next_input_word 421 ja L_scan_loop 422 jmp L_done_search 423 424 .align 4,0x90 425L_RECORD_PARTIAL: 426 movq %rcx, %rax // dict_location 427 movb $1, -1(next_tag) // *next_tag = 1 for partial matched 428 subq dictionary, %rax // dict_location - dictionary 429 movl %edx, (%rcx) // *dict_location = input_word; 430 sarq $2, %rax // offset in 32-bit word 431 movb %al, (next_qp) // update *next_qp 432 andl $1023, %edx // lower 10 bits 433 incq next_qp // next_qp++ 434 mov %dx, (next_low_bits) // save next_low_bits 435 addq $2, next_low_bits // next_low_bits++ 436 cmpq next_input_word, end_of_input // end_of_input vs next_input_word 437 ja L_scan_loop 438 jmp L_done_search 439 440