1207753Smm/* 2207753Smm * Speed-optimized CRC64 using slicing-by-four algorithm 3207753Smm * 4207753Smm * This uses only i386 instructions, but it is optimized for i686 and later 5207753Smm * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2). 6207753Smm * 7207753Smm * Authors: Igor Pavlov (original CRC32 assembly code) 8207753Smm * Lasse Collin (CRC64 adaptation of the modified CRC32 code) 9207753Smm * 10207753Smm * This file has been put into the public domain. 11207753Smm * You can do whatever you want with this file. 12207753Smm * 13207753Smm * This code needs lzma_crc64_table, which can be created using the 14207753Smm * following C code: 15207753Smm 16207753Smmuint64_t lzma_crc64_table[4][256]; 17207753Smm 18207753Smmvoid 19207753Smminit_table(void) 20207753Smm{ 21207753Smm // ECMA-182 22207753Smm static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42); 23207753Smm 24207753Smm for (size_t s = 0; s < 4; ++s) { 25207753Smm for (size_t b = 0; b < 256; ++b) { 26207753Smm uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b]; 27207753Smm 28207753Smm for (size_t i = 0; i < 8; ++i) { 29207753Smm if (r & 1) 30207753Smm r = (r >> 1) ^ poly64; 31207753Smm else 32207753Smm r >>= 1; 33207753Smm } 34207753Smm 35207753Smm lzma_crc64_table[s][b] = r; 36207753Smm } 37207753Smm } 38207753Smm} 39207753Smm 40207753Smm * The prototype of the CRC64 function: 41207753Smm * extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc); 42207753Smm */ 43207753Smm 44207753Smm/* 45207753Smm * On some systems, the functions need to be prefixed. The prefix is 46207753Smm * usually an underscore. 47207753Smm */ 48207753Smm#ifndef __USER_LABEL_PREFIX__ 49207753Smm# define __USER_LABEL_PREFIX__ 50207753Smm#endif 51207753Smm#define MAKE_SYM_CAT(prefix, sym) prefix ## sym 52207753Smm#define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym) 53207753Smm#define LZMA_CRC64 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64) 54207753Smm#define LZMA_CRC64_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc64_table) 55207753Smm 56207753Smm/* 57207753Smm * Solaris assembler doesn't have .p2align, and Darwin uses .align 58207753Smm * differently than GNU/Linux and Solaris. 59207753Smm */ 60207753Smm#if defined(__APPLE__) || defined(__MSDOS__) 61207753Smm# define ALIGN(pow2, abs) .align pow2 62207753Smm#else 63207753Smm# define ALIGN(pow2, abs) .align abs 64207753Smm#endif 65207753Smm 66207753Smm .text 67207753Smm .globl LZMA_CRC64 68207753Smm 69207753Smm#if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \ 70207753Smm && !defined(__MSDOS__) 71207753Smm .type LZMA_CRC64, @function 72207753Smm#endif 73207753Smm 74207753Smm ALIGN(4, 16) 75207753SmmLZMA_CRC64: 76207753Smm /* 77207753Smm * Register usage: 78207753Smm * %eax crc LSB 79207753Smm * %edx crc MSB 80207753Smm * %esi buf 81207753Smm * %edi size or buf + size 82207753Smm * %ebx lzma_crc64_table 83207753Smm * %ebp Table index 84207753Smm * %ecx Temporary 85207753Smm */ 86207753Smm pushl %ebx 87207753Smm pushl %esi 88207753Smm pushl %edi 89207753Smm pushl %ebp 90207753Smm movl 0x14(%esp), %esi /* buf */ 91207753Smm movl 0x18(%esp), %edi /* size */ 92207753Smm movl 0x1C(%esp), %eax /* crc LSB */ 93207753Smm movl 0x20(%esp), %edx /* crc MSB */ 94207753Smm 95207753Smm /* 96207753Smm * Store the address of lzma_crc64_table to %ebx. This is needed to 97207753Smm * get position-independent code (PIC). 98207753Smm * 99207753Smm * The PIC macro is defined by libtool, while __PIC__ is defined 100207753Smm * by GCC but only on some systems. Testing for both makes it simpler 101207753Smm * to test this code without libtool, and keeps the code working also 102207753Smm * when built with libtool but using something else than GCC. 103207753Smm * 104207753Smm * I understood that libtool may define PIC on Windows even though 105207753Smm * the code in Windows DLLs is not PIC in sense that it is in ELF 106207753Smm * binaries, so we need a separate check to always use the non-PIC 107207753Smm * code on Windows. 108207753Smm */ 109207753Smm#if (!defined(PIC) && !defined(__PIC__)) \ 110207753Smm || (defined(_WIN32) || defined(__CYGWIN__)) 111207753Smm /* Not PIC */ 112207753Smm movl $ LZMA_CRC64_TABLE, %ebx 113207753Smm#elif defined(__APPLE__) 114207753Smm /* Mach-O */ 115207753Smm call .L_get_pc 116207753Smm.L_pic: 117207753Smm leal .L_lzma_crc64_table$non_lazy_ptr-.L_pic(%ebx), %ebx 118207753Smm movl (%ebx), %ebx 119207753Smm#else 120207753Smm /* ELF */ 121207753Smm call .L_get_pc 122207753Smm addl $_GLOBAL_OFFSET_TABLE_, %ebx 123207753Smm movl LZMA_CRC64_TABLE@GOT(%ebx), %ebx 124207753Smm#endif 125207753Smm 126207753Smm /* Complement the initial value. */ 127207753Smm notl %eax 128207753Smm notl %edx 129207753Smm 130207753Smm.L_align: 131207753Smm /* 132207753Smm * Check if there is enough input to use slicing-by-four. 133207753Smm * We need eight bytes, because the loop pre-reads four bytes. 134207753Smm */ 135207753Smm cmpl $8, %edi 136207753Smm jb .L_rest 137207753Smm 138207753Smm /* Check if we have reached alignment of four bytes. */ 139207753Smm testl $3, %esi 140207753Smm jz .L_slice 141207753Smm 142207753Smm /* Calculate CRC of the next input byte. */ 143207753Smm movzbl (%esi), %ebp 144207753Smm incl %esi 145207753Smm movzbl %al, %ecx 146207753Smm xorl %ecx, %ebp 147207753Smm shrdl $8, %edx, %eax 148207753Smm xorl (%ebx, %ebp, 8), %eax 149207753Smm shrl $8, %edx 150207753Smm xorl 4(%ebx, %ebp, 8), %edx 151207753Smm decl %edi 152207753Smm jmp .L_align 153207753Smm 154207753Smm.L_slice: 155207753Smm /* 156207753Smm * If we get here, there's at least eight bytes of aligned input 157207753Smm * available. Make %edi multiple of four bytes. Store the possible 158207753Smm * remainder over the "size" variable in the argument stack. 159207753Smm */ 160207753Smm movl %edi, 0x18(%esp) 161207753Smm andl $-4, %edi 162207753Smm subl %edi, 0x18(%esp) 163207753Smm 164207753Smm /* 165207753Smm * Let %edi be buf + size - 4 while running the main loop. This way 166207753Smm * we can compare for equality to determine when exit the loop. 167207753Smm */ 168207753Smm addl %esi, %edi 169207753Smm subl $4, %edi 170207753Smm 171207753Smm /* Read in the first four aligned bytes. */ 172207753Smm movl (%esi), %ecx 173207753Smm 174207753Smm.L_loop: 175207753Smm xorl %eax, %ecx 176207753Smm movzbl %cl, %ebp 177207753Smm movl 0x1800(%ebx, %ebp, 8), %eax 178207753Smm xorl %edx, %eax 179207753Smm movl 0x1804(%ebx, %ebp, 8), %edx 180207753Smm movzbl %ch, %ebp 181207753Smm xorl 0x1000(%ebx, %ebp, 8), %eax 182207753Smm xorl 0x1004(%ebx, %ebp, 8), %edx 183207753Smm shrl $16, %ecx 184207753Smm movzbl %cl, %ebp 185207753Smm xorl 0x0800(%ebx, %ebp, 8), %eax 186207753Smm xorl 0x0804(%ebx, %ebp, 8), %edx 187207753Smm movzbl %ch, %ebp 188207753Smm addl $4, %esi 189207753Smm xorl (%ebx, %ebp, 8), %eax 190207753Smm xorl 4(%ebx, %ebp, 8), %edx 191207753Smm 192207753Smm /* Check for end of aligned input. */ 193207753Smm cmpl %edi, %esi 194207753Smm 195207753Smm /* 196207753Smm * Copy the next input byte to %ecx. It is slightly faster to 197207753Smm * read it here than at the top of the loop. 198207753Smm */ 199207753Smm movl (%esi), %ecx 200207753Smm jb .L_loop 201207753Smm 202207753Smm /* 203207753Smm * Process the remaining four bytes, which we have already 204207753Smm * copied to %ecx. 205207753Smm */ 206207753Smm xorl %eax, %ecx 207207753Smm movzbl %cl, %ebp 208207753Smm movl 0x1800(%ebx, %ebp, 8), %eax 209207753Smm xorl %edx, %eax 210207753Smm movl 0x1804(%ebx, %ebp, 8), %edx 211207753Smm movzbl %ch, %ebp 212207753Smm xorl 0x1000(%ebx, %ebp, 8), %eax 213207753Smm xorl 0x1004(%ebx, %ebp, 8), %edx 214207753Smm shrl $16, %ecx 215207753Smm movzbl %cl, %ebp 216207753Smm xorl 0x0800(%ebx, %ebp, 8), %eax 217207753Smm xorl 0x0804(%ebx, %ebp, 8), %edx 218207753Smm movzbl %ch, %ebp 219207753Smm addl $4, %esi 220207753Smm xorl (%ebx, %ebp, 8), %eax 221207753Smm xorl 4(%ebx, %ebp, 8), %edx 222207753Smm 223207753Smm /* Copy the number of remaining bytes to %edi. */ 224207753Smm movl 0x18(%esp), %edi 225207753Smm 226207753Smm.L_rest: 227207753Smm /* Check for end of input. */ 228207753Smm testl %edi, %edi 229207753Smm jz .L_return 230207753Smm 231207753Smm /* Calculate CRC of the next input byte. */ 232207753Smm movzbl (%esi), %ebp 233207753Smm incl %esi 234207753Smm movzbl %al, %ecx 235207753Smm xorl %ecx, %ebp 236207753Smm shrdl $8, %edx, %eax 237207753Smm xorl (%ebx, %ebp, 8), %eax 238207753Smm shrl $8, %edx 239207753Smm xorl 4(%ebx, %ebp, 8), %edx 240207753Smm decl %edi 241207753Smm jmp .L_rest 242207753Smm 243207753Smm.L_return: 244207753Smm /* Complement the final value. */ 245207753Smm notl %eax 246207753Smm notl %edx 247207753Smm 248207753Smm popl %ebp 249207753Smm popl %edi 250207753Smm popl %esi 251207753Smm popl %ebx 252207753Smm ret 253207753Smm 254207753Smm#if defined(PIC) || defined(__PIC__) 255207753Smm ALIGN(4, 16) 256207753Smm.L_get_pc: 257207753Smm movl (%esp), %ebx 258207753Smm ret 259207753Smm#endif 260207753Smm 261207753Smm#if defined(__APPLE__) && (defined(PIC) || defined(__PIC__)) 262207753Smm /* Mach-O PIC */ 263207753Smm .section __IMPORT,__pointers,non_lazy_symbol_pointers 264207753Smm.L_lzma_crc64_table$non_lazy_ptr: 265207753Smm .indirect_symbol LZMA_CRC64_TABLE 266207753Smm .long 0 267207753Smm 268207753Smm#elif defined(_WIN32) || defined(__CYGWIN__) 269207753Smm# ifdef DLL_EXPORT 270207753Smm /* This is equivalent of __declspec(dllexport). */ 271207753Smm .section .drectve 272207753Smm .ascii " -export:lzma_crc64" 273207753Smm# endif 274207753Smm 275207753Smm#elif !defined(__MSDOS__) 276207753Smm /* ELF */ 277207753Smm .size LZMA_CRC64, .-LZMA_CRC64 278207753Smm#endif 279207753Smm 280207753Smm/* 281207753Smm * This is needed to support non-executable stack. It's ugly to 282207753Smm * use __linux__ here, but I don't know a way to detect when 283207753Smm * we are using GNU assembler. 284207753Smm */ 285331235Semaste#if defined(__ELF__) && (defined(__FreeBSD__) || defined(__linux__)) 286207753Smm .section .note.GNU-stack,"",@progbits 287207753Smm#endif 288