1207753Smm/* 2207753Smm * Speed-optimized CRC32 using slicing-by-eight algorithm 3207753Smm * 4207753Smm * This uses only i386 instructions, but it is optimized for i686 and later 5207753Smm * (including e.g. Pentium II/III/IV, Athlon XP, and Core 2). For i586 6207753Smm * (e.g. Pentium), slicing-by-four would be better, and even the C version 7207753Smm * of slicing-by-eight built with gcc -march=i586 tends to be a little bit 8207753Smm * better than this. Very few probably run this code on i586 or older x86 9207753Smm * so this shouldn't be a problem in practice. 10207753Smm * 11207753Smm * Authors: Igor Pavlov (original version) 12207753Smm * Lasse Collin (AT&T syntax, PIC support, better portability) 13207753Smm * 14207753Smm * This file has been put into the public domain. 15207753Smm * You can do whatever you want with this file. 16207753Smm * 17207753Smm * This code needs lzma_crc32_table, which can be created using the 18207753Smm * following C code: 19207753Smm 20207753Smmuint32_t lzma_crc32_table[8][256]; 21207753Smm 22207753Smmvoid 23207753Smminit_table(void) 24207753Smm{ 25207753Smm // IEEE-802.3 26207753Smm static const uint32_t poly32 = UINT32_C(0xEDB88320); 27207753Smm 28207753Smm // Castagnoli 29207753Smm // static const uint32_t poly32 = UINT32_C(0x82F63B78); 30207753Smm 31207753Smm // Koopman 32207753Smm // static const uint32_t poly32 = UINT32_C(0xEB31D82E); 33207753Smm 34207753Smm for (size_t s = 0; s < 8; ++s) { 35207753Smm for (size_t b = 0; b < 256; ++b) { 36207753Smm uint32_t r = s == 0 ? b : lzma_crc32_table[s - 1][b]; 37207753Smm 38207753Smm for (size_t i = 0; i < 8; ++i) { 39207753Smm if (r & 1) 40207753Smm r = (r >> 1) ^ poly32; 41207753Smm else 42207753Smm r >>= 1; 43207753Smm } 44207753Smm 45207753Smm lzma_crc32_table[s][b] = r; 46207753Smm } 47207753Smm } 48207753Smm} 49207753Smm 50207753Smm * The prototype of the CRC32 function: 51207753Smm * extern uint32_t lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc); 52207753Smm */ 53207753Smm 54207753Smm/* 55207753Smm * On some systems, the functions need to be prefixed. The prefix is 56207753Smm * usually an underscore. 57207753Smm */ 58207753Smm#ifndef __USER_LABEL_PREFIX__ 59207753Smm# define __USER_LABEL_PREFIX__ 60207753Smm#endif 61207753Smm#define MAKE_SYM_CAT(prefix, sym) prefix ## sym 62207753Smm#define MAKE_SYM(prefix, sym) MAKE_SYM_CAT(prefix, sym) 63207753Smm#define LZMA_CRC32 MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc32) 64207753Smm#define LZMA_CRC32_TABLE MAKE_SYM(__USER_LABEL_PREFIX__, lzma_crc32_table) 65207753Smm 66207753Smm/* 67207753Smm * Solaris assembler doesn't have .p2align, and Darwin uses .align 68207753Smm * differently than GNU/Linux and Solaris. 69207753Smm */ 70207753Smm#if defined(__APPLE__) || defined(__MSDOS__) 71207753Smm# define ALIGN(pow2, abs) .align pow2 72207753Smm#else 73207753Smm# define ALIGN(pow2, abs) .align abs 74207753Smm#endif 75207753Smm 76207753Smm .text 77207753Smm .globl LZMA_CRC32 78207753Smm 79207753Smm#if !defined(__APPLE__) && !defined(_WIN32) && !defined(__CYGWIN__) \ 80207753Smm && !defined(__MSDOS__) 81207753Smm .type LZMA_CRC32, @function 82207753Smm#endif 83207753Smm 84207753Smm ALIGN(4, 16) 85207753SmmLZMA_CRC32: 86207753Smm /* 87207753Smm * Register usage: 88207753Smm * %eax crc 89207753Smm * %esi buf 90207753Smm * %edi size or buf + size 91207753Smm * %ebx lzma_crc32_table 92207753Smm * %ebp Table index 93207753Smm * %ecx Temporary 94207753Smm * %edx Temporary 95207753Smm */ 96207753Smm pushl %ebx 97207753Smm pushl %esi 98207753Smm pushl %edi 99207753Smm pushl %ebp 100207753Smm movl 0x14(%esp), %esi /* buf */ 101207753Smm movl 0x18(%esp), %edi /* size */ 102207753Smm movl 0x1C(%esp), %eax /* crc */ 103207753Smm 104207753Smm /* 105207753Smm * Store the address of lzma_crc32_table to %ebx. This is needed to 106207753Smm * get position-independent code (PIC). 107207753Smm * 108207753Smm * The PIC macro is defined by libtool, while __PIC__ is defined 109207753Smm * by GCC but only on some systems. Testing for both makes it simpler 110207753Smm * to test this code without libtool, and keeps the code working also 111207753Smm * when built with libtool but using something else than GCC. 112207753Smm * 113207753Smm * I understood that libtool may define PIC on Windows even though 114207753Smm * the code in Windows DLLs is not PIC in sense that it is in ELF 115207753Smm * binaries, so we need a separate check to always use the non-PIC 116207753Smm * code on Windows. 117207753Smm */ 118207753Smm#if (!defined(PIC) && !defined(__PIC__)) \ 119207753Smm || (defined(_WIN32) || defined(__CYGWIN__)) 120207753Smm /* Not PIC */ 121207753Smm movl $ LZMA_CRC32_TABLE, %ebx 122207753Smm#elif defined(__APPLE__) 123207753Smm /* Mach-O */ 124207753Smm call .L_get_pc 125207753Smm.L_pic: 126207753Smm leal .L_lzma_crc32_table$non_lazy_ptr-.L_pic(%ebx), %ebx 127207753Smm movl (%ebx), %ebx 128207753Smm#else 129207753Smm /* ELF */ 130207753Smm call .L_get_pc 131207753Smm addl $_GLOBAL_OFFSET_TABLE_, %ebx 132207753Smm movl LZMA_CRC32_TABLE@GOT(%ebx), %ebx 133207753Smm#endif 134207753Smm 135207753Smm /* Complement the initial value. */ 136207753Smm notl %eax 137207753Smm 138207753Smm ALIGN(4, 16) 139207753Smm.L_align: 140207753Smm /* 141207753Smm * Check if there is enough input to use slicing-by-eight. 142207753Smm * We need 16 bytes, because the loop pre-reads eight bytes. 143207753Smm */ 144207753Smm cmpl $16, %edi 145207753Smm jb .L_rest 146207753Smm 147207753Smm /* Check if we have reached alignment of eight bytes. */ 148207753Smm testl $7, %esi 149207753Smm jz .L_slice 150207753Smm 151207753Smm /* Calculate CRC of the next input byte. */ 152207753Smm movzbl (%esi), %ebp 153207753Smm incl %esi 154207753Smm movzbl %al, %ecx 155207753Smm xorl %ecx, %ebp 156207753Smm shrl $8, %eax 157207753Smm xorl (%ebx, %ebp, 4), %eax 158207753Smm decl %edi 159207753Smm jmp .L_align 160207753Smm 161207753Smm ALIGN(2, 4) 162207753Smm.L_slice: 163207753Smm /* 164207753Smm * If we get here, there's at least 16 bytes of aligned input 165207753Smm * available. Make %edi multiple of eight bytes. Store the possible 166207753Smm * remainder over the "size" variable in the argument stack. 167207753Smm */ 168207753Smm movl %edi, 0x18(%esp) 169207753Smm andl $-8, %edi 170207753Smm subl %edi, 0x18(%esp) 171207753Smm 172207753Smm /* 173207753Smm * Let %edi be buf + size - 8 while running the main loop. This way 174207753Smm * we can compare for equality to determine when exit the loop. 175207753Smm */ 176207753Smm addl %esi, %edi 177207753Smm subl $8, %edi 178207753Smm 179207753Smm /* Read in the first eight aligned bytes. */ 180207753Smm xorl (%esi), %eax 181207753Smm movl 4(%esi), %ecx 182207753Smm movzbl %cl, %ebp 183207753Smm 184207753Smm.L_loop: 185207753Smm movl 0x0C00(%ebx, %ebp, 4), %edx 186207753Smm movzbl %ch, %ebp 187207753Smm xorl 0x0800(%ebx, %ebp, 4), %edx 188207753Smm shrl $16, %ecx 189207753Smm xorl 8(%esi), %edx 190207753Smm movzbl %cl, %ebp 191207753Smm xorl 0x0400(%ebx, %ebp, 4), %edx 192207753Smm movzbl %ch, %ebp 193207753Smm xorl (%ebx, %ebp, 4), %edx 194207753Smm movzbl %al, %ebp 195207753Smm 196207753Smm /* 197207753Smm * Read the next four bytes, for which the CRC is calculated 198207753Smm * on the next interation of the loop. 199207753Smm */ 200207753Smm movl 12(%esi), %ecx 201207753Smm 202207753Smm xorl 0x1C00(%ebx, %ebp, 4), %edx 203207753Smm movzbl %ah, %ebp 204207753Smm shrl $16, %eax 205207753Smm xorl 0x1800(%ebx, %ebp, 4), %edx 206207753Smm movzbl %ah, %ebp 207207753Smm movzbl %al, %eax 208207753Smm movl 0x1400(%ebx, %eax, 4), %eax 209207753Smm addl $8, %esi 210207753Smm xorl %edx, %eax 211207753Smm xorl 0x1000(%ebx, %ebp, 4), %eax 212207753Smm 213207753Smm /* Check for end of aligned input. */ 214207753Smm cmpl %edi, %esi 215207753Smm movzbl %cl, %ebp 216207753Smm jne .L_loop 217207753Smm 218207753Smm /* 219207753Smm * Process the remaining eight bytes, which we have already 220207753Smm * copied to %ecx and %edx. 221207753Smm */ 222207753Smm movl 0x0C00(%ebx, %ebp, 4), %edx 223207753Smm movzbl %ch, %ebp 224207753Smm xorl 0x0800(%ebx, %ebp, 4), %edx 225207753Smm shrl $16, %ecx 226207753Smm movzbl %cl, %ebp 227207753Smm xorl 0x0400(%ebx, %ebp, 4), %edx 228207753Smm movzbl %ch, %ebp 229207753Smm xorl (%ebx, %ebp, 4), %edx 230207753Smm movzbl %al, %ebp 231207753Smm 232207753Smm xorl 0x1C00(%ebx, %ebp, 4), %edx 233207753Smm movzbl %ah, %ebp 234207753Smm shrl $16, %eax 235207753Smm xorl 0x1800(%ebx, %ebp, 4), %edx 236207753Smm movzbl %ah, %ebp 237207753Smm movzbl %al, %eax 238207753Smm movl 0x1400(%ebx, %eax, 4), %eax 239207753Smm addl $8, %esi 240207753Smm xorl %edx, %eax 241207753Smm xorl 0x1000(%ebx, %ebp, 4), %eax 242207753Smm 243207753Smm /* Copy the number of remaining bytes to %edi. */ 244207753Smm movl 0x18(%esp), %edi 245207753Smm 246207753Smm.L_rest: 247207753Smm /* Check for end of input. */ 248207753Smm testl %edi, %edi 249207753Smm jz .L_return 250207753Smm 251207753Smm /* Calculate CRC of the next input byte. */ 252207753Smm movzbl (%esi), %ebp 253207753Smm incl %esi 254207753Smm movzbl %al, %ecx 255207753Smm xorl %ecx, %ebp 256207753Smm shrl $8, %eax 257207753Smm xorl (%ebx, %ebp, 4), %eax 258207753Smm decl %edi 259207753Smm jmp .L_rest 260207753Smm 261207753Smm.L_return: 262207753Smm /* Complement the final value. */ 263207753Smm notl %eax 264207753Smm 265207753Smm popl %ebp 266207753Smm popl %edi 267207753Smm popl %esi 268207753Smm popl %ebx 269207753Smm ret 270207753Smm 271207753Smm#if defined(PIC) || defined(__PIC__) 272207753Smm ALIGN(4, 16) 273207753Smm.L_get_pc: 274207753Smm movl (%esp), %ebx 275207753Smm ret 276207753Smm#endif 277207753Smm 278207753Smm#if defined(__APPLE__) && (defined(PIC) || defined(__PIC__)) 279207753Smm /* Mach-O PIC */ 280207753Smm .section __IMPORT,__pointers,non_lazy_symbol_pointers 281207753Smm.L_lzma_crc32_table$non_lazy_ptr: 282207753Smm .indirect_symbol LZMA_CRC32_TABLE 283207753Smm .long 0 284207753Smm 285207753Smm#elif defined(_WIN32) || defined(__CYGWIN__) 286207753Smm# ifdef DLL_EXPORT 287207753Smm /* This is equivalent of __declspec(dllexport). */ 288207753Smm .section .drectve 289207753Smm .ascii " -export:lzma_crc32" 290207753Smm# endif 291207753Smm 292207753Smm#elif !defined(__MSDOS__) 293207753Smm /* ELF */ 294207753Smm .size LZMA_CRC32, .-LZMA_CRC32 295207753Smm#endif 296207753Smm 297207753Smm/* 298207753Smm * This is needed to support non-executable stack. It's ugly to 299207753Smm * use __linux__ here, but I don't know a way to detect when 300207753Smm * we are using GNU assembler. 301207753Smm */ 302207753Smm#if defined(__ELF__) && defined(__linux__) 303207753Smm .section .note.GNU-stack,"",@progbits 304207753Smm#endif 305