1/* crc32.c -- compute the CRC-32 of a data stream
2 * Copyright (C) 1995-2022 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 *
5 * This interleaved implementation of a CRC makes use of pipelined multiple
6 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8 */
9
10/* @(#) Id */
11
12/*
13  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14  protection on the static variables used to control the first-use generation
15  of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16  first call get_crc_table() to initialize the tables before allowing more than
17  one thread to use crc32().
18
19  MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20  produced, so that this one source file can be compiled to an executable.
21 */
22
23#ifdef MAKECRCH
24#  include <stdio.h>
25#  ifndef DYNAMIC_CRC_TABLE
26#    define DYNAMIC_CRC_TABLE
27#  endif /* !DYNAMIC_CRC_TABLE */
28#endif /* MAKECRCH */
29
30#include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
31
32 /*
33  A CRC of a message is computed on N braids of words in the message, where
34  each word consists of W bytes (4 or 8). If N is 3, for example, then three
35  running sparse CRCs are calculated respectively on each braid, at these
36  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37  This is done starting at a word boundary, and continues until as many blocks
38  of N * W bytes as are available have been processed. The results are combined
39  into a single CRC at the end. For this code, N must be in the range 1..6 and
40  W must be 4 or 8. The upper limit on N can be increased if desired by adding
41  more #if blocks, extending the patterns apparent in the code. In addition,
42  crc32.h would need to be regenerated, if the maximum N value is increased.
43
44  N and W are chosen empirically by benchmarking the execution time on a given
45  processor. The choices for N and W below were based on testing on Intel Kaby
46  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49  They were all tested with either gcc or clang, all using the -O3 optimization
50  level. Your mileage may vary.
51 */
52
53/* Define N */
54#ifdef Z_TESTN
55#  define N Z_TESTN
56#else
57#  define N 5
58#endif
59#if N < 1 || N > 6
60#  error N must be in 1..6
61#endif
62
63/*
64  z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65  z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66  that bytes are eight bits.
67 */
68
69/*
70  Define W and the associated z_word_t type. If W is not defined, then a
71  braided calculation is not used, and the associated tables and code are not
72  compiled.
73 */
74#ifdef Z_TESTW
75#  if Z_TESTW-1 != -1
76#    define W Z_TESTW
77#  endif
78#else
79#  ifdef MAKECRCH
80#    define W 8         /* required for MAKECRCH */
81#  else
82#    if defined(__x86_64__) || defined(__aarch64__)
83#      define W 8
84#    else
85#      define W 4
86#    endif
87#  endif
88#endif
89#ifdef W
90#  if W == 8 && defined(Z_U8)
91     typedef Z_U8 z_word_t;
92#  elif defined(Z_U4)
93#    undef W
94#    define W 4
95     typedef Z_U4 z_word_t;
96#  else
97#    undef W
98#  endif
99#endif
100
101/* Local functions. */
102local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
103local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
104
105/* If available, use the ARM processor CRC32 instruction. */
106#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
107#  define ARMCRC32
108#endif
109
110#if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
111/*
112  Swap the bytes in a z_word_t to convert between little and big endian. Any
113  self-respecting compiler will optimize this to a single machine byte-swap
114  instruction, if one is available. This assumes that word_t is either 32 bits
115  or 64 bits.
116 */
117local z_word_t byte_swap(word)
118    z_word_t word;
119{
120#  if W == 8
121    return
122        (word & 0xff00000000000000) >> 56 |
123        (word & 0xff000000000000) >> 40 |
124        (word & 0xff0000000000) >> 24 |
125        (word & 0xff00000000) >> 8 |
126        (word & 0xff000000) << 8 |
127        (word & 0xff0000) << 24 |
128        (word & 0xff00) << 40 |
129        (word & 0xff) << 56;
130#  else   /* W == 4 */
131    return
132        (word & 0xff000000) >> 24 |
133        (word & 0xff0000) >> 8 |
134        (word & 0xff00) << 8 |
135        (word & 0xff) << 24;
136#  endif
137}
138#endif
139
140/* CRC polynomial. */
141#define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
142
143#ifdef DYNAMIC_CRC_TABLE
144
145local z_crc_t FAR crc_table[256];
146local z_crc_t FAR x2n_table[32];
147local void make_crc_table OF((void));
148#ifdef W
149   local z_word_t FAR crc_big_table[256];
150   local z_crc_t FAR crc_braid_table[W][256];
151   local z_word_t FAR crc_braid_big_table[W][256];
152   local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
153#endif
154#ifdef MAKECRCH
155   local void write_table OF((FILE *, const z_crc_t FAR *, int));
156   local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
157   local void write_table64 OF((FILE *, const z_word_t FAR *, int));
158#endif /* MAKECRCH */
159
160/*
161  Define a once() function depending on the availability of atomics. If this is
162  compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
163  multiple threads, and if atomics are not available, then get_crc_table() must
164  be called to initialize the tables and must return before any threads are
165  allowed to compute or combine CRCs.
166 */
167
168/* Definition of once functionality. */
169typedef struct once_s once_t;
170local void once OF((once_t *, void (*)(void)));
171
172/* Check for the availability of atomics. */
173#if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
174    !defined(__STDC_NO_ATOMICS__)
175
176#include <stdatomic.h>
177
178/* Structure for once(), which must be initialized with ONCE_INIT. */
179struct once_s {
180    atomic_flag begun;
181    atomic_int done;
182};
183#define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
184
185/*
186  Run the provided init() function exactly once, even if multiple threads
187  invoke once() at the same time. The state must be a once_t initialized with
188  ONCE_INIT.
189 */
190local void once(state, init)
191    once_t *state;
192    void (*init)(void);
193{
194    if (!atomic_load(&state->done)) {
195        if (atomic_flag_test_and_set(&state->begun))
196            while (!atomic_load(&state->done))
197                ;
198        else {
199            init();
200            atomic_store(&state->done, 1);
201        }
202    }
203}
204
205#else   /* no atomics */
206
207/* Structure for once(), which must be initialized with ONCE_INIT. */
208struct once_s {
209    volatile int begun;
210    volatile int done;
211};
212#define ONCE_INIT {0, 0}
213
214/* Test and set. Alas, not atomic, but tries to minimize the period of
215   vulnerability. */
216local int test_and_set OF((int volatile *));
217local int test_and_set(flag)
218    int volatile *flag;
219{
220    int was;
221
222    was = *flag;
223    *flag = 1;
224    return was;
225}
226
227/* Run the provided init() function once. This is not thread-safe. */
228local void once(state, init)
229    once_t *state;
230    void (*init)(void);
231{
232    if (!state->done) {
233        if (test_and_set(&state->begun))
234            while (!state->done)
235                ;
236        else {
237            init();
238            state->done = 1;
239        }
240    }
241}
242
243#endif
244
245/* State for once(). */
246local once_t made = ONCE_INIT;
247
248/*
249  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
250  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
251
252  Polynomials over GF(2) are represented in binary, one bit per coefficient,
253  with the lowest powers in the most significant bit. Then adding polynomials
254  is just exclusive-or, and multiplying a polynomial by x is a right shift by
255  one. If we call the above polynomial p, and represent a byte as the
256  polynomial q, also with the lowest power in the most significant bit (so the
257  byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
258  where a mod b means the remainder after dividing a by b.
259
260  This calculation is done using the shift-register method of multiplying and
261  taking the remainder. The register is initialized to zero, and for each
262  incoming bit, x^32 is added mod p to the register if the bit is a one (where
263  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
264  (which is shifting right by one and adding x^32 mod p if the bit shifted out
265  is a one). We start with the highest power (least significant bit) of q and
266  repeat for all eight bits of q.
267
268  The table is simply the CRC of all possible eight bit values. This is all the
269  information needed to generate CRCs on data a byte at a time for all
270  combinations of CRC register values and incoming bytes.
271 */
272
273local void make_crc_table()
274{
275    unsigned i, j, n;
276    z_crc_t p;
277
278    /* initialize the CRC of bytes tables */
279    for (i = 0; i < 256; i++) {
280        p = i;
281        for (j = 0; j < 8; j++)
282            p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
283        crc_table[i] = p;
284#ifdef W
285        crc_big_table[i] = byte_swap(p);
286#endif
287    }
288
289    /* initialize the x^2^n mod p(x) table */
290    p = (z_crc_t)1 << 30;         /* x^1 */
291    x2n_table[0] = p;
292    for (n = 1; n < 32; n++)
293        x2n_table[n] = p = multmodp(p, p);
294
295#ifdef W
296    /* initialize the braiding tables -- needs x2n_table[] */
297    braid(crc_braid_table, crc_braid_big_table, N, W);
298#endif
299
300#ifdef MAKECRCH
301    {
302        /*
303          The crc32.h header file contains tables for both 32-bit and 64-bit
304          z_word_t's, and so requires a 64-bit type be available. In that case,
305          z_word_t must be defined to be 64-bits. This code then also generates
306          and writes out the tables for the case that z_word_t is 32 bits.
307         */
308#if !defined(W) || W != 8
309#  error Need a 64-bit integer type in order to generate crc32.h.
310#endif
311        FILE *out;
312        int k, n;
313        z_crc_t ltl[8][256];
314        z_word_t big[8][256];
315
316        out = fopen("crc32.h", "w");
317        if (out == NULL) return;
318
319        /* write out little-endian CRC table to crc32.h */
320        fprintf(out,
321            "/* crc32.h -- tables for rapid CRC calculation\n"
322            " * Generated automatically by crc32.c\n */\n"
323            "\n"
324            "local const z_crc_t FAR crc_table[] = {\n"
325            "    ");
326        write_table(out, crc_table, 256);
327        fprintf(out,
328            "};\n");
329
330        /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
331        fprintf(out,
332            "\n"
333            "#ifdef W\n"
334            "\n"
335            "#if W == 8\n"
336            "\n"
337            "local const z_word_t FAR crc_big_table[] = {\n"
338            "    ");
339        write_table64(out, crc_big_table, 256);
340        fprintf(out,
341            "};\n");
342
343        /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
344        fprintf(out,
345            "\n"
346            "#else /* W == 4 */\n"
347            "\n"
348            "local const z_word_t FAR crc_big_table[] = {\n"
349            "    ");
350        write_table32hi(out, crc_big_table, 256);
351        fprintf(out,
352            "};\n"
353            "\n"
354            "#endif\n");
355
356        /* write out braid tables for each value of N */
357        for (n = 1; n <= 6; n++) {
358            fprintf(out,
359            "\n"
360            "#if N == %d\n", n);
361
362            /* compute braid tables for this N and 64-bit word_t */
363            braid(ltl, big, n, 8);
364
365            /* write out braid tables for 64-bit z_word_t to crc32.h */
366            fprintf(out,
367            "\n"
368            "#if W == 8\n"
369            "\n"
370            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
371            for (k = 0; k < 8; k++) {
372                fprintf(out, "   {");
373                write_table(out, ltl[k], 256);
374                fprintf(out, "}%s", k < 7 ? ",\n" : "");
375            }
376            fprintf(out,
377            "};\n"
378            "\n"
379            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
380            for (k = 0; k < 8; k++) {
381                fprintf(out, "   {");
382                write_table64(out, big[k], 256);
383                fprintf(out, "}%s", k < 7 ? ",\n" : "");
384            }
385            fprintf(out,
386            "};\n");
387
388            /* compute braid tables for this N and 32-bit word_t */
389            braid(ltl, big, n, 4);
390
391            /* write out braid tables for 32-bit z_word_t to crc32.h */
392            fprintf(out,
393            "\n"
394            "#else /* W == 4 */\n"
395            "\n"
396            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
397            for (k = 0; k < 4; k++) {
398                fprintf(out, "   {");
399                write_table(out, ltl[k], 256);
400                fprintf(out, "}%s", k < 3 ? ",\n" : "");
401            }
402            fprintf(out,
403            "};\n"
404            "\n"
405            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
406            for (k = 0; k < 4; k++) {
407                fprintf(out, "   {");
408                write_table32hi(out, big[k], 256);
409                fprintf(out, "}%s", k < 3 ? ",\n" : "");
410            }
411            fprintf(out,
412            "};\n"
413            "\n"
414            "#endif\n"
415            "\n"
416            "#endif\n");
417        }
418        fprintf(out,
419            "\n"
420            "#endif\n");
421
422        /* write out zeros operator table to crc32.h */
423        fprintf(out,
424            "\n"
425            "local const z_crc_t FAR x2n_table[] = {\n"
426            "    ");
427        write_table(out, x2n_table, 32);
428        fprintf(out,
429            "};\n");
430        fclose(out);
431    }
432#endif /* MAKECRCH */
433}
434
435#ifdef MAKECRCH
436
437/*
438   Write the 32-bit values in table[0..k-1] to out, five per line in
439   hexadecimal separated by commas.
440 */
441local void write_table(out, table, k)
442    FILE *out;
443    const z_crc_t FAR *table;
444    int k;
445{
446    int n;
447
448    for (n = 0; n < k; n++)
449        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
450                (unsigned long)(table[n]),
451                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
452}
453
454/*
455   Write the high 32-bits of each value in table[0..k-1] to out, five per line
456   in hexadecimal separated by commas.
457 */
458local void write_table32hi(out, table, k)
459FILE *out;
460const z_word_t FAR *table;
461int k;
462{
463    int n;
464
465    for (n = 0; n < k; n++)
466        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
467                (unsigned long)(table[n] >> 32),
468                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
469}
470
471/*
472  Write the 64-bit values in table[0..k-1] to out, three per line in
473  hexadecimal separated by commas. This assumes that if there is a 64-bit
474  type, then there is also a long long integer type, and it is at least 64
475  bits. If not, then the type cast and format string can be adjusted
476  accordingly.
477 */
478local void write_table64(out, table, k)
479    FILE *out;
480    const z_word_t FAR *table;
481    int k;
482{
483    int n;
484
485    for (n = 0; n < k; n++)
486        fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
487                (unsigned long long)(table[n]),
488                n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
489}
490
491/* Actually do the deed. */
492int main()
493{
494    make_crc_table();
495    return 0;
496}
497
498#endif /* MAKECRCH */
499
500#ifdef W
501/*
502  Generate the little and big-endian braid tables for the given n and z_word_t
503  size w. Each array must have room for w blocks of 256 elements.
504 */
505local void braid(ltl, big, n, w)
506    z_crc_t ltl[][256];
507    z_word_t big[][256];
508    int n;
509    int w;
510{
511    int k;
512    z_crc_t i, p, q;
513    for (k = 0; k < w; k++) {
514        p = x2nmodp((n * w + 3 - k) << 3, 0);
515        ltl[k][0] = 0;
516        big[w - 1 - k][0] = 0;
517        for (i = 1; i < 256; i++) {
518            ltl[k][i] = q = multmodp(i << 24, p);
519            big[w - 1 - k][i] = byte_swap(q);
520        }
521    }
522}
523#endif
524
525#else /* !DYNAMIC_CRC_TABLE */
526/* ========================================================================
527 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
528 * of x for combining CRC-32s, all made by make_crc_table().
529 */
530#include "crc32.h"
531#endif /* DYNAMIC_CRC_TABLE */
532
533/* ========================================================================
534 * Routines used for CRC calculation. Some are also required for the table
535 * generation above.
536 */
537
538/*
539  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
540  reflected. For speed, this requires that a not be zero.
541 */
542local z_crc_t multmodp(a, b)
543    z_crc_t a;
544    z_crc_t b;
545{
546    z_crc_t m, p;
547
548    m = (z_crc_t)1 << 31;
549    p = 0;
550    for (;;) {
551        if (a & m) {
552            p ^= b;
553            if ((a & (m - 1)) == 0)
554                break;
555        }
556        m >>= 1;
557        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
558    }
559    return p;
560}
561
562/*
563  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
564  initialized.
565 */
566local z_crc_t x2nmodp(n, k)
567    z_off64_t n;
568    unsigned k;
569{
570    z_crc_t p;
571
572    p = (z_crc_t)1 << 31;           /* x^0 == 1 */
573    while (n) {
574        if (n & 1)
575            p = multmodp(x2n_table[k & 31], p);
576        n >>= 1;
577        k++;
578    }
579    return p;
580}
581
582/* =========================================================================
583 * This function can be used by asm versions of crc32(), and to force the
584 * generation of the CRC tables in a threaded application.
585 */
586const z_crc_t FAR * ZEXPORT get_crc_table()
587{
588#ifdef DYNAMIC_CRC_TABLE
589    once(&made, make_crc_table);
590#endif /* DYNAMIC_CRC_TABLE */
591    return (const z_crc_t FAR *)crc_table;
592}
593
594/* =========================================================================
595 * Use ARM machine instructions if available. This will compute the CRC about
596 * ten times faster than the braided calculation. This code does not check for
597 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
598 * only be defined if the compilation specifies an ARM processor architecture
599 * that has the instructions. For example, compiling with -march=armv8.1-a or
600 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
601 * instructions.
602 */
603#ifdef ARMCRC32
604
605/*
606   Constants empirically determined to maximize speed. These values are from
607   measurements on a Cortex-A57. Your mileage may vary.
608 */
609#define Z_BATCH 3990                /* number of words in a batch */
610#define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
611#define Z_BATCH_MIN 800             /* fewest words in a final batch */
612
613unsigned long ZEXPORT crc32_z(crc, buf, len)
614    unsigned long crc;
615    const unsigned char FAR *buf;
616    z_size_t len;
617{
618    z_crc_t val;
619    z_word_t crc1, crc2;
620    const z_word_t *word;
621    z_word_t val0, val1, val2;
622    z_size_t last, last2, i;
623    z_size_t num;
624
625    /* Return initial CRC, if requested. */
626    if (buf == Z_NULL) return 0;
627
628#ifdef DYNAMIC_CRC_TABLE
629    once(&made, make_crc_table);
630#endif /* DYNAMIC_CRC_TABLE */
631
632    /* Pre-condition the CRC */
633    crc ^= 0xffffffff;
634
635    /* Compute the CRC up to a word boundary. */
636    while (len && ((z_size_t)buf & 7) != 0) {
637        len--;
638        val = *buf++;
639        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
640    }
641
642    /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
643    word = (z_word_t const *)buf;
644    num = len >> 3;
645    len &= 7;
646
647    /* Do three interleaved CRCs to realize the throughput of one crc32x
648       instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
649       CRCs are combined into a single CRC after each set of batches. */
650    while (num >= 3 * Z_BATCH) {
651        crc1 = 0;
652        crc2 = 0;
653        for (i = 0; i < Z_BATCH; i++) {
654            val0 = word[i];
655            val1 = word[i + Z_BATCH];
656            val2 = word[i + 2 * Z_BATCH];
657            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
658            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
659            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
660        }
661        word += 3 * Z_BATCH;
662        num -= 3 * Z_BATCH;
663        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
664        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
665    }
666
667    /* Do one last smaller batch with the remaining words, if there are enough
668       to pay for the combination of CRCs. */
669    last = num / 3;
670    if (last >= Z_BATCH_MIN) {
671        last2 = last << 1;
672        crc1 = 0;
673        crc2 = 0;
674        for (i = 0; i < last; i++) {
675            val0 = word[i];
676            val1 = word[i + last];
677            val2 = word[i + last2];
678            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
679            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
680            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
681        }
682        word += 3 * last;
683        num -= 3 * last;
684        val = x2nmodp(last, 6);
685        crc = multmodp(val, crc) ^ crc1;
686        crc = multmodp(val, crc) ^ crc2;
687    }
688
689    /* Compute the CRC on any remaining words. */
690    for (i = 0; i < num; i++) {
691        val0 = word[i];
692        __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
693    }
694    word += num;
695
696    /* Complete the CRC on any remaining bytes. */
697    buf = (const unsigned char FAR *)word;
698    while (len) {
699        len--;
700        val = *buf++;
701        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
702    }
703
704    /* Return the CRC, post-conditioned. */
705    return crc ^ 0xffffffff;
706}
707
708#else
709
710#ifdef W
711
712/*
713  Return the CRC of the W bytes in the word_t data, taking the
714  least-significant byte of the word as the first byte of data, without any pre
715  or post conditioning. This is used to combine the CRCs of each braid.
716 */
717local z_crc_t crc_word(data)
718    z_word_t data;
719{
720    int k;
721    for (k = 0; k < W; k++)
722        data = (data >> 8) ^ crc_table[data & 0xff];
723    return (z_crc_t)data;
724}
725
726local z_word_t crc_word_big(data)
727    z_word_t data;
728{
729    int k;
730    for (k = 0; k < W; k++)
731        data = (data << 8) ^
732            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
733    return data;
734}
735
736#endif
737
738/* ========================================================================= */
739unsigned long ZEXPORT crc32_z(crc, buf, len)
740    unsigned long crc;
741    const unsigned char FAR *buf;
742    z_size_t len;
743{
744    /* Return initial CRC, if requested. */
745    if (buf == Z_NULL) return 0;
746
747#ifdef DYNAMIC_CRC_TABLE
748    once(&made, make_crc_table);
749#endif /* DYNAMIC_CRC_TABLE */
750
751    /* Pre-condition the CRC */
752    crc ^= 0xffffffff;
753
754#ifdef W
755
756    /* If provided enough bytes, do a braided CRC calculation. */
757    if (len >= N * W + W - 1) {
758        z_size_t blks;
759        z_word_t const *words;
760        unsigned endian;
761        int k;
762
763        /* Compute the CRC up to a z_word_t boundary. */
764        while (len && ((z_size_t)buf & (W - 1)) != 0) {
765            len--;
766            crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
767        }
768
769        /* Compute the CRC on as many N z_word_t blocks as are available. */
770        blks = len / (N * W);
771        len -= blks * N * W;
772        words = (z_word_t const *)buf;
773
774        /* Do endian check at execution time instead of compile time, since ARM
775           processors can change the endianess at execution time. If the
776           compiler knows what the endianess will be, it can optimize out the
777           check and the unused branch. */
778        endian = 1;
779        if (*(unsigned char *)&endian) {
780            /* Little endian. */
781
782            z_crc_t crc0;
783            z_word_t word0;
784#if N > 1
785            z_crc_t crc1;
786            z_word_t word1;
787#if N > 2
788            z_crc_t crc2;
789            z_word_t word2;
790#if N > 3
791            z_crc_t crc3;
792            z_word_t word3;
793#if N > 4
794            z_crc_t crc4;
795            z_word_t word4;
796#if N > 5
797            z_crc_t crc5;
798            z_word_t word5;
799#endif
800#endif
801#endif
802#endif
803#endif
804
805            /* Initialize the CRC for each braid. */
806            crc0 = crc;
807#if N > 1
808            crc1 = 0;
809#if N > 2
810            crc2 = 0;
811#if N > 3
812            crc3 = 0;
813#if N > 4
814            crc4 = 0;
815#if N > 5
816            crc5 = 0;
817#endif
818#endif
819#endif
820#endif
821#endif
822
823            /*
824              Process the first blks-1 blocks, computing the CRCs on each braid
825              independently.
826             */
827            while (--blks) {
828                /* Load the word for each braid into registers. */
829                word0 = crc0 ^ words[0];
830#if N > 1
831                word1 = crc1 ^ words[1];
832#if N > 2
833                word2 = crc2 ^ words[2];
834#if N > 3
835                word3 = crc3 ^ words[3];
836#if N > 4
837                word4 = crc4 ^ words[4];
838#if N > 5
839                word5 = crc5 ^ words[5];
840#endif
841#endif
842#endif
843#endif
844#endif
845                words += N;
846
847                /* Compute and update the CRC for each word. The loop should
848                   get unrolled. */
849                crc0 = crc_braid_table[0][word0 & 0xff];
850#if N > 1
851                crc1 = crc_braid_table[0][word1 & 0xff];
852#if N > 2
853                crc2 = crc_braid_table[0][word2 & 0xff];
854#if N > 3
855                crc3 = crc_braid_table[0][word3 & 0xff];
856#if N > 4
857                crc4 = crc_braid_table[0][word4 & 0xff];
858#if N > 5
859                crc5 = crc_braid_table[0][word5 & 0xff];
860#endif
861#endif
862#endif
863#endif
864#endif
865                for (k = 1; k < W; k++) {
866                    crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
867#if N > 1
868                    crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
869#if N > 2
870                    crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
871#if N > 3
872                    crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
873#if N > 4
874                    crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
875#if N > 5
876                    crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
877#endif
878#endif
879#endif
880#endif
881#endif
882                }
883            }
884
885            /*
886              Process the last block, combining the CRCs of the N braids at the
887              same time.
888             */
889            crc = crc_word(crc0 ^ words[0]);
890#if N > 1
891            crc = crc_word(crc1 ^ words[1] ^ crc);
892#if N > 2
893            crc = crc_word(crc2 ^ words[2] ^ crc);
894#if N > 3
895            crc = crc_word(crc3 ^ words[3] ^ crc);
896#if N > 4
897            crc = crc_word(crc4 ^ words[4] ^ crc);
898#if N > 5
899            crc = crc_word(crc5 ^ words[5] ^ crc);
900#endif
901#endif
902#endif
903#endif
904#endif
905            words += N;
906        }
907        else {
908            /* Big endian. */
909
910            z_word_t crc0, word0, comb;
911#if N > 1
912            z_word_t crc1, word1;
913#if N > 2
914            z_word_t crc2, word2;
915#if N > 3
916            z_word_t crc3, word3;
917#if N > 4
918            z_word_t crc4, word4;
919#if N > 5
920            z_word_t crc5, word5;
921#endif
922#endif
923#endif
924#endif
925#endif
926
927            /* Initialize the CRC for each braid. */
928            crc0 = byte_swap(crc);
929#if N > 1
930            crc1 = 0;
931#if N > 2
932            crc2 = 0;
933#if N > 3
934            crc3 = 0;
935#if N > 4
936            crc4 = 0;
937#if N > 5
938            crc5 = 0;
939#endif
940#endif
941#endif
942#endif
943#endif
944
945            /*
946              Process the first blks-1 blocks, computing the CRCs on each braid
947              independently.
948             */
949            while (--blks) {
950                /* Load the word for each braid into registers. */
951                word0 = crc0 ^ words[0];
952#if N > 1
953                word1 = crc1 ^ words[1];
954#if N > 2
955                word2 = crc2 ^ words[2];
956#if N > 3
957                word3 = crc3 ^ words[3];
958#if N > 4
959                word4 = crc4 ^ words[4];
960#if N > 5
961                word5 = crc5 ^ words[5];
962#endif
963#endif
964#endif
965#endif
966#endif
967                words += N;
968
969                /* Compute and update the CRC for each word. The loop should
970                   get unrolled. */
971                crc0 = crc_braid_big_table[0][word0 & 0xff];
972#if N > 1
973                crc1 = crc_braid_big_table[0][word1 & 0xff];
974#if N > 2
975                crc2 = crc_braid_big_table[0][word2 & 0xff];
976#if N > 3
977                crc3 = crc_braid_big_table[0][word3 & 0xff];
978#if N > 4
979                crc4 = crc_braid_big_table[0][word4 & 0xff];
980#if N > 5
981                crc5 = crc_braid_big_table[0][word5 & 0xff];
982#endif
983#endif
984#endif
985#endif
986#endif
987                for (k = 1; k < W; k++) {
988                    crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
989#if N > 1
990                    crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
991#if N > 2
992                    crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
993#if N > 3
994                    crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
995#if N > 4
996                    crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
997#if N > 5
998                    crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
999#endif
1000#endif
1001#endif
1002#endif
1003#endif
1004                }
1005            }
1006
1007            /*
1008              Process the last block, combining the CRCs of the N braids at the
1009              same time.
1010             */
1011            comb = crc_word_big(crc0 ^ words[0]);
1012#if N > 1
1013            comb = crc_word_big(crc1 ^ words[1] ^ comb);
1014#if N > 2
1015            comb = crc_word_big(crc2 ^ words[2] ^ comb);
1016#if N > 3
1017            comb = crc_word_big(crc3 ^ words[3] ^ comb);
1018#if N > 4
1019            comb = crc_word_big(crc4 ^ words[4] ^ comb);
1020#if N > 5
1021            comb = crc_word_big(crc5 ^ words[5] ^ comb);
1022#endif
1023#endif
1024#endif
1025#endif
1026#endif
1027            words += N;
1028            crc = byte_swap(comb);
1029        }
1030
1031        /*
1032          Update the pointer to the remaining bytes to process.
1033         */
1034        buf = (unsigned char const *)words;
1035    }
1036
1037#endif /* W */
1038
1039    /* Complete the computation of the CRC on any remaining bytes. */
1040    while (len >= 8) {
1041        len -= 8;
1042        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1043        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1044        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1045        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1046        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1047        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1048        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1049        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1050    }
1051    while (len) {
1052        len--;
1053        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054    }
1055
1056    /* Return the CRC, post-conditioned. */
1057    return crc ^ 0xffffffff;
1058}
1059
1060#endif
1061
1062/* ========================================================================= */
1063unsigned long ZEXPORT crc32(crc, buf, len)
1064    unsigned long crc;
1065    const unsigned char FAR *buf;
1066    uInt len;
1067{
1068    return crc32_z(crc, buf, len);
1069}
1070
1071/* ========================================================================= */
1072uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1073    uLong crc1;
1074    uLong crc2;
1075    z_off64_t len2;
1076{
1077#ifdef DYNAMIC_CRC_TABLE
1078    once(&made, make_crc_table);
1079#endif /* DYNAMIC_CRC_TABLE */
1080    return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
1081}
1082
1083/* ========================================================================= */
1084uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1085    uLong crc1;
1086    uLong crc2;
1087    z_off_t len2;
1088{
1089    return crc32_combine64(crc1, crc2, len2);
1090}
1091
1092/* ========================================================================= */
1093uLong ZEXPORT crc32_combine_gen64(len2)
1094    z_off64_t len2;
1095{
1096#ifdef DYNAMIC_CRC_TABLE
1097    once(&made, make_crc_table);
1098#endif /* DYNAMIC_CRC_TABLE */
1099    return x2nmodp(len2, 3);
1100}
1101
1102/* ========================================================================= */
1103uLong ZEXPORT crc32_combine_gen(len2)
1104    z_off_t len2;
1105{
1106    return crc32_combine_gen64(len2);
1107}
1108
1109/* ========================================================================= */
1110uLong crc32_combine_op(crc1, crc2, op)
1111    uLong crc1;
1112    uLong crc2;
1113    uLong op;
1114{
1115    return multmodp(op, crc1) ^ crc2;
1116}
1117