wp_block.c revision 296341
1/**
2 * The Whirlpool hashing function.
3 *
4 * <P>
5 * <b>References</b>
6 *
7 * <P>
8 * The Whirlpool algorithm was developed by
9 * <a href="mailto:pbarreto@scopus.com.br">Paulo S. L. M. Barreto</a> and
10 * <a href="mailto:vincent.rijmen@cryptomathic.com">Vincent Rijmen</a>.
11 *
12 * See
13 *      P.S.L.M. Barreto, V. Rijmen,
14 *      ``The Whirlpool hashing function,''
15 *      NESSIE submission, 2000 (tweaked version, 2001),
16 *      <https://www.cosic.esat.kuleuven.ac.be/nessie/workshop/submissions/whirlpool.zip>
17 *
18 * Based on "@version 3.0 (2003.03.12)" by Paulo S.L.M. Barreto and
19 * Vincent Rijmen. Lookup "reference implementations" on
20 * <http://planeta.terra.com.br/informatica/paulobarreto/>
21 *
22 * =============================================================================
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
25 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
28 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
31 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
32 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
33 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
34 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 *
36 */
37
38#include "wp_locl.h"
39#include <string.h>
40
41typedef unsigned char u8;
42#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32)
43typedef unsigned __int64 u64;
44#elif defined(__arch64__)
45typedef unsigned long u64;
46#else
47typedef unsigned long long u64;
48#endif
49
50#define ROUNDS  10
51
52#define STRICT_ALIGNMENT
53#if defined(__i386) || defined(__i386__) || \
54    defined(__x86_64) || defined(__x86_64__) || \
55    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)
56/*
57 * Well, formally there're couple of other architectures, which permit
58 * unaligned loads, specifically those not crossing cache lines, IA-64 and
59 * PowerPC...
60 */
61# undef STRICT_ALIGNMENT
62#endif
63
64#undef SMALL_REGISTER_BANK
65#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
66# define SMALL_REGISTER_BANK
67# if defined(WHIRLPOOL_ASM)
68#  ifndef OPENSSL_SMALL_FOOTPRINT
69/*
70 * it appears that for elder non-MMX
71 * CPUs this is actually faster!
72 */
73#   define OPENSSL_SMALL_FOOTPRINT
74#  endif
75#  define GO_FOR_MMX(ctx,inp,num)     do {                    \
76        extern unsigned int OPENSSL_ia32cap_P[];                \
77        void whirlpool_block_mmx(void *,const void *,size_t);   \
78        if (!(OPENSSL_ia32cap_P[0] & (1<<23)))  break;          \
79        whirlpool_block_mmx(ctx->H.c,inp,num);  return;         \
80                                        } while (0)
81# endif
82#endif
83
84#undef ROTATE
85#if defined(_MSC_VER)
86# if defined(_WIN64)            /* applies to both IA-64 and AMD64 */
87#  pragma intrinsic(_rotl64)
88#  define ROTATE(a,n) _rotl64((a),n)
89# endif
90#elif defined(__GNUC__) && __GNUC__>=2
91# if defined(__x86_64) || defined(__x86_64__)
92#  if defined(L_ENDIAN)
93#   define ROTATE(a,n)       ({ u64 ret; asm ("rolq %1,%0"   \
94                                   : "=r"(ret) : "J"(n),"0"(a) : "cc"); ret; })
95#  elif defined(B_ENDIAN)
96       /*
97        * Most will argue that x86_64 is always little-endian. Well, yes, but
98        * then we have stratus.com who has modified gcc to "emulate"
99        * big-endian on x86. Is there evidence that they [or somebody else]
100        * won't do same for x86_64? Naturally no. And this line is waiting
101        * ready for that brave soul:-)
102        */
103#   define ROTATE(a,n)       ({ u64 ret; asm ("rorq %1,%0"   \
104                                   : "=r"(ret) : "J"(n),"0"(a) : "cc"); ret; })
105#  endif
106# elif defined(__ia64) || defined(__ia64__)
107#  if defined(L_ENDIAN)
108#   define ROTATE(a,n)       ({ u64 ret; asm ("shrp %0=%1,%1,%2"     \
109                                   : "=r"(ret) : "r"(a),"M"(64-(n))); ret; })
110#  elif defined(B_ENDIAN)
111#   define ROTATE(a,n)       ({ u64 ret; asm ("shrp %0=%1,%1,%2"     \
112                                   : "=r"(ret) : "r"(a),"M"(n)); ret; })
113#  endif
114# endif
115#endif
116
117#if defined(OPENSSL_SMALL_FOOTPRINT)
118# if !defined(ROTATE)
119#  if defined(L_ENDIAN)         /* little-endians have to rotate left */
120#   define ROTATE(i,n)       ((i)<<(n) ^ (i)>>(64-n))
121#  elif defined(B_ENDIAN)       /* big-endians have to rotate right */
122#   define ROTATE(i,n)       ((i)>>(n) ^ (i)<<(64-n))
123#  endif
124# endif
125# if defined(ROTATE) && !defined(STRICT_ALIGNMENT)
126#  define STRICT_ALIGNMENT      /* ensure smallest table size */
127# endif
128#endif
129
130/*
131 * Table size depends on STRICT_ALIGNMENT and whether or not endian-
132 * specific ROTATE macro is defined. If STRICT_ALIGNMENT is not
133 * defined, which is normally the case on x86[_64] CPUs, the table is
134 * 4KB large unconditionally. Otherwise if ROTATE is defined, the
135 * table is 2KB large, and otherwise - 16KB. 2KB table requires a
136 * whole bunch of additional rotations, but I'm willing to "trade,"
137 * because 16KB table certainly trashes L1 cache. I wish all CPUs
138 * could handle unaligned load as 4KB table doesn't trash the cache,
139 * nor does it require additional rotations.
140 */
141/*
142 * Note that every Cn macro expands as two loads: one byte load and
143 * one quadword load. One can argue that that many single-byte loads
144 * is too excessive, as one could load a quadword and "milk" it for
145 * eight 8-bit values instead. Well, yes, but in order to do so *and*
146 * avoid excessive loads you have to accomodate a handful of 64-bit
147 * values in the register bank and issue a bunch of shifts and mask.
148 * It's a tradeoff: loads vs. shift and mask in big register bank[!].
149 * On most CPUs eight single-byte loads are faster and I let other
150 * ones to depend on smart compiler to fold byte loads if beneficial.
151 * Hand-coded assembler would be another alternative:-)
152 */
153#ifdef STRICT_ALIGNMENT
154# if defined(ROTATE)
155#  define N   1
156#  define LL(c0,c1,c2,c3,c4,c5,c6,c7) c0,c1,c2,c3,c4,c5,c6,c7
157#  define C0(K,i)     (Cx.q[K.c[(i)*8+0]])
158#  define C1(K,i)     ROTATE(Cx.q[K.c[(i)*8+1]],8)
159#  define C2(K,i)     ROTATE(Cx.q[K.c[(i)*8+2]],16)
160#  define C3(K,i)     ROTATE(Cx.q[K.c[(i)*8+3]],24)
161#  define C4(K,i)     ROTATE(Cx.q[K.c[(i)*8+4]],32)
162#  define C5(K,i)     ROTATE(Cx.q[K.c[(i)*8+5]],40)
163#  define C6(K,i)     ROTATE(Cx.q[K.c[(i)*8+6]],48)
164#  define C7(K,i)     ROTATE(Cx.q[K.c[(i)*8+7]],56)
165# else
166#  define N   8
167#  define LL(c0,c1,c2,c3,c4,c5,c6,c7) c0,c1,c2,c3,c4,c5,c6,c7, \
168                                        c7,c0,c1,c2,c3,c4,c5,c6, \
169                                        c6,c7,c0,c1,c2,c3,c4,c5, \
170                                        c5,c6,c7,c0,c1,c2,c3,c4, \
171                                        c4,c5,c6,c7,c0,c1,c2,c3, \
172                                        c3,c4,c5,c6,c7,c0,c1,c2, \
173                                        c2,c3,c4,c5,c6,c7,c0,c1, \
174                                        c1,c2,c3,c4,c5,c6,c7,c0
175#  define C0(K,i)     (Cx.q[0+8*K.c[(i)*8+0]])
176#  define C1(K,i)     (Cx.q[1+8*K.c[(i)*8+1]])
177#  define C2(K,i)     (Cx.q[2+8*K.c[(i)*8+2]])
178#  define C3(K,i)     (Cx.q[3+8*K.c[(i)*8+3]])
179#  define C4(K,i)     (Cx.q[4+8*K.c[(i)*8+4]])
180#  define C5(K,i)     (Cx.q[5+8*K.c[(i)*8+5]])
181#  define C6(K,i)     (Cx.q[6+8*K.c[(i)*8+6]])
182#  define C7(K,i)     (Cx.q[7+8*K.c[(i)*8+7]])
183# endif
184#else
185# define N     2
186# define LL(c0,c1,c2,c3,c4,c5,c6,c7)   c0,c1,c2,c3,c4,c5,c6,c7, \
187                                        c0,c1,c2,c3,c4,c5,c6,c7
188# define C0(K,i)       (((u64*)(Cx.c+0))[2*K.c[(i)*8+0]])
189# define C1(K,i)       (((u64*)(Cx.c+7))[2*K.c[(i)*8+1]])
190# define C2(K,i)       (((u64*)(Cx.c+6))[2*K.c[(i)*8+2]])
191# define C3(K,i)       (((u64*)(Cx.c+5))[2*K.c[(i)*8+3]])
192# define C4(K,i)       (((u64*)(Cx.c+4))[2*K.c[(i)*8+4]])
193# define C5(K,i)       (((u64*)(Cx.c+3))[2*K.c[(i)*8+5]])
194# define C6(K,i)       (((u64*)(Cx.c+2))[2*K.c[(i)*8+6]])
195# define C7(K,i)       (((u64*)(Cx.c+1))[2*K.c[(i)*8+7]])
196#endif
197
198static const
199    union {
200    u8 c[(256 * N + ROUNDS) * sizeof(u64)];
201    u64 q[(256 * N + ROUNDS)];
202} Cx = {
203        {
204            /* Note endian-neutral representation:-) */
205            LL(0x18, 0x18, 0x60, 0x18, 0xc0, 0x78, 0x30, 0xd8),
206            LL(0x23, 0x23, 0x8c, 0x23, 0x05, 0xaf, 0x46, 0x26),
207            LL(0xc6, 0xc6, 0x3f, 0xc6, 0x7e, 0xf9, 0x91, 0xb8),
208            LL(0xe8, 0xe8, 0x87, 0xe8, 0x13, 0x6f, 0xcd, 0xfb),
209            LL(0x87, 0x87, 0x26, 0x87, 0x4c, 0xa1, 0x13, 0xcb),
210            LL(0xb8, 0xb8, 0xda, 0xb8, 0xa9, 0x62, 0x6d, 0x11),
211            LL(0x01, 0x01, 0x04, 0x01, 0x08, 0x05, 0x02, 0x09),
212            LL(0x4f, 0x4f, 0x21, 0x4f, 0x42, 0x6e, 0x9e, 0x0d),
213            LL(0x36, 0x36, 0xd8, 0x36, 0xad, 0xee, 0x6c, 0x9b),
214            LL(0xa6, 0xa6, 0xa2, 0xa6, 0x59, 0x04, 0x51, 0xff),
215            LL(0xd2, 0xd2, 0x6f, 0xd2, 0xde, 0xbd, 0xb9, 0x0c),
216            LL(0xf5, 0xf5, 0xf3, 0xf5, 0xfb, 0x06, 0xf7, 0x0e),
217            LL(0x79, 0x79, 0xf9, 0x79, 0xef, 0x80, 0xf2, 0x96),
218            LL(0x6f, 0x6f, 0xa1, 0x6f, 0x5f, 0xce, 0xde, 0x30),
219            LL(0x91, 0x91, 0x7e, 0x91, 0xfc, 0xef, 0x3f, 0x6d),
220            LL(0x52, 0x52, 0x55, 0x52, 0xaa, 0x07, 0xa4, 0xf8),
221            LL(0x60, 0x60, 0x9d, 0x60, 0x27, 0xfd, 0xc0, 0x47),
222            LL(0xbc, 0xbc, 0xca, 0xbc, 0x89, 0x76, 0x65, 0x35),
223            LL(0x9b, 0x9b, 0x56, 0x9b, 0xac, 0xcd, 0x2b, 0x37),
224            LL(0x8e, 0x8e, 0x02, 0x8e, 0x04, 0x8c, 0x01, 0x8a),
225            LL(0xa3, 0xa3, 0xb6, 0xa3, 0x71, 0x15, 0x5b, 0xd2),
226            LL(0x0c, 0x0c, 0x30, 0x0c, 0x60, 0x3c, 0x18, 0x6c),
227            LL(0x7b, 0x7b, 0xf1, 0x7b, 0xff, 0x8a, 0xf6, 0x84),
228            LL(0x35, 0x35, 0xd4, 0x35, 0xb5, 0xe1, 0x6a, 0x80),
229            LL(0x1d, 0x1d, 0x74, 0x1d, 0xe8, 0x69, 0x3a, 0xf5),
230            LL(0xe0, 0xe0, 0xa7, 0xe0, 0x53, 0x47, 0xdd, 0xb3),
231            LL(0xd7, 0xd7, 0x7b, 0xd7, 0xf6, 0xac, 0xb3, 0x21),
232            LL(0xc2, 0xc2, 0x2f, 0xc2, 0x5e, 0xed, 0x99, 0x9c),
233            LL(0x2e, 0x2e, 0xb8, 0x2e, 0x6d, 0x96, 0x5c, 0x43),
234            LL(0x4b, 0x4b, 0x31, 0x4b, 0x62, 0x7a, 0x96, 0x29),
235            LL(0xfe, 0xfe, 0xdf, 0xfe, 0xa3, 0x21, 0xe1, 0x5d),
236            LL(0x57, 0x57, 0x41, 0x57, 0x82, 0x16, 0xae, 0xd5),
237            LL(0x15, 0x15, 0x54, 0x15, 0xa8, 0x41, 0x2a, 0xbd),
238            LL(0x77, 0x77, 0xc1, 0x77, 0x9f, 0xb6, 0xee, 0xe8),
239            LL(0x37, 0x37, 0xdc, 0x37, 0xa5, 0xeb, 0x6e, 0x92),
240            LL(0xe5, 0xe5, 0xb3, 0xe5, 0x7b, 0x56, 0xd7, 0x9e),
241            LL(0x9f, 0x9f, 0x46, 0x9f, 0x8c, 0xd9, 0x23, 0x13),
242            LL(0xf0, 0xf0, 0xe7, 0xf0, 0xd3, 0x17, 0xfd, 0x23),
243            LL(0x4a, 0x4a, 0x35, 0x4a, 0x6a, 0x7f, 0x94, 0x20),
244            LL(0xda, 0xda, 0x4f, 0xda, 0x9e, 0x95, 0xa9, 0x44),
245            LL(0x58, 0x58, 0x7d, 0x58, 0xfa, 0x25, 0xb0, 0xa2),
246            LL(0xc9, 0xc9, 0x03, 0xc9, 0x06, 0xca, 0x8f, 0xcf),
247            LL(0x29, 0x29, 0xa4, 0x29, 0x55, 0x8d, 0x52, 0x7c),
248            LL(0x0a, 0x0a, 0x28, 0x0a, 0x50, 0x22, 0x14, 0x5a),
249            LL(0xb1, 0xb1, 0xfe, 0xb1, 0xe1, 0x4f, 0x7f, 0x50),
250            LL(0xa0, 0xa0, 0xba, 0xa0, 0x69, 0x1a, 0x5d, 0xc9),
251            LL(0x6b, 0x6b, 0xb1, 0x6b, 0x7f, 0xda, 0xd6, 0x14),
252            LL(0x85, 0x85, 0x2e, 0x85, 0x5c, 0xab, 0x17, 0xd9),
253            LL(0xbd, 0xbd, 0xce, 0xbd, 0x81, 0x73, 0x67, 0x3c),
254            LL(0x5d, 0x5d, 0x69, 0x5d, 0xd2, 0x34, 0xba, 0x8f),
255            LL(0x10, 0x10, 0x40, 0x10, 0x80, 0x50, 0x20, 0x90),
256            LL(0xf4, 0xf4, 0xf7, 0xf4, 0xf3, 0x03, 0xf5, 0x07),
257            LL(0xcb, 0xcb, 0x0b, 0xcb, 0x16, 0xc0, 0x8b, 0xdd),
258            LL(0x3e, 0x3e, 0xf8, 0x3e, 0xed, 0xc6, 0x7c, 0xd3),
259            LL(0x05, 0x05, 0x14, 0x05, 0x28, 0x11, 0x0a, 0x2d),
260            LL(0x67, 0x67, 0x81, 0x67, 0x1f, 0xe6, 0xce, 0x78),
261            LL(0xe4, 0xe4, 0xb7, 0xe4, 0x73, 0x53, 0xd5, 0x97),
262            LL(0x27, 0x27, 0x9c, 0x27, 0x25, 0xbb, 0x4e, 0x02),
263            LL(0x41, 0x41, 0x19, 0x41, 0x32, 0x58, 0x82, 0x73),
264            LL(0x8b, 0x8b, 0x16, 0x8b, 0x2c, 0x9d, 0x0b, 0xa7),
265            LL(0xa7, 0xa7, 0xa6, 0xa7, 0x51, 0x01, 0x53, 0xf6),
266            LL(0x7d, 0x7d, 0xe9, 0x7d, 0xcf, 0x94, 0xfa, 0xb2),
267            LL(0x95, 0x95, 0x6e, 0x95, 0xdc, 0xfb, 0x37, 0x49),
268            LL(0xd8, 0xd8, 0x47, 0xd8, 0x8e, 0x9f, 0xad, 0x56),
269            LL(0xfb, 0xfb, 0xcb, 0xfb, 0x8b, 0x30, 0xeb, 0x70),
270            LL(0xee, 0xee, 0x9f, 0xee, 0x23, 0x71, 0xc1, 0xcd),
271            LL(0x7c, 0x7c, 0xed, 0x7c, 0xc7, 0x91, 0xf8, 0xbb),
272            LL(0x66, 0x66, 0x85, 0x66, 0x17, 0xe3, 0xcc, 0x71),
273            LL(0xdd, 0xdd, 0x53, 0xdd, 0xa6, 0x8e, 0xa7, 0x7b),
274            LL(0x17, 0x17, 0x5c, 0x17, 0xb8, 0x4b, 0x2e, 0xaf),
275            LL(0x47, 0x47, 0x01, 0x47, 0x02, 0x46, 0x8e, 0x45),
276            LL(0x9e, 0x9e, 0x42, 0x9e, 0x84, 0xdc, 0x21, 0x1a),
277            LL(0xca, 0xca, 0x0f, 0xca, 0x1e, 0xc5, 0x89, 0xd4),
278            LL(0x2d, 0x2d, 0xb4, 0x2d, 0x75, 0x99, 0x5a, 0x58),
279            LL(0xbf, 0xbf, 0xc6, 0xbf, 0x91, 0x79, 0x63, 0x2e),
280            LL(0x07, 0x07, 0x1c, 0x07, 0x38, 0x1b, 0x0e, 0x3f),
281            LL(0xad, 0xad, 0x8e, 0xad, 0x01, 0x23, 0x47, 0xac),
282            LL(0x5a, 0x5a, 0x75, 0x5a, 0xea, 0x2f, 0xb4, 0xb0),
283            LL(0x83, 0x83, 0x36, 0x83, 0x6c, 0xb5, 0x1b, 0xef),
284            LL(0x33, 0x33, 0xcc, 0x33, 0x85, 0xff, 0x66, 0xb6),
285            LL(0x63, 0x63, 0x91, 0x63, 0x3f, 0xf2, 0xc6, 0x5c),
286            LL(0x02, 0x02, 0x08, 0x02, 0x10, 0x0a, 0x04, 0x12),
287            LL(0xaa, 0xaa, 0x92, 0xaa, 0x39, 0x38, 0x49, 0x93),
288            LL(0x71, 0x71, 0xd9, 0x71, 0xaf, 0xa8, 0xe2, 0xde),
289            LL(0xc8, 0xc8, 0x07, 0xc8, 0x0e, 0xcf, 0x8d, 0xc6),
290            LL(0x19, 0x19, 0x64, 0x19, 0xc8, 0x7d, 0x32, 0xd1),
291            LL(0x49, 0x49, 0x39, 0x49, 0x72, 0x70, 0x92, 0x3b),
292            LL(0xd9, 0xd9, 0x43, 0xd9, 0x86, 0x9a, 0xaf, 0x5f),
293            LL(0xf2, 0xf2, 0xef, 0xf2, 0xc3, 0x1d, 0xf9, 0x31),
294            LL(0xe3, 0xe3, 0xab, 0xe3, 0x4b, 0x48, 0xdb, 0xa8),
295            LL(0x5b, 0x5b, 0x71, 0x5b, 0xe2, 0x2a, 0xb6, 0xb9),
296            LL(0x88, 0x88, 0x1a, 0x88, 0x34, 0x92, 0x0d, 0xbc),
297            LL(0x9a, 0x9a, 0x52, 0x9a, 0xa4, 0xc8, 0x29, 0x3e),
298            LL(0x26, 0x26, 0x98, 0x26, 0x2d, 0xbe, 0x4c, 0x0b),
299            LL(0x32, 0x32, 0xc8, 0x32, 0x8d, 0xfa, 0x64, 0xbf),
300            LL(0xb0, 0xb0, 0xfa, 0xb0, 0xe9, 0x4a, 0x7d, 0x59),
301            LL(0xe9, 0xe9, 0x83, 0xe9, 0x1b, 0x6a, 0xcf, 0xf2),
302            LL(0x0f, 0x0f, 0x3c, 0x0f, 0x78, 0x33, 0x1e, 0x77),
303            LL(0xd5, 0xd5, 0x73, 0xd5, 0xe6, 0xa6, 0xb7, 0x33),
304            LL(0x80, 0x80, 0x3a, 0x80, 0x74, 0xba, 0x1d, 0xf4),
305            LL(0xbe, 0xbe, 0xc2, 0xbe, 0x99, 0x7c, 0x61, 0x27),
306            LL(0xcd, 0xcd, 0x13, 0xcd, 0x26, 0xde, 0x87, 0xeb),
307            LL(0x34, 0x34, 0xd0, 0x34, 0xbd, 0xe4, 0x68, 0x89),
308            LL(0x48, 0x48, 0x3d, 0x48, 0x7a, 0x75, 0x90, 0x32),
309            LL(0xff, 0xff, 0xdb, 0xff, 0xab, 0x24, 0xe3, 0x54),
310            LL(0x7a, 0x7a, 0xf5, 0x7a, 0xf7, 0x8f, 0xf4, 0x8d),
311            LL(0x90, 0x90, 0x7a, 0x90, 0xf4, 0xea, 0x3d, 0x64),
312            LL(0x5f, 0x5f, 0x61, 0x5f, 0xc2, 0x3e, 0xbe, 0x9d),
313            LL(0x20, 0x20, 0x80, 0x20, 0x1d, 0xa0, 0x40, 0x3d),
314            LL(0x68, 0x68, 0xbd, 0x68, 0x67, 0xd5, 0xd0, 0x0f),
315            LL(0x1a, 0x1a, 0x68, 0x1a, 0xd0, 0x72, 0x34, 0xca),
316            LL(0xae, 0xae, 0x82, 0xae, 0x19, 0x2c, 0x41, 0xb7),
317            LL(0xb4, 0xb4, 0xea, 0xb4, 0xc9, 0x5e, 0x75, 0x7d),
318            LL(0x54, 0x54, 0x4d, 0x54, 0x9a, 0x19, 0xa8, 0xce),
319            LL(0x93, 0x93, 0x76, 0x93, 0xec, 0xe5, 0x3b, 0x7f),
320            LL(0x22, 0x22, 0x88, 0x22, 0x0d, 0xaa, 0x44, 0x2f),
321            LL(0x64, 0x64, 0x8d, 0x64, 0x07, 0xe9, 0xc8, 0x63),
322            LL(0xf1, 0xf1, 0xe3, 0xf1, 0xdb, 0x12, 0xff, 0x2a),
323            LL(0x73, 0x73, 0xd1, 0x73, 0xbf, 0xa2, 0xe6, 0xcc),
324            LL(0x12, 0x12, 0x48, 0x12, 0x90, 0x5a, 0x24, 0x82),
325            LL(0x40, 0x40, 0x1d, 0x40, 0x3a, 0x5d, 0x80, 0x7a),
326            LL(0x08, 0x08, 0x20, 0x08, 0x40, 0x28, 0x10, 0x48),
327            LL(0xc3, 0xc3, 0x2b, 0xc3, 0x56, 0xe8, 0x9b, 0x95),
328            LL(0xec, 0xec, 0x97, 0xec, 0x33, 0x7b, 0xc5, 0xdf),
329            LL(0xdb, 0xdb, 0x4b, 0xdb, 0x96, 0x90, 0xab, 0x4d),
330            LL(0xa1, 0xa1, 0xbe, 0xa1, 0x61, 0x1f, 0x5f, 0xc0),
331            LL(0x8d, 0x8d, 0x0e, 0x8d, 0x1c, 0x83, 0x07, 0x91),
332            LL(0x3d, 0x3d, 0xf4, 0x3d, 0xf5, 0xc9, 0x7a, 0xc8),
333            LL(0x97, 0x97, 0x66, 0x97, 0xcc, 0xf1, 0x33, 0x5b),
334            LL(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
335            LL(0xcf, 0xcf, 0x1b, 0xcf, 0x36, 0xd4, 0x83, 0xf9),
336            LL(0x2b, 0x2b, 0xac, 0x2b, 0x45, 0x87, 0x56, 0x6e),
337            LL(0x76, 0x76, 0xc5, 0x76, 0x97, 0xb3, 0xec, 0xe1),
338            LL(0x82, 0x82, 0x32, 0x82, 0x64, 0xb0, 0x19, 0xe6),
339            LL(0xd6, 0xd6, 0x7f, 0xd6, 0xfe, 0xa9, 0xb1, 0x28),
340            LL(0x1b, 0x1b, 0x6c, 0x1b, 0xd8, 0x77, 0x36, 0xc3),
341            LL(0xb5, 0xb5, 0xee, 0xb5, 0xc1, 0x5b, 0x77, 0x74),
342            LL(0xaf, 0xaf, 0x86, 0xaf, 0x11, 0x29, 0x43, 0xbe),
343            LL(0x6a, 0x6a, 0xb5, 0x6a, 0x77, 0xdf, 0xd4, 0x1d),
344            LL(0x50, 0x50, 0x5d, 0x50, 0xba, 0x0d, 0xa0, 0xea),
345            LL(0x45, 0x45, 0x09, 0x45, 0x12, 0x4c, 0x8a, 0x57),
346            LL(0xf3, 0xf3, 0xeb, 0xf3, 0xcb, 0x18, 0xfb, 0x38),
347            LL(0x30, 0x30, 0xc0, 0x30, 0x9d, 0xf0, 0x60, 0xad),
348            LL(0xef, 0xef, 0x9b, 0xef, 0x2b, 0x74, 0xc3, 0xc4),
349            LL(0x3f, 0x3f, 0xfc, 0x3f, 0xe5, 0xc3, 0x7e, 0xda),
350            LL(0x55, 0x55, 0x49, 0x55, 0x92, 0x1c, 0xaa, 0xc7),
351            LL(0xa2, 0xa2, 0xb2, 0xa2, 0x79, 0x10, 0x59, 0xdb),
352            LL(0xea, 0xea, 0x8f, 0xea, 0x03, 0x65, 0xc9, 0xe9),
353            LL(0x65, 0x65, 0x89, 0x65, 0x0f, 0xec, 0xca, 0x6a),
354            LL(0xba, 0xba, 0xd2, 0xba, 0xb9, 0x68, 0x69, 0x03),
355            LL(0x2f, 0x2f, 0xbc, 0x2f, 0x65, 0x93, 0x5e, 0x4a),
356            LL(0xc0, 0xc0, 0x27, 0xc0, 0x4e, 0xe7, 0x9d, 0x8e),
357            LL(0xde, 0xde, 0x5f, 0xde, 0xbe, 0x81, 0xa1, 0x60),
358            LL(0x1c, 0x1c, 0x70, 0x1c, 0xe0, 0x6c, 0x38, 0xfc),
359            LL(0xfd, 0xfd, 0xd3, 0xfd, 0xbb, 0x2e, 0xe7, 0x46),
360            LL(0x4d, 0x4d, 0x29, 0x4d, 0x52, 0x64, 0x9a, 0x1f),
361            LL(0x92, 0x92, 0x72, 0x92, 0xe4, 0xe0, 0x39, 0x76),
362            LL(0x75, 0x75, 0xc9, 0x75, 0x8f, 0xbc, 0xea, 0xfa),
363            LL(0x06, 0x06, 0x18, 0x06, 0x30, 0x1e, 0x0c, 0x36),
364            LL(0x8a, 0x8a, 0x12, 0x8a, 0x24, 0x98, 0x09, 0xae),
365            LL(0xb2, 0xb2, 0xf2, 0xb2, 0xf9, 0x40, 0x79, 0x4b),
366            LL(0xe6, 0xe6, 0xbf, 0xe6, 0x63, 0x59, 0xd1, 0x85),
367            LL(0x0e, 0x0e, 0x38, 0x0e, 0x70, 0x36, 0x1c, 0x7e),
368            LL(0x1f, 0x1f, 0x7c, 0x1f, 0xf8, 0x63, 0x3e, 0xe7),
369            LL(0x62, 0x62, 0x95, 0x62, 0x37, 0xf7, 0xc4, 0x55),
370            LL(0xd4, 0xd4, 0x77, 0xd4, 0xee, 0xa3, 0xb5, 0x3a),
371            LL(0xa8, 0xa8, 0x9a, 0xa8, 0x29, 0x32, 0x4d, 0x81),
372            LL(0x96, 0x96, 0x62, 0x96, 0xc4, 0xf4, 0x31, 0x52),
373            LL(0xf9, 0xf9, 0xc3, 0xf9, 0x9b, 0x3a, 0xef, 0x62),
374            LL(0xc5, 0xc5, 0x33, 0xc5, 0x66, 0xf6, 0x97, 0xa3),
375            LL(0x25, 0x25, 0x94, 0x25, 0x35, 0xb1, 0x4a, 0x10),
376            LL(0x59, 0x59, 0x79, 0x59, 0xf2, 0x20, 0xb2, 0xab),
377            LL(0x84, 0x84, 0x2a, 0x84, 0x54, 0xae, 0x15, 0xd0),
378            LL(0x72, 0x72, 0xd5, 0x72, 0xb7, 0xa7, 0xe4, 0xc5),
379            LL(0x39, 0x39, 0xe4, 0x39, 0xd5, 0xdd, 0x72, 0xec),
380            LL(0x4c, 0x4c, 0x2d, 0x4c, 0x5a, 0x61, 0x98, 0x16),
381            LL(0x5e, 0x5e, 0x65, 0x5e, 0xca, 0x3b, 0xbc, 0x94),
382            LL(0x78, 0x78, 0xfd, 0x78, 0xe7, 0x85, 0xf0, 0x9f),
383            LL(0x38, 0x38, 0xe0, 0x38, 0xdd, 0xd8, 0x70, 0xe5),
384            LL(0x8c, 0x8c, 0x0a, 0x8c, 0x14, 0x86, 0x05, 0x98),
385            LL(0xd1, 0xd1, 0x63, 0xd1, 0xc6, 0xb2, 0xbf, 0x17),
386            LL(0xa5, 0xa5, 0xae, 0xa5, 0x41, 0x0b, 0x57, 0xe4),
387            LL(0xe2, 0xe2, 0xaf, 0xe2, 0x43, 0x4d, 0xd9, 0xa1),
388            LL(0x61, 0x61, 0x99, 0x61, 0x2f, 0xf8, 0xc2, 0x4e),
389            LL(0xb3, 0xb3, 0xf6, 0xb3, 0xf1, 0x45, 0x7b, 0x42),
390            LL(0x21, 0x21, 0x84, 0x21, 0x15, 0xa5, 0x42, 0x34),
391            LL(0x9c, 0x9c, 0x4a, 0x9c, 0x94, 0xd6, 0x25, 0x08),
392            LL(0x1e, 0x1e, 0x78, 0x1e, 0xf0, 0x66, 0x3c, 0xee),
393            LL(0x43, 0x43, 0x11, 0x43, 0x22, 0x52, 0x86, 0x61),
394            LL(0xc7, 0xc7, 0x3b, 0xc7, 0x76, 0xfc, 0x93, 0xb1),
395            LL(0xfc, 0xfc, 0xd7, 0xfc, 0xb3, 0x2b, 0xe5, 0x4f),
396            LL(0x04, 0x04, 0x10, 0x04, 0x20, 0x14, 0x08, 0x24),
397            LL(0x51, 0x51, 0x59, 0x51, 0xb2, 0x08, 0xa2, 0xe3),
398            LL(0x99, 0x99, 0x5e, 0x99, 0xbc, 0xc7, 0x2f, 0x25),
399            LL(0x6d, 0x6d, 0xa9, 0x6d, 0x4f, 0xc4, 0xda, 0x22),
400            LL(0x0d, 0x0d, 0x34, 0x0d, 0x68, 0x39, 0x1a, 0x65),
401            LL(0xfa, 0xfa, 0xcf, 0xfa, 0x83, 0x35, 0xe9, 0x79),
402            LL(0xdf, 0xdf, 0x5b, 0xdf, 0xb6, 0x84, 0xa3, 0x69),
403            LL(0x7e, 0x7e, 0xe5, 0x7e, 0xd7, 0x9b, 0xfc, 0xa9),
404            LL(0x24, 0x24, 0x90, 0x24, 0x3d, 0xb4, 0x48, 0x19),
405            LL(0x3b, 0x3b, 0xec, 0x3b, 0xc5, 0xd7, 0x76, 0xfe),
406            LL(0xab, 0xab, 0x96, 0xab, 0x31, 0x3d, 0x4b, 0x9a),
407            LL(0xce, 0xce, 0x1f, 0xce, 0x3e, 0xd1, 0x81, 0xf0),
408            LL(0x11, 0x11, 0x44, 0x11, 0x88, 0x55, 0x22, 0x99),
409            LL(0x8f, 0x8f, 0x06, 0x8f, 0x0c, 0x89, 0x03, 0x83),
410            LL(0x4e, 0x4e, 0x25, 0x4e, 0x4a, 0x6b, 0x9c, 0x04),
411            LL(0xb7, 0xb7, 0xe6, 0xb7, 0xd1, 0x51, 0x73, 0x66),
412            LL(0xeb, 0xeb, 0x8b, 0xeb, 0x0b, 0x60, 0xcb, 0xe0),
413            LL(0x3c, 0x3c, 0xf0, 0x3c, 0xfd, 0xcc, 0x78, 0xc1),
414            LL(0x81, 0x81, 0x3e, 0x81, 0x7c, 0xbf, 0x1f, 0xfd),
415            LL(0x94, 0x94, 0x6a, 0x94, 0xd4, 0xfe, 0x35, 0x40),
416            LL(0xf7, 0xf7, 0xfb, 0xf7, 0xeb, 0x0c, 0xf3, 0x1c),
417            LL(0xb9, 0xb9, 0xde, 0xb9, 0xa1, 0x67, 0x6f, 0x18),
418            LL(0x13, 0x13, 0x4c, 0x13, 0x98, 0x5f, 0x26, 0x8b),
419            LL(0x2c, 0x2c, 0xb0, 0x2c, 0x7d, 0x9c, 0x58, 0x51),
420            LL(0xd3, 0xd3, 0x6b, 0xd3, 0xd6, 0xb8, 0xbb, 0x05),
421            LL(0xe7, 0xe7, 0xbb, 0xe7, 0x6b, 0x5c, 0xd3, 0x8c),
422            LL(0x6e, 0x6e, 0xa5, 0x6e, 0x57, 0xcb, 0xdc, 0x39),
423            LL(0xc4, 0xc4, 0x37, 0xc4, 0x6e, 0xf3, 0x95, 0xaa),
424            LL(0x03, 0x03, 0x0c, 0x03, 0x18, 0x0f, 0x06, 0x1b),
425            LL(0x56, 0x56, 0x45, 0x56, 0x8a, 0x13, 0xac, 0xdc),
426            LL(0x44, 0x44, 0x0d, 0x44, 0x1a, 0x49, 0x88, 0x5e),
427            LL(0x7f, 0x7f, 0xe1, 0x7f, 0xdf, 0x9e, 0xfe, 0xa0),
428            LL(0xa9, 0xa9, 0x9e, 0xa9, 0x21, 0x37, 0x4f, 0x88),
429            LL(0x2a, 0x2a, 0xa8, 0x2a, 0x4d, 0x82, 0x54, 0x67),
430            LL(0xbb, 0xbb, 0xd6, 0xbb, 0xb1, 0x6d, 0x6b, 0x0a),
431            LL(0xc1, 0xc1, 0x23, 0xc1, 0x46, 0xe2, 0x9f, 0x87),
432            LL(0x53, 0x53, 0x51, 0x53, 0xa2, 0x02, 0xa6, 0xf1),
433            LL(0xdc, 0xdc, 0x57, 0xdc, 0xae, 0x8b, 0xa5, 0x72),
434            LL(0x0b, 0x0b, 0x2c, 0x0b, 0x58, 0x27, 0x16, 0x53),
435            LL(0x9d, 0x9d, 0x4e, 0x9d, 0x9c, 0xd3, 0x27, 0x01),
436            LL(0x6c, 0x6c, 0xad, 0x6c, 0x47, 0xc1, 0xd8, 0x2b),
437            LL(0x31, 0x31, 0xc4, 0x31, 0x95, 0xf5, 0x62, 0xa4),
438            LL(0x74, 0x74, 0xcd, 0x74, 0x87, 0xb9, 0xe8, 0xf3),
439            LL(0xf6, 0xf6, 0xff, 0xf6, 0xe3, 0x09, 0xf1, 0x15),
440            LL(0x46, 0x46, 0x05, 0x46, 0x0a, 0x43, 0x8c, 0x4c),
441            LL(0xac, 0xac, 0x8a, 0xac, 0x09, 0x26, 0x45, 0xa5),
442            LL(0x89, 0x89, 0x1e, 0x89, 0x3c, 0x97, 0x0f, 0xb5),
443            LL(0x14, 0x14, 0x50, 0x14, 0xa0, 0x44, 0x28, 0xb4),
444            LL(0xe1, 0xe1, 0xa3, 0xe1, 0x5b, 0x42, 0xdf, 0xba),
445            LL(0x16, 0x16, 0x58, 0x16, 0xb0, 0x4e, 0x2c, 0xa6),
446            LL(0x3a, 0x3a, 0xe8, 0x3a, 0xcd, 0xd2, 0x74, 0xf7),
447            LL(0x69, 0x69, 0xb9, 0x69, 0x6f, 0xd0, 0xd2, 0x06),
448            LL(0x09, 0x09, 0x24, 0x09, 0x48, 0x2d, 0x12, 0x41),
449            LL(0x70, 0x70, 0xdd, 0x70, 0xa7, 0xad, 0xe0, 0xd7),
450            LL(0xb6, 0xb6, 0xe2, 0xb6, 0xd9, 0x54, 0x71, 0x6f),
451            LL(0xd0, 0xd0, 0x67, 0xd0, 0xce, 0xb7, 0xbd, 0x1e),
452            LL(0xed, 0xed, 0x93, 0xed, 0x3b, 0x7e, 0xc7, 0xd6),
453            LL(0xcc, 0xcc, 0x17, 0xcc, 0x2e, 0xdb, 0x85, 0xe2),
454            LL(0x42, 0x42, 0x15, 0x42, 0x2a, 0x57, 0x84, 0x68),
455            LL(0x98, 0x98, 0x5a, 0x98, 0xb4, 0xc2, 0x2d, 0x2c),
456            LL(0xa4, 0xa4, 0xaa, 0xa4, 0x49, 0x0e, 0x55, 0xed),
457            LL(0x28, 0x28, 0xa0, 0x28, 0x5d, 0x88, 0x50, 0x75),
458            LL(0x5c, 0x5c, 0x6d, 0x5c, 0xda, 0x31, 0xb8, 0x86),
459            LL(0xf8, 0xf8, 0xc7, 0xf8, 0x93, 0x3f, 0xed, 0x6b),
460            LL(0x86, 0x86, 0x22, 0x86, 0x44, 0xa4, 0x11, 0xc2),
461#define RC      (&(Cx.q[256*N]))
462            0x18, 0x23, 0xc6, 0xe8, 0x87, 0xb8, 0x01, 0x4f,
463            /* rc[ROUNDS] */
464            0x36, 0xa6, 0xd2, 0xf5, 0x79, 0x6f, 0x91, 0x52, 0x60, 0xbc, 0x9b,
465            0x8e, 0xa3, 0x0c, 0x7b, 0x35, 0x1d, 0xe0, 0xd7, 0xc2, 0x2e, 0x4b,
466            0xfe, 0x57, 0x15, 0x77, 0x37, 0xe5, 0x9f, 0xf0, 0x4a, 0xda, 0x58,
467            0xc9, 0x29, 0x0a, 0xb1, 0xa0, 0x6b, 0x85, 0xbd, 0x5d, 0x10, 0xf4,
468            0xcb, 0x3e, 0x05, 0x67, 0xe4, 0x27, 0x41, 0x8b, 0xa7, 0x7d, 0x95,
469            0xd8, 0xfb, 0xee, 0x7c, 0x66, 0xdd, 0x17, 0x47, 0x9e, 0xca, 0x2d,
470            0xbf, 0x07, 0xad, 0x5a, 0x83, 0x33
471        }
472    };
473
474void whirlpool_block(WHIRLPOOL_CTX *ctx, const void *inp, size_t n)
475{
476    int r;
477    const u8 *p = inp;
478    union {
479        u64 q[8];
480        u8 c[64];
481    } S, K, *H = (void *)ctx->H.q;
482
483#ifdef GO_FOR_MMX
484    GO_FOR_MMX(ctx, inp, n);
485#endif
486    do {
487#ifdef OPENSSL_SMALL_FOOTPRINT
488        u64 L[8];
489        int i;
490
491        for (i = 0; i < 64; i++)
492            S.c[i] = (K.c[i] = H->c[i]) ^ p[i];
493        for (r = 0; r < ROUNDS; r++) {
494            for (i = 0; i < 8; i++) {
495                L[i] = i ? 0 : RC[r];
496                L[i] ^= C0(K, i) ^ C1(K, (i - 1) & 7) ^
497                    C2(K, (i - 2) & 7) ^ C3(K, (i - 3) & 7) ^
498                    C4(K, (i - 4) & 7) ^ C5(K, (i - 5) & 7) ^
499                    C6(K, (i - 6) & 7) ^ C7(K, (i - 7) & 7);
500            }
501            memcpy(K.q, L, 64);
502            for (i = 0; i < 8; i++) {
503                L[i] ^= C0(S, i) ^ C1(S, (i - 1) & 7) ^
504                    C2(S, (i - 2) & 7) ^ C3(S, (i - 3) & 7) ^
505                    C4(S, (i - 4) & 7) ^ C5(S, (i - 5) & 7) ^
506                    C6(S, (i - 6) & 7) ^ C7(S, (i - 7) & 7);
507            }
508            memcpy(S.q, L, 64);
509        }
510        for (i = 0; i < 64; i++)
511            H->c[i] ^= S.c[i] ^ p[i];
512#else
513        u64 L0, L1, L2, L3, L4, L5, L6, L7;
514
515# ifdef STRICT_ALIGNMENT
516        if ((size_t)p & 7) {
517            memcpy(S.c, p, 64);
518            S.q[0] ^= (K.q[0] = H->q[0]);
519            S.q[1] ^= (K.q[1] = H->q[1]);
520            S.q[2] ^= (K.q[2] = H->q[2]);
521            S.q[3] ^= (K.q[3] = H->q[3]);
522            S.q[4] ^= (K.q[4] = H->q[4]);
523            S.q[5] ^= (K.q[5] = H->q[5]);
524            S.q[6] ^= (K.q[6] = H->q[6]);
525            S.q[7] ^= (K.q[7] = H->q[7]);
526        } else
527# endif
528        {
529            const u64 *pa = (const u64 *)p;
530            S.q[0] = (K.q[0] = H->q[0]) ^ pa[0];
531            S.q[1] = (K.q[1] = H->q[1]) ^ pa[1];
532            S.q[2] = (K.q[2] = H->q[2]) ^ pa[2];
533            S.q[3] = (K.q[3] = H->q[3]) ^ pa[3];
534            S.q[4] = (K.q[4] = H->q[4]) ^ pa[4];
535            S.q[5] = (K.q[5] = H->q[5]) ^ pa[5];
536            S.q[6] = (K.q[6] = H->q[6]) ^ pa[6];
537            S.q[7] = (K.q[7] = H->q[7]) ^ pa[7];
538        }
539
540        for (r = 0; r < ROUNDS; r++) {
541# ifdef SMALL_REGISTER_BANK
542            L0 = C0(K, 0) ^ C1(K, 7) ^ C2(K, 6) ^ C3(K, 5) ^
543                C4(K, 4) ^ C5(K, 3) ^ C6(K, 2) ^ C7(K, 1) ^ RC[r];
544            L1 = C0(K, 1) ^ C1(K, 0) ^ C2(K, 7) ^ C3(K, 6) ^
545                C4(K, 5) ^ C5(K, 4) ^ C6(K, 3) ^ C7(K, 2);
546            L2 = C0(K, 2) ^ C1(K, 1) ^ C2(K, 0) ^ C3(K, 7) ^
547                C4(K, 6) ^ C5(K, 5) ^ C6(K, 4) ^ C7(K, 3);
548            L3 = C0(K, 3) ^ C1(K, 2) ^ C2(K, 1) ^ C3(K, 0) ^
549                C4(K, 7) ^ C5(K, 6) ^ C6(K, 5) ^ C7(K, 4);
550            L4 = C0(K, 4) ^ C1(K, 3) ^ C2(K, 2) ^ C3(K, 1) ^
551                C4(K, 0) ^ C5(K, 7) ^ C6(K, 6) ^ C7(K, 5);
552            L5 = C0(K, 5) ^ C1(K, 4) ^ C2(K, 3) ^ C3(K, 2) ^
553                C4(K, 1) ^ C5(K, 0) ^ C6(K, 7) ^ C7(K, 6);
554            L6 = C0(K, 6) ^ C1(K, 5) ^ C2(K, 4) ^ C3(K, 3) ^
555                C4(K, 2) ^ C5(K, 1) ^ C6(K, 0) ^ C7(K, 7);
556            L7 = C0(K, 7) ^ C1(K, 6) ^ C2(K, 5) ^ C3(K, 4) ^
557                C4(K, 3) ^ C5(K, 2) ^ C6(K, 1) ^ C7(K, 0);
558
559            K.q[0] = L0;
560            K.q[1] = L1;
561            K.q[2] = L2;
562            K.q[3] = L3;
563            K.q[4] = L4;
564            K.q[5] = L5;
565            K.q[6] = L6;
566            K.q[7] = L7;
567
568            L0 ^= C0(S, 0) ^ C1(S, 7) ^ C2(S, 6) ^ C3(S, 5) ^
569                C4(S, 4) ^ C5(S, 3) ^ C6(S, 2) ^ C7(S, 1);
570            L1 ^= C0(S, 1) ^ C1(S, 0) ^ C2(S, 7) ^ C3(S, 6) ^
571                C4(S, 5) ^ C5(S, 4) ^ C6(S, 3) ^ C7(S, 2);
572            L2 ^= C0(S, 2) ^ C1(S, 1) ^ C2(S, 0) ^ C3(S, 7) ^
573                C4(S, 6) ^ C5(S, 5) ^ C6(S, 4) ^ C7(S, 3);
574            L3 ^= C0(S, 3) ^ C1(S, 2) ^ C2(S, 1) ^ C3(S, 0) ^
575                C4(S, 7) ^ C5(S, 6) ^ C6(S, 5) ^ C7(S, 4);
576            L4 ^= C0(S, 4) ^ C1(S, 3) ^ C2(S, 2) ^ C3(S, 1) ^
577                C4(S, 0) ^ C5(S, 7) ^ C6(S, 6) ^ C7(S, 5);
578            L5 ^= C0(S, 5) ^ C1(S, 4) ^ C2(S, 3) ^ C3(S, 2) ^
579                C4(S, 1) ^ C5(S, 0) ^ C6(S, 7) ^ C7(S, 6);
580            L6 ^= C0(S, 6) ^ C1(S, 5) ^ C2(S, 4) ^ C3(S, 3) ^
581                C4(S, 2) ^ C5(S, 1) ^ C6(S, 0) ^ C7(S, 7);
582            L7 ^= C0(S, 7) ^ C1(S, 6) ^ C2(S, 5) ^ C3(S, 4) ^
583                C4(S, 3) ^ C5(S, 2) ^ C6(S, 1) ^ C7(S, 0);
584
585            S.q[0] = L0;
586            S.q[1] = L1;
587            S.q[2] = L2;
588            S.q[3] = L3;
589            S.q[4] = L4;
590            S.q[5] = L5;
591            S.q[6] = L6;
592            S.q[7] = L7;
593# else
594            L0 = C0(K, 0);
595            L1 = C1(K, 0);
596            L2 = C2(K, 0);
597            L3 = C3(K, 0);
598            L4 = C4(K, 0);
599            L5 = C5(K, 0);
600            L6 = C6(K, 0);
601            L7 = C7(K, 0);
602            L0 ^= RC[r];
603
604            L1 ^= C0(K, 1);
605            L2 ^= C1(K, 1);
606            L3 ^= C2(K, 1);
607            L4 ^= C3(K, 1);
608            L5 ^= C4(K, 1);
609            L6 ^= C5(K, 1);
610            L7 ^= C6(K, 1);
611            L0 ^= C7(K, 1);
612
613            L2 ^= C0(K, 2);
614            L3 ^= C1(K, 2);
615            L4 ^= C2(K, 2);
616            L5 ^= C3(K, 2);
617            L6 ^= C4(K, 2);
618            L7 ^= C5(K, 2);
619            L0 ^= C6(K, 2);
620            L1 ^= C7(K, 2);
621
622            L3 ^= C0(K, 3);
623            L4 ^= C1(K, 3);
624            L5 ^= C2(K, 3);
625            L6 ^= C3(K, 3);
626            L7 ^= C4(K, 3);
627            L0 ^= C5(K, 3);
628            L1 ^= C6(K, 3);
629            L2 ^= C7(K, 3);
630
631            L4 ^= C0(K, 4);
632            L5 ^= C1(K, 4);
633            L6 ^= C2(K, 4);
634            L7 ^= C3(K, 4);
635            L0 ^= C4(K, 4);
636            L1 ^= C5(K, 4);
637            L2 ^= C6(K, 4);
638            L3 ^= C7(K, 4);
639
640            L5 ^= C0(K, 5);
641            L6 ^= C1(K, 5);
642            L7 ^= C2(K, 5);
643            L0 ^= C3(K, 5);
644            L1 ^= C4(K, 5);
645            L2 ^= C5(K, 5);
646            L3 ^= C6(K, 5);
647            L4 ^= C7(K, 5);
648
649            L6 ^= C0(K, 6);
650            L7 ^= C1(K, 6);
651            L0 ^= C2(K, 6);
652            L1 ^= C3(K, 6);
653            L2 ^= C4(K, 6);
654            L3 ^= C5(K, 6);
655            L4 ^= C6(K, 6);
656            L5 ^= C7(K, 6);
657
658            L7 ^= C0(K, 7);
659            L0 ^= C1(K, 7);
660            L1 ^= C2(K, 7);
661            L2 ^= C3(K, 7);
662            L3 ^= C4(K, 7);
663            L4 ^= C5(K, 7);
664            L5 ^= C6(K, 7);
665            L6 ^= C7(K, 7);
666
667            K.q[0] = L0;
668            K.q[1] = L1;
669            K.q[2] = L2;
670            K.q[3] = L3;
671            K.q[4] = L4;
672            K.q[5] = L5;
673            K.q[6] = L6;
674            K.q[7] = L7;
675
676            L0 ^= C0(S, 0);
677            L1 ^= C1(S, 0);
678            L2 ^= C2(S, 0);
679            L3 ^= C3(S, 0);
680            L4 ^= C4(S, 0);
681            L5 ^= C5(S, 0);
682            L6 ^= C6(S, 0);
683            L7 ^= C7(S, 0);
684
685            L1 ^= C0(S, 1);
686            L2 ^= C1(S, 1);
687            L3 ^= C2(S, 1);
688            L4 ^= C3(S, 1);
689            L5 ^= C4(S, 1);
690            L6 ^= C5(S, 1);
691            L7 ^= C6(S, 1);
692            L0 ^= C7(S, 1);
693
694            L2 ^= C0(S, 2);
695            L3 ^= C1(S, 2);
696            L4 ^= C2(S, 2);
697            L5 ^= C3(S, 2);
698            L6 ^= C4(S, 2);
699            L7 ^= C5(S, 2);
700            L0 ^= C6(S, 2);
701            L1 ^= C7(S, 2);
702
703            L3 ^= C0(S, 3);
704            L4 ^= C1(S, 3);
705            L5 ^= C2(S, 3);
706            L6 ^= C3(S, 3);
707            L7 ^= C4(S, 3);
708            L0 ^= C5(S, 3);
709            L1 ^= C6(S, 3);
710            L2 ^= C7(S, 3);
711
712            L4 ^= C0(S, 4);
713            L5 ^= C1(S, 4);
714            L6 ^= C2(S, 4);
715            L7 ^= C3(S, 4);
716            L0 ^= C4(S, 4);
717            L1 ^= C5(S, 4);
718            L2 ^= C6(S, 4);
719            L3 ^= C7(S, 4);
720
721            L5 ^= C0(S, 5);
722            L6 ^= C1(S, 5);
723            L7 ^= C2(S, 5);
724            L0 ^= C3(S, 5);
725            L1 ^= C4(S, 5);
726            L2 ^= C5(S, 5);
727            L3 ^= C6(S, 5);
728            L4 ^= C7(S, 5);
729
730            L6 ^= C0(S, 6);
731            L7 ^= C1(S, 6);
732            L0 ^= C2(S, 6);
733            L1 ^= C3(S, 6);
734            L2 ^= C4(S, 6);
735            L3 ^= C5(S, 6);
736            L4 ^= C6(S, 6);
737            L5 ^= C7(S, 6);
738
739            L7 ^= C0(S, 7);
740            L0 ^= C1(S, 7);
741            L1 ^= C2(S, 7);
742            L2 ^= C3(S, 7);
743            L3 ^= C4(S, 7);
744            L4 ^= C5(S, 7);
745            L5 ^= C6(S, 7);
746            L6 ^= C7(S, 7);
747
748            S.q[0] = L0;
749            S.q[1] = L1;
750            S.q[2] = L2;
751            S.q[3] = L3;
752            S.q[4] = L4;
753            S.q[5] = L5;
754            S.q[6] = L6;
755            S.q[7] = L7;
756# endif
757        }
758
759# ifdef STRICT_ALIGNMENT
760        if ((size_t)p & 7) {
761            int i;
762            for (i = 0; i < 64; i++)
763                H->c[i] ^= S.c[i] ^ p[i];
764        } else
765# endif
766        {
767            const u64 *pa = (const u64 *)p;
768            H->q[0] ^= S.q[0] ^ pa[0];
769            H->q[1] ^= S.q[1] ^ pa[1];
770            H->q[2] ^= S.q[2] ^ pa[2];
771            H->q[3] ^= S.q[3] ^ pa[3];
772            H->q[4] ^= S.q[4] ^ pa[4];
773            H->q[5] ^= S.q[5] ^ pa[5];
774            H->q[6] ^= S.q[6] ^ pa[6];
775            H->q[7] ^= S.q[7] ^ pa[7];
776        }
777#endif
778        p += 64;
779    } while (--n);
780}
781