1/*******************************************************************************
2* Copyright (c) 2013, Intel Corporation
3*
4* All rights reserved.
5*
6* Redistribution and use in source and binary forms, with or without
7* modification, are permitted provided that the following conditions are
8* met:
9*
10* * Redistributions of source code must retain the above copyright
11*   notice, this list of conditions and the following disclaimer.
12*
13* * Redistributions in binary form must reproduce the above copyright
14*   notice, this list of conditions and the following disclaimer in the
15*   documentation and/or other materials provided with the
16*   distribution.
17*
18* * Neither the name of the Intel Corporation nor the names of its
19*   contributors may be used to endorse or promote products derived from
20*   this software without specific prior written permission.
21*
22*
23* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
24* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
27* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34********************************************************************************
35*
36* Intel SHA Extensions optimized implementation of a SHA-1 update function
37*
38* The function takes a pointer to the current hash values, a pointer to the
39* input data, and a number of 64 byte blocks to process.  Once all blocks have
40* been processed, the digest pointer is  updated with the resulting hash value.
41* The function only processes complete blocks, there is no functionality to
42* store partial blocks.  All message padding and hash value initialization must
43* be done outside the update function.
44*
45* The indented lines in the loop are instructions related to rounds processing.
46* The non-indented lines are instructions related to the message schedule.
47*
48* Author: Sean Gulley <sean.m.gulley@intel.com>
49* Date:   July 2013
50*
51********************************************************************************
52*
53* Example complier command line:
54* icc intel_sha_extensions_sha1_intrinsic.c
55* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
56*
57*******************************************************************************/
58#include <sys/cdefs.h>
59__FBSDID("$FreeBSD$");
60
61#include <sys/types.h>
62#include <crypto/aesni/aesni_os.h>
63#include <crypto/aesni/sha_sse.h>
64
65#include <immintrin.h>
66
67void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
68   __m128i abcd, e0, e1;
69   __m128i abcd_save, e_save;
70   __m128i msg0, msg1, msg2, msg3;
71   __m128i shuf_mask, e_mask;
72
73#if 0
74   e_mask    = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
75#else
76   (void)e_mask;
77   e0        = _mm_set_epi64x(0, 0);
78#endif
79   shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
80
81   // Load initial hash values
82   abcd      = _mm_loadu_si128((__m128i*) digest);
83   e0        = _mm_insert_epi32(e0, *(digest+4), 3);
84   abcd      = _mm_shuffle_epi32(abcd, 0x1B);
85#if 0
86   e0        = _mm_and_si128(e0, e_mask);
87#endif
88
89   while (num_blks > 0) {
90      // Save hash values for addition after rounds
91      abcd_save = abcd;
92      e_save    = e0;
93
94      // Rounds 0-3
95      msg0 = _mm_loadu_si128((const __m128i*) data);
96      msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
97         e0   = _mm_add_epi32(e0, msg0);
98         e1   = abcd;
99         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
100
101      // Rounds 4-7
102      msg1 = _mm_loadu_si128((const __m128i*) (data+16));
103      msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
104         e1   = _mm_sha1nexte_epu32(e1, msg1);
105         e0   = abcd;
106         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
107      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
108
109      // Rounds 8-11
110      msg2 = _mm_loadu_si128((const __m128i*) (data+32));
111      msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
112         e0   = _mm_sha1nexte_epu32(e0, msg2);
113         e1   = abcd;
114         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
115      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
116      msg0 = _mm_xor_si128(msg0, msg2);
117
118      // Rounds 12-15
119      msg3 = _mm_loadu_si128((const __m128i*) (data+48));
120      msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
121         e1   = _mm_sha1nexte_epu32(e1, msg3);
122         e0   = abcd;
123      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
124         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
125      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
126      msg1 = _mm_xor_si128(msg1, msg3);
127
128      // Rounds 16-19
129         e0   = _mm_sha1nexte_epu32(e0, msg0);
130         e1   = abcd;
131      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
132         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
133      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
134      msg2 = _mm_xor_si128(msg2, msg0);
135
136      // Rounds 20-23
137         e1   = _mm_sha1nexte_epu32(e1, msg1);
138         e0   = abcd;
139      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
140         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
141      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
142      msg3 = _mm_xor_si128(msg3, msg1);
143
144      // Rounds 24-27
145         e0   = _mm_sha1nexte_epu32(e0, msg2);
146         e1   = abcd;
147      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
148         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
149      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
150      msg0 = _mm_xor_si128(msg0, msg2);
151
152      // Rounds 28-31
153         e1   = _mm_sha1nexte_epu32(e1, msg3);
154         e0   = abcd;
155      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
156         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
157      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
158      msg1 = _mm_xor_si128(msg1, msg3);
159
160      // Rounds 32-35
161         e0   = _mm_sha1nexte_epu32(e0, msg0);
162         e1   = abcd;
163      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
164         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
165      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
166      msg2 = _mm_xor_si128(msg2, msg0);
167
168      // Rounds 36-39
169         e1   = _mm_sha1nexte_epu32(e1, msg1);
170         e0   = abcd;
171      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
172         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
173      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
174      msg3 = _mm_xor_si128(msg3, msg1);
175
176      // Rounds 40-43
177         e0   = _mm_sha1nexte_epu32(e0, msg2);
178         e1   = abcd;
179      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
180         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
181      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
182      msg0 = _mm_xor_si128(msg0, msg2);
183
184      // Rounds 44-47
185         e1   = _mm_sha1nexte_epu32(e1, msg3);
186         e0   = abcd;
187      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
188         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
189      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
190      msg1 = _mm_xor_si128(msg1, msg3);
191
192      // Rounds 48-51
193         e0   = _mm_sha1nexte_epu32(e0, msg0);
194         e1   = abcd;
195      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
196         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
197      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
198      msg2 = _mm_xor_si128(msg2, msg0);
199
200      // Rounds 52-55
201         e1   = _mm_sha1nexte_epu32(e1, msg1);
202         e0   = abcd;
203      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
204         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
205      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
206      msg3 = _mm_xor_si128(msg3, msg1);
207
208      // Rounds 56-59
209         e0   = _mm_sha1nexte_epu32(e0, msg2);
210         e1   = abcd;
211      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
212         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
213      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
214      msg0 = _mm_xor_si128(msg0, msg2);
215
216      // Rounds 60-63
217         e1   = _mm_sha1nexte_epu32(e1, msg3);
218         e0   = abcd;
219      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
220         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
221      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
222      msg1 = _mm_xor_si128(msg1, msg3);
223
224      // Rounds 64-67
225         e0   = _mm_sha1nexte_epu32(e0, msg0);
226         e1   = abcd;
227      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
228         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
229      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
230      msg2 = _mm_xor_si128(msg2, msg0);
231
232      // Rounds 68-71
233         e1   = _mm_sha1nexte_epu32(e1, msg1);
234         e0   = abcd;
235      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
236         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
237      msg3 = _mm_xor_si128(msg3, msg1);
238
239      // Rounds 72-75
240         e0   = _mm_sha1nexte_epu32(e0, msg2);
241         e1   = abcd;
242      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
243         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
244
245      // Rounds 76-79
246         e1   = _mm_sha1nexte_epu32(e1, msg3);
247         e0   = abcd;
248         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
249
250      // Add current hash values with previously saved
251      e0   = _mm_sha1nexte_epu32(e0, e_save);
252      abcd = _mm_add_epi32(abcd, abcd_save);
253
254      data += 64;
255      num_blks--;
256   }
257
258   abcd = _mm_shuffle_epi32(abcd, 0x1B);
259   _mm_store_si128((__m128i*) digest, abcd);
260   *(digest+4) = _mm_extract_epi32(e0, 3);
261}
262
263