1/* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5 6 To the extent possible under law, the author(s) have dedicated all copyright 7 and related and neighboring rights to this software to the public domain 8 worldwide. This software is distributed without any warranty. 9 10 You should have received a copy of the CC0 Public Domain Dedication along with 11 this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12*/ 13#pragma once 14#ifndef __BLAKE2B_LOAD_SSE41_H__ 15#define __BLAKE2B_LOAD_SSE41_H__ 16 17#define LOAD_MSG_0_1(b0, b1) \ 18do \ 19{ \ 20b0 = _mm_unpacklo_epi64(m0, m1); \ 21b1 = _mm_unpacklo_epi64(m2, m3); \ 22} while(0) 23 24 25#define LOAD_MSG_0_2(b0, b1) \ 26do \ 27{ \ 28b0 = _mm_unpackhi_epi64(m0, m1); \ 29b1 = _mm_unpackhi_epi64(m2, m3); \ 30} while(0) 31 32 33#define LOAD_MSG_0_3(b0, b1) \ 34do \ 35{ \ 36b0 = _mm_unpacklo_epi64(m4, m5); \ 37b1 = _mm_unpacklo_epi64(m6, m7); \ 38} while(0) 39 40 41#define LOAD_MSG_0_4(b0, b1) \ 42do \ 43{ \ 44b0 = _mm_unpackhi_epi64(m4, m5); \ 45b1 = _mm_unpackhi_epi64(m6, m7); \ 46} while(0) 47 48 49#define LOAD_MSG_1_1(b0, b1) \ 50do \ 51{ \ 52b0 = _mm_unpacklo_epi64(m7, m2); \ 53b1 = _mm_unpackhi_epi64(m4, m6); \ 54} while(0) 55 56 57#define LOAD_MSG_1_2(b0, b1) \ 58do \ 59{ \ 60b0 = _mm_unpacklo_epi64(m5, m4); \ 61b1 = _mm_alignr_epi8(m3, m7, 8); \ 62} while(0) 63 64 65#define LOAD_MSG_1_3(b0, b1) \ 66do \ 67{ \ 68b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 69b1 = _mm_unpackhi_epi64(m5, m2); \ 70} while(0) 71 72 73#define LOAD_MSG_1_4(b0, b1) \ 74do \ 75{ \ 76b0 = _mm_unpacklo_epi64(m6, m1); \ 77b1 = _mm_unpackhi_epi64(m3, m1); \ 78} while(0) 79 80 81#define LOAD_MSG_2_1(b0, b1) \ 82do \ 83{ \ 84b0 = _mm_alignr_epi8(m6, m5, 8); \ 85b1 = _mm_unpackhi_epi64(m2, m7); \ 86} while(0) 87 88 89#define LOAD_MSG_2_2(b0, b1) \ 90do \ 91{ \ 92b0 = _mm_unpacklo_epi64(m4, m0); \ 93b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 94} while(0) 95 96 97#define LOAD_MSG_2_3(b0, b1) \ 98do \ 99{ \ 100b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 101b1 = _mm_unpackhi_epi64(m3, m4); \ 102} while(0) 103 104 105#define LOAD_MSG_2_4(b0, b1) \ 106do \ 107{ \ 108b0 = _mm_unpacklo_epi64(m7, m3); \ 109b1 = _mm_alignr_epi8(m2, m0, 8); \ 110} while(0) 111 112 113#define LOAD_MSG_3_1(b0, b1) \ 114do \ 115{ \ 116b0 = _mm_unpackhi_epi64(m3, m1); \ 117b1 = _mm_unpackhi_epi64(m6, m5); \ 118} while(0) 119 120 121#define LOAD_MSG_3_2(b0, b1) \ 122do \ 123{ \ 124b0 = _mm_unpackhi_epi64(m4, m0); \ 125b1 = _mm_unpacklo_epi64(m6, m7); \ 126} while(0) 127 128 129#define LOAD_MSG_3_3(b0, b1) \ 130do \ 131{ \ 132b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 133b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 134} while(0) 135 136 137#define LOAD_MSG_3_4(b0, b1) \ 138do \ 139{ \ 140b0 = _mm_unpacklo_epi64(m3, m5); \ 141b1 = _mm_unpacklo_epi64(m0, m4); \ 142} while(0) 143 144 145#define LOAD_MSG_4_1(b0, b1) \ 146do \ 147{ \ 148b0 = _mm_unpackhi_epi64(m4, m2); \ 149b1 = _mm_unpacklo_epi64(m1, m5); \ 150} while(0) 151 152 153#define LOAD_MSG_4_2(b0, b1) \ 154do \ 155{ \ 156b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 157b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 158} while(0) 159 160 161#define LOAD_MSG_4_3(b0, b1) \ 162do \ 163{ \ 164b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 165b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 166} while(0) 167 168 169#define LOAD_MSG_4_4(b0, b1) \ 170do \ 171{ \ 172b0 = _mm_alignr_epi8(m6, m0, 8); \ 173b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 174} while(0) 175 176 177#define LOAD_MSG_5_1(b0, b1) \ 178do \ 179{ \ 180b0 = _mm_unpacklo_epi64(m1, m3); \ 181b1 = _mm_unpacklo_epi64(m0, m4); \ 182} while(0) 183 184 185#define LOAD_MSG_5_2(b0, b1) \ 186do \ 187{ \ 188b0 = _mm_unpacklo_epi64(m6, m5); \ 189b1 = _mm_unpackhi_epi64(m5, m1); \ 190} while(0) 191 192 193#define LOAD_MSG_5_3(b0, b1) \ 194do \ 195{ \ 196b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 197b1 = _mm_unpackhi_epi64(m7, m0); \ 198} while(0) 199 200 201#define LOAD_MSG_5_4(b0, b1) \ 202do \ 203{ \ 204b0 = _mm_unpackhi_epi64(m6, m2); \ 205b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 206} while(0) 207 208 209#define LOAD_MSG_6_1(b0, b1) \ 210do \ 211{ \ 212b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 213b1 = _mm_unpacklo_epi64(m7, m2); \ 214} while(0) 215 216 217#define LOAD_MSG_6_2(b0, b1) \ 218do \ 219{ \ 220b0 = _mm_unpackhi_epi64(m2, m7); \ 221b1 = _mm_alignr_epi8(m5, m6, 8); \ 222} while(0) 223 224 225#define LOAD_MSG_6_3(b0, b1) \ 226do \ 227{ \ 228b0 = _mm_unpacklo_epi64(m0, m3); \ 229b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ 230} while(0) 231 232 233#define LOAD_MSG_6_4(b0, b1) \ 234do \ 235{ \ 236b0 = _mm_unpackhi_epi64(m3, m1); \ 237b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 238} while(0) 239 240 241#define LOAD_MSG_7_1(b0, b1) \ 242do \ 243{ \ 244b0 = _mm_unpackhi_epi64(m6, m3); \ 245b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 246} while(0) 247 248 249#define LOAD_MSG_7_2(b0, b1) \ 250do \ 251{ \ 252b0 = _mm_alignr_epi8(m7, m5, 8); \ 253b1 = _mm_unpackhi_epi64(m0, m4); \ 254} while(0) 255 256 257#define LOAD_MSG_7_3(b0, b1) \ 258do \ 259{ \ 260b0 = _mm_unpackhi_epi64(m2, m7); \ 261b1 = _mm_unpacklo_epi64(m4, m1); \ 262} while(0) 263 264 265#define LOAD_MSG_7_4(b0, b1) \ 266do \ 267{ \ 268b0 = _mm_unpacklo_epi64(m0, m2); \ 269b1 = _mm_unpacklo_epi64(m3, m5); \ 270} while(0) 271 272 273#define LOAD_MSG_8_1(b0, b1) \ 274do \ 275{ \ 276b0 = _mm_unpacklo_epi64(m3, m7); \ 277b1 = _mm_alignr_epi8(m0, m5, 8); \ 278} while(0) 279 280 281#define LOAD_MSG_8_2(b0, b1) \ 282do \ 283{ \ 284b0 = _mm_unpackhi_epi64(m7, m4); \ 285b1 = _mm_alignr_epi8(m4, m1, 8); \ 286} while(0) 287 288 289#define LOAD_MSG_8_3(b0, b1) \ 290do \ 291{ \ 292b0 = m6; \ 293b1 = _mm_alignr_epi8(m5, m0, 8); \ 294} while(0) 295 296 297#define LOAD_MSG_8_4(b0, b1) \ 298do \ 299{ \ 300b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 301b1 = m2; \ 302} while(0) 303 304 305#define LOAD_MSG_9_1(b0, b1) \ 306do \ 307{ \ 308b0 = _mm_unpacklo_epi64(m5, m4); \ 309b1 = _mm_unpackhi_epi64(m3, m0); \ 310} while(0) 311 312 313#define LOAD_MSG_9_2(b0, b1) \ 314do \ 315{ \ 316b0 = _mm_unpacklo_epi64(m1, m2); \ 317b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 318} while(0) 319 320 321#define LOAD_MSG_9_3(b0, b1) \ 322do \ 323{ \ 324b0 = _mm_unpackhi_epi64(m7, m4); \ 325b1 = _mm_unpackhi_epi64(m1, m6); \ 326} while(0) 327 328 329#define LOAD_MSG_9_4(b0, b1) \ 330do \ 331{ \ 332b0 = _mm_alignr_epi8(m7, m5, 8); \ 333b1 = _mm_unpacklo_epi64(m6, m0); \ 334} while(0) 335 336 337#define LOAD_MSG_10_1(b0, b1) \ 338do \ 339{ \ 340b0 = _mm_unpacklo_epi64(m0, m1); \ 341b1 = _mm_unpacklo_epi64(m2, m3); \ 342} while(0) 343 344 345#define LOAD_MSG_10_2(b0, b1) \ 346do \ 347{ \ 348b0 = _mm_unpackhi_epi64(m0, m1); \ 349b1 = _mm_unpackhi_epi64(m2, m3); \ 350} while(0) 351 352 353#define LOAD_MSG_10_3(b0, b1) \ 354do \ 355{ \ 356b0 = _mm_unpacklo_epi64(m4, m5); \ 357b1 = _mm_unpacklo_epi64(m6, m7); \ 358} while(0) 359 360 361#define LOAD_MSG_10_4(b0, b1) \ 362do \ 363{ \ 364b0 = _mm_unpackhi_epi64(m4, m5); \ 365b1 = _mm_unpackhi_epi64(m6, m7); \ 366} while(0) 367 368 369#define LOAD_MSG_11_1(b0, b1) \ 370do \ 371{ \ 372b0 = _mm_unpacklo_epi64(m7, m2); \ 373b1 = _mm_unpackhi_epi64(m4, m6); \ 374} while(0) 375 376 377#define LOAD_MSG_11_2(b0, b1) \ 378do \ 379{ \ 380b0 = _mm_unpacklo_epi64(m5, m4); \ 381b1 = _mm_alignr_epi8(m3, m7, 8); \ 382} while(0) 383 384 385#define LOAD_MSG_11_3(b0, b1) \ 386do \ 387{ \ 388b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 389b1 = _mm_unpackhi_epi64(m5, m2); \ 390} while(0) 391 392 393#define LOAD_MSG_11_4(b0, b1) \ 394do \ 395{ \ 396b0 = _mm_unpacklo_epi64(m6, m1); \ 397b1 = _mm_unpackhi_epi64(m3, m1); \ 398} while(0) 399 400 401#endif 402 403