1/* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5 6 To the extent possible under law, the author(s) have dedicated all copyright 7 and related and neighboring rights to this software to the public domain 8 worldwide. This software is distributed without any warranty. 9 10 You should have received a copy of the CC0 Public Domain Dedication along with 11 this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. 12*/ 13#pragma once 14#ifndef __BLAKE2S_LOAD_SSE41_H__ 15#define __BLAKE2S_LOAD_SSE41_H__ 16 17#define LOAD_MSG_0_1(buf) \ 18buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); 19 20#define LOAD_MSG_0_2(buf) \ 21buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); 22 23#define LOAD_MSG_0_3(buf) \ 24buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); 25 26#define LOAD_MSG_0_4(buf) \ 27buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); 28 29#define LOAD_MSG_1_1(buf) \ 30t0 = _mm_blend_epi16(m1, m2, 0x0C); \ 31t1 = _mm_slli_si128(m3, 4); \ 32t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 33buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); 34 35#define LOAD_MSG_1_2(buf) \ 36t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ 37t1 = _mm_blend_epi16(m1,m3,0xC0); \ 38t2 = _mm_blend_epi16(t0, t1, 0xF0); \ 39buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 40 41#define LOAD_MSG_1_3(buf) \ 42t0 = _mm_slli_si128(m1, 4); \ 43t1 = _mm_blend_epi16(m2, t0, 0x30); \ 44t2 = _mm_blend_epi16(m0, t1, 0xF0); \ 45buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 46 47#define LOAD_MSG_1_4(buf) \ 48t0 = _mm_unpackhi_epi32(m0,m1); \ 49t1 = _mm_slli_si128(m3, 4); \ 50t2 = _mm_blend_epi16(t0, t1, 0x0C); \ 51buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); 52 53#define LOAD_MSG_2_1(buf) \ 54t0 = _mm_unpackhi_epi32(m2,m3); \ 55t1 = _mm_blend_epi16(m3,m1,0x0C); \ 56t2 = _mm_blend_epi16(t0, t1, 0x0F); \ 57buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 58 59#define LOAD_MSG_2_2(buf) \ 60t0 = _mm_unpacklo_epi32(m2,m0); \ 61t1 = _mm_blend_epi16(t0, m0, 0xF0); \ 62t2 = _mm_slli_si128(m3, 8); \ 63buf = _mm_blend_epi16(t1, t2, 0xC0); 64 65#define LOAD_MSG_2_3(buf) \ 66t0 = _mm_blend_epi16(m0, m2, 0x3C); \ 67t1 = _mm_srli_si128(m1, 12); \ 68t2 = _mm_blend_epi16(t0,t1,0x03); \ 69buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); 70 71#define LOAD_MSG_2_4(buf) \ 72t0 = _mm_slli_si128(m3, 4); \ 73t1 = _mm_blend_epi16(m0, m1, 0x33); \ 74t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 75buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); 76 77#define LOAD_MSG_3_1(buf) \ 78t0 = _mm_unpackhi_epi32(m0,m1); \ 79t1 = _mm_unpackhi_epi32(t0, m2); \ 80t2 = _mm_blend_epi16(t1, m3, 0x0C); \ 81buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); 82 83#define LOAD_MSG_3_2(buf) \ 84t0 = _mm_slli_si128(m2, 8); \ 85t1 = _mm_blend_epi16(m3,m0,0x0C); \ 86t2 = _mm_blend_epi16(t1, t0, 0xC0); \ 87buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 88 89#define LOAD_MSG_3_3(buf) \ 90t0 = _mm_blend_epi16(m0,m1,0x0F); \ 91t1 = _mm_blend_epi16(t0, m3, 0xC0); \ 92buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 93 94#define LOAD_MSG_3_4(buf) \ 95t0 = _mm_unpacklo_epi32(m0,m2); \ 96t1 = _mm_unpackhi_epi32(m1,m2); \ 97buf = _mm_unpacklo_epi64(t1,t0); 98 99#define LOAD_MSG_4_1(buf) \ 100t0 = _mm_unpacklo_epi64(m1,m2); \ 101t1 = _mm_unpackhi_epi64(m0,m2); \ 102t2 = _mm_blend_epi16(t0,t1,0x33); \ 103buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); 104 105#define LOAD_MSG_4_2(buf) \ 106t0 = _mm_unpackhi_epi64(m1,m3); \ 107t1 = _mm_unpacklo_epi64(m0,m1); \ 108buf = _mm_blend_epi16(t0,t1,0x33); 109 110#define LOAD_MSG_4_3(buf) \ 111t0 = _mm_unpackhi_epi64(m3,m1); \ 112t1 = _mm_unpackhi_epi64(m2,m0); \ 113buf = _mm_blend_epi16(t1,t0,0x33); 114 115#define LOAD_MSG_4_4(buf) \ 116t0 = _mm_blend_epi16(m0,m2,0x03); \ 117t1 = _mm_slli_si128(t0, 8); \ 118t2 = _mm_blend_epi16(t1,m3,0x0F); \ 119buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); 120 121#define LOAD_MSG_5_1(buf) \ 122t0 = _mm_unpackhi_epi32(m0,m1); \ 123t1 = _mm_unpacklo_epi32(m0,m2); \ 124buf = _mm_unpacklo_epi64(t0,t1); 125 126#define LOAD_MSG_5_2(buf) \ 127t0 = _mm_srli_si128(m2, 4); \ 128t1 = _mm_blend_epi16(m0,m3,0x03); \ 129buf = _mm_blend_epi16(t1,t0,0x3C); 130 131#define LOAD_MSG_5_3(buf) \ 132t0 = _mm_blend_epi16(m1,m0,0x0C); \ 133t1 = _mm_srli_si128(m3, 4); \ 134t2 = _mm_blend_epi16(t0,t1,0x30); \ 135buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); 136 137#define LOAD_MSG_5_4(buf) \ 138t0 = _mm_unpacklo_epi64(m1,m2); \ 139t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ 140buf = _mm_blend_epi16(t0,t1,0x33); 141 142#define LOAD_MSG_6_1(buf) \ 143t0 = _mm_slli_si128(m1, 12); \ 144t1 = _mm_blend_epi16(m0,m3,0x33); \ 145buf = _mm_blend_epi16(t1,t0,0xC0); 146 147#define LOAD_MSG_6_2(buf) \ 148t0 = _mm_blend_epi16(m3,m2,0x30); \ 149t1 = _mm_srli_si128(m1, 4); \ 150t2 = _mm_blend_epi16(t0,t1,0x03); \ 151buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); 152 153#define LOAD_MSG_6_3(buf) \ 154t0 = _mm_unpacklo_epi64(m0,m2); \ 155t1 = _mm_srli_si128(m1, 4); \ 156buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); 157 158#define LOAD_MSG_6_4(buf) \ 159t0 = _mm_unpackhi_epi32(m1,m2); \ 160t1 = _mm_unpackhi_epi64(m0,t0); \ 161buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); 162 163#define LOAD_MSG_7_1(buf) \ 164t0 = _mm_unpackhi_epi32(m0,m1); \ 165t1 = _mm_blend_epi16(t0,m3,0x0F); \ 166buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); 167 168#define LOAD_MSG_7_2(buf) \ 169t0 = _mm_blend_epi16(m2,m3,0x30); \ 170t1 = _mm_srli_si128(m0,4); \ 171t2 = _mm_blend_epi16(t0,t1,0x03); \ 172buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); 173 174#define LOAD_MSG_7_3(buf) \ 175t0 = _mm_unpackhi_epi64(m0,m3); \ 176t1 = _mm_unpacklo_epi64(m1,m2); \ 177t2 = _mm_blend_epi16(t0,t1,0x3C); \ 178buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); 179 180#define LOAD_MSG_7_4(buf) \ 181t0 = _mm_unpacklo_epi32(m0,m1); \ 182t1 = _mm_unpackhi_epi32(m1,m2); \ 183buf = _mm_unpacklo_epi64(t0,t1); 184 185#define LOAD_MSG_8_1(buf) \ 186t0 = _mm_unpackhi_epi32(m1,m3); \ 187t1 = _mm_unpacklo_epi64(t0,m0); \ 188t2 = _mm_blend_epi16(t1,m2,0xC0); \ 189buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); 190 191#define LOAD_MSG_8_2(buf) \ 192t0 = _mm_unpackhi_epi32(m0,m3); \ 193t1 = _mm_blend_epi16(m2,t0,0xF0); \ 194buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); 195 196#define LOAD_MSG_8_3(buf) \ 197t0 = _mm_blend_epi16(m2,m0,0x0C); \ 198t1 = _mm_slli_si128(t0,4); \ 199buf = _mm_blend_epi16(t1,m3,0x0F); 200 201#define LOAD_MSG_8_4(buf) \ 202t0 = _mm_blend_epi16(m1,m0,0x30); \ 203buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); 204 205#define LOAD_MSG_9_1(buf) \ 206t0 = _mm_blend_epi16(m0,m2,0x03); \ 207t1 = _mm_blend_epi16(m1,m2,0x30); \ 208t2 = _mm_blend_epi16(t1,t0,0x0F); \ 209buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); 210 211#define LOAD_MSG_9_2(buf) \ 212t0 = _mm_slli_si128(m0,4); \ 213t1 = _mm_blend_epi16(m1,t0,0xC0); \ 214buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); 215 216#define LOAD_MSG_9_3(buf) \ 217t0 = _mm_unpackhi_epi32(m0,m3); \ 218t1 = _mm_unpacklo_epi32(m2,m3); \ 219t2 = _mm_unpackhi_epi64(t0,t1); \ 220buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); 221 222#define LOAD_MSG_9_4(buf) \ 223t0 = _mm_blend_epi16(m3,m2,0xC0); \ 224t1 = _mm_unpacklo_epi32(m0,m3); \ 225t2 = _mm_blend_epi16(t0,t1,0x0F); \ 226buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); 227 228#endif 229 230