/* * Copyright (c) 2017 Thomas Pornin * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define BR_ENABLE_INTRINSICS 1 #include "inner.h" #if BR_SSE2 /* * This file contains a ChaCha20 implementation that leverages SSE2 * opcodes for better performance. */ /* see bearssl_block.h */ br_chacha20_run br_chacha20_sse2_get(void) { /* * If using 64-bit mode, then SSE2 opcodes should be automatically * available, since they are part of the ABI. * * In 32-bit mode, we use CPUID to detect the SSE2 feature. */ #if BR_amd64 return &br_chacha20_sse2_run; #else /* * SSE2 support is indicated by bit 26 in EDX. */ if (br_cpuid(0, 0, 0, 0x04000000)) { return &br_chacha20_sse2_run; } else { return 0; } #endif } BR_TARGETS_X86_UP /* see bearssl_block.h */ BR_TARGET("sse2") uint32_t br_chacha20_sse2_run(const void *key, const void *iv, uint32_t cc, void *data, size_t len) { unsigned char *buf; uint32_t ivtmp[4]; __m128i kw0, kw1; __m128i iw, cw; __m128i one; static const uint32_t CW[] = { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 }; buf = data; kw0 = _mm_loadu_si128(key); kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16)); ivtmp[0] = cc; memcpy(ivtmp + 1, iv, 12); iw = _mm_loadu_si128((const void *)ivtmp); cw = _mm_loadu_si128((const void *)CW); one = _mm_set_epi32(0, 0, 0, 1); while (len > 0) { /* * sj contains state words 4*j to 4*j+3. */ __m128i s0, s1, s2, s3; int i; s0 = cw; s1 = kw0; s2 = kw1; s3 = iw; for (i = 0; i < 10; i ++) { /* * Even round is straightforward application on * the state words. */ s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 16), _mm_srli_epi32(s3, 16)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 12), _mm_srli_epi32(s1, 20)); s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 8), _mm_srli_epi32(s3, 24)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 7), _mm_srli_epi32(s1, 25)); /* * For the odd round, we must rotate some state * words so that the computations apply on the * right combinations of words. */ s1 = _mm_shuffle_epi32(s1, 0x39); s2 = _mm_shuffle_epi32(s2, 0x4E); s3 = _mm_shuffle_epi32(s3, 0x93); s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 16), _mm_srli_epi32(s3, 16)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 12), _mm_srli_epi32(s1, 20)); s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 8), _mm_srli_epi32(s3, 24)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 7), _mm_srli_epi32(s1, 25)); /* * After the odd round, we rotate back the values * to undo the rotate at the start of the odd round. */ s1 = _mm_shuffle_epi32(s1, 0x93); s2 = _mm_shuffle_epi32(s2, 0x4E); s3 = _mm_shuffle_epi32(s3, 0x39); } /* * Addition with the initial state. */ s0 = _mm_add_epi32(s0, cw); s1 = _mm_add_epi32(s1, kw0); s2 = _mm_add_epi32(s2, kw1); s3 = _mm_add_epi32(s3, iw); /* * Increment block counter. */ iw = _mm_add_epi32(iw, one); /* * XOR final state with the data. */ if (len < 64) { unsigned char tmp[64]; size_t u; _mm_storeu_si128((void *)(tmp + 0), s0); _mm_storeu_si128((void *)(tmp + 16), s1); _mm_storeu_si128((void *)(tmp + 32), s2); _mm_storeu_si128((void *)(tmp + 48), s3); for (u = 0; u < len; u ++) { buf[u] ^= tmp[u]; } break; } else { __m128i b0, b1, b2, b3; b0 = _mm_loadu_si128((const void *)(buf + 0)); b1 = _mm_loadu_si128((const void *)(buf + 16)); b2 = _mm_loadu_si128((const void *)(buf + 32)); b3 = _mm_loadu_si128((const void *)(buf + 48)); b0 = _mm_xor_si128(b0, s0); b1 = _mm_xor_si128(b1, s1); b2 = _mm_xor_si128(b2, s2); b3 = _mm_xor_si128(b3, s3); _mm_storeu_si128((void *)(buf + 0), b0); _mm_storeu_si128((void *)(buf + 16), b1); _mm_storeu_si128((void *)(buf + 32), b2); _mm_storeu_si128((void *)(buf + 48), b3); buf += 64; len -= 64; } } /* * _mm_extract_epi32() requires SSE4.1. We prefer to stick to * raw SSE2, thus we use _mm_extract_epi16(). */ return (uint32_t)_mm_extract_epi16(iw, 0) | ((uint32_t)_mm_extract_epi16(iw, 1) << 16); } BR_TARGETS_X86_DOWN #else /* see bearssl_block.h */ br_chacha20_run br_chacha20_sse2_get(void) { return 0; } #endif