1/* 2 * Copyright 2014, Pawe�� Dziepak, pdziepak@quarnos.org. 3 * Distributed under the terms of the MIT License. 4 */ 5 6 7#include <array> 8 9#include <cstddef> 10#include <cstdint> 11 12#include <emmintrin.h> 13 14 15namespace { 16 17 18// __m128i resolves to a type with an attribute, which can't get into the 19// template signature, resulting in a warning. Nonetheless the code is what we 20// expect, so we silent the warning. 21#pragma GCC diagnostic push 22#if defined __GNUC__ && __GNUC__ >= 6 23#pragma GCC diagnostic ignored "-Wignored-attributes" 24#endif 25 26 27template<template<size_t N> class Generator, unsigned N, unsigned ...Index> 28struct GenerateTable : GenerateTable<Generator, N - 1, N - 1, Index...> { 29}; 30 31template<template<size_t N> class Generator, unsigned ...Index> 32struct GenerateTable<Generator, 0, Index...> 33 : std::array<decltype(Generator<0>::sValue), sizeof...(Index)> { 34 constexpr GenerateTable() 35 : 36 std::array<decltype(Generator<0>::sValue), sizeof...(Index)> { 37 { Generator<Index>::sValue... } 38 } 39 { 40 } 41}; 42 43 44#pragma GCC diagnostic pop 45 46 47static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source, 48 size_t length) 49{ 50 __asm__ __volatile__("rep movsb" 51 : "+D" (destination), "+S" (source), "+c" (length) 52 : 53 : "memory"); 54} 55 56 57template<size_t N> 58inline void copy_small(uint8_t* destination, const uint8_t* source) 59{ 60 struct data { 61 uint8_t x[N]; 62 }; 63 *reinterpret_cast<data*>(destination) 64 = *reinterpret_cast<const data*>(source); 65} 66 67 68template<size_t N> 69struct SmallGenerator { 70 constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>; 71}; 72constexpr static GenerateTable<SmallGenerator, 8> table_small; 73 74 75static inline void memcpy_small(uint8_t* destination, const uint8_t* source, 76 size_t length) 77{ 78 if (length < 8) { 79 table_small[length](destination, source); 80 } else { 81 auto to = reinterpret_cast<uint64_t*>(destination); 82 auto from = reinterpret_cast<const uint64_t*>(source); 83 *to = *from; 84 to = reinterpret_cast<uint64_t*>(destination + length - 8); 85 from = reinterpret_cast<const uint64_t*>(source + length - 8); 86 *to = *from; 87 } 88} 89 90 91template<size_t N> 92inline void copy_sse(__m128i* destination, const __m128i* source) 93{ 94 auto temp = _mm_loadu_si128(source); 95 _mm_storeu_si128(destination, temp); 96 copy_sse<N - 1>(destination + 1, source + 1); 97} 98 99 100template<> 101inline void copy_sse<0>(__m128i* destination, const __m128i* source) 102{ 103} 104 105 106template<size_t N> 107struct SSEGenerator { 108 constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>; 109}; 110constexpr static GenerateTable<SSEGenerator, 4> table_sse; 111 112 113static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length) 114{ 115 auto to = reinterpret_cast<__m128i*>(destination); 116 auto from = reinterpret_cast<const __m128i*>(source); 117 auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16); 118 auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16); 119 while (length >= 64) { 120 copy_sse<4>(to, from); 121 to += 4; 122 from += 4; 123 length -= 64; 124 } 125 if (length >= 16) { 126 table_sse[length / 16](to, from); 127 length %= 16; 128 } 129 if (length) { 130 copy_sse<1>(toEnd, fromEnd); 131 } 132} 133 134 135} 136 137 138extern "C" void* memcpy(void* destination, const void* source, size_t length) 139{ 140 auto to = static_cast<uint8_t*>(destination); 141 auto from = static_cast<const uint8_t*>(source); 142 if (length <= 16) { 143 memcpy_small(to, from, length); 144 return destination; 145 } 146 if (length < 2048) { 147 memcpy_sse(to, from, length); 148 return destination; 149 } 150 memcpy_repmovs(to, from, length); 151 return destination; 152} 153 154 155static inline void 156memset_repstos(uint8_t* destination, uint8_t value, size_t length) 157{ 158 __asm__ __volatile__("rep stosb" 159 : "+D" (destination), "+c" (length) 160 : "a" (value) 161 : "memory"); 162} 163 164 165static inline void 166memset_sse(uint8_t* destination, uint8_t value, size_t length) 167{ 168 __m128i packed = _mm_set1_epi8(value); 169 auto end = reinterpret_cast<__m128i*>(destination + length - 16); 170 auto diff = reinterpret_cast<uintptr_t>(destination) % 16; 171 if (diff) { 172 diff = 16 - diff; 173 length -= diff; 174 _mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed); 175 } 176 auto ptr = reinterpret_cast<__m128i*>(destination + diff); 177 while (length >= 64) { 178 _mm_store_si128(ptr++, packed); 179 _mm_store_si128(ptr++, packed); 180 _mm_store_si128(ptr++, packed); 181 _mm_store_si128(ptr++, packed); 182 length -= 64; 183 } 184 while (length >= 16) { 185 _mm_store_si128(ptr++, packed); 186 length -= 16; 187 } 188 _mm_storeu_si128(end, packed); 189} 190 191 192static inline void 193memset_small(uint8_t* destination, uint8_t value, size_t length) 194{ 195 if (length >= 8) { 196 auto packed = value * 0x101010101010101ul; 197 auto ptr = reinterpret_cast<uint64_t*>(destination); 198 auto end = reinterpret_cast<uint64_t*>(destination + length - 8); 199 while (length >= 8) { 200 *ptr++ = packed; 201 length -= 8; 202 } 203 *end = packed; 204 } else { 205 while (length--) { 206 *destination++ = value; 207 } 208 } 209} 210 211 212extern "C" void* 213memset(void* ptr, int chr, size_t length) 214{ 215 auto value = static_cast<unsigned char>(chr); 216 auto destination = static_cast<uint8_t*>(ptr); 217 if (length < 32) { 218 memset_small(destination, value, length); 219 return ptr; 220 } 221 if (length < 2048) { 222 memset_sse(destination, value, length); 223 return ptr; 224 } 225 memset_repstos(destination, value, length); 226 return ptr; 227} 228 229