1/*
2 * Copyright 2014, Pawe�� Dziepak, pdziepak@quarnos.org.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7#include <array>
8
9#include <cstddef>
10#include <cstdint>
11
12#include <emmintrin.h>
13
14
15namespace {
16
17
18// __m128i resolves to a type with an attribute, which can't get into the
19// template signature, resulting in a warning. Nonetheless the code is what we
20// expect, so we silent the warning.
21#pragma GCC diagnostic push
22#if defined __GNUC__ && __GNUC__ >= 6
23#pragma GCC diagnostic ignored "-Wignored-attributes"
24#endif
25
26
27template<template<size_t N> class Generator, unsigned N, unsigned ...Index>
28struct GenerateTable : GenerateTable<Generator, N - 1,  N - 1, Index...> {
29};
30
31template<template<size_t N> class Generator, unsigned ...Index>
32struct GenerateTable<Generator, 0, Index...>
33	: std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
34	constexpr GenerateTable()
35	:
36	std::array<decltype(Generator<0>::sValue), sizeof...(Index)> {
37		{ Generator<Index>::sValue... }
38	}
39	{
40	}
41};
42
43
44#pragma GCC diagnostic pop
45
46
47static inline void memcpy_repmovs(uint8_t* destination, const uint8_t* source,
48	size_t length)
49{
50	__asm__ __volatile__("rep movsb"
51		: "+D" (destination), "+S" (source), "+c" (length)
52		:
53		: "memory");
54}
55
56
57template<size_t N>
58inline void copy_small(uint8_t* destination, const uint8_t* source)
59{
60	struct data {
61		uint8_t x[N];
62	};
63	*reinterpret_cast<data*>(destination)
64		= *reinterpret_cast<const data*>(source);
65}
66
67
68template<size_t N>
69struct SmallGenerator {
70	constexpr static void (*sValue)(uint8_t*, const uint8_t*) = copy_small<N>;
71};
72constexpr static GenerateTable<SmallGenerator, 8> table_small;
73
74
75static inline void memcpy_small(uint8_t* destination, const uint8_t* source,
76	size_t length)
77{
78	if (length < 8) {
79		table_small[length](destination, source);
80	} else {
81		auto to = reinterpret_cast<uint64_t*>(destination);
82		auto from = reinterpret_cast<const uint64_t*>(source);
83		*to = *from;
84		to = reinterpret_cast<uint64_t*>(destination + length - 8);
85		from = reinterpret_cast<const uint64_t*>(source + length - 8);
86		*to = *from;
87	}
88}
89
90
91template<size_t N>
92inline void copy_sse(__m128i* destination, const __m128i* source)
93{
94	auto temp = _mm_loadu_si128(source);
95	_mm_storeu_si128(destination, temp);
96	copy_sse<N - 1>(destination + 1, source + 1);
97}
98
99
100template<>
101inline void copy_sse<0>(__m128i* destination, const __m128i* source)
102{
103}
104
105
106template<size_t N>
107struct SSEGenerator {
108	constexpr static void (*sValue)(__m128i*, const __m128i*) = copy_sse<N>;
109};
110constexpr static GenerateTable<SSEGenerator, 4> table_sse;
111
112
113static inline void memcpy_sse(uint8_t* destination, const uint8_t* source, size_t length)
114{
115	auto to = reinterpret_cast<__m128i*>(destination);
116	auto from = reinterpret_cast<const __m128i*>(source);
117	auto toEnd = reinterpret_cast<__m128i*>(destination + length - 16);
118	auto fromEnd = reinterpret_cast<const __m128i*>(source + length - 16);
119	while (length >= 64) {
120		copy_sse<4>(to, from);
121		to += 4;
122		from += 4;
123		length -= 64;
124	}
125	if (length >= 16) {
126		table_sse[length / 16](to, from);
127		length %= 16;
128	}
129	if (length) {
130		copy_sse<1>(toEnd, fromEnd);
131	}
132}
133
134
135}
136
137
138extern "C" void* memcpy(void* destination, const void* source, size_t length)
139{
140	auto to = static_cast<uint8_t*>(destination);
141	auto from = static_cast<const uint8_t*>(source);
142	if (length <= 16) {
143		memcpy_small(to, from, length);
144		return destination;
145	}
146	if (length < 2048) {
147		memcpy_sse(to, from, length);
148		return destination;
149	}
150	memcpy_repmovs(to, from, length);
151	return destination;
152}
153
154
155static inline void
156memset_repstos(uint8_t* destination, uint8_t value, size_t length)
157{
158	__asm__ __volatile__("rep stosb"
159		: "+D" (destination), "+c" (length)
160		: "a" (value)
161		: "memory");
162}
163
164
165static inline void
166memset_sse(uint8_t* destination, uint8_t value, size_t length)
167{
168	__m128i packed = _mm_set1_epi8(value);
169	auto end = reinterpret_cast<__m128i*>(destination + length - 16);
170	auto diff = reinterpret_cast<uintptr_t>(destination) % 16;
171	if (diff) {
172		diff = 16 - diff;
173		length -= diff;
174		_mm_storeu_si128(reinterpret_cast<__m128i*>(destination), packed);
175	}
176	auto ptr = reinterpret_cast<__m128i*>(destination + diff);
177	while (length >= 64) {
178		_mm_store_si128(ptr++, packed);
179		_mm_store_si128(ptr++, packed);
180		_mm_store_si128(ptr++, packed);
181		_mm_store_si128(ptr++, packed);
182		length -= 64;
183	}
184	while (length >= 16) {
185		_mm_store_si128(ptr++, packed);
186		length -= 16;
187	}
188	_mm_storeu_si128(end, packed);
189}
190
191
192static inline void
193memset_small(uint8_t* destination, uint8_t value, size_t length)
194{
195	if (length >= 8) {
196		auto packed = value * 0x101010101010101ul;
197		auto ptr = reinterpret_cast<uint64_t*>(destination);
198		auto end = reinterpret_cast<uint64_t*>(destination + length - 8);
199		while (length >= 8) {
200			*ptr++ = packed;
201			length -= 8;
202		}
203		*end = packed;
204	} else {
205		while (length--) {
206			*destination++ = value;
207		}
208	}
209}
210
211
212extern "C" void*
213memset(void* ptr, int chr, size_t length)
214{
215	auto value = static_cast<unsigned char>(chr);
216	auto destination = static_cast<uint8_t*>(ptr);
217	if (length < 32) {
218		memset_small(destination, value, length);
219		return ptr;
220	}
221	if (length < 2048) {
222		memset_sse(destination, value, length);
223		return ptr;
224	}
225	memset_repstos(destination, value, length);
226	return ptr;
227}
228
229