1#include <string.h>
2#include <stdint.h>
3#include <endian.h>
4
5void *memcpy(void *restrict dest, const void *restrict src, size_t n)
6{
7	unsigned char *d = dest;
8	const unsigned char *s = src;
9
10#ifdef __GNUC__
11
12#if __BYTE_ORDER == __LITTLE_ENDIAN
13#define LS >>
14#define RS <<
15#else
16#define LS <<
17#define RS >>
18#endif
19
20	typedef uint32_t __attribute__((__may_alias__)) u32;
21	uint32_t w, x;
22
23	for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++;
24
25	if ((uintptr_t)d % 4 == 0) {
26		for (; n>=16; s+=16, d+=16, n-=16) {
27			*(u32 *)(d+0) = *(u32 *)(s+0);
28			*(u32 *)(d+4) = *(u32 *)(s+4);
29			*(u32 *)(d+8) = *(u32 *)(s+8);
30			*(u32 *)(d+12) = *(u32 *)(s+12);
31		}
32		if (n&8) {
33			*(u32 *)(d+0) = *(u32 *)(s+0);
34			*(u32 *)(d+4) = *(u32 *)(s+4);
35			d += 8; s += 8;
36		}
37		if (n&4) {
38			*(u32 *)(d+0) = *(u32 *)(s+0);
39			d += 4; s += 4;
40		}
41		if (n&2) {
42			*d++ = *s++; *d++ = *s++;
43		}
44		if (n&1) {
45			*d = *s;
46		}
47		return dest;
48	}
49
50	if (n >= 32) switch ((uintptr_t)d % 4) {
51	case 1:
52		w = *(u32 *)s;
53		*d++ = *s++;
54		*d++ = *s++;
55		*d++ = *s++;
56		n -= 3;
57		for (; n>=17; s+=16, d+=16, n-=16) {
58			x = *(u32 *)(s+1);
59			*(u32 *)(d+0) = (w LS 24) | (x RS 8);
60			w = *(u32 *)(s+5);
61			*(u32 *)(d+4) = (x LS 24) | (w RS 8);
62			x = *(u32 *)(s+9);
63			*(u32 *)(d+8) = (w LS 24) | (x RS 8);
64			w = *(u32 *)(s+13);
65			*(u32 *)(d+12) = (x LS 24) | (w RS 8);
66		}
67		break;
68	case 2:
69		w = *(u32 *)s;
70		*d++ = *s++;
71		*d++ = *s++;
72		n -= 2;
73		for (; n>=18; s+=16, d+=16, n-=16) {
74			x = *(u32 *)(s+2);
75			*(u32 *)(d+0) = (w LS 16) | (x RS 16);
76			w = *(u32 *)(s+6);
77			*(u32 *)(d+4) = (x LS 16) | (w RS 16);
78			x = *(u32 *)(s+10);
79			*(u32 *)(d+8) = (w LS 16) | (x RS 16);
80			w = *(u32 *)(s+14);
81			*(u32 *)(d+12) = (x LS 16) | (w RS 16);
82		}
83		break;
84	case 3:
85		w = *(u32 *)s;
86		*d++ = *s++;
87		n -= 1;
88		for (; n>=19; s+=16, d+=16, n-=16) {
89			x = *(u32 *)(s+3);
90			*(u32 *)(d+0) = (w LS 8) | (x RS 24);
91			w = *(u32 *)(s+7);
92			*(u32 *)(d+4) = (x LS 8) | (w RS 24);
93			x = *(u32 *)(s+11);
94			*(u32 *)(d+8) = (w LS 8) | (x RS 24);
95			w = *(u32 *)(s+15);
96			*(u32 *)(d+12) = (x LS 8) | (w RS 24);
97		}
98		break;
99	}
100	if (n&16) {
101		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
102		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
103		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
104		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
105	}
106	if (n&8) {
107		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
108		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
109	}
110	if (n&4) {
111		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
112	}
113	if (n&2) {
114		*d++ = *s++; *d++ = *s++;
115	}
116	if (n&1) {
117		*d = *s;
118	}
119	return dest;
120#endif
121
122	for (; n; n--) *d++ = *s++;
123	return dest;
124}
125