1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
6
7/*
8 * memcpy - Copy a memory block.
9 *
10 * Input:
11 * rdi destination
12 * rsi source
13 * rdx count
14 *
15 * Output:
16 * rax original destination
17 */
18
19	ALIGN
20memcpy_c:
21	CFI_STARTPROC
22	movq %rdi,%rax
23	movl %edx,%ecx
24	shrl $3,%ecx
25	andl $7,%edx
26	rep movsq
27	movl %edx,%ecx
28	rep movsb
29	ret
30	CFI_ENDPROC
31ENDPROC(memcpy_c)
32
33ENTRY(__memcpy)
34ENTRY(memcpy)
35	CFI_STARTPROC
36	pushq %rbx
37	CFI_ADJUST_CFA_OFFSET 8
38	CFI_REL_OFFSET rbx, 0
39	movq %rdi,%rax
40
41	movl %edx,%ecx
42	shrl $6,%ecx
43	jz .Lhandle_tail
44
45	.p2align 4
46.Lloop_64:
47	decl %ecx
48
49	movq (%rsi),%r11
50	movq 8(%rsi),%r8
51
52	movq %r11,(%rdi)
53	movq %r8,1*8(%rdi)
54
55	movq 2*8(%rsi),%r9
56	movq 3*8(%rsi),%r10
57
58	movq %r9,2*8(%rdi)
59	movq %r10,3*8(%rdi)
60
61	movq 4*8(%rsi),%r11
62	movq 5*8(%rsi),%r8
63
64	movq %r11,4*8(%rdi)
65	movq %r8,5*8(%rdi)
66
67	movq 6*8(%rsi),%r9
68	movq 7*8(%rsi),%r10
69
70	movq %r9,6*8(%rdi)
71	movq %r10,7*8(%rdi)
72
73	leaq 64(%rsi),%rsi
74	leaq 64(%rdi),%rdi
75	jnz  .Lloop_64
76
77.Lhandle_tail:
78	movl %edx,%ecx
79	andl $63,%ecx
80	shrl $3,%ecx
81	jz   .Lhandle_7
82	.p2align 4
83.Lloop_8:
84	decl %ecx
85	movq (%rsi),%r8
86	movq %r8,(%rdi)
87	leaq 8(%rdi),%rdi
88	leaq 8(%rsi),%rsi
89	jnz  .Lloop_8
90
91.Lhandle_7:
92	movl %edx,%ecx
93	andl $7,%ecx
94	jz .Lende
95	.p2align 4
96.Lloop_1:
97	movb (%rsi),%r8b
98	movb %r8b,(%rdi)
99	incq %rdi
100	incq %rsi
101	decl %ecx
102	jnz .Lloop_1
103
104.Lende:
105	popq %rbx
106	CFI_ADJUST_CFA_OFFSET -8
107	CFI_RESTORE rbx
108	ret
109.Lfinal:
110	CFI_ENDPROC
111ENDPROC(memcpy)
112ENDPROC(__memcpy)
113
114	/* Some CPUs run faster using the string copy instructions.
115	   It is also a lot simpler. Use this when possible */
116
117	.section .altinstr_replacement,"ax"
1181:	.byte 0xeb				/* jmp <disp8> */
119	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
1202:
121	.previous
122	.section .altinstructions,"a"
123	.align 8
124	.quad memcpy
125	.quad 1b
126	.byte X86_FEATURE_REP_GOOD
127	.byte .Lfinal - memcpy
128	.byte 2b - 1b
129	.previous
130