1/* Copyright 2002 Andi Kleen */ 2 3#include <linux/linkage.h> 4 5#include <asm/cpufeature.h> 6#include <asm/dwarf2.h> 7 8/* 9 * memcpy - Copy a memory block. 10 * 11 * Input: 12 * rdi destination 13 * rsi source 14 * rdx count 15 * 16 * Output: 17 * rax original destination 18 */ 19 20/* 21 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 22 * 23 * This gets patched over the unrolled variant (below) via the 24 * alternative instructions framework: 25 */ 26 .section .altinstr_replacement, "ax", @progbits 27.Lmemcpy_c: 28 movq %rdi, %rax 29 30 movl %edx, %ecx 31 shrl $3, %ecx 32 andl $7, %edx 33 rep movsq 34 movl %edx, %ecx 35 rep movsb 36 ret 37.Lmemcpy_e: 38 .previous 39 40ENTRY(__memcpy) 41ENTRY(memcpy) 42 CFI_STARTPROC 43 44 /* 45 * Put the number of full 64-byte blocks into %ecx. 46 * Tail portion is handled at the end: 47 */ 48 movq %rdi, %rax 49 movl %edx, %ecx 50 shrl $6, %ecx 51 jz .Lhandle_tail 52 53 .p2align 4 54.Lloop_64: 55 /* 56 * We decrement the loop index here - and the zero-flag is 57 * checked at the end of the loop (instructions inbetween do 58 * not change the zero flag): 59 */ 60 decl %ecx 61 62 /* 63 * Move in blocks of 4x16 bytes: 64 */ 65 movq 0*8(%rsi), %r11 66 movq 1*8(%rsi), %r8 67 movq %r11, 0*8(%rdi) 68 movq %r8, 1*8(%rdi) 69 70 movq 2*8(%rsi), %r9 71 movq 3*8(%rsi), %r10 72 movq %r9, 2*8(%rdi) 73 movq %r10, 3*8(%rdi) 74 75 movq 4*8(%rsi), %r11 76 movq 5*8(%rsi), %r8 77 movq %r11, 4*8(%rdi) 78 movq %r8, 5*8(%rdi) 79 80 movq 6*8(%rsi), %r9 81 movq 7*8(%rsi), %r10 82 movq %r9, 6*8(%rdi) 83 movq %r10, 7*8(%rdi) 84 85 leaq 64(%rsi), %rsi 86 leaq 64(%rdi), %rdi 87 88 jnz .Lloop_64 89 90.Lhandle_tail: 91 movl %edx, %ecx 92 andl $63, %ecx 93 shrl $3, %ecx 94 jz .Lhandle_7 95 96 .p2align 4 97.Lloop_8: 98 decl %ecx 99 movq (%rsi), %r8 100 movq %r8, (%rdi) 101 leaq 8(%rdi), %rdi 102 leaq 8(%rsi), %rsi 103 jnz .Lloop_8 104 105.Lhandle_7: 106 movl %edx, %ecx 107 andl $7, %ecx 108 jz .Lend 109 110 .p2align 4 111.Lloop_1: 112 movb (%rsi), %r8b 113 movb %r8b, (%rdi) 114 incq %rdi 115 incq %rsi 116 decl %ecx 117 jnz .Lloop_1 118 119.Lend: 120 ret 121 CFI_ENDPROC 122ENDPROC(memcpy) 123ENDPROC(__memcpy) 124 125 /* 126 * Some CPUs run faster using the string copy instructions. 127 * It is also a lot simpler. Use this when possible: 128 */ 129 130 .section .altinstructions, "a" 131 .align 8 132 .quad memcpy 133 .quad .Lmemcpy_c 134 .word X86_FEATURE_REP_GOOD 135 136 /* 137 * Replace only beginning, memcpy is used to apply alternatives, 138 * so it is silly to overwrite itself with nops - reboot is the 139 * only outcome... 140 */ 141 .byte .Lmemcpy_e - .Lmemcpy_c 142 .byte .Lmemcpy_e - .Lmemcpy_c 143 .previous 144