memcpy_forward_vp4cp4n2.S revision 337136
1//===----------------------Hexagon builtin routine ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is dual licensed under the MIT and the University of Illinois Open 6// Source Licenses. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// An optimized version of a memcpy which is equivalent to the following loop: 11// 12// volatile unsigned *dest; 13// unsigned *src; 14// 15// for (i = 0; i < num_words; ++i) 16// *dest++ = *src++; 17// 18// The corresponding C prototype for this function would be 19// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest, 20// const unsigned *src, 21// unsigned num_words); 22// 23// *** Both dest and src must be aligned to 32-bit boundaries. *** 24// The code does not perform any runtime checks for this, and will fail 25// in bad ways if this requirement is not met. 26// 27// The "forward" in the name refers to the fact that the function copies 28// the words going forward in memory. It is incorrect to use this function 29// for cases where the original code copied words in any other order. 30// 31// *** This function is only for the use by the compiler. *** 32// The only indended use is for the LLVM compiler to generate calls to 33// this function, when a mem-copy loop, like the one above, is detected. 34 35 .text 36 37// Inputs: 38// r0: dest 39// r1: src 40// r2: num_words 41 42 .globl hexagon_memcpy_forward_vp4cp4n2 43 .balign 32 44 .type hexagon_memcpy_forward_vp4cp4n2,@function 45hexagon_memcpy_forward_vp4cp4n2: 46 47 // Compute r3 to be the number of words remaining in the current page. 48 // At the same time, compute r4 to be the number of 32-byte blocks 49 // remaining in the page (for prefetch). 50 { 51 r3 = sub(##4096, r1) 52 r5 = lsr(r2, #3) 53 } 54 { 55 // The word count before end-of-page is in the 12 lowest bits of r3. 56 // (If the address in r1 was already page-aligned, the bits are 0.) 57 r3 = extractu(r3, #10, #2) 58 r4 = extractu(r3, #7, #5) 59 } 60 { 61 r3 = minu(r2, r3) 62 r4 = minu(r5, r4) 63 } 64 { 65 r4 = or(r4, ##2105344) // 2105344 = 0x202000 66 p0 = cmp.eq(r3, #0) 67 if (p0.new) jump:nt .Lskipprolog 68 } 69 l2fetch(r1, r4) 70 { 71 loop0(.Lprolog, r3) 72 r2 = sub(r2, r3) // r2 = number of words left after the prolog. 73 } 74 .falign 75.Lprolog: 76 { 77 r4 = memw(r1++#4) 78 memw(r0++#4) = r4.new 79 } :endloop0 80.Lskipprolog: 81 { 82 // Let r3 = number of whole pages left (page = 1024 words). 83 r3 = lsr(r2, #10) 84 if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain 85 } 86 { 87 loop1(.Lout, r3) 88 r2 = extractu(r2, #10, #0) // r2 = r2 & 1023 89 r3 = ##2105472 // r3 = 0x202080 (prefetch info) 90 } 91 // Iterate over pages. 92 .falign 93.Lout: 94 // Prefetch each individual page. 95 l2fetch(r1, r3) 96 loop0(.Lpage, #512) 97 .falign 98.Lpage: 99 r5:4 = memd(r1++#8) 100 { 101 memw(r0++#8) = r4 102 memw(r0+#4) = r5 103 } :endloop0:endloop1 104.Lskipmain: 105 { 106 r3 = ##2105344 // r3 = 0x202000 (prefetch info) 107 r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining. 108 p0 = cmp.eq(r2, #0) 109 if (p0.new) jumpr:nt r31 110 } 111 { 112 r3 = or(r3, r4) 113 loop0(.Lepilog, r2) 114 } 115 l2fetch(r1, r3) 116 .falign 117.Lepilog: 118 { 119 r4 = memw(r1++#4) 120 memw(r0++#4) = r4.new 121 } :endloop0 122 123 jumpr r31 124 125.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 126