1336817Sdim//===----------------------Hexagon builtin routine ------------------------===// 2336817Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6336817Sdim// 7336817Sdim//===----------------------------------------------------------------------===// 8336817Sdim// 9336817Sdim// An optimized version of a memcpy which is equivalent to the following loop: 10336817Sdim// 11336817Sdim// volatile unsigned *dest; 12336817Sdim// unsigned *src; 13336817Sdim// 14336817Sdim// for (i = 0; i < num_words; ++i) 15336817Sdim// *dest++ = *src++; 16336817Sdim// 17336817Sdim// The corresponding C prototype for this function would be 18336817Sdim// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest, 19336817Sdim// const unsigned *src, 20336817Sdim// unsigned num_words); 21336817Sdim// 22336817Sdim// *** Both dest and src must be aligned to 32-bit boundaries. *** 23336817Sdim// The code does not perform any runtime checks for this, and will fail 24336817Sdim// in bad ways if this requirement is not met. 25336817Sdim// 26336817Sdim// The "forward" in the name refers to the fact that the function copies 27336817Sdim// the words going forward in memory. It is incorrect to use this function 28336817Sdim// for cases where the original code copied words in any other order. 29336817Sdim// 30336817Sdim// *** This function is only for the use by the compiler. *** 31336817Sdim// The only indended use is for the LLVM compiler to generate calls to 32336817Sdim// this function, when a mem-copy loop, like the one above, is detected. 33336817Sdim 34336817Sdim .text 35336817Sdim 36336817Sdim// Inputs: 37336817Sdim// r0: dest 38336817Sdim// r1: src 39336817Sdim// r2: num_words 40336817Sdim 41336817Sdim .globl hexagon_memcpy_forward_vp4cp4n2 42336817Sdim .balign 32 43336817Sdim .type hexagon_memcpy_forward_vp4cp4n2,@function 44336817Sdimhexagon_memcpy_forward_vp4cp4n2: 45336817Sdim 46336817Sdim // Compute r3 to be the number of words remaining in the current page. 47336817Sdim // At the same time, compute r4 to be the number of 32-byte blocks 48336817Sdim // remaining in the page (for prefetch). 49336817Sdim { 50336817Sdim r3 = sub(##4096, r1) 51336817Sdim r5 = lsr(r2, #3) 52336817Sdim } 53336817Sdim { 54336817Sdim // The word count before end-of-page is in the 12 lowest bits of r3. 55336817Sdim // (If the address in r1 was already page-aligned, the bits are 0.) 56336817Sdim r3 = extractu(r3, #10, #2) 57336817Sdim r4 = extractu(r3, #7, #5) 58336817Sdim } 59336817Sdim { 60336817Sdim r3 = minu(r2, r3) 61336817Sdim r4 = minu(r5, r4) 62336817Sdim } 63336817Sdim { 64336817Sdim r4 = or(r4, ##2105344) // 2105344 = 0x202000 65336817Sdim p0 = cmp.eq(r3, #0) 66336817Sdim if (p0.new) jump:nt .Lskipprolog 67336817Sdim } 68336817Sdim l2fetch(r1, r4) 69336817Sdim { 70336817Sdim loop0(.Lprolog, r3) 71336817Sdim r2 = sub(r2, r3) // r2 = number of words left after the prolog. 72336817Sdim } 73336817Sdim .falign 74336817Sdim.Lprolog: 75336817Sdim { 76336817Sdim r4 = memw(r1++#4) 77336817Sdim memw(r0++#4) = r4.new 78336817Sdim } :endloop0 79336817Sdim.Lskipprolog: 80336817Sdim { 81336817Sdim // Let r3 = number of whole pages left (page = 1024 words). 82336817Sdim r3 = lsr(r2, #10) 83336817Sdim if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain 84336817Sdim } 85336817Sdim { 86336817Sdim loop1(.Lout, r3) 87336817Sdim r2 = extractu(r2, #10, #0) // r2 = r2 & 1023 88336817Sdim r3 = ##2105472 // r3 = 0x202080 (prefetch info) 89336817Sdim } 90336817Sdim // Iterate over pages. 91336817Sdim .falign 92336817Sdim.Lout: 93336817Sdim // Prefetch each individual page. 94336817Sdim l2fetch(r1, r3) 95336817Sdim loop0(.Lpage, #512) 96336817Sdim .falign 97336817Sdim.Lpage: 98336817Sdim r5:4 = memd(r1++#8) 99336817Sdim { 100336817Sdim memw(r0++#8) = r4 101336817Sdim memw(r0+#4) = r5 102336817Sdim } :endloop0:endloop1 103336817Sdim.Lskipmain: 104336817Sdim { 105336817Sdim r3 = ##2105344 // r3 = 0x202000 (prefetch info) 106336817Sdim r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining. 107336817Sdim p0 = cmp.eq(r2, #0) 108336817Sdim if (p0.new) jumpr:nt r31 109336817Sdim } 110336817Sdim { 111336817Sdim r3 = or(r3, r4) 112336817Sdim loop0(.Lepilog, r2) 113336817Sdim } 114336817Sdim l2fetch(r1, r3) 115336817Sdim .falign 116336817Sdim.Lepilog: 117336817Sdim { 118336817Sdim r4 = memw(r1++#4) 119336817Sdim memw(r0++#4) = r4.new 120336817Sdim } :endloop0 121336817Sdim 122336817Sdim jumpr r31 123336817Sdim 124336817Sdim.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 125