1336817Sdim//===----------------------Hexagon builtin routine ------------------------===//
2336817Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6336817Sdim//
7336817Sdim//===----------------------------------------------------------------------===//
8336817Sdim//
9336817Sdim// An optimized version of a memcpy which is equivalent to the following loop:
10336817Sdim//
11336817Sdim//   volatile unsigned *dest;
12336817Sdim//   unsigned *src;
13336817Sdim//
14336817Sdim//   for (i = 0; i < num_words; ++i)
15336817Sdim//     *dest++ = *src++;
16336817Sdim//
17336817Sdim// The corresponding C prototype for this function would be
18336817Sdim// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
19336817Sdim//                                      const unsigned *src,
20336817Sdim//                                      unsigned num_words);
21336817Sdim//
22336817Sdim// *** Both dest and src must be aligned to 32-bit boundaries. ***
23336817Sdim// The code does not perform any runtime checks for this, and will fail
24336817Sdim// in bad ways if this requirement is not met.
25336817Sdim//
26336817Sdim// The "forward" in the name refers to the fact that the function copies
27336817Sdim// the words going forward in memory.  It is incorrect to use this function
28336817Sdim// for cases where the original code copied words in any other order.
29336817Sdim//
30336817Sdim// *** This function is only for the use by the compiler. ***
31336817Sdim// The only indended use is for the LLVM compiler to generate calls to
32336817Sdim// this function, when a mem-copy loop, like the one above, is detected.
33336817Sdim
34336817Sdim  .text
35336817Sdim
36336817Sdim// Inputs:
37336817Sdim//   r0: dest
38336817Sdim//   r1: src
39336817Sdim//   r2: num_words
40336817Sdim
41336817Sdim  .globl  hexagon_memcpy_forward_vp4cp4n2
42336817Sdim  .balign 32
43336817Sdim  .type   hexagon_memcpy_forward_vp4cp4n2,@function
44336817Sdimhexagon_memcpy_forward_vp4cp4n2:
45336817Sdim
46336817Sdim    // Compute r3 to be the number of words remaining in the current page.
47336817Sdim    // At the same time, compute r4 to be the number of 32-byte blocks
48336817Sdim    // remaining in the page (for prefetch).
49336817Sdim  {
50336817Sdim    r3 = sub(##4096, r1)
51336817Sdim    r5 = lsr(r2, #3)
52336817Sdim  }
53336817Sdim  {
54336817Sdim    // The word count before end-of-page is in the 12 lowest bits of r3.
55336817Sdim    // (If the address in r1 was already page-aligned, the bits are 0.)
56336817Sdim    r3 = extractu(r3, #10, #2)
57336817Sdim    r4 = extractu(r3, #7, #5)
58336817Sdim  }
59336817Sdim  {
60336817Sdim    r3 = minu(r2, r3)
61336817Sdim    r4 = minu(r5, r4)
62336817Sdim  }
63336817Sdim  {
64336817Sdim    r4 = or(r4, ##2105344)      // 2105344 = 0x202000
65336817Sdim    p0 = cmp.eq(r3, #0)
66336817Sdim    if (p0.new) jump:nt .Lskipprolog
67336817Sdim  }
68336817Sdim    l2fetch(r1, r4)
69336817Sdim  {
70336817Sdim    loop0(.Lprolog, r3)
71336817Sdim    r2 = sub(r2, r3)            // r2 = number of words left after the prolog.
72336817Sdim  }
73336817Sdim  .falign
74336817Sdim.Lprolog:
75336817Sdim  {
76336817Sdim    r4 = memw(r1++#4)
77336817Sdim    memw(r0++#4) = r4.new
78336817Sdim  } :endloop0
79336817Sdim.Lskipprolog:
80336817Sdim  {
81336817Sdim    // Let r3 = number of whole pages left (page = 1024 words).
82336817Sdim    r3 = lsr(r2, #10)
83336817Sdim    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
84336817Sdim  }
85336817Sdim  {
86336817Sdim    loop1(.Lout, r3)
87336817Sdim    r2 = extractu(r2, #10, #0)  // r2 = r2 & 1023
88336817Sdim    r3 = ##2105472              // r3 = 0x202080 (prefetch info)
89336817Sdim  }
90336817Sdim    // Iterate over pages.
91336817Sdim  .falign
92336817Sdim.Lout:
93336817Sdim    // Prefetch each individual page.
94336817Sdim    l2fetch(r1, r3)
95336817Sdim    loop0(.Lpage, #512)
96336817Sdim  .falign
97336817Sdim.Lpage:
98336817Sdim    r5:4 = memd(r1++#8)
99336817Sdim  {
100336817Sdim    memw(r0++#8) = r4
101336817Sdim    memw(r0+#4) = r5
102336817Sdim  } :endloop0:endloop1
103336817Sdim.Lskipmain:
104336817Sdim  {
105336817Sdim    r3 = ##2105344              // r3 = 0x202000 (prefetch info)
106336817Sdim    r4 = lsr(r2, #3)            // r4 = number of 32-byte blocks remaining.
107336817Sdim    p0 = cmp.eq(r2, #0)
108336817Sdim    if (p0.new) jumpr:nt r31
109336817Sdim  }
110336817Sdim  {
111336817Sdim    r3 = or(r3, r4)
112336817Sdim    loop0(.Lepilog, r2)
113336817Sdim  }
114336817Sdim    l2fetch(r1, r3)
115336817Sdim  .falign
116336817Sdim.Lepilog:
117336817Sdim  {
118336817Sdim    r4 = memw(r1++#4)
119336817Sdim    memw(r0++#4) = r4.new
120336817Sdim  } :endloop0
121336817Sdim
122336817Sdim    jumpr r31
123336817Sdim
124336817Sdim.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
125