1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An optimized version of a memcpy which is equivalent to the following loop:
10//
11//   volatile unsigned *dest;
12//   unsigned *src;
13//
14//   for (i = 0; i < num_words; ++i)
15//     *dest++ = *src++;
16//
17// The corresponding C prototype for this function would be
18// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
19//                                      const unsigned *src,
20//                                      unsigned num_words);
21//
22// *** Both dest and src must be aligned to 32-bit boundaries. ***
23// The code does not perform any runtime checks for this, and will fail
24// in bad ways if this requirement is not met.
25//
26// The "forward" in the name refers to the fact that the function copies
27// the words going forward in memory.  It is incorrect to use this function
28// for cases where the original code copied words in any other order.
29//
30// *** This function is only for the use by the compiler. ***
31// The only indended use is for the LLVM compiler to generate calls to
32// this function, when a mem-copy loop, like the one above, is detected.
33
34  .text
35
36// Inputs:
37//   r0: dest
38//   r1: src
39//   r2: num_words
40
41  .globl  hexagon_memcpy_forward_vp4cp4n2
42  .balign 32
43  .type   hexagon_memcpy_forward_vp4cp4n2,@function
44hexagon_memcpy_forward_vp4cp4n2:
45
46    // Compute r3 to be the number of words remaining in the current page.
47    // At the same time, compute r4 to be the number of 32-byte blocks
48    // remaining in the page (for prefetch).
49  {
50    r3 = sub(##4096, r1)
51    r5 = lsr(r2, #3)
52  }
53  {
54    // The word count before end-of-page is in the 12 lowest bits of r3.
55    // (If the address in r1 was already page-aligned, the bits are 0.)
56    r3 = extractu(r3, #10, #2)
57    r4 = extractu(r3, #7, #5)
58  }
59  {
60    r3 = minu(r2, r3)
61    r4 = minu(r5, r4)
62  }
63  {
64    r4 = or(r4, ##2105344)      // 2105344 = 0x202000
65    p0 = cmp.eq(r3, #0)
66    if (p0.new) jump:nt .Lskipprolog
67  }
68    l2fetch(r1, r4)
69  {
70    loop0(.Lprolog, r3)
71    r2 = sub(r2, r3)            // r2 = number of words left after the prolog.
72  }
73  .falign
74.Lprolog:
75  {
76    r4 = memw(r1++#4)
77    memw(r0++#4) = r4.new
78  } :endloop0
79.Lskipprolog:
80  {
81    // Let r3 = number of whole pages left (page = 1024 words).
82    r3 = lsr(r2, #10)
83    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
84  }
85  {
86    loop1(.Lout, r3)
87    r2 = extractu(r2, #10, #0)  // r2 = r2 & 1023
88    r3 = ##2105472              // r3 = 0x202080 (prefetch info)
89  }
90    // Iterate over pages.
91  .falign
92.Lout:
93    // Prefetch each individual page.
94    l2fetch(r1, r3)
95    loop0(.Lpage, #512)
96  .falign
97.Lpage:
98    r5:4 = memd(r1++#8)
99  {
100    memw(r0++#8) = r4
101    memw(r0+#4) = r5
102  } :endloop0:endloop1
103.Lskipmain:
104  {
105    r3 = ##2105344              // r3 = 0x202000 (prefetch info)
106    r4 = lsr(r2, #3)            // r4 = number of 32-byte blocks remaining.
107    p0 = cmp.eq(r2, #0)
108    if (p0.new) jumpr:nt r31
109  }
110  {
111    r3 = or(r3, r4)
112    loop0(.Lepilog, r2)
113  }
114    l2fetch(r1, r3)
115  .falign
116.Lepilog:
117  {
118    r4 = memw(r1++#4)
119    memw(r0++#4) = r4.new
120  } :endloop0
121
122    jumpr r31
123
124.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
125