memcpy_forward_vp4cp4n2.S revision 337136
1//===----------------------Hexagon builtin routine ------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is dual licensed under the MIT and the University of Illinois Open
6// Source Licenses. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// An optimized version of a memcpy which is equivalent to the following loop:
11//
12//   volatile unsigned *dest;
13//   unsigned *src;
14//
15//   for (i = 0; i < num_words; ++i)
16//     *dest++ = *src++;
17//
18// The corresponding C prototype for this function would be
19// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
20//                                      const unsigned *src,
21//                                      unsigned num_words);
22//
23// *** Both dest and src must be aligned to 32-bit boundaries. ***
24// The code does not perform any runtime checks for this, and will fail
25// in bad ways if this requirement is not met.
26//
27// The "forward" in the name refers to the fact that the function copies
28// the words going forward in memory.  It is incorrect to use this function
29// for cases where the original code copied words in any other order.
30//
31// *** This function is only for the use by the compiler. ***
32// The only indended use is for the LLVM compiler to generate calls to
33// this function, when a mem-copy loop, like the one above, is detected.
34
35  .text
36
37// Inputs:
38//   r0: dest
39//   r1: src
40//   r2: num_words
41
42  .globl  hexagon_memcpy_forward_vp4cp4n2
43  .balign 32
44  .type   hexagon_memcpy_forward_vp4cp4n2,@function
45hexagon_memcpy_forward_vp4cp4n2:
46
47    // Compute r3 to be the number of words remaining in the current page.
48    // At the same time, compute r4 to be the number of 32-byte blocks
49    // remaining in the page (for prefetch).
50  {
51    r3 = sub(##4096, r1)
52    r5 = lsr(r2, #3)
53  }
54  {
55    // The word count before end-of-page is in the 12 lowest bits of r3.
56    // (If the address in r1 was already page-aligned, the bits are 0.)
57    r3 = extractu(r3, #10, #2)
58    r4 = extractu(r3, #7, #5)
59  }
60  {
61    r3 = minu(r2, r3)
62    r4 = minu(r5, r4)
63  }
64  {
65    r4 = or(r4, ##2105344)      // 2105344 = 0x202000
66    p0 = cmp.eq(r3, #0)
67    if (p0.new) jump:nt .Lskipprolog
68  }
69    l2fetch(r1, r4)
70  {
71    loop0(.Lprolog, r3)
72    r2 = sub(r2, r3)            // r2 = number of words left after the prolog.
73  }
74  .falign
75.Lprolog:
76  {
77    r4 = memw(r1++#4)
78    memw(r0++#4) = r4.new
79  } :endloop0
80.Lskipprolog:
81  {
82    // Let r3 = number of whole pages left (page = 1024 words).
83    r3 = lsr(r2, #10)
84    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
85  }
86  {
87    loop1(.Lout, r3)
88    r2 = extractu(r2, #10, #0)  // r2 = r2 & 1023
89    r3 = ##2105472              // r3 = 0x202080 (prefetch info)
90  }
91    // Iterate over pages.
92  .falign
93.Lout:
94    // Prefetch each individual page.
95    l2fetch(r1, r3)
96    loop0(.Lpage, #512)
97  .falign
98.Lpage:
99    r5:4 = memd(r1++#8)
100  {
101    memw(r0++#8) = r4
102    memw(r0+#4) = r5
103  } :endloop0:endloop1
104.Lskipmain:
105  {
106    r3 = ##2105344              // r3 = 0x202000 (prefetch info)
107    r4 = lsr(r2, #3)            // r4 = number of 32-byte blocks remaining.
108    p0 = cmp.eq(r2, #0)
109    if (p0.new) jumpr:nt r31
110  }
111  {
112    r3 = or(r3, r4)
113    loop0(.Lepilog, r2)
114  }
115    l2fetch(r1, r3)
116  .falign
117.Lepilog:
118  {
119    r4 = memw(r1++#4)
120    memw(r0++#4) = r4.new
121  } :endloop0
122
123    jumpr r31
124
125.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
126