1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/appleapiopts.h>
30#include <ppc/asm.h>
31#include <machine/cpu_capabilities.h>
32#include <machine/commpage.h>
33
34        .text
35        .align	2
36
37/* *********************
38 * * M E M S E T _ G 3 *
39 * *********************
40 *
41 * This is a subroutine called by Libc memset and _memset_pattern for large nonzero
42 * operands (zero operands are funneled into bzero.)  This version is for
43 * 32-bit processors with a 32-byte cache line and no Altivec.
44 *
45 * Registers at entry:
46 *		r4 = count of bytes to store (must be >= 32)
47 *      r8 = ptr to the 1st byte to store (16-byte aligned)
48 *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
49 * When we return:
50 *		r3 = not changed, since memset returns it
51 *      r4 = bytes remaining to store (will be <32)
52 *      r7 = not changed
53 *      r8 = ptr to next byte to store (still 16-byte aligned)
54 *     r12 = not changed (holds return value for memset)
55 */
56
57        .align	4
58memset_g3:
59        andi.   r0,r8,16                // cache line aligned?
60        lfd     f0,0(r9)                // pick up the pattern in two FPRs
61        lfd     f1,8(r9)
62        beq     1f                      // skip if already aligned
63
64        // cache line align
65
66        stfd    f0,0(r8)                // no, store another 16 bytes to align
67        stfd    f1,8(r8)
68        subi    r4,r4,16                // skip past the 16 bytes we just stored
69        addi    r8,r8,16
70
71        // Loop over cache lines.  This code uses a private protocol with the kernel:
72        // when the kernel emulates an alignment exception on a DCBZ that occurs in the
73        // commpage, it zeroes CR7.  We use this to detect the case where we are operating on
74        // uncached memory, and do not use DCBZ again in this code. We assume that either
75        // all the operand is cacheable or none of it is, so we only check the first DCBZ.
761:
77        srwi.   r0,r4,6                 // get count of 64-byte chunks
78        cmpw    cr7,r0,r0               // set cr7_eq (kernel turns off on alignment exception)
79        rlwinm  r4,r4,0,0x3F            // mask down to residual count (0..63)
80        beq     Lleftover               // no chunks
81        dcbz    0,r8                    // zero first cache line (clearing cr7 if alignment exception)
82        mtctr   r0
83        li      r6,32                   // get an offset for DCBZ
84        beq+    cr7,LDcbzEnter          // enter DCBZ loop (we didn't get an alignment exception)
85
86        // Loop over 64-byte chunks without DCBZ.
87LNoDcbz:
88        stfd    f0,0(r8)
89        stfd    f1,8(r8)
90        stfd    f0,16(r8)
91        stfd    f1,24(r8)
92        stfd    f0,32(r8)
93        stfd    f1,40(r8)
94        stfd    f0,48(r8)
95        stfd    f1,56(r8)
96        addi    r8,r8,64
97        bdnz    LNoDcbz
98
99        b       Lleftover
100
101        // Loop over 64-byte chunks using DCBZ.
102LDcbz:
103        dcbz    0,r8
104LDcbzEnter:
105        dcbz    r6,r8
106        stfd    f0,0(r8)
107        stfd    f1,8(r8)
108        stfd    f0,16(r8)
109        stfd    f1,24(r8)
110        stfd    f0,32(r8)
111        stfd    f1,40(r8)
112        stfd    f0,48(r8)
113        stfd    f1,56(r8)
114        addi    r8,r8,64
115        bdnz    LDcbz
116
117        // Handle leftovers (0..63 bytes)
118Lleftover:
119        srwi.   r0,r4,4                 // get count of 16-byte chunks
120        rlwinm  r4,r4,0,0xF             // mask down to residuals
121        beqlr                           // no 16-byte chunks so done
122        mtctr   r0
1232:
124        stfd    f0,0(r8)
125        stfd    f1,8(r8)
126        addi    r8,r8,16
127        bdnz    2b
128
129        blr
130
131	COMMPAGE_DESCRIPTOR(memset_g3,_COMM_PAGE_MEMSET_PATTERN,kCache32,kHasAltivec, \
132				kCommPage32)
133