1/*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <ppc/asm.h>
30#include <ppc/exception.h>
31#include <assym.s>
32
33        .text
34        .align	2
35        .globl	_memset
36        .globl	_bzero
37        .globl	_bzero_nc
38        .globl	_bzero_phys
39        .globl	_bzero_phys_nc
40
41
42// *****************************
43// * B Z E R O _ P H Y S _ N C *
44// *****************************
45//
46// void bzero_phys_nc(addr64_t phys_addr, uint32_t length);
47//
48// Takes a phys addr in (r3,r4), and length in r5.  NO CACHING
49
50        .align	5
51LEXT(bzero_phys_nc)
52        mflr	r12				// save return address
53        rlwinm	r3,r3,0,1,0		// coallesce long-long in (r3,r4) into reg64_t in r3
54        rlwimi	r3,r4,0,0,31
55        mr		r4,r5			// put length where bzero() expects it
56        bl		EXT(ml_set_physical_get_ffs)	// turn DR off, SF on, features in cr6, old MSR in r11
57        bl		EXT(bzero_nc)		// use normal bzero() routine
58        mtlr	r12				// restore return
59        b		EXT(ml_restore)		// restore MSR, turning DR on and SF off
60
61
62// ***********************
63// * B Z E R O _ P H Y S *
64// ***********************
65//
66// void bzero_phys(addr64_t phys_addr, uint32_t length);
67//
68// Takes a phys addr in (r3,r4), and length in r5.  We leave cache on.
69
70        .align	5
71LEXT(bzero_phys)
72        mflr	r12				// save return address
73        rlwinm	r3,r3,0,1,0		// coallesce long-long in (r3,r4) into reg64_t in r3
74        rlwimi	r3,r4,0,0,31
75        mr		r4,r5			// put length where bzero() expects it
76        bl		EXT(ml_set_physical_get_ffs)	// turn DR off, SF on, features in cr6, old MSR in r11
77        bl		EXT(bzero)		// use normal bzero() routine
78        mtlr	r12				// restore return
79        b		EXT(ml_restore)		// restore MSR, turning DR on and SF off
80
81
82// *******************
83// * B Z E R O _ N C *
84// *******************
85//
86//	void bzero_nc(char	*addr, unsigned int length);
87//
88// For use with uncached memory.  Doesn't seem to be used at all, so probably not
89// performance critical.  NB: we must avoid unaligned stores, because some
90// machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
91// memory.  Of course, we must also avoid dcbz.
92
93LEXT(bzero_nc)
94        cmplwi	cr1,r4,20		// too short to bother with 16-byte loops?
95        cmplwi	cr7,r4,0		// check for (len==0)
96        li		r6,0			// get a 0
97        bge		cr1,bznc1		// skip if length >=20
98        mtctr	r4				// set up byte loop
99        beqlr--	cr7				// done if len=0
100
101// Short operands, loop over bytes.
102
103bznc0:
104        stb		r6,0(r3)
105        addi	r3,r3,1
106        bdnz	bznc0
107        blr
108
109// Handle operands long enough to do doubleword stores; we must doubleword
110// align, to avoid alignment exceptions.
111
112bznc1:
113        neg		r7,r3			// start to compute #bytes to align
114        mfsprg	r10,2			// get feature flags
115        andi.	r0,r7,7			// get #bytes to doubleword align
116        mr		r5,r3			// make copy of operand ptr as bcopy expects
117        mtcrf	0x02,r10		// put pf64Bitb etc in cr6
118        beq		bzero_tail		// already doubleword aligned
119        sub		r4,r4,r0		// adjust count
120        mtctr	r0				// set up loop
121bznc2:							// zero bytes until doubleword aligned
122        stb		r6,0(r5)
123        addi	r5,r5,1
124        bdnz	bznc2
125        b		bzero_tail		// join bzero, now that r5 is aligned
126
127
128// *************     ***************
129// * B Z E R O * and * M E M S E T *
130// *************     ***************
131//
132// void *   memset(void *b, int c, size_t len);
133// void		bzero(void *b, size_t len);
134//
135// These routines support G3, G4, and the 970, and run in both 32 and
136// 64-bit mode.  Lengths (size_t) are always 32 bits.
137//
138// Register use:
139//    r0 = temp
140//    r2 = temp
141//    r3 = original ptr, not changed since memset returns it
142//    r4 = count of bytes to set
143//    r5 = working operand ptr ("rp")
144//    r6 = value to store (usually 0)
145// r7-r9 = temps
146//   r10 = feature flags
147//   r11 = old MSR (if bzero_phys)
148//   r12 = return address (if bzero_phys)
149//   cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
150
151        .align	5
152LEXT(memset)					// void *   memset(void *b, int c, size_t len);
153        andi.	r6,r4,0xFF		// copy value to working register, test for 0
154        mr		r4,r5			// move length to working register
155        bne--	memset1			// skip if nonzero
156LEXT(bzero)						// void	bzero(void *b, size_t len);
157        dcbtst	0,r3			// touch in 1st cache block
158        mfsprg	r10,2			// get features
159        li		r6,0			// get a 0
160        neg		r7,r3			// start to compute #bytes to align
161        andi.	r0,r10,pf128Byte+pf32Byte // get cache line size
162        mtcrf	0x02,r10		// put pf128Byte etc in cr6
163        cmplw	r4,r0			// operand length >= cache line size?
164        mr		r5,r3			// make copy of operand ptr (can't change r3)
165        blt		bzero_tail		// too short for dcbz (or dcbz128)
166        rlwinm	r0,r7,0,0x1F	// get #bytes to  32-byte align
167        rlwinm	r9,r7,0,0x7F	// get #bytes to 128-byte align
168        bt++	pf128Byteb,bzero_128 // skip if 128-byte processor
169
170// Operand length >=32 and cache line size is 32.
171//		r0 = #bytes to 32-byte align
172//		r4 = length
173//		r5 = ptr to operand
174//		r6 = 0
175
176        sub		r2,r4,r0		// adjust length
177        cmpwi	cr1,r0,0		// already 32-byte aligned?
178        srwi.	r8,r2,5			// get #32-byte chunks
179        beq		bzero_tail		// not long enough to dcbz
180        mtctr	r8				// set up loop count
181        rlwinm	r4,r2,0,27,31	// mask down to leftover byte count
182        beq		cr1,bz_dcbz32 	// skip if already 32-byte aligned
183
184// 32-byte align.  We just store 32 0s, rather than test and use conditional
185// branches.  This is usually faster, because there are no mispredicts.
186
187        stw		r6,0(r5)		// zero next 32 bytes
188        stw		r6,4(r5)
189        stw		r6,8(r5)
190        stw		r6,12(r5)
191        stw		r6,16(r5)
192        stw		r6,20(r5)
193        stw		r6,24(r5)
194        stw		r6,28(r5)
195        add		r5,r5,r0		// now r5 is 32-byte aligned
196        b		bz_dcbz32
197
198// Loop doing 32-byte version of DCBZ instruction.
199
200        .align	4				// align the inner loop
201bz_dcbz32:
202        dcbz	0,r5			// zero another 32 bytes
203        addi	r5,r5,32
204        bdnz	bz_dcbz32
205
206// Store trailing bytes.  This routine is used both by bzero and memset.
207//		r4 = #bytes to store (may be large if memset)
208//		r5 = address
209//		r6 = value to store (in all 8 bytes)
210//     cr6 = pf64Bit etc flags
211
212bzero_tail:
213        srwi.	r0,r4,4			// get #(16-byte-chunks)
214        mtcrf	0x01,r4			// remaining byte count to cr7
215        beq		bzt3			// no 16-byte chunks
216        mtctr	r0				// set up loop count
217        bt++	pf64Bitb,bzt2	// skip if 64-bit processor
218        b		bzt1
219        .align	5
220bzt1:							// loop over 16-byte chunks on 32-bit processor
221        stw		r6,0(r5)
222        stw		r6,4(r5)
223        stw		r6,8(r5)
224        stw		r6,12(r5)
225        addi	r5,r5,16
226        bdnz	bzt1
227        b		bzt3
228        .align	5
229bzt2:							// loop over 16-byte chunks on 64-bit processor
230        std		r6,0(r5)
231        std		r6,8(r5)
232        addi	r5,r5,16
233        bdnz	bzt2
234        bf		28,bzt4			// 8-byte chunk?
235        std		r6,0(r5)
236        addi	r5,r5,8
237        b		bzt4
238bzt3:
239        bf		28,bzt4			// 8-byte chunk?
240        stw		r6,0(r5)
241        stw		r6,4(r5)
242        addi	r5,r5,8
243bzt4:
244        bf		29,bzt5			// word?
245        stw		r6,0(r5)
246        addi	r5,r5,4
247bzt5:
248        bf		30,bzt6			// halfword?
249        sth		r6,0(r5)
250        addi	r5,r5,2
251bzt6:
252        bflr	31				// byte?
253        stb		r6,0(r5)
254        blr
255
256// Operand length is >=128 and cache line size is 128. We assume that
257// because the linesize is 128 bytes, this is a 64-bit processor.
258//		r4 = length
259//		r5 = ptr to operand
260//		r6 = 0
261//		r7 = neg(r5)
262//		r9 = #bytes to 128-byte align
263
264        .align	5
265bzero_128:
266        sub		r2,r4,r9		// r2 <- length remaining after cache-line aligning
267        rlwinm	r0,r7,0,0xF		// r0 <- #bytes to 16-byte align
268        srwi.	r8,r2,7			// r8 <- number of cache lines to 0
269        std		r6,0(r5)		// always store 16 bytes to 16-byte align...
270        std		r6,8(r5)		// ...even if too short for dcbz128
271        add		r5,r5,r0		// 16-byte align ptr
272        sub		r4,r4,r0		// adjust count
273        beq		bzero_tail		// r8==0, not long enough to dcbz128
274        sub.	r7,r9,r0		// get #bytes remaining to 128-byte align
275        rlwinm	r4,r2,0,0x7F	// r4 <- length remaining after dcbz128'ing
276        mtctr	r8				// set up dcbz128 loop
277        beq		bz_dcbz128		// already 128-byte aligned
278        b		bz_align		// enter loop over 16-byte chunks
279
280// 128-byte align by looping over 16-byte chunks.
281
282        .align	5
283bz_align:						// loop over 16-byte chunks
284        subic.	r7,r7,16		// more to go?
285        std		r6,0(r5)
286        std		r6,8(r5)
287        addi	r5,r5,16
288        bgt		bz_align
289
290        b		bz_dcbz128		// enter dcbz128 loop
291
292// Loop over 128-byte cache lines.
293//		r4 = length remaining after cache lines (0..127)
294//		r5 = ptr (128-byte aligned)
295//		r6 = 0
296//		ctr = count of cache lines to 0
297
298        .align	5
299bz_dcbz128:
300        dcbz128	0,r5			// zero a 128-byte cache line
301        addi	r5,r5,128
302        bdnz	bz_dcbz128
303
304        b		bzero_tail		// handle leftovers
305
306
307// Handle memset() for nonzero values.  This case is relatively infrequent;
308// the large majority of memset() calls are for 0.
309//		r3 = ptr
310//		r4 = count
311//		r6 = value in lower byte (nonzero)
312
313memset1:
314        cmplwi	r4,16			// too short to bother aligning?
315        rlwimi	r6,r6,8,16,23	// replicate value to low 2 bytes
316        mr		r5,r3			// make working copy of operand ptr
317        rlwimi	r6,r6,16,0,15	// value now in all 4 bytes
318        blt		bzero_tail		// length<16, we won't be using "std"
319        mfsprg	r10,2			// get feature flags
320        neg		r7,r5			// start to compute #bytes to align
321        rlwinm	r6,r6,0,1,0		// value now in all 8 bytes (if 64-bit)
322        andi.	r0,r7,7			// r6 <- #bytes to doubleword align
323        stw		r6,0(r5)		// store 8 bytes to avoid a loop
324        stw		r6,4(r5)
325        mtcrf	0x02,r10		// get pf64Bit flag etc in cr6
326        sub		r4,r4,r0		// adjust count
327        add		r5,r5,r0		// doubleword align ptr
328        b		bzero_tail
329
330
331
332