1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
31 *
32 * Version of 2/20/2003, tuned for G4.  The inner loops use DCBA to avoid
33 * reading destination cache lines.  Only the 7450 actually benefits from
34 * this, and then only in the cold-cache case.  On 7400s and 7455s, we
35 * patch the DCBAs into NOPs.
36 *
37 * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
38 * environment.  Note also the rather delicate way we assign multiple uses
39 * to the same register.  Beware.
40 *
41 *   r0  = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
42 *   r2  = "w8" or vrsave ("rv")
43 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
44 *   r4  = source ptr ("rs")
45 *   r5  = count of bytes to move ("rc")
46 *   r6  = "w1", "c16", or "cm17"
47 *   r7  = "w2", "c32", or "cm33"
48 *   r8  = "w3", "c48", or "cm49"
49 *   r9  = "w4", or "cm1"
50 *   r10 = "w5", "c96", or "cm97"
51 *   r11 = "w6", "c128", or "cm129"
52 *   r12 = destination ptr ("rd")
53 *   v0  = permute vector ("vp")
54 * v1-v4 = qw's loaded from source
55 * v5-v7 = permuted qw's ("vw", "vx", "vy")
56 */
57#define rs	r4
58#define rd	r12
59#define rc	r5
60#define	rv	r2
61
62#define w1	r6
63#define w2	r7
64#define w3	r8
65#define	w4	r9
66#define	w5	r10
67#define	w6	r11
68#define	w7	r0
69#define	w8	r2
70
71#define c16		r6
72#define cm17	r6
73#define c32		r7
74#define cm33	r7
75#define c48		r8
76#define cm49	r8
77#define cm1		r9
78#define c96		r10
79#define cm97	r10
80#define c128	r11
81#define cm129	r11
82
83#define	vp	v0
84#define	vw	v5
85#define	vx	v6
86#define	vy	v7
87
88#include <sys/appleapiopts.h>
89#include <ppc/asm.h>
90#include <machine/cpu_capabilities.h>
91#include <machine/commpage.h>
92
93        .text
94
95#define	kMedium		32				// too long for inline loopless code
96#define	kLong		96				// long enough to justify use of Altivec
97
98
99// Main entry points.
100
101        .align 	5
102bcopy_g4:							// void bcopy(const void *src, void *dst, size_t len)
103        cmplwi	rc,kMedium			// short or long?
104        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
105        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
106        mr		rd,r4				// start to move registers to canonic spot
107        mr		rs,r3
108        blt+	LShort				// handle short operands
109        dcbt	0,r3				// touch in destination
110        b		LMedium				// join medium/long operand code
111
112// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
113
114        .align	5
115Lmemcpy_g4:							// void* memcpy(void *dst, void *src, size_t len)
116Lmemmove_g4:						// void* memmove(void *dst, const void *src, size_t len)
117        cmplwi	rc,kMedium			// short or long?
118        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
119        dcbt	0,r4				// touch in the first line of source
120        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
121        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
122        bge-	LMedium				// handle medium or long operands
123
124// Handle short operands.
125
126LShort:
127        andi.	r0,rc,0x10			// test bit 27 separately (faster on G4)
128        mtcrf	0x01,rc				// put length bits 28-31 in cr7
129        blt-	cr1,LShortReverse
130
131// Forward short operands.  This is the most frequent case, so it is inline.
132
133        beq		LShort16			// quadword to move?
134        lwz		w1,0(rs)
135        lwz		w2,4(rs)
136        lwz		w3,8(rs)
137        lwz		w4,12(rs)
138        addi	rs,rs,16
139        stw		w1,0(rd)
140        stw		w2,4(rd)
141        stw		w3,8(rd)
142        stw		w4,12(rd)
143        addi	rd,rd,16
144LShort16:							// join here to xfer 0-15 bytes
145        bf		28,2f				// doubleword?
146        lwz		w1,0(rs)
147        lwz		w2,4(rs)
148        addi	rs,rs,8
149        stw		w1,0(rd)
150        stw		w2,4(rd)
151        addi	rd,rd,8
1522:
153        bf		29,3f				// word?
154        lwz		w1,0(rs)
155        addi	rs,rs,4
156        stw		w1,0(rd)
157        addi	rd,rd,4
1583:
159        bf		30,4f				// halfword to move?
160        lhz		w1,0(rs)
161        addi	rs,rs,2
162        sth		w1,0(rd)
163        addi	rd,rd,2
1644:
165        bflr	31					// skip if no odd byte
166        lbz		w1,0(rs)
167        stb		w1,0(rd)
168        blr
169
170
171// Handle short reverse operands.
172//		cr0 = bne if bit 27 of length is set
173//		cr7 = bits 28-31 of length
174
175LShortReverse:
176        add		rs,rs,rc			// adjust ptrs for reverse move
177        add		rd,rd,rc
178        beq		LShortReverse16		// quadword to move?
179        lwz		w1,-4(rs)
180        lwz		w2,-8(rs)
181        lwz		w3,-12(rs)
182        lwzu	w4,-16(rs)
183        stw		w1,-4(rd)
184        stw		w2,-8(rd)
185        stw		w3,-12(rd)
186        stwu	w4,-16(rd)
187LShortReverse16:					// join here to xfer 0-15 bytes and return
188        bf		28,2f				// doubleword?
189        lwz		w1,-4(rs)
190        lwzu	w2,-8(rs)
191        stw		w1,-4(rd)
192        stwu	w2,-8(rd)
1932:
194        bf		29,3f				// word?
195        lwzu	w1,-4(rs)
196        stwu	w1,-4(rd)
1973:
198        bf		30,4f				// halfword to move?
199        lhzu	w1,-2(rs)
200        sthu	w1,-2(rd)
2014:
202        bflr	31					// done if no odd byte
203        lbz 	w1,-1(rs)			// no update
204        stb 	w1,-1(rd)
205        blr
206
207
208// Medium and long operands.  Use Altivec if long enough, else scalar loops.
209//		w1 = (rd-rs), used to check for alignment
210//     cr1 = blt iff we must move reverse
211
212        .align	4
213LMedium:
214        dcbtst	0,rd				// touch in destination
215        cmplwi	cr7,rc,kLong		// long enough for vectors?
216        neg		w3,rd				// start to compute #bytes to align destination
217        rlwinm	r0,w1,0,0x7			// check relative 8-byte alignment
218        andi.	w6,w3,7				// w6 <- #bytes to 8-byte align destination
219        blt		cr1,LMediumReverse	// handle reverse moves
220        rlwinm	w4,w3,0,0x1F		// w4 <- #bytes to 32-byte align destination
221        cmpwi	cr6,r0,0			// set cr6 beq if relatively aligned
222        bge		cr7,LFwdLong		// long enough for vectors
223
224// Medium length: use scalar loops.
225//	w6/cr0 = #bytes to 8-byte align destination
226//	   cr6 = beq if relatively doubleword aligned
227
228        sub		rc,rc,w6			// decrement length remaining
229        beq		1f					// skip if dest already doubleword aligned
230        mtxer	w6					// set up count for move
231        lswx	w1,0,rs				// move w6 bytes to align destination
232        stswx	w1,0,rd
233        add		rs,rs,w6			// bump ptrs past
234        add		rd,rd,w6
2351:
236        srwi	r0,rc,4				// get # 16-byte chunks (>=1)
237        mtcrf	0x01,rc				// save remaining byte count here for LShort16
238        mtctr	r0					// set up 16-byte loop
239        bne		cr6,3f				// source not 4-byte aligned
240        b		2f
241
242        .align	4
2432:									// loop over 16-byte  aligned chunks
244        lfd		f0,0(rs)
245        lfd		f1,8(rs)
246        addi	rs,rs,16
247        stfd	f0,0(rd)
248        stfd	f1,8(rd)
249        addi	rd,rd,16
250        bdnz	2b
251
252        b		LShort16
253
254        .align	4
2553:									// loop over 16-byte unaligned chunks
256        lwz		w1,0(rs)
257        lwz		w2,4(rs)
258        lwz		w3,8(rs)
259        lwz		w4,12(rs)
260        addi	rs,rs,16
261        stw		w1,0(rd)
262        stw		w2,4(rd)
263        stw		w3,8(rd)
264        stw		w4,12(rd)
265        addi	rd,rd,16
266        bdnz	3b
267
268        b		LShort16
269
270
271// Vector loops.  First, we must 32-byte align the destination.
272//		w1 = (rd-rs), used to check for reverse and alignment
273//		w4 = #bytes to 32-byte align destination
274//		rc = long enough for at least one vector loop
275
276LFwdLong:
277        cmpwi	w4,0				// dest already aligned?
278        sub		rc,rc,w4			// adjust length
279        mtcrf	0x01,w4				// cr7 <- #bytes to align dest
280        rlwinm	w2,w1,0,0xF			// relatively 16-byte aligned?
281        mtcrf	0x02,w4				// finish moving #bytes to align to cr6 and cr7
282        srwi	r0,rc,6				// get # 64-byte chunks to xfer (>=1)
283        cmpwi	cr5,w2,0			// set cr5 beq if relatively 16-byte aligned
284        beq		LFwdAligned			// dest is already aligned
285
286// 32-byte align destination.
287
288        bf		31,1f				// byte to move?
289        lbz		w1,0(rs)
290        addi	rs,rs,1
291        stb		w1,0(rd)
292        addi	rd,rd,1
2931:
294        bf		30,2f				// halfword?
295        lhz		w1,0(rs)
296        addi	rs,rs,2
297        sth		w1,0(rd)
298        addi	rd,rd,2
2992:
300        bf		29,3f				// word?
301        lwz		w1,0(rs)
302        addi	rs,rs,4
303        stw		w1,0(rd)
304        addi	rd,rd,4
3053:
306        bf		28,4f				// doubleword?
307        lwz		w1,0(rs)
308        lwz		w2,4(rs)
309        addi	rs,rs,8
310        stw		w1,0(rd)
311        stw		w2,4(rd)
312        addi	rd,rd,8
3134:
314        bf		27,LFwdAligned		// quadword?
315        lwz		w1,0(rs)
316        lwz		w2,4(rs)
317        lwz		w3,8(rs)
318        lwz		w4,12(rs)
319        addi	rs,rs,16
320        stw		w1,0(rd)
321        stw		w2,4(rd)
322        stw		w3,8(rd)
323        stw		w4,12(rd)
324        addi	rd,rd,16
325
326
327// Destination is 32-byte aligned.
328//		r0 = count of 64-byte chunks to move (not 0)
329//		rd = 32-byte aligned
330//		rc = bytes remaining
331//	   cr5 = beq if source is 16-byte aligned
332// We set up many registers:
333//	   ctr = number of 64-byte chunks to move
334//	r0/cr0 = leftover QWs to move
335//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
336//	   cr6 = beq if leftover byte count is 0
337//		rv = original value of vrsave
338// c16 etc = loaded
339
340LFwdAligned:
341        mfspr	rv,vrsave			// get bitmap of live vector registers
342        mtcrf	0x01,rc				// move leftover count to cr7 for LShort16
343        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3
344        mtctr	r0					// set up loop count
345        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
346        oris	w1,rv,0xFF00		// we use v0-v7
347        rlwinm.	r0,rc,28,30,31		// get number of quadword leftovers (0-3) and set cr0
348        mtspr	vrsave,w1			// update mask
349        li		c16,16				// get constants used in ldvx/stvx
350        li		c32,32
351        li		c48,48
352        li		c96,96
353        li		c128,128
354        bne		cr5,LForwardVecUnal	// handle unaligned operands
355        b		1f
356
357        .align	4
3581:        							// loop over 64-byte chunks
359        dcbt	c96,rs
360        dcbt	c128,rs
361        lvx		v1,0,rs
362        lvx		v2,c16,rs
363        lvx		v3,c32,rs
364        lvx		v4,c48,rs
365        addi	rs,rs,64
366        dcba	0,rd				// patched to NOP on some machines
367        stvx	v1,0,rd
368        stvx	v2,c16,rd
369        dcba	c32,rd				// patched to NOP on some machines
370        stvx	v3,c32,rd
371        stvx	v4,c48,rd
372        addi	rd,rd,64
373        bdnz	1b
374
375        beq		4f					// no leftover quadwords
376        mtctr	r0
3773:									// loop over remaining quadwords (1-3)
378        lvx		v1,0,rs
379        addi	rs,rs,16
380        stvx	v1,0,rd
381        addi	rd,rd,16
382        bdnz	3b
3834:
384        mtspr	vrsave,rv			// restore bitmap of live vr's
385        bne		cr6,LShort16		// handle last 0-15 bytes if any
386        blr
387
388
389// Long, forward, unaligned vector loop.
390
391LForwardVecUnal:
392        lvsl	vp,0,rs				// get permute vector to shift left
393        lvx		v1,0,rs				// prefetch 1st source quadword
394        b		1f
395
396        .align	4					// align inner loops
3971:									// loop over 64-byte chunks
398        lvx		v2,c16,rs
399        dcbt	c96,rs
400        lvx		v3,c32,rs
401        dcbt	c128,rs
402        lvx		v4,c48,rs
403        addi	rs,rs,64
404        vperm	vw,v1,v2,vp
405        lvx		v1,0,rs
406        vperm	vx,v2,v3,vp
407        dcba	0,rd				// patched to NOP on some machines
408        stvx	vw,0,rd
409        vperm	vy,v3,v4,vp
410        stvx	vx,c16,rd
411        vperm	vw,v4,v1,vp
412        dcba	c32,rd				// patched to NOP on some machines
413        stvx	vy,c32,rd
414        stvx	vw,c48,rd
415        addi	rd,rd,64
416        bdnz	1b
417
418        beq-	4f					// no leftover quadwords
419        mtctr	r0
4203:									// loop over remaining quadwords
421        lvx		v2,c16,rs
422        addi	rs,rs,16
423        vperm	vx,v1,v2,vp
424        vor		v1,v2,v2			// v1 <- v2
425        stvx	vx,0,rd
426        addi	rd,rd,16
427        bdnz	3b
4284:
429        mtspr	vrsave,rv			// restore bitmap of live vr's
430        bne		cr6,LShort16		// handle last 0-15 bytes if any
431        blr
432
433
434// Medium and long, reverse moves.  We use altivec if the operands are long enough,
435// else a lwz/stx loop.
436//		w1 = (rd-rs), used to check for reverse and alignment
437//	   cr7 = bge if long
438
439LMediumReverse:
440        add		rd,rd,rc			// point to end of operands
441        add		rs,rs,rc
442        andi.	w4,rd,0x1F			// w4 <- #bytes to 32-byte align destination
443        rlwinm	w6,rd,0,0x3			// w6 <- #bytes to 4-byte align destination
444        bge		cr7,LLongReverse	// long enough for vectors
445
446// Scalar loop.
447//	    w6 = #bytes to 4-byte align destination
448
449        sub		rc,rc,w6			// decrement length remaining
450        mtxer	w6					// set up count for move
451        sub		rs,rs,w6			// back up ptrs
452        sub		rd,rd,w6
453        srwi	r0,rc,4				// get # 16-byte chunks (>=1)
454        mtcrf	0x01,rc				// set remaining byte count here for LShortReverse16
455        lswx	w1,0,rs				// move w6 bytes to align destination
456        stswx	w1,0,rd
457        mtctr	r0					// set up 16-byte loop
458        b		1f
459
460        .align	4
4611:									// loop over 16-byte  aligned chunks
462        lwz		w1,-4(rs)
463        lwz		w2,-8(rs)
464        lwz		w3,-12(rs)
465        lwzu	w4,-16(rs)
466        stw		w1,-4(rd)
467        stw		w2,-8(rd)
468        stw		w3,-12(rd)
469        stwu	w4,-16(rd)
470        bdnz	1b
471
472        b		LShortReverse16
473
474
475// Reverse vector loops.  First, we must 32-byte align the destination.
476//		w1 = (rd-rs), used to check for reverse and alignment
477//	w4/cr0 = #bytes to 32-byte align destination
478//		rc = long enough for at least one vector loop
479
480LLongReverse:
481        sub		rc,rc,w4			// adjust length
482        mtcrf	0x01,w4				// cr7 <- #bytes to align dest
483        rlwinm	w2,w1,0,0xF			// relatively 16-byte aligned?
484        mtcrf	0x02,w4				// finish moving #bytes to align to cr6 and cr7
485        srwi	r0,rc,6				// get # 64-byte chunks to xfer (>=1)
486        cmpwi	cr5,w2,0			// set cr5 beq if relatively 16-byte aligned
487        beq		LReverseAligned		// dest is already aligned
488
489// 32-byte align destination.
490
491        bf		31,1f				// byte to move?
492        lbzu 	w1,-1(rs)
493        stbu 	w1,-1(rd)
4941:
495        bf		30,2f				// halfword?
496        lhzu 	w1,-2(rs)
497        sthu 	w1,-2(rd)
4982:
499        bf		29,3f				// word?
500        lwzu 	w1,-4(rs)
501        stwu 	w1,-4(rd)
5023:
503        bf		28,4f				// doubleword?
504        lwz		w1,-4(rs)
505        lwzu	w2,-8(rs)
506        stw		w1,-4(rd)
507        stwu	w2,-8(rd)
5084:
509        bf		27,LReverseAligned	// quadword?
510        lwz		w1,-4(rs)
511        lwz		w2,-8(rs)
512        lwz		w3,-12(rs)
513        lwzu	w4,-16(rs)
514        stw		w1,-4(rd)
515        stw		w2,-8(rd)
516        stw		w3,-12(rd)
517        stwu	w4,-16(rd)
518
519// Destination is 32-byte aligned.
520//		r0 = count of 64-byte chunks to move (not 0)
521//		rd = 32-byte aligned
522//		rc = bytes remaining
523//	   cr5 = beq if source is 16-byte aligned
524// We set up many registers:
525//	   ctr = number of 64-byte chunks to move
526//	r0/cr0 = leftover QWs to move
527//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
528//	   cr6 = beq if leftover byte count is 0
529//		rv = original value of vrsave
530// cm1 etc = loaded
531
532LReverseAligned:
533        mfspr	rv,vrsave			// get bitmap of live vector registers
534        mtcrf	0x01,rc				// move leftover count to cr7 for LShort16
535        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3
536        mtctr	r0					// set up loop count
537        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
538        oris	w1,rv,0xFF00		// we use v0-v7
539        rlwinm.	r0,rc,28,30,31		// get number of quadword leftovers (0-3) and set cr0
540        mtspr	vrsave,w1			// update mask
541        li		cm1,-1				// get constants used in ldvx/stvx
542        li		cm17,-17
543        li		cm33,-33
544        li		cm49,-49
545        li		cm97,-97
546        li		cm129,-129
547        bne		cr5,LReverseVecUnal	// handle unaligned operands
548        b		1f
549
550        .align	4					// align inner loops
5511:        							// loop over 64-byte chunks
552        dcbt	cm97,rs
553        dcbt	cm129,rs
554        lvx		v1,cm1,rs
555        lvx		v2,cm17,rs
556        lvx		v3,cm33,rs
557        lvx		v4,cm49,rs
558        subi	rs,rs,64
559        stvx	v1,cm1,rd
560        stvx	v2,cm17,rd
561        stvx	v3,cm33,rd
562        stvx	v4,cm49,rd
563        subi	rd,rd,64
564        bdnz	1b
565
566        beq		4f					// no leftover quadwords
567        mtctr	r0
5683:									// loop over remaining quadwords (1-7)
569        lvx		v1,cm1,rs
570        subi	rs,rs,16
571        stvx	v1,cm1,rd
572        subi	rd,rd,16
573        bdnz	3b
5744:
575        mtspr	vrsave,rv			// restore bitmap of live vr's
576        bne		cr6,LShortReverse16	// handle last 0-15 bytes if any
577        blr
578
579
580// Long, reverse, unaligned vector loop.
581
582LReverseVecUnal:
583        lvsl	vp,0,rs				// get permute vector to shift left
584        lvx		v1,cm1,rs			// v1 always looks ahead
585        b		1f
586
587        .align	4					// align the inner loops
5881:									// loop over 64-byte chunks
589        lvx		v2,cm17,rs
590        dcbt	cm97,rs
591        lvx		v3,cm33,rs
592        dcbt	cm129,rs
593        lvx		v4,cm49,rs
594        subi	rs,rs,64
595        vperm	vw,v2,v1,vp
596        lvx		v1,cm1,rs
597        vperm	vx,v3,v2,vp
598        stvx	vw,cm1,rd
599        vperm	vy,v4,v3,vp
600        stvx	vx,cm17,rd
601        vperm	vw,v1,v4,vp
602        stvx	vy,cm33,rd
603        stvx	vw,cm49,rd
604        subi	rd,rd,64
605        bdnz	1b
606
607        beq		3f					// no leftover quadwords
608        mtctr	r0
6092:									// loop over 1-3 quadwords
610        lvx		v2,cm17,rs
611        subi	rs,rs,16
612        vperm	vx,v2,v1,vp
613        vor		v1,v2,v2			// v1 <- v2
614        stvx	vx,cm1,rd
615        subi	rd,rd,16
616        bdnz	2b
6173:
618        mtspr	vrsave,rv			// restore bitmap of live vr's
619        bne		cr6,LShortReverse16	// handle last 0-15 bytes iff any
620        blr
621
622	COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)
623