1/*
2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28;
29;			Copy bytes of data around. Handles overlapped data.
30;
31;
32#include <ppc/asm.h>
33#include <ppc/proc_reg.h>
34#include <assym.s>
35
36;       These routines use CR5 for certain flags:
37;		Use CR5_lt to indicate non-cached (in bcopy and memcpy)
38#define noncache 20
39
40
41;       The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
42#define BCOPY_SF_SIZE   32      // total size
43#define BCOPY_SF_MSR    16      // we save caller's MSR here (possibly minus VEC and FP)
44
45
46#define kShort  32              // short operands are special cased
47
48
49; void bcopy_physvir_32(from, to, nbytes)
50;
51; Attempt to copy physically addressed memory with translation on if conditions are met.
52; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors
53; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
54; for the passed phys addrs and do the copy with translation on.
55;
56; Rules are: - neither source nor destination can cross a page.
57;            - Interrupts must be disabled when this routine is called.
58;            - Translation must be on when called.
59;
60; To do the copy, we build a 128 DBAT for both the source and sink.  If both are the same, only one
61; is loaded.  We do not touch the IBATs, so there is no issue if either physical page
62; address is the same as the virtual address of the instructions we are executing.
63;
64; At the end, we invalidate the used DBATs.
65;
66; Note that the address parameters are long longs.  We will transform these to 64-bit
67; values.  Note that on 32-bit architectures that this will ignore the high half of the
68; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
69; there anyhow.
70;
71; Note also that this routine is used only on 32-bit machines. If you're contemplating use
72; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
73; for an example of how this is done.
74
75			.align	5
76			.globl	EXT(bcopy_physvir_32)
77
78LEXT(bcopy_physvir_32)
79            mflr    r0                          ; get return address
80            rlwinm	r3,r3,0,1,0					; Duplicate high half of long long paddr into top of reg
81            mfsprg	r8,2						; get processor feature flags
82            stw     r0,8(r1)                    ; save return address
83			rlwimi	r3,r4,0,0,31				; Combine bottom of long long to full 64-bits
84            stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
85            mtcrf	0x02,r8						; move pf64Bit to cr6 so we can test
86            subi    r0,r7,1                     ; get length - 1
87			rlwinm	r4,r5,0,1,0					; Duplicate high half of long long paddr into top of reg
88			add		r11,r3,r0					; Point to last byte of sink
89			mr		r5,r7						; Get the length into the right register
90            rlwimi	r4,r6,0,0,31				; Combine bottom of long long to full 64-bits
91
92; This test for page overflow may not work if the length is negative.  Negative lengths are invalid input
93; to bcopy_physvir() on 32-bit machines, and will result in a panic.
94
95			add		r12,r4,r0					; Point to last byte of source
96			xor		r7,r11,r3					; See if we went to next page
97			xor		r8,r12,r4					; See if we went to next page
98			or		r0,r7,r8					; Combine wrap
99
100//			li		r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2)	; Set default attributes
101			li		r9,((2<<3)|2)				; Set default attributes
102			rlwinm.	r0,r0,0,0,19				; Did we overflow a page?
103			li		r7,2						; Set validity flags
104			li		r8,2						; Set validity flags
105			bne-	bcopy_phys1					; Overflowed page, do normal physical copy...
106
107			rlwimi	r11,r9,0,15,31				; Set sink lower DBAT value
108			rlwimi	r12,r9,0,15,31				; Set source lower DBAT value
109			rlwimi	r7,r11,0,0,14				; Set sink upper DBAT value
110			rlwimi	r8,r12,0,0,14				; Set source upper DBAT value
111			cmplw	cr1,r11,r12					; See if sink and source are same block
112
113			sync
114
115			mtdbatl	0,r11						; Set sink lower DBAT
116			mtdbatu	0,r7						; Set sink upper DBAT
117
118			beq-	cr1,bcpvsame				; Source and sink are in same block
119
120			mtdbatl	1,r12						; Set source lower DBAT
121			mtdbatu	1,r8						; Set source upper DBAT
122
123bcpvsame:
124            sync                                ; wait for the BATs to stabilize
125            isync
126
127            bl      EXT(bcopy)                  ; BATs set up, args in r3-r5, so do the copy with DR on
128
129            li		r0,0						; Get set to invalidate upper half of BATs
130			sync								; Make sure all is well
131			mtdbatu	0,r0						; Clear sink upper DBAT
132			mtdbatu	1,r0						; Clear source upper DBAT
133			sync
134			isync
135
136            lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address
137            addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
138            mtlr    r0
139            blr
140
141
142; void bcopy_phys(from, to, nbytes)
143;
144; Turns off data translation before the copy.  This one will not work in user state.
145; This routine is used on 32 and 64-bit machines.
146;
147; Note that the address parameters are long longs.  We will transform these to 64-bit
148; values.  Note that on 32-bit architectures that this will ignore the high half of the
149; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
150; there anyhow.
151;
152; Also note that you probably will not be happy if either the sink or source spans across the
153; boundary between RAM and I/O space.  Good chance of hanging the machine and this code
154; will not check, so be careful.
155;
156; NOTE: when called, translation must be on, and we must be in 32-bit mode.
157;       Interrupts may or may not be disabled.
158
159			.align	5
160			.globl	EXT(bcopy_phys)
161
162LEXT(bcopy_phys)
163            mflr    r0                          ; get return address
164            rlwinm	r3,r3,0,1,0					; Duplicate high half of long long paddr into top of reg
165            stw     r0,8(r1)                    ; save
166            mfsprg	r8,2						; get processor feature flags
167            stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
168			rlwimi	r3,r4,0,0,31				; Combine bottom of long long to full 64-bits
169			rlwinm	r4,r5,0,1,0					; Duplicate high half of long long paddr into top of reg
170			mtcrf	0x02,r8						; move pf64Bit to cr6 so we can test
171			rlwimi	r4,r6,0,0,31				; Combine bottom of long long to full 64-bits
172			mr		r5,r7						; Get the length into the right register
173
174bcopy_phys1:									; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
175			mfmsr	r9							; Get the MSR
176			lis		r6,hi16(MASK(MSR_VEC))		; Get vector enable
177            ori     r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))	; Add in FP and DR
178            andc    r9,r9,r6                    ; unconditionally turn DR, VEC, and FP off
179            bt++	pf64Bitb,bcopy_phys64		; skip if 64-bit (only they take hint)
180
181; 32-bit CPUs
182
183			mtmsr	r9							; turn DR, FP, and VEC off
184			isync								; Wait for it
185
186            bl      EXT(bcopy)                  ; do the copy with translation off and caching on
187
188			mfmsr	r9							; Get the MSR
189            ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on (but leave VEC and FP off)
190            mtmsr   r9                          ; restore msr
191            isync                               ; wait for it to happen
192            lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
193            mtlr    r0
194            addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
195            blr
196
197
198; 64-bit: turn DR off and SF on.
199
200bcopy_phys64:									; r9 = MSR with DP, VEC, and FP off
201            ori     r8,r9,lo16(MASK(MSR_DR))    ; make a copy with DR back on... this is what we return to caller
202			srdi	r2,r3,31					; Get a 1 if source is in I/O memory
203            li		r0,1						; Note - we use this in a couple places below
204			srdi	r10,r4,31					; Get a 1 if sink is in I/O memory
205            std     r8,BCOPY_SF_MSR(r1)         ; save caller's MSR so we remember whether EE was on
206            rldimi	r9,r0,63,MSR_SF_BIT			; set SF on in MSR we will copy with
207			cmpldi	cr0,r2,1					; Is source in I/O memory?
208			cmpldi	cr7,r10,1					; Is sink in I/O memory?
209            mtmsrd	r9							; turn 64-bit addressing on, data translation off
210            isync								; wait for it to happen
211			cror	cr7_eq,cr0_eq,cr7_eq		; See if either source or sink is in I/O area
212            beq--   cr7,io_space_real_mode_copy ; an operand is in I/O space
213
214            bl      EXT(bcopy)                  ; do copy with DR off and SF on, cache enabled
215
216bcopy_phys64x:
217			mfmsr	r9							; Get the MSR we used to copy
218            rldicl	r9,r9,0,MSR_SF_BIT+1		; clear SF
219            ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on
220            mtmsrd  r9                          ; turn 64-bit mode off, translation back on
221            isync								; wait for it to happen
222            lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
223            ld      r8,BCOPY_SF_MSR(r1)         ; get caller's MSR once translation is back on
224            mtlr    r0
225            mtmsrd  r8,1                        ; turn EE back on if necessary
226            addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
227            blr
228
229;   We need to copy with DR off, but one of the operands is in I/O space.  To avoid wedging U3,
230;   which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
231;   This can only be done by setting bits in HID4.  We cannot lose control and execute random code in
232;   this state, so we have to disable interrupts as well.  This is an unpleasant hack.
233
234io_space_real_mode_copy:                        ; r0=1, r9=MSR we want to copy with
235			sldi	r11,r0,31-MSR_EE_BIT		; Get a mask for the EE bit
236			sldi	r0,r0,32+8					; Get the right bit to turn off caching
237			andc	r9,r9,r11					; Turn off EE bit
238			mfspr	r2,hid4						; Get HID4
239			mtmsrd	r9,1                        ; Force off EE
240			or		r2,r2,r0					; Set bit to make real accesses cache-inhibited
241			sync								; Sync up
242			mtspr	hid4,r2						; Make real accesses cache-inhibited
243			isync								; Toss prefetches
244
245			lis		r12,0xE000					; Get the unlikeliest ESID possible
246			srdi	r12,r12,1					; Make 0x7FFFFFFFF0000000
247			slbie	r12							; Make sure the ERAT is cleared
248
249			sync
250			isync
251
252            bl      EXT(bcopy_nc)               ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
253
254			li		r0,1						; Get a 1
255			sldi	r0,r0,32+8					; Get the right bit to turn off caching
256			mfspr	r2,hid4						; Get HID4
257			andc	r2,r2,r0					; Clear bit to make real accesses cache-inhibited
258			sync								; Sync up
259			mtspr	hid4,r2						; Make real accesses not cache-inhibited
260			isync								; Toss prefetches
261
262			lis		r12,0xE000					; Get the unlikeliest ESID possible
263			srdi	r12,r12,1					; Make 0x7FFFFFFFF0000000
264			slbie	r12							; Make sure the ERAT is cleared
265            b       bcopy_phys64x
266
267
268;
269; shortcopy
270;
271; Special case short operands (<32 bytes), which are very common.  Note that the check for
272; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
273; reverse when it wasn't necessary to do so.  This is OK, since performance of the two cases
274; is similar.  We do get the direction right when it counts (ie, when the operands overlap.)
275; Also note that we use the G3/G4 "backend" code, even on G5.  This is OK too, since G5 has
276; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
277; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
278; which cost about 20 cycles if they cross a 32-byte boundary on G5.  Finally, because we
279; might do unaligned accesses this code cannot be called from bcopy_nc().
280;           r4 = destination
281;           r5 = length (<32)
282;           r6 = source
283;           r12 = (dest - source)
284
285            .align  5
286shortcopy:
287            cmplw   r12,r5                      ; must move reverse if (dest-source)<length
288            mtcrf   2,r5                        ; move length to cr6 and cr7 one at a time...
289            mtcrf   1,r5                        ; ...which is faster on G4 and G5
290            bge++   backend                     ; handle forward moves (most common case)
291            add     r6,r6,r5                    ; point one past end of operands in reverse moves
292            add     r4,r4,r5
293            b       bbackend                    ; handle reverse moves
294
295;
296; void bcopy(from, to, nbytes)
297;
298; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
299; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
300; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
301; to the thread_recover routine.  What this means is that it would be hard to use vector or floating point
302; registers to accelerate the copy.
303;
304; NOTE: this code can be called in any of three "modes":
305;       - on 32-bit processors (32-byte cache line)
306;       - on 64-bit processors running in 32-bit mode (128-byte cache line)
307;       - on 64-bit processors running in 64-bit mode (128-byte cache line)
308
309			.align	5
310			.globl	EXT(bcopy)
311            .globl  EXT(bcopy_nop_if_32bit)
312
313LEXT(bcopy)
314			cmplwi	cr1,r5,kShort               ; less than 32 bytes?
315            sub.    r12,r4,r3					; test for to==from in mode-independent way, start fwd/rev check
316			mr		r6,r3						; Set source (must preserve r3 for memcopy return)
317			blt     cr1,shortcopy               ; special case short operands
318			crclr	noncache					; Set cached
319LEXT(bcopy_nop_if_32bit)
320            bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
321			bne+    copyit32					; handle 32-bit processor
322            blr                                 ; to==from so nothing to do
323
324;
325; bcopy_nc(from, to, nbytes)
326;
327; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
328; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
329; alignment exceptions.  Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
330; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
331
332			.align	5
333			.globl	EXT(bcopy_nc)
334            .globl  EXT(bcopy_nc_nop_if_32bit)
335
336LEXT(bcopy_nc)
337			cmpwi	cr1,r5,0					; Check if we have a 0 length
338            sub.	r12,r4,r3					; test for to==from in mode-independent way, start fwd/rev check
339			mr		r6,r3						; Set source (must preserve r3 for memcopy return)
340			crset	noncache					; Set non-cached
341			cror    cr0_eq,cr1_eq,cr0_eq        ; set cr0 beq if either length zero or to==from
342LEXT(bcopy_nc_nop_if_32bit)
343            bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
344			bne+    copyit32					; handle 32-bit processor
345            blr                                 ; either zero length or to==from
346
347;
348; void* memcpy(to, from, nbytes)
349; void* memmove(to, from, nbytes)
350;
351; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
352; However, they would work correctly if called in 64-bit mode.
353
354			.align	5
355			.globl	EXT(memcpy)
356			.globl	EXT(memmove)
357            .globl  EXT(memcpy_nop_if_32bit)
358
359LEXT(memcpy)
360LEXT(memmove)
361			cmplwi	cr1,r5,kShort               ; less than 32 bytes?
362            sub.    r12,r3,r4					; test for to==from in mode-independent way, start fwd/rev check
363			mr		r6,r4						; Set source
364			mr		r4,r3						; Set the "to" (must preserve r3 for return value)
365			blt     cr1,shortcopy               ; special case short operands
366			crclr	noncache					; Set cached
367LEXT(memcpy_nop_if_32bit)
368            bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
369			beqlr-                              ; exit if to==from
370
371
372;       Here to copy on 32-bit processors.
373;
374;			When we move the memory, forward overlays must be handled.  We
375;			also can not use the cache instructions if we are from bcopy_nc.
376;			We need to preserve R3 because it needs to be returned for memcpy.
377;			We can be interrupted and lose control here.
378;
379;           When entered:
380;               r4 = destination
381;               r5 = length (>0)
382;               r6 = source
383;               r12 = (dest - source)
384;               cr5 = noncache flag
385
386copyit32:                                       ; WARNING! can drop down to this label
387            cmplw   cr1,r12,r5                  ; must move reverse if (dest-source)<length
388            cntlzw  r11,r5                      ; get magnitude of length
389            dcbt    0,r6                        ; start to touch in source
390            lis     r10,hi16(0x80000000)        ; get 0x80000000
391            neg     r9,r4                       ; start to get alignment for destination
392            dcbtst  0,r4                        ; start to touch in destination
393            sraw    r8,r10,r11                  ; get mask based on operand length, to limit alignment
394            blt-    cr1,reverse32bit            ; reverse move required
395
396; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
397; NOTE: we never do an unaligned access if the source and destination are "relatively"
398; word aligned.  We depend on this in the uncached case on 64-bit processors.
399;               r4 = destination
400;               r5 = length (>0)
401;               r6 = source
402;               r8 = inverse of largest mask smaller than operand length
403;               r9 = neg(dest), used to compute alignment
404;               cr5 = noncache flag
405
406forward32bit:                                   ; enter from 64-bit CPUs with word aligned uncached operands
407			rlwinm	r7,r9,0,0x1F				; get bytes to 32-byte-align destination
408			andc.   r0,r7,r8					; limit to the maximum front end move
409            mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
410			beq		alline						; Already on a line...
411
412			mtcrf	0x02,r0						; ...since moving more than one is slower on G4 and G5
413			sub		r5,r5,r0					; Set the length left to move
414
415			bf		31,alhalf					; No single byte to do...
416			lbz		r7,0(r6)					; Get the byte
417			addi	r6,r6,1						; Point to the next
418			stb		r7,0(r4)					; Save the single
419			addi	r4,r4,1						; Bump sink
420
421;			Sink is halfword aligned here
422
423alhalf:		bf		30,alword					; No halfword to do...
424			lhz		r7,0(r6)					; Get the halfword
425			addi	r6,r6,2						; Point to the next
426			sth		r7,0(r4)					; Save the halfword
427			addi	r4,r4,2						; Bump sink
428
429;			Sink is word aligned here
430
431alword:		bf		29,aldouble					; No word to do...
432			lwz		r7,0(r6)					; Get the word
433			addi	r6,r6,4						; Point to the next
434			stw		r7,0(r4)					; Save the word
435			addi	r4,r4,4						; Bump sink
436
437;			Sink is double aligned here
438
439aldouble:	bf		28,alquad					; No double to do...
440			lwz		r7,0(r6)					; Get the first word
441			lwz		r8,4(r6)					; Get the second word
442			addi	r6,r6,8						; Point to the next
443			stw		r7,0(r4)					; Save the first word
444			stw		r8,4(r4)					; Save the second word
445			addi	r4,r4,8						; Bump sink
446
447;			Sink is quadword aligned here
448
449alquad:		bf		27,alline					; No quad to do...
450			lwz		r7,0(r6)					; Get the first word
451			lwz		r8,4(r6)					; Get the second word
452			lwz		r9,8(r6)					; Get the third word
453			stw		r7,0(r4)					; Save the first word
454			lwz		r11,12(r6)					; Get the fourth word
455			addi	r6,r6,16					; Point to the next
456			stw		r8,4(r4)					; Save the second word
457			stw		r9,8(r4)					; Save the third word
458			stw		r11,12(r4)					; Save the fourth word
459			addi	r4,r4,16					; Bump sink
460
461;			Sink is line aligned here
462
463alline:		rlwinm.	r0,r5,27,5,31				; Get the number of full lines to move
464            mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
465			mtcrf	0x01,r5						; ...since moving more than one is slower on G4 and G5
466			beq-	backend						; No full lines to move
467
468            mtctr   r0                          ; set up loop count
469			li		r0,96						; Stride for touch ahead
470            b       nxtline
471
472            .align  4
473nxtline:
474            lwz		r2,0(r6)					; Get the first word
475			lwz		r5,4(r6)					; Get the second word
476			lwz		r7,8(r6)					; Get the third word
477			lwz		r8,12(r6)					; Get the fourth word
478			lwz		r9,16(r6)					; Get the fifth word
479			lwz		r10,20(r6)					; Get the sixth word
480			lwz		r11,24(r6)					; Get the seventh word
481			lwz		r12,28(r6)					; Get the eighth word
482			bt-		noncache,skipz				; Skip if we are not cached...
483			dcbz	0,r4						; Blow away the whole line because we are replacing it
484			dcbt	r6,r0						; Touch ahead a bit
485skipz:
486			addi	r6,r6,32					; Point to the next
487			stw		r2,0(r4)					; Save the first word
488			stw		r5,4(r4)					; Save the second word
489			stw		r7,8(r4)					; Save the third word
490			stw		r8,12(r4)					; Save the fourth word
491			stw		r9,16(r4)					; Save the fifth word
492			stw		r10,20(r4)					; Save the sixth word
493			stw		r11,24(r4)					; Save the seventh word
494			stw		r12,28(r4)					; Save the eighth word
495			addi	r4,r4,32					; Bump sink
496			bdnz+	nxtline						; Do the next line, if any...
497
498
499;			Move backend quadword
500
501backend:                                        ; Join here from "shortcopy" for forward moves <32 bytes
502            bf		27,noquad					; No quad to do...
503			lwz		r7,0(r6)					; Get the first word
504			lwz		r8,4(r6)					; Get the second word
505			lwz		r9,8(r6)					; Get the third word
506			lwz		r11,12(r6)					; Get the fourth word
507			stw		r7,0(r4)					; Save the first word
508			addi	r6,r6,16					; Point to the next
509			stw		r8,4(r4)					; Save the second word
510			stw		r9,8(r4)					; Save the third word
511			stw		r11,12(r4)					; Save the fourth word
512			addi	r4,r4,16					; Bump sink
513
514;			Move backend double
515
516noquad:		bf		28,nodouble					; No double to do...
517			lwz		r7,0(r6)					; Get the first word
518			lwz		r8,4(r6)					; Get the second word
519			addi	r6,r6,8						; Point to the next
520			stw		r7,0(r4)					; Save the first word
521			stw		r8,4(r4)					; Save the second word
522			addi	r4,r4,8						; Bump sink
523
524;			Move backend word
525
526nodouble:	bf		29,noword					; No word to do...
527			lwz		r7,0(r6)					; Get the word
528			addi	r6,r6,4						; Point to the next
529			stw		r7,0(r4)					; Save the word
530			addi	r4,r4,4						; Bump sink
531
532;			Move backend halfword
533
534noword:		bf		30,nohalf					; No halfword to do...
535			lhz		r7,0(r6)					; Get the halfword
536			addi	r6,r6,2						; Point to the next
537			sth		r7,0(r4)					; Save the halfword
538			addi	r4,r4,2						; Bump sink
539
540;			Move backend byte
541
542nohalf:		bflr    31                          ; Leave cuz we are all done...
543			lbz		r7,0(r6)					; Get the byte
544			stb		r7,0(r4)					; Save the single
545            blr
546
547
548; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
549; NOTE: we never do an unaligned access if the source and destination are "relatively"
550; word aligned.  We depend on this in the uncached case on 64-bit processors.
551; These are slower because we don't bother with dcbz.  Fortunately, reverse moves are uncommon.
552;               r4 = destination
553;               r5 = length (>0)
554;               r6 = source
555;               r8 = inverse of largest mask smaller than operand length
556;               cr5 = noncache flag (but we don't dcbz anyway)
557
558reverse32bit:									; here from 64-bit code with word aligned uncached operands
559            add		r4,r5,r4					; Point past the last sink byte
560			add		r6,r5,r6					; Point past the last source byte
561			rlwinm	r7,r4,0,0x1F				; Calculate the length to align dest on cache boundary
562			li		r12,-1						; Make sure we touch in the actual line
563			andc.   r0,r7,r8					; Apply movement limit
564			dcbt	r12,r6						; Touch in the last line of source
565            mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
566			dcbtst	r12,r4						; Touch in the last line of the sink
567			mtcrf	0x02,r0						; ...since moving more than one is slower on G4 and G5
568			beq-	balline						; Aready on cache line boundary (or too short to bother)
569
570			sub		r5,r5,r0					; Precaculate move length left after alignment
571
572			bf		31,balhalf					; No single byte to do...
573			lbz		r7,-1(r6)					; Get the byte
574			subi	r6,r6,1						; Point to the next
575			stb		r7,-1(r4)					; Save the single
576			subi	r4,r4,1						; Bump sink
577
578;			Sink is halfword aligned here
579
580balhalf:	bf		30,balword					; No halfword to do...
581			lhz		r7,-2(r6)					; Get the halfword
582			subi	r6,r6,2						; Point to the next
583			sth		r7,-2(r4)					; Save the halfword
584			subi	r4,r4,2						; Bump sink
585
586;			Sink is word aligned here
587
588balword:	bf		29,baldouble				; No word to do...
589			lwz		r7,-4(r6)					; Get the word
590			subi	r6,r6,4						; Point to the next
591			stw		r7,-4(r4)					; Save the word
592			subi	r4,r4,4						; Bump sink
593
594;			Sink is double aligned here
595
596baldouble:	bf		28,balquad					; No double to do...
597			lwz		r7,-8(r6)					; Get the first word
598			lwz		r8,-4(r6)					; Get the second word
599			subi	r6,r6,8						; Point to the next
600			stw		r7,-8(r4)					; Save the first word
601			stw		r8,-4(r4)					; Save the second word
602			subi	r4,r4,8						; Bump sink
603
604;			Sink is quadword aligned here
605
606balquad:	bf		27,balline					; No quad to do...
607			lwz		r7,-16(r6)					; Get the first word
608			lwz		r8,-12(r6)					; Get the second word
609			lwz		r9,-8(r6)					; Get the third word
610			lwz		r11,-4(r6)					; Get the fourth word
611			stw		r7,-16(r4)					; Save the first word
612			subi	r6,r6,16					; Point to the next
613			stw		r8,-12(r4)					; Save the second word
614			stw		r9,-8(r4)					; Save the third word
615			stw		r11,-4(r4)					; Save the fourth word
616			subi	r4,r4,16					; Bump sink
617
618;			Sink is line aligned here
619
620balline:	rlwinm.	r0,r5,27,5,31				; Get the number of full lines to move
621            mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
622			mtcrf	0x01,r5						; ...since moving more than one is slower on G4 and G5
623			beq-	bbackend					; No full lines to move
624            mtctr   r0                          ; set up loop count
625            b       bnxtline
626
627            .align  4
628bnxtline:
629			lwz		r7,-32(r6)					; Get the first word
630			lwz		r5,-28(r6)					; Get the second word
631			lwz		r2,-24(r6)					; Get the third word
632			lwz		r12,-20(r6)					; Get the third word
633			lwz		r11,-16(r6)					; Get the fifth word
634			lwz		r10,-12(r6)					; Get the sixth word
635			lwz		r9,-8(r6)					; Get the seventh word
636			lwz		r8,-4(r6)					; Get the eighth word
637			subi	r6,r6,32					; Point to the next
638
639			stw		r7,-32(r4)					; Get the first word
640            stw		r5,-28(r4)					; Get the second word
641			stw		r2,-24(r4)					; Get the third word
642			stw		r12,-20(r4)					; Get the third word
643			stw		r11,-16(r4)					; Get the fifth word
644			stw		r10,-12(r4)					; Get the sixth word
645			stw		r9,-8(r4)					; Get the seventh word
646			stw		r8,-4(r4)					; Get the eighth word
647			subi	r4,r4,32					; Bump sink
648
649			bdnz+	bnxtline					; Do the next line, if any...
650
651;
652;			Note: We touched these lines in at the beginning
653;
654
655;			Move backend quadword
656
657bbackend:                                       ; Join here from "shortcopy" for reverse moves of <32 bytes
658            bf		27,bnoquad					; No quad to do...
659			lwz		r7,-16(r6)					; Get the first word
660			lwz		r8,-12(r6)					; Get the second word
661			lwz		r9,-8(r6)					; Get the third word
662			lwz		r11,-4(r6)					; Get the fourth word
663			stw		r7,-16(r4)					; Save the first word
664			subi	r6,r6,16					; Point to the next
665			stw		r8,-12(r4)					; Save the second word
666			stw		r9,-8(r4)					; Save the third word
667			stw		r11,-4(r4)					; Save the fourth word
668			subi	r4,r4,16					; Bump sink
669
670;			Move backend double
671
672bnoquad:	bf		28,bnodouble				; No double to do...
673			lwz		r7,-8(r6)					; Get the first word
674			lwz		r8,-4(r6)					; Get the second word
675			subi	r6,r6,8						; Point to the next
676			stw		r7,-8(r4)					; Save the first word
677			stw		r8,-4(r4)					; Save the second word
678			subi	r4,r4,8						; Bump sink
679
680;			Move backend word
681
682bnodouble:	bf		29,bnoword					; No word to do...
683			lwz		r7,-4(r6)					; Get the word
684			subi	r6,r6,4						; Point to the next
685			stw		r7,-4(r4)					; Save the word
686			subi	r4,r4,4						; Bump sink
687
688;			Move backend halfword
689
690bnoword:	bf		30,bnohalf					; No halfword to do...
691			lhz		r7,-2(r6)					; Get the halfword
692			subi	r6,r6,2						; Point to the next
693			sth		r7,-2(r4)					; Save the halfword
694			subi	r4,r4,2						; Bump sink
695
696;			Move backend byte
697
698bnohalf:	bflr    31                          ; Leave cuz we are all done...
699			lbz		r7,-1(r6)					; Get the byte
700			stb		r7,-1(r4)					; Save the single
701			blr
702
703
704// Here on 64-bit processors, which have a 128-byte cache line.  This can be
705// called either in 32 or 64-bit mode, which makes the test for reverse moves
706// a little tricky.  We've already filtered out the (sou==dest) and (len==0)
707// special cases.
708//
709// When entered:
710//		r4 = destination (32 or 64-bit ptr)
711//		r5 = length (always 32 bits)
712//		r6 = source (32 or 64-bit ptr)
713//      r12 = (dest - source), reverse move required if (dest-source)<length
714//		cr5 = noncache flag
715
716        .align	5
717copyit64:
718        rlwinm  r7,r5,0,0,31        // truncate length to 32-bit, in case we're running in 64-bit mode
719        cntlzw	r11,r5				// get magnitude of length
720        dcbt	0,r6				// touch in 1st block of source
721        dcbtst	0,r4				// touch in 1st destination cache block
722        subc    r7,r12,r7           // set Carry if (dest-source)>=length, in mode-independent way
723        li      r0,0                // get a 0
724        lis     r10,hi16(0x80000000)// get 0x80000000
725        addze.  r0,r0               // set cr0 on carry bit (beq if reverse move required)
726        neg     r9,r4               // start to get alignment for destination
727        sraw    r8,r10,r11          // get mask based on operand length, to limit alignment
728        bt--	noncache,c64uncached// skip if uncached
729        beq--	c64rdouble          // handle cached reverse moves
730
731
732// Forward, cached or doubleword aligned uncached.  This is the common case.
733// NOTE: we never do an unaligned access if the source and destination are "relatively"
734// doubleword aligned.  We depend on this in the uncached case.
735//      r4 = destination
736//      r5 = length (>0)
737//      r6 = source
738//      r8 = inverse of largest mask smaller than operand length
739//      r9 = neg(dest), used to compute alignment
740//      cr5 = noncache flag
741
742c64double:
743        rlwinm  r7,r9,0,0x7F        // get #bytes to 128-byte align destination
744        andc    r7,r7,r8            // limit by operand length
745        andi.	r8,r7,7				// r8 <- #bytes to doubleword align
746        srwi	r9,r7,3				// r9 <- #doublewords to 128-byte align
747        sub		r5,r5,r7			// adjust length remaining
748        cmpwi	cr1,r9,0			// any doublewords to move to cache align?
749        srwi	r10,r5,7			// r10 <- 128-byte chunks to xfer after aligning dest
750        cmpwi	cr7,r10,0			// set cr7 on chunk count
751        beq		c64double2			// dest already doubleword aligned
752        mtctr	r8
753        b		c64double1
754
755        .align	5					// align inner loops
756c64double1:							// copy bytes until dest is doubleword aligned
757        lbz		r0,0(r6)
758        addi	r6,r6,1
759        stb		r0,0(r4)
760        addi	r4,r4,1
761        bdnz	c64double1
762
763c64double2:							// r9/cr1=doublewords, r10/cr7=128-byte chunks
764        beq		cr1,c64double4		// no doublewords to xfer in order to cache align
765        mtctr	r9
766        b		c64double3
767
768        .align	5					// align inner loops
769c64double3:							// copy doublewords until dest is 128-byte aligned
770        ld		r7,0(r6)
771        addi	r6,r6,8
772        std		r7,0(r4)
773        addi	r4,r4,8
774        bdnz	c64double3
775
776// Here to xfer 128-byte chunks, if any.  Since we only have 8 GPRs for
777// data (64 bytes), we load/store each twice per 128-byte chunk.
778
779c64double4:							// r10/cr7=128-byte chunks
780        rlwinm	r0,r5,29,28,31		// r0 <- count of leftover doublewords, after moving chunks
781        cmpwi	cr1,r0,0			// set cr1 on leftover doublewords
782        beq		cr7,c64double7		// no 128-byte chunks
783
784        ; We must check for (source-dest)<128 in a mode-independent way.  If within 128 bytes,
785        ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
786
787        sub		r8,r6,r4			// r8 <- (source - dest)
788        rldicr. r0,r8,0,63-7        // zero low 7 bits and check for 0, mode independent
789        cror	noncache,cr0_eq,noncache	// turn on "noncache" flag if (source-dest)<128
790        mtctr	r10
791        b		c64InnerLoop
792
793        .align	5					// align inner loop
794c64InnerLoop:						// loop copying 128-byte cache lines to 128-aligned destination
795        ld		r0,0(r6)			// start pipe: load 1st half-line
796        ld		r2,8(r6)
797        ld		r7,16(r6)
798        ld		r8,24(r6)
799        ld		r9,32(r6)
800        ld		r10,40(r6)
801        ld		r11,48(r6)
802        ld		r12,56(r6)
803        bt		noncache,c64InnerLoop1	// skip if uncached or overlap
804        dcbz128	0,r4				// avoid prefetch of next cache line
805c64InnerLoop1:
806
807        std		r0,0(r4)
808        std		r2,8(r4)
809        std		r7,16(r4)
810        std		r8,24(r4)
811        std		r9,32(r4)
812        std		r10,40(r4)
813        std		r11,48(r4)
814        std		r12,56(r4)
815
816        ld		r0,64(r6)			// load 2nd half of chunk
817        ld		r2,72(r6)
818        ld		r7,80(r6)
819        ld		r8,88(r6)
820        ld		r9,96(r6)
821        ld		r10,104(r6)
822        ld		r11,112(r6)
823        ld		r12,120(r6)
824        addi	r6,r6,128
825
826        std		r0,64(r4)
827        std		r2,72(r4)
828        std		r7,80(r4)
829        std		r8,88(r4)
830        std		r9,96(r4)
831        std		r10,104(r4)
832        std		r11,112(r4)
833        std		r12,120(r4)
834        addi	r4,r4,128			// advance to next dest chunk
835
836        bdnz	c64InnerLoop		// loop if more chunks
837
838
839c64double7:         	            // r5 <- leftover bytes, cr1 set on doubleword count
840        rlwinm	r0,r5,29,28,31		// r0 <- count of leftover doublewords (0-15)
841        andi.	r5,r5,7				// r5/cr0 <- count of leftover bytes (0-7)
842        beq		cr1,c64byte			// no leftover doublewords
843        mtctr	r0
844        b		c64double8
845
846        .align	5					// align inner loop
847c64double8:							// loop copying leftover doublewords
848        ld		r0,0(r6)
849        addi	r6,r6,8
850        std		r0,0(r4)
851        addi	r4,r4,8
852        bdnz	c64double8
853
854
855// Forward byte loop.
856
857c64byte:							// r5/cr0 <- byte count (can be big if unaligned uncached)
858		beqlr                       // done if no leftover bytes
859        mtctr	r5
860        b		c64byte1
861
862        .align	5					// align inner loop
863c64byte1:
864        lbz		r0,0(r6)
865        addi	r6,r6,1
866        stb		r0,0(r4)
867        addi	r4,r4,1
868        bdnz	c64byte1
869
870        blr
871
872
873// Uncached copies.  We must avoid unaligned accesses, since they always take alignment
874// exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
875// a byte at a time, but that is still much faster than alignment exceptions.
876//      r4 = destination
877//      r5 = length (>0)
878//      r6 = source
879//      r8 = inverse of largest mask smaller than operand length
880//      r9 = neg(dest), used to compute alignment
881//      r12 = (dest-source), used to test relative alignment
882//      cr0 = beq if reverse move required
883//      cr5 = noncache flag
884
885c64uncached:
886        rlwinm	r10,r12,0,29,31		// relatively doubleword aligned?
887        rlwinm	r11,r12,0,30,31		// relatively word aligned?
888        cmpwi	cr7,r10,0			// set cr7 beq if doubleword aligned
889        cmpwi	cr1,r11,0			// set cr1 beq if word aligned
890        beq--   c64reverseUncached
891
892        beq		cr7,c64double		// doubleword aligned
893        beq		cr1,forward32bit    // word aligned, use G3/G4 code
894        cmpwi	r5,0				// set cr0 on byte count
895        b		c64byte				// unaligned operands
896
897c64reverseUncached:
898        beq		cr7,c64rdouble		// doubleword aligned so can use LD/STD
899        beq		cr1,reverse32bit	// word aligned, use G3/G4 code
900        add		r6,r6,r5			// point to (end+1) of source and dest
901        add		r4,r4,r5
902        cmpwi	r5,0				// set cr0 on length
903        b		c64rbyte			// copy a byte at a time
904
905
906
907// Reverse doubleword copies.  This is used for all cached copies, and doubleword
908// aligned uncached copies.
909//      r4 = destination
910//      r5 = length (>0)
911//      r6 = source
912//      r8 = inverse of largest mask of low-order 1s smaller than operand length
913//      cr5 = noncache flag
914
915c64rdouble:
916        add		r6,r6,r5			// point to (end+1) of source and dest
917        add		r4,r4,r5
918        rlwinm	r7,r4,0,29,31		// r7 <- #bytes to doubleword align dest
919        andc.   r7,r7,r8            // limit by operand length
920        sub		r5,r5,r7			// adjust length
921        srwi	r8,r5,6				// r8 <- 64-byte chunks to xfer
922        cmpwi	cr1,r8,0			// any chunks?
923        beq		c64rd2				// source already doubleword aligned
924        mtctr	r7
925
926c64rd1:								// copy bytes until source doublword aligned
927        lbzu	r0,-1(r6)
928        stbu	r0,-1(r4)
929        bdnz	c64rd1
930
931c64rd2:								// r8/cr1 <- count of 64-byte chunks
932        rlwinm	r0,r5,29,29,31		// r0 <- count of leftover doublewords
933        andi.	r5,r5,7				// r5/cr0 <- count of leftover bytes
934        cmpwi	cr7,r0,0			// leftover doublewords?
935        beq		cr1,c64rd4			// no chunks to xfer
936        mtctr	r8
937        b		c64rd3
938
939        .align	5					// align inner loop
940c64rd3:								// loop copying 64-byte chunks
941        ld		r7,-8(r6)
942        ld		r8,-16(r6)
943        ld		r9,-24(r6)
944        ld		r10,-32(r6)
945        ld		r11,-40(r6)
946        ld		r12,-48(r6)
947        std		r7,-8(r4)
948        std		r8,-16(r4)
949        ld		r7,-56(r6)
950        ldu		r8,-64(r6)
951        std		r9,-24(r4)
952        std		r10,-32(r4)
953        std		r11,-40(r4)
954        std		r12,-48(r4)
955        std		r7,-56(r4)
956        stdu	r8,-64(r4)
957        bdnz	c64rd3
958
959c64rd4:								// r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
960        beq		cr7,c64rbyte		// no leftover doublewords
961        mtctr	r0
962
963c64rd5:								// loop copying leftover doublewords
964        ldu		r0,-8(r6)
965        stdu	r0,-8(r4)
966        bdnz	c64rd5
967
968
969// Reverse byte loop.
970
971c64rbyte:							// r5/cr0 <- byte count (can be big if unaligned uncached)
972        beqlr                       // done if no leftover bytes
973        mtctr	r5
974
975c64rbyte1:
976        lbzu	r0,-1(r6)
977        stbu	r0,-1(r4)
978        bdnz	c64rbyte1
979
980        blr
981
982