1/*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31#include <debug.h>
32#include <ppc/asm.h>
33#include <ppc/proc_reg.h>
34#include <mach/ppc/vm_param.h>
35#include <assym.s>
36#include <sys/errno.h>
37
38#define INSTRUMENT 0
39
40//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
41/*
42 * void pmap_zero_page(vm_offset_t pa)
43 *
44 * Zero a page of physical memory.  This routine runs in 32 or 64-bit mode,
45 * and handles 32 and 128-byte cache lines.
46 */
47
48
49		.align	5
50		.globl	EXT(pmap_zero_page)
51
52LEXT(pmap_zero_page)
53
54        mflr	r12								// save return address
55        bl		EXT(ml_set_physical_disabled)	// turn DR and EE off, SF on, get features in r10
56        mtlr	r12								// restore return address
57        andi.	r9,r10,pf32Byte+pf128Byte		// r9 <- cache line size
58
59        subfic	r4,r9,PPC_PGBYTES				// r4 <- starting offset in page
60
61		bt++	pf64Bitb,page0S4				// Go do the big guys...
62
63		slwi	r3,r3,12						// get page address from page num
64		b		page_zero_1						// Jump to line aligned loop...
65
66        .align	5
67
68		nop
69		nop
70		nop
71		nop
72		nop
73		nop
74		nop
75
76page0S4:
77		sldi	r3,r3,12						// get page address from page num
78
79page_zero_1:									// loop zeroing cache lines
80        sub.	r5,r4,r9						// more to go?
81        dcbz128	r3,r4							// zero either 32 or 128 bytes
82        sub		r4,r5,r9						// generate next offset
83        dcbz128	r3,r5
84        bne--	page_zero_1
85
86        b		EXT(ml_restore)					// restore MSR and do the isync
87
88
89//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
90/* void
91 * phys_copy(src, dst, bytecount)
92 *      addr64_t 	    src;
93 *      addr64_t 	    dst;
94 *      int             bytecount
95 *
96 * This routine will copy bytecount bytes from physical address src to physical
97 * address dst.  It runs in 64-bit mode if necessary, but does not handle
98 * overlap or make any attempt to be optimal.  Length must be a signed word.
99 * Not performance critical.
100 */
101
102
103		.align	5
104		.globl	EXT(phys_copy)
105
106LEXT(phys_copy)
107
108		rlwinm	r3,r3,0,1,0					; Duplicate high half of long long paddr into top of reg
109        mflr	r12								// get return address
110		rlwimi	r3,r4,0,0,31				; Combine bottom of long long to full 64-bits
111		rlwinm	r4,r5,0,1,0					; Duplicate high half of long long paddr into top of reg
112        bl		EXT(ml_set_physical_disabled)	// turn DR and EE off, SF on, get features in r10
113		rlwimi	r4,r6,0,0,31				; Combine bottom of long long to full 64-bits
114        mtlr	r12								// restore return address
115        subic.	r5,r7,4							// a word to copy?
116        b		phys_copy_2
117
118		.align	5
119
120phys_copy_1:									// loop copying words
121        subic.	r5,r5,4							// more to go?
122        lwz		r0,0(r3)
123        addi	r3,r3,4
124        stw		r0,0(r4)
125        addi	r4,r4,4
126phys_copy_2:
127        bge		phys_copy_1
128        addic.	r5,r5,4							// restore count
129        ble		phys_copy_4						// no more
130
131        										// Loop is aligned here
132
133phys_copy_3:									// loop copying bytes
134        subic.	r5,r5,1							// more to go?
135        lbz		r0,0(r3)
136        addi	r3,r3,1
137        stb		r0,0(r4)
138        addi	r4,r4,1
139        bgt		phys_copy_3
140phys_copy_4:
141        b		EXT(ml_restore)					// restore MSR and do the isync
142
143
144//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
145/* void
146 * pmap_copy_page(src, dst)
147 *      ppnum_t     src;
148 *      ppnum_t     dst;
149 *
150 * This routine will copy the physical page src to physical page dst
151 *
152 * This routine assumes that the src and dst are page numbers and that the
153 * destination is cached.  It runs on 32 and 64 bit processors, with and
154 * without altivec, and with 32 and 128 byte cache lines.
155 * We also must assume that no-one will be executing within the destination
156 * page, and that this will be used for paging.  Because this
157 * is a common routine, we have tuned loops for each processor class.
158 *
159 */
160#define	kSFSize	(FM_SIZE+160)
161
162ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
163
164		lis		r2,hi16(MASK(MSR_VEC))			; Get the vector flag
165        mflr	r0								// get return
166 		ori		r2,r2,lo16(MASK(MSR_FP))		; Add the FP flag
167		stw		r0,8(r1)						// save
168        stwu	r1,-kSFSize(r1)					// set up a stack frame for VRs or FPRs
169        mfmsr	r11								// save MSR at entry
170        mfsprg	r10,2							// get feature flags
171        andc	r11,r11,r2						// Clear out vec and fp
172        ori		r2,r2,lo16(MASK(MSR_EE))		// Get EE on also
173        andc	r2,r11,r2						// Clear out EE as well
174        mtcrf	0x02,r10						// we need to test pf64Bit
175        ori		r2,r2,MASK(MSR_FP)				// must enable FP for G3...
176        mtcrf	0x80,r10						// we need to test pfAltivec too
177        oris	r2,r2,hi16(MASK(MSR_VEC))		// enable altivec for G4 (ignored if G3)
178        mtmsr	r2								// turn EE off, FP and VEC on
179        isync
180        bt++	pf64Bitb,pmap_copy_64			// skip if 64-bit processor (only they take hint)
181 		slwi	r3,r3,12						// get page address from page num
182		slwi	r4,r4,12						// get page address from page num
183        rlwinm	r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1	// get ready to turn off DR
184        bt		pfAltivecb,pmap_copy_g4			// altivec but not 64-bit means G4
185
186
187        // G3 -- copy using FPRs
188
189        stfd	f0,FM_SIZE+0(r1)				// save the 4 FPRs we use to copy
190        stfd	f1,FM_SIZE+8(r1)
191        li		r5,PPC_PGBYTES/32				// count of cache lines in a page
192        stfd	f2,FM_SIZE+16(r1)
193        mtctr	r5
194        stfd	f3,FM_SIZE+24(r1)
195        mtmsr	r12								// turn off DR after saving FPRs on stack
196        isync
197
198pmap_g3_copy_loop:								// loop over 32-byte cache lines
199        dcbz	0,r4							// avoid read of dest line
200        lfd		f0,0(r3)
201        lfd		f1,8(r3)
202        lfd		f2,16(r3)
203        lfd		f3,24(r3)
204        addi	r3,r3,32
205        stfd	f0,0(r4)
206        stfd	f1,8(r4)
207        stfd	f2,16(r4)
208        stfd	f3,24(r4)
209        dcbst	0,r4							// flush dest line to RAM
210        addi	r4,r4,32
211        bdnz	pmap_g3_copy_loop
212
213        sync									// wait for stores to take
214        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
215        li		r6,PPC_PGBYTES-32				// point to last line in page
216pmap_g3_icache_flush:
217        subic.	r5,r6,32						// more to go?
218        icbi	r4,r6							// flush another line in icache
219        subi	r6,r5,32						// get offset to next line
220        icbi	r4,r5
221        bne		pmap_g3_icache_flush
222
223        sync
224        mtmsr	r2								// turn DR back on
225        isync
226        lfd		f0,FM_SIZE+0(r1)				// restore the FPRs
227        lfd		f1,FM_SIZE+8(r1)
228        lfd		f2,FM_SIZE+16(r1)
229        lfd		f3,FM_SIZE+24(r1)
230
231        b		pmap_g4_restore					// restore MSR and done
232
233
234        // G4 -- copy using VRs
235
236pmap_copy_g4:									// r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
237        la		r9,FM_SIZE+16(r1)				// place where we save VRs to r9
238        li		r5,16							// load x-form offsets into r5-r9
239        li		r6,32							// another offset
240        stvx	v0,0,r9							// save some VRs so we can use to copy
241        li		r7,48							// another offset
242        stvx	v1,r5,r9
243        li		r0,PPC_PGBYTES/64				// we loop over 64-byte chunks
244        stvx	v2,r6,r9
245        mtctr	r0
246        li		r8,96							// get look-ahead for touch
247        stvx	v3,r7,r9
248        li		r9,128
249        mtmsr	r12								// now we've saved VRs on stack, turn off DR
250        isync									// wait for it to happen
251        b		pmap_g4_copy_loop
252
253        .align	5								// align inner loops
254pmap_g4_copy_loop:								// loop over 64-byte chunks
255        dcbt	r3,r8							// touch 3 lines ahead
256        nop										// avoid a 17-word loop...
257        dcbt	r3,r9							// touch 4 lines ahead
258        nop										// more padding
259        dcba	0,r4							// avoid pre-fetch of 1st dest line
260        lvx		v0,0,r3							// offset 0
261        lvx		v1,r5,r3						// offset 16
262        lvx		v2,r6,r3						// offset 32
263        lvx		v3,r7,r3						// offset 48
264        addi	r3,r3,64
265        dcba	r6,r4							// avoid pre-fetch of 2nd line
266        stvx	v0,0,r4							// offset 0
267        stvx	v1,r5,r4						// offset 16
268        stvx	v2,r6,r4						// offset 32
269        stvx	v3,r7,r4						// offset 48
270        dcbf	0,r4							// push line 1
271        dcbf	r6,r4							// and line 2
272        addi	r4,r4,64
273        bdnz	pmap_g4_copy_loop
274
275        sync									// wait for stores to take
276        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
277        li		r8,PPC_PGBYTES-32				// point to last line in page
278pmap_g4_icache_flush:
279        subic.	r9,r8,32						// more to go?
280        icbi	r4,r8							// flush from icache
281        subi	r8,r9,32						// get offset to next line
282        icbi	r4,r9
283        bne		pmap_g4_icache_flush
284
285        sync
286        mtmsr	r2								// turn DR back on
287        isync
288        la		r9,FM_SIZE+16(r1)				// get base of VR save area
289        lvx		v0,0,r9							// restore the VRs
290        lvx		v1,r5,r9
291        lvx		v2,r6,r9
292        lvx		v3,r7,r9
293
294pmap_g4_restore:								// r11=MSR
295        mtmsr	r11								// turn EE on, VEC and FR off
296        isync									// wait for it to happen
297        addi	r1,r1,kSFSize					// pop off our stack frame
298        lwz		r0,8(r1)						// restore return address
299        mtlr	r0
300        blr
301
302
303        // 64-bit/128-byte processor: copy using VRs
304
305pmap_copy_64:									// r10=features, r11=old MSR
306 		sldi	r3,r3,12						// get page address from page num
307		sldi	r4,r4,12						// get page address from page num
308		la		r9,FM_SIZE+16(r1)				// get base of VR save area
309        li		r5,16							// load x-form offsets into r5-r9
310        li		r6,32							// another offset
311        bf		pfAltivecb,pmap_novmx_copy		// altivec suppressed...
312        stvx	v0,0,r9							// save 8 VRs so we can copy wo bubbles
313        stvx	v1,r5,r9
314        li		r7,48							// another offset
315        li		r0,PPC_PGBYTES/128				// we loop over 128-byte chunks
316        stvx	v2,r6,r9
317        stvx	v3,r7,r9
318        addi	r9,r9,64						// advance base ptr so we can store another 4
319        mtctr	r0
320        li		r0,MASK(MSR_DR)					// get DR bit
321        stvx	v4,0,r9
322        stvx	v5,r5,r9
323        andc	r12,r2,r0						// turn off DR bit
324        li		r0,1							// get a 1 to slam into SF
325        stvx	v6,r6,r9
326        stvx	v7,r7,r9
327        rldimi	r12,r0,63,MSR_SF_BIT			// set SF bit (bit 0)
328        li		r8,-128							// offset so we can reach back one line
329        mtmsrd	r12								// now we've saved VRs, turn DR off and SF on
330        isync									// wait for it to happen
331        dcbt128	0,r3,1							// start a forward stream
332        b		pmap_64_copy_loop
333
334        .align	5								// align inner loops
335pmap_64_copy_loop:								// loop over 128-byte chunks
336        dcbz128	0,r4							// avoid read of destination line
337        lvx		v0,0,r3							// offset 0
338        lvx		v1,r5,r3						// offset 16
339        lvx		v2,r6,r3						// offset 32
340        lvx		v3,r7,r3						// offset 48
341        addi	r3,r3,64						// don't have enough GPRs so add 64 2x
342        lvx		v4,0,r3							// offset 64
343        lvx		v5,r5,r3						// offset 80
344        lvx		v6,r6,r3						// offset 96
345        lvx		v7,r7,r3						// offset 112
346        addi	r3,r3,64
347        stvx	v0,0,r4							// offset 0
348        stvx	v1,r5,r4						// offset 16
349        stvx	v2,r6,r4						// offset 32
350        stvx	v3,r7,r4						// offset 48
351        addi	r4,r4,64
352        stvx	v4,0,r4							// offset 64
353        stvx	v5,r5,r4						// offset 80
354        stvx	v6,r6,r4						// offset 96
355        stvx	v7,r7,r4						// offset 112
356        addi	r4,r4,64
357        dcbf	r8,r4							// flush the line we just wrote
358        bdnz	pmap_64_copy_loop
359
360        sync									// wait for stores to take
361        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
362        li		r8,PPC_PGBYTES-128				// point to last line in page
363pmap_64_icache_flush:
364        subic.	r9,r8,128						// more to go?
365        icbi	r4,r8							// flush from icache
366        subi	r8,r9,128						// get offset to next line
367        icbi	r4,r9
368        bne		pmap_64_icache_flush
369
370        sync
371        mtmsrd	r2								// turn DR back on, SF off
372        isync
373        la		r9,FM_SIZE+16(r1)				// get base address of VR save area on stack
374        lvx		v0,0,r9							// restore the VRs
375        lvx		v1,r5,r9
376        lvx		v2,r6,r9
377        lvx		v3,r7,r9
378        addi	r9,r9,64
379        lvx		v4,0,r9
380        lvx		v5,r5,r9
381        lvx		v6,r6,r9
382        lvx		v7,r7,r9
383
384        b		pmap_g4_restore					// restore lower half of MSR and return
385
386 //
387 //		Copy on 64-bit without VMX
388 //
389
390pmap_novmx_copy:
391		li		r0,PPC_PGBYTES/128				// we loop over 128-byte chunks
392		mtctr	r0
393		li		r0,MASK(MSR_DR)					// get DR bit
394		andc	r12,r2,r0						// turn off DR bit
395		li		r0,1							// get a 1 to slam into SF
396		rldimi	r12,r0,63,MSR_SF_BIT			// set SF bit (bit 0)
397		mtmsrd	r12								// now we've saved VRs, turn DR off and SF on
398		isync									// wait for it to happen
399		dcbt128	0,r3,1							// start a forward stream
400
401pmap_novmx_copy_loop:							// loop over 128-byte cache lines
402        dcbz128	0,r4							// avoid read of dest line
403
404        ld		r0,0(r3)						// Load half a line
405        ld		r12,8(r3)
406        ld		r5,16(r3)
407        ld		r6,24(r3)
408        ld		r7,32(r3)
409        ld		r8,40(r3)
410        ld		r9,48(r3)
411        ld		r10,56(r3)
412
413        std		r0,0(r4)						// Store half a line
414        std		r12,8(r4)
415        std		r5,16(r4)
416        std		r6,24(r4)
417        std		r7,32(r4)
418        std		r8,40(r4)
419        std		r9,48(r4)
420        std		r10,56(r4)
421
422        ld		r0,64(r3)						// Load half a line
423        ld		r12,72(r3)
424        ld		r5,80(r3)
425        ld		r6,88(r3)
426        ld		r7,96(r3)
427        ld		r8,104(r3)
428        ld		r9,112(r3)
429        ld		r10,120(r3)
430
431        addi	r3,r3,128
432
433        std		r0,64(r4)						// Store half a line
434        std		r12,72(r4)
435        std		r5,80(r4)
436        std		r6,88(r4)
437        std		r7,96(r4)
438        std		r8,104(r4)
439        std		r9,112(r4)
440        std		r10,120(r4)
441
442        dcbf	0,r4							// flush the line we just wrote
443		addi	r4,r4,128
444        bdnz	pmap_novmx_copy_loop
445
446        sync									// wait for stores to take
447        subi	r4,r4,PPC_PGBYTES				// restore ptr to destintation page
448        li		r8,PPC_PGBYTES-128				// point to last line in page
449
450pmap_novmx_icache_flush:
451        subic.	r9,r8,128						// more to go?
452        icbi	r4,r8							// flush from icache
453        subi	r8,r9,128						// get offset to next line
454        icbi	r4,r9
455        bne		pmap_novmx_icache_flush
456
457        sync
458        mtmsrd	r2								// turn DR back on, SF off
459        isync
460
461        b		pmap_g4_restore					// restore lower half of MSR and return
462
463
464
465//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
466
467// Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
468// These routines all run both on 32 and 64-bit machines, though because they are called
469// by the BSD kernel they are always in 32-bit mode when entered.  The mapped ptr returned
470// by MapUserMemoryWindow will be 64 bits however on 64-bit machines.  Beware to avoid
471// using compare instructions on this ptr.  This mapped ptr is kept globally in r31, so there
472// is no need to store or load it, which are mode-dependent operations since it could be
473// 32 or 64 bits.
474
475#define	kkFrameSize	(FM_SIZE+32)
476
477#define	kkBufSize	(FM_SIZE+0)
478#define	kkCR3		(FM_SIZE+4)
479#define	kkSource	(FM_SIZE+8)
480#define	kkDest		(FM_SIZE+12)
481#define	kkCountPtr	(FM_SIZE+16)
482#define	kkR31Save	(FM_SIZE+20)
483#define	kkThrErrJmp	(FM_SIZE+24)
484
485
486// nonvolatile CR bits we use as flags in cr3
487
488#define	kk64bit		12
489#define	kkNull		13
490#define	kkIn		14
491#define	kkString	15
492#define	kkZero		15
493
494
495//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
496/*
497 * int
498 * copyoutstr(src, dst, maxcount, count)
499 *	vm_offset_t	src;        // r3
500 *	addr64_t	dst;        // r4 and r5
501 *	vm_size_t	maxcount;   // r6
502 *	vm_size_t*	count;      // r7
503 *
504 * Set *count to the number of bytes copied.
505 */
506
507ENTRY(copyoutstr, TAG_NO_FRAME_USED)
508        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
509        mr      r10,r4                          // move high word of 64-bit user address to r10
510        li		r0,0
511        crset	kkString						// flag as a string op
512        mr      r11,r5                          // move low word of 64-bit user address to r11
513        stw		r0,0(r7)						// initialize #bytes moved
514        crclr	kkIn							// flag as copyout
515        b		copyJoin
516
517
518//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
519/*
520 * int
521 * copyinstr(src, dst, maxcount, count)
522 *	addr64_t	src;        // r3 and r4
523 *	vm_offset_t	dst;        // r5
524 *	vm_size_t	maxcount;   // r6
525 *	vm_size_t*	count;      // r7
526 *
527 * Set *count to the number of bytes copied
528 * If dst == NULL, don't copy, just count bytes.
529 * Only currently called from klcopyinstr.
530 */
531
532ENTRY(copyinstr, TAG_NO_FRAME_USED)
533        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
534        cmplwi	r5,0							// dst==NULL?
535        mr      r10,r3                          // move high word of 64-bit user address to r10
536        li		r0,0
537        crset	kkString						// flag as a string op
538        mr      r11,r4                          // move low word of 64-bit user address to r11
539        crmove	kkNull,cr0_eq					// remember if (dst==NULL)
540        stw		r0,0(r7)						// initialize #bytes moved
541        crset	kkIn							// flag as copyin (rather than copyout)
542        b		copyJoin1						// skip over the "crclr kkNull"
543
544
545//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
546/*
547 * int
548 * copyout(src, dst, count)
549 *	vm_offset_t	src;        // r3
550 *	addr64_t	dst;        // r4 and r5
551 *	size_t		count;      // r6
552 */
553
554			.align	5
555			.globl	EXT(copyout)
556			.globl	EXT(copyoutmsg)
557
558LEXT(copyout)
559LEXT(copyoutmsg)
560
561#if INSTRUMENT
562        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[12] - Take stamp at copyout
563        stw		r12,0x6100+(12*16)+0x0(0)		; INSTRUMENT - Save it
564        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
565        stw		r12,0x6100+(12*16)+0x4(0)		; INSTRUMENT - Save it
566        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
567        stw		r12,0x6100+(12*16)+0x8(0)		; INSTRUMENT - Save it
568        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
569        stw		r12,0x6100+(12*16)+0xC(0)		; INSTRUMENT - Save it
570#endif
571        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
572        mr      r10,r4                          // move high word of 64-bit user address to r10
573        crclr	kkString						// not a string version
574        mr      r11,r5                          // move low word of 64-bit user address to r11
575        crclr	kkIn							// flag as copyout
576        b		copyJoin
577
578
579//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
580/*
581 * int
582 * copyin(src, dst, count)
583 *	addr64_t	src;        // r3 and r4
584 *	vm_offset_t	dst;        // r5
585 *	size_t		count;      // r6
586 */
587
588
589			.align	5
590			.globl	EXT(copyin)
591			.globl	EXT(copyinmsg)
592
593LEXT(copyin)
594LEXT(copyinmsg)
595
596        mfcr	r2,0x10                         // save caller's cr3, which we use for flags
597        mr      r10,r3                          // move high word of 64-bit user address to r10
598        crclr	kkString						// not a string version
599        mr      r11,r4                          // move low word of 64-bit user address to r11
600        crset	kkIn							// flag as copyin
601
602
603// Common code to handle setup for all the copy variants:
604//		r2 = caller's cr3
605//      r3 = source if copyout
606//      r5 = dest if copyin
607//      r6 = buffer length or count
608//      r7 = count output ptr (if kkString set)
609//	   r10 = high word of 64-bit user-space address (source if copyin, dest if copyout)
610//	   r11 = low word of 64-bit user-space address
611//     cr3 = kkIn, kkString, kkNull flags
612
613copyJoin:
614        crclr	kkNull							// (dst==NULL) convention not used with this call
615copyJoin1:										// enter from copyinstr with kkNull set
616		mflr	r0								// get return address
617        cmplwi	r6,0							// buffer length 0?
618        lis		r9,0x1000						// r9 <- 0x10000000 (256MB)
619		stw		r0,FM_LR_SAVE(r1)				// save return
620        cmplw	cr1,r6,r9						// buffer length > 256MB ?
621        mfsprg	r8,2							// get the features
622        beq--	copyinout_0						// 0 length is degenerate case
623		stwu	r1,-kkFrameSize(r1)				// set up stack frame
624        stw		r2,kkCR3(r1)                    // save caller's cr3, which we use for flags
625        mtcrf	0x02,r8							// move pf64Bit to cr6
626        stw		r3,kkSource(r1)					// save args across MapUserMemoryWindow
627        stw		r5,kkDest(r1)
628        stw		r6,kkBufSize(r1)
629        crmove	kk64bit,pf64Bitb				// remember if this is a 64-bit processor
630        stw		r7,kkCountPtr(r1)
631        stw		r31,kkR31Save(r1)				// we use r31 globally for mapped user ptr
632
633
634
635// Handle buffer length > 256MB.  This is an error (ENAMETOOLONG) on copyin and copyout.
636// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
637// the buffer length to 256MB.  This isn't an issue if the string is less than 256MB
638// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG.  This restriction
639// is due to MapUserMemoryWindow; we don't want to consume more than two segments for
640// the mapping.
641
642        ble++	cr1,copyin0						// skip if buffer length <= 256MB
643        bf		kkString,copyinout_too_big		// error if not string op
644        mr		r6,r9							// silently clamp buffer length to 256MB
645        stw		r9,kkBufSize(r1)				// update saved copy too
646
647
648// Set up thread_recover in case we hit an illegal address.
649
650copyin0:
651		li		r31,0							// no mapped ptr yet
652		mfsprg  r8,1							// Get the current thread
653		lis		r2,hi16(copyinout_error)
654		ori		r2,r2,lo16(copyinout_error)
655		lwz		r4,THREAD_RECOVER(r8)
656		lwz		r3,ACT_VMMAP(r8)				// r3 <- vm_map virtual address
657		stw		r2,THREAD_RECOVER(r8)
658		stw		r4,kkThrErrJmp(r1)
659
660
661// Map user segment into kernel map, turn on 64-bit mode.  At this point:
662//		r3 = vm map
663//		r6 = buffer length
664// r10/r11 = 64-bit user-space ptr (source if copyin, dest if copyout)
665//
666// When we call MapUserMemoryWindow, we pass:
667//      r3 = vm map ptr
668//   r4/r5 = 64-bit user space address as an addr64_t
669
670        mr      r4,r10                          // copy user ptr into r4/r5
671        mr      r5,r11
672#if INSTRUMENT
673        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
674        stw		r12,0x6100+(13*16)+0x0(0)		; INSTRUMENT - Save it
675        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
676        stw		r12,0x6100+(13*16)+0x4(0)		; INSTRUMENT - Save it
677        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
678        stw		r12,0x6100+(13*16)+0x8(0)		; INSTRUMENT - Save it
679        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
680        stw		r12,0x6100+(13*16)+0xC(0)		; INSTRUMENT - Save it
681#endif
682        bl		EXT(MapUserMemoryWindow)		// get r3/r4 <- 64-bit address in kernel map of user operand
683#if INSTRUMENT
684        mfspr	r12,pmc1						; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
685        stw		r12,0x6100+(14*16)+0x0(0)		; INSTRUMENT - Save it
686        mfspr	r12,pmc2						; INSTRUMENT - Get stamp
687        stw		r12,0x6100+(14*16)+0x4(0)		; INSTRUMENT - Save it
688        mfspr	r12,pmc3						; INSTRUMENT - Get stamp
689        stw		r12,0x6100+(14*16)+0x8(0)		; INSTRUMENT - Save it
690        mfspr	r12,pmc4						; INSTRUMENT - Get stamp
691        stw		r12,0x6100+(14*16)+0xC(0)		; INSTRUMENT - Save it
692#endif
693        mr		r31,r4							// r31 <- mapped ptr into user space (may be 64-bit)
694        bf--	kk64bit,copyin1					// skip if a 32-bit processor
695
696 		rldimi	r31,r3,32,0						// slam high-order bits into mapped ptr
697        mfmsr	r4								// if 64-bit, turn on SF so we can use returned ptr
698        li		r0,1
699        rldimi	r4,r0,63,MSR_SF_BIT				// light bit 0
700        mtmsrd	r4								// turn on 64-bit mode
701        isync									// wait for mode to change
702
703
704// Load r3-r5, substituting mapped ptr as appropriate.
705
706copyin1:
707        lwz		r5,kkBufSize(r1)				// restore length to copy
708        bf		kkIn,copyin2					// skip if copyout
709        lwz		r4,kkDest(r1)					// copyin: dest is kernel ptr
710        mr		r3,r31							// source is mapped ptr
711        b		copyin3
712copyin2:										// handle copyout
713        lwz		r3,kkSource(r1)					// source is kernel buffer (r3 at entry)
714        mr		r4,r31							// dest is mapped ptr into user space
715
716
717// Finally, all set up to copy:
718//		r3 = source ptr (mapped if copyin)
719//		r4 = dest ptr (mapped if copyout)
720//		r5 = length
721//	   r31 = mapped ptr returned by MapUserMemoryWindow
722//	   cr3 = kkIn, kkString, kk64bit, and kkNull flags
723
724copyin3:
725        bt		kkString,copyString				// handle copyinstr and copyoutstr
726        bl		EXT(bcopy)						// copyin and copyout: let bcopy do the work
727        li		r3,0							// return success
728
729
730// Main exit point for copyin, copyout, copyinstr, and copyoutstr.  Also reached
731// from error recovery if we get a DSI accessing user space.  Clear recovery ptr,
732// and pop off frame.
733//		r3 = 0, EFAULT, or ENAMETOOLONG
734
735copyinx:
736        lwz		r2,kkCR3(r1)                    // get callers cr3
737		mfsprg  r6,1							// Get the current thread
738        bf--	kk64bit,copyinx1				// skip if 32-bit processor
739        mfmsr	r12
740        rldicl	r12,r12,0,MSR_SF_BIT+1			// if 64-bit processor, turn 64-bit mode off
741        mtmsrd	r12								// turn SF off
742        isync									// wait for the mode to change
743copyinx1:
744		lwz		r0,FM_LR_SAVE+kkFrameSize(r1)   // get return address
745        lwz		r31,kkR31Save(r1)				// restore callers r31
746        lwz		r4,kkThrErrJmp(r1)				// load saved thread recover
747        addi	r1,r1,kkFrameSize				// pop off our stack frame
748		mtlr	r0
749		stw		r4,THREAD_RECOVER(r6)			// restore thread recover
750        mtcrf	0x10,r2							// restore cr3
751		blr
752
753
754/* We get here via the exception handler if an illegal
755 * user memory reference was made.  This error handler is used by
756 * copyin, copyout, copyinstr, and copyoutstr.  Registers are as
757 * they were at point of fault, so for example cr3 flags are valid.
758 */
759
760copyinout_error:
761        li		r3,EFAULT						// return error
762        b		copyinx
763
764copyinout_0:									// degenerate case: 0-length copy
765		mtcrf	0x10,r2							// restore cr3
766        li		r3,0							// return success
767        blr
768
769copyinout_too_big:								// degenerate case
770        mtcrf	0x10,r2							// restore cr3
771        lwz		r1,0(r1)						// pop off stack frame
772        li		r3,ENAMETOOLONG
773        blr
774
775
776//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
777// Handle copyinstr and copyoutstr.  At this point the stack frame is set up,
778// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode
779// if necessary, and:
780//		r3 = source ptr, mapped if copyinstr
781//		r4 = dest ptr, mapped if copyoutstr
782//		r5 = buffer length
783//	   r31 = mapped ptr returned by MapUserMemoryWindow
784//     cr3 = kkIn, kkString, kkNull, and kk64bit flags
785// We do word copies unless the buffer is very short, then use a byte copy loop
786// for the leftovers if necessary.  The crossover at which the word loop becomes
787// faster is about seven bytes, counting the zero.
788//
789// We first must word-align the source ptr, in order to avoid taking a spurious
790// page fault.
791
792copyString:
793        cmplwi	cr1,r5,15						// is buffer very short?
794        mr      r12,r3                          // remember ptr to 1st source byte
795        mtctr	r5								// assuming short, set up loop count for bytes
796        blt--   cr1,copyinstr8					// too short for word loop
797        rlwinm  r2,r3,0,0x3                     // get byte offset of 1st byte within word
798        rlwinm  r9,r3,3,0x18                    // get bit offset of 1st byte within word
799        li      r7,-1
800        sub     r3,r3,r2                        // word-align source address
801        add     r6,r5,r2                        // get length starting at byte 0 in word
802        srw     r7,r7,r9                        // get mask for bytes in first word
803        srwi	r0,r6,2							// get #words in buffer
804        lwz     r5,0(r3)                        // get aligned word with first source byte
805        lis		r10,hi16(0xFEFEFEFF)			// load magic constants into r10 and r11
806        lis		r11,hi16(0x80808080)
807        mtctr	r0								// set up word loop count
808        addi    r3,r3,4                         // advance past the source word
809        ori		r10,r10,lo16(0xFEFEFEFF)
810        ori		r11,r11,lo16(0x80808080)
811        orc     r8,r5,r7                        // map bytes preceeding first source byte into 0xFF
812        bt--	kkNull,copyinstr5enter          // enter loop that just counts
813
814// Special case 1st word, which has been 0xFF filled on left.  Note that we use
815// "and.", even though we execute both in 32 and 64-bit mode.  This is OK.
816
817        slw     r5,r5,r9                        // left justify payload bytes
818        add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
819        andc	r7,r11,r8						// r7 = ~data & 0x80808080
820		subfic  r0,r2,4							// get r0 <- #payload bytes in 1st word
821        and.    r7,r9,r7						// if r7==0, then all bytes in r8 are nonzero
822        stw     r5,0(r4)                        // copy payload bytes to dest buffer
823        add		r4,r4,r0						// then point to next byte in dest buffer
824        bdnzt   cr0_eq,copyinstr6               // use loop that copies if 0 not found
825
826        b		copyinstr7                      // 0 found (buffer can't be full)
827
828
829// Word loop(s).  They do a word-parallel search for 0s, using the following
830// inobvious but very efficient test:
831//		y =  data + 0xFEFEFEFF
832//		z = ~data & 0x80808080
833// If (y & z)==0, then all bytes in dataword are nonzero.  There are two copies
834// of this loop, one that just counts and another that copies.
835//		r3 = ptr to next word of source (word aligned)
836//		r4 = ptr to next byte in buffer
837//      r6 = original buffer length (adjusted to be word origin)
838//     r10 = 0xFEFEFEFE
839//     r11 = 0x80808080
840//     r12 = ptr to 1st source byte (used to determine string length)
841
842        .align	5								// align inner loops for speed
843copyinstr5:										// version that counts but does not copy
844        lwz     r8,0(r3)						// get next word of source
845        addi    r3,r3,4                         // advance past it
846copyinstr5enter:
847        add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
848        andc	r7,r11,r8						// r7 = ~data & 0x80808080
849        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
850        bdnzt   cr0_eq,copyinstr5				// if r7==0, then all bytes in r8 are nonzero
851
852        b		copyinstr7
853
854        .align	5								// align inner loops for speed
855copyinstr6:										// version that counts and copies
856        lwz     r8,0(r3)						// get next word of source
857        addi    r3,r3,4                         // advance past it
858        addi	r4,r4,4							// increment dest ptr while we wait for data
859        add		r9,r10,r8						// r9 =  data + 0xFEFEFEFF
860        andc	r7,r11,r8						// r7 = ~data & 0x80808080
861        and.    r7,r9,r7                        // r7 = r9 & r7 ("." ok even in 64-bit mode)
862        stw		r8,-4(r4)						// pack all 4 bytes into buffer
863        bdnzt	cr0_eq,copyinstr6				// if r7==0, then all bytes are nonzero
864
865
866// Either 0 found or buffer filled.  The above algorithm has mapped nonzero bytes to 0
867// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
868// mapped to 0x80.  We must mask out these false hits before searching for an 0x80 byte.
869//		r3 = word aligned ptr to next word of source (ie, r8==mem(r3-4))
870//      r6 = original buffer length (adjusted to be word origin)
871//      r7 = computed vector of 0x00 and 0x80 bytes
872//      r8 = original source word, coming from -4(r3), possibly padded with 0xFFs on left if 1st word
873//     r12 = ptr to 1st source byte (used to determine string length)
874//     cr0 = beq set iff 0 not found
875
876copyinstr7:
877        rlwinm	r2,r8,7,0,31					// move 0x01 bits to 0x80 position
878		rlwinm  r6,r6,0,0x3						// mask down to partial byte count in last word
879        andc	r7,r7,r2						// turn off false hits from 0x0100 worst case
880        crnot	kkZero,cr0_eq					// 0 found iff cr0_eq is off
881        srwi    r7,r7,8                         // we want to count the 0 as a byte xferred
882		cmpwi   r6,0							// any bytes left over in last word?
883        cntlzw	r7,r7							// now we can find the 0 byte (ie, the 0x80)
884        subi    r3,r3,4                         // back up r3 to point to 1st byte in r8
885        srwi	r7,r7,3							// convert 8,16,24,32 to 1,2,3,4
886        add     r3,r3,r7                        // now r3 points one past 0 byte, or at 1st byte not xferred
887        bt++	kkZero,copyinstr10				// 0 found, so done
888
889        beq		copyinstr10						// r6==0, so buffer truly full
890        mtctr	r6								// 0 not found, loop over r6 bytes
891        b		copyinstr8						// enter byte loop for last 1-3 leftover bytes
892
893
894// Byte loop.  This is used for very small buffers and for the odd bytes left over
895// after searching and copying words at a time.
896//      r3 = ptr to next byte of source
897//      r4 = ptr to next dest byte
898//     r12 = ptr to first byte of source
899//     ctr = count of bytes to check
900
901        .align	5								// align inner loops for speed
902copyinstr8:										// loop over bytes of source
903        lbz		r0,0(r3)						// get next byte of source
904        addi	r3,r3,1
905        addi	r4,r4,1							// increment dest addr whether we store or not
906        cmpwi	r0,0							// the 0?
907        bt--	kkNull,copyinstr9				// don't store if copyinstr with NULL ptr
908        stb		r0,-1(r4)
909copyinstr9:
910        bdnzf	cr0_eq,copyinstr8				// loop if byte not 0 and more room in buffer
911
912        crmove	kkZero,cr0_eq					// remember if 0 found or buffer filled
913
914
915// Buffer filled or 0 found.  Unwind and return.
916//      r3 = ptr to 1st source byte not transferred
917//     r12 = ptr to 1st source byte
918//     r31 = mapped ptr returned by MapUserMemoryWindow
919//     cr3 = kkZero set iff 0 found
920
921copyinstr10:
922        lwz		r9,kkCountPtr(r1)				// get ptr to place to store count of bytes moved
923        sub     r2,r3,r12                       // compute #bytes copied (including the 0)
924        li		r3,0							// assume success return status
925        stw		r2,0(r9)						// store #bytes moved
926        bt++	kkZero,copyinx					// we did find the 0 so return 0
927        li		r3,ENAMETOOLONG					// buffer filled
928        b		copyinx							// join main exit routine
929
930//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
931/*
932 * int
933 * copypv(source, sink, size, which)
934 *	addr64_t	src;        // r3 and r4
935 *	addr64_t	dst;        // r5 and r6
936 *	size_t		size;		// r7
937 *	int			which;		// r8
938 *
939 * Operand size bytes are copied from operand src into operand dst. The source and
940 * destination operand addresses are given as addr64_t, and may designate starting
941 * locations in physical or virtual memory in any combination except where both are
942 * virtual. Virtual memory locations may be in either the kernel or the current thread's
943 * address space. Operand size may be up to 256MB.
944 *
945 * Operation is controlled by operand which, which offers these options:
946 *		cppvPsrc : source operand is (1) physical or (0) virtual
947 *		cppvPsnk : destination operand is (1) physical or (0) virtual
948 *		cppvKmap : virtual operand is in (1) kernel or (0) current thread
949 *		cppvFsnk : (1) flush destination before and after transfer
950 *		cppvFsrc : (1) flush source before and after transfer
951 *		cppvNoModSnk : (1) don't set source operand's changed bit(s)
952 *		cppvNoRefSrc : (1) don't set destination operand's referenced bit(s)
953 *
954 * Implementation is now split into this new 64-bit path and the old path, hw_copypv_32().
955 * This section describes the operation of the new 64-bit path.
956 *
957 * The 64-bit path utilizes the more capacious 64-bit kernel address space to create a
958 * window in the kernel address space into all of physical RAM plus the I/O hole. Since
959 * the window's mappings specify the proper access policies for the underlying memory,
960 * the new path does not have to flush caches to avoid a cache paradox, so cppvFsnk
961 * and cppvFsrc are ignored. Physical operand adresses are relocated into the physical
962 * memory window, and are accessed with data relocation on. Virtual addresses are either
963 * within the kernel, or are mapped into the kernel address space through the user memory
964 * window. Because accesses to a virtual operand are performed with data relocation on,
965 * the new path does not have to translate the address, disable/enable interrupts, lock
966 * the mapping, or update referenced and changed bits.
967 *
968 * The IBM 970 (a.k.a. G5) processor treats real-mode accesses as guarded, so there is
969 * a substantial performance penalty for copypv operating in real mode. Utilizing the
970 * new 64-bit path, transfer performance increases >100% on the G5.
971 *
972 * The attentive reader may notice that mtmsrd ops are not followed by isync ops as
973 * might be expected. The 970 follows PowerPC architecture version 2.01, which defines
974 * mtmsrd with L=0 as a context synchronizing op, so a following isync is no longer
975 * required.
976 *
977 * To keep things exciting, we develop 64-bit values in non-volatiles, but we also need
978 * to call 32-bit functions, which would lead to the high-order 32 bits of our values
979 * getting clobbered unless we do something special. So, we preserve our 64-bit non-volatiles
980 * in our own stack frame across calls to 32-bit functions.
981 *
982 */
983
984// Map operand which bits into non-volatile CR2 and CR3 bits.
985#define whichAlign	((3+1)*4)
986#define whichMask	0x007F0000
987#define pvPsnk		(cppvPsnkb - whichAlign)
988#define pvPsrc		(cppvPsrcb - whichAlign)
989#define pvFsnk		(cppvFsnkb - whichAlign)
990#define pvFsrc		(cppvFsrcb - whichAlign)
991#define pvNoModSnk	(cppvNoModSnkb - whichAlign)
992#define pvNoRefSrc	(cppvNoRefSrcb - whichAlign)
993#define pvKmap		(cppvKmapb - whichAlign)
994#define pvNoCache	cr2_lt
995
996		.align	5
997		.globl	EXT(copypv)
998
999LEXT(copypv)
1000        mfsprg	r10,2							// get feature flags
1001        mtcrf	0x02,r10						// we need to test pf64Bit
1002        bt++	pf64Bitb,copypv_64				// skip if 64-bit processor (only they take hint)
1003
1004        b		EXT(hw_copypv_32)				// carry on with 32-bit copypv
1005
1006// Push a 32-bit ABI-compliant stack frame and preserve all non-volatiles that we'll clobber.
1007copypv_64:
1008		mfsprg	r9,1							// get current thread
1009		stwu	r1,-(FM_ALIGN((31-26+11)*4)+FM_SIZE)(r1)
1010												// allocate stack frame and link it
1011		mflr	r0								// get return address
1012		mfcr	r10								// get cr2 and cr3
1013		lwz		r12,THREAD_RECOVER(r9)			// get error callback
1014		stw		r26,FM_ARG0+0x00(r1)			// save non-volatile r26
1015		stw		r27,FM_ARG0+0x04(r1)			// save non-volatile r27
1016		stw		r28,FM_ARG0+0x08(r1)			// save non-volatile r28
1017		stw		r29,FM_ARG0+0x0C(r1)			// save non-volatile r29
1018		stw		r30,FM_ARG0+0x10(r1)			// save non-volatile r30
1019		stw		r31,FM_ARG0+0x14(r1)			// save non-volatile r31
1020		stw		r12,FM_ARG0+0x20(r1)			// save error callback
1021		stw		r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
1022												// save return address
1023		stw		r10,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
1024												// save non-volatile cr2 and cr3
1025
1026// Non-volatile register usage in this routine is:
1027//	r26: saved msr image
1028//	r27: current pmap_t / virtual source address
1029//	r28: destination virtual address
1030//	r29: source address
1031//	r30: destination address
1032//	r31: byte count to copy
1033//	cr2/3: parameter 'which' bits
1034
1035		rlwinm	r8,r8,whichAlign,whichMask		// align and mask which bits
1036		mr		r31,r7							// copy size to somewhere non-volatile
1037		mtcrf	0x20,r8							// insert which bits into cr2 and cr3
1038		mtcrf	0x10,r8							// insert which bits into cr2 and cr3
1039		rlwinm	r29,r3,0,1,0					// form source address high-order bits
1040		rlwinm	r30,r5,0,1,0					// form destination address high-order bits
1041		rlwimi	r29,r4,0,0,31					// form source address low-order bits
1042		rlwimi	r30,r6,0,0,31					// form destination address low-order bits
1043		crand	cr7_lt,pvPsnk,pvPsrc			// are both operand addresses physical?
1044		cntlzw	r0,r31							// count leading zeroes in byte count
1045		cror	cr7_eq,pvPsnk,pvPsrc			// cr7_eq <- source or destination is physical
1046		bf--	cr7_eq,copypv_einval			// both operands may not be virtual
1047		cmplwi	r0,4							// byte count greater than or equal 256M (2**28)?
1048		blt--	copypv_einval					// byte count too big, give EINVAL
1049		cmplwi	r31,0							// byte count zero?
1050		beq--	copypv_zero						// early out
1051		bt		cr7_lt,copypv_phys				// both operand addresses are physical
1052		mr		r28,r30							// assume destination is virtual
1053		bf		pvPsnk,copypv_dv				// is destination virtual?
1054		mr		r28,r29							// no, so source must be virtual
1055copypv_dv:
1056		lis		r27,ha16(EXT(kernel_pmap))		// get kernel's pmap_t *, high-order
1057		lwz		r27,lo16(EXT(kernel_pmap))(r27) // get kernel's pmap_t
1058		bt		pvKmap,copypv_kern				// virtual address in kernel map?
1059		lwz		r3,ACT_VMMAP(r9)				// get user's vm_map *
1060		rldicl	r4,r28,32,32					// r4, r5 <- addr64_t virtual address
1061		rldicl	r5,r28,0,32
1062		std		r29,FM_ARG0+0x30(r1)			// preserve 64-bit r29 across 32-bit call
1063		std		r30,FM_ARG0+0x38(r1)			// preserve 64-bit r30 across 32-bit call
1064		bl		EXT(MapUserMemoryWindow)		// map slice of user space into kernel space
1065		ld		r29,FM_ARG0+0x30(r1)			// restore 64-bit r29
1066		ld		r30,FM_ARG0+0x38(r1)			// restore 64-bit r30
1067		rlwinm	r28,r3,0,1,0					// convert relocated addr64_t virtual address
1068		rlwimi	r28,r4,0,0,31					//  into a single 64-bit scalar
1069copypv_kern:
1070
1071// Since we'll be accessing the virtual operand with data-relocation on, we won't need to
1072// update the referenced and changed bits manually after the copy. So, force the appropriate
1073// flag bit on for the virtual operand.
1074		crorc	pvNoModSnk,pvNoModSnk,pvPsnk	// for virtual dest, let hardware do ref/chg bits
1075		crorc	pvNoRefSrc,pvNoRefSrc,pvPsrc	// for virtual source, let hardware do ref bit
1076
1077// We'll be finding a mapping and looking at, so we need to disable 'rupts.
1078		lis		r0,hi16(MASK(MSR_VEC))			// get vector mask
1079		ori		r0,r0,lo16(MASK(MSR_FP))		// insert fp mask
1080		mfmsr	r26								// save current msr
1081		andc	r26,r26,r0						// turn off VEC and FP in saved copy
1082		ori		r0,r0,lo16(MASK(MSR_EE))		// add EE to our mask
1083		andc	r0,r26,r0						// disable EE in our new msr image
1084		mtmsrd	r0								// introduce new msr image
1085
1086// We're now holding the virtual operand's pmap_t in r27 and its virtual address in r28. We now
1087// try to find a mapping corresponding to this address in order to determine whether the address
1088// is cacheable. If we don't find a mapping, we can safely assume that the operand is cacheable
1089// (a non-cacheable operand must be a block mapping, which will always exist); otherwise, we
1090// examine the mapping's caching-inhibited bit.
1091		mr		r3,r27							// r3 <- pmap_t pmap
1092		rldicl	r4,r28,32,32					// r4, r5 <- addr64_t va
1093		rldicl	r5,r28,0,32
1094		la		r6,FM_ARG0+0x18(r1)				// r6 <- addr64_t *nextva
1095		li		r7,1							// r7 <- int full, search nested mappings
1096		std		r26,FM_ARG0+0x28(r1)			// preserve 64-bit r26 across 32-bit calls
1097		std		r28,FM_ARG0+0x30(r1)			// preserve 64-bit r28 across 32-bit calls
1098		std		r29,FM_ARG0+0x38(r1)			// preserve 64-bit r29 across 32-bit calls
1099		std		r30,FM_ARG0+0x40(r1)			// preserve 64-bit r30 across 32-bit calls
1100		bl		EXT(mapping_find)				// find mapping for virtual operand
1101		mr.		r3,r3							// did we find it?
1102		beq		copypv_nomapping				// nope, so we'll assume it's cacheable
1103		lwz		r4,mpVAddr+4(r3)				// get low half of virtual addr for hw flags
1104		rlwinm.	r4,r4,0,mpIb-32,mpIb-32			// caching-inhibited bit set?
1105		crnot	pvNoCache,cr0_eq				// if it is, use bcopy_nc
1106		bl		EXT(mapping_drop_busy)			// drop busy on the mapping
1107copypv_nomapping:
1108		ld		r26,FM_ARG0+0x28(r1)			// restore 64-bit r26
1109		ld		r28,FM_ARG0+0x30(r1)			// restore 64-bit r28
1110		ld		r29,FM_ARG0+0x38(r1)			// restore 64-bit r29
1111		ld		r30,FM_ARG0+0x40(r1)			// restore 64-bit r30
1112		mtmsrd	r26								// restore msr to it's previous state
1113
1114// Set both the source and destination virtual addresses to the virtual operand's address --
1115// we'll overlay one of them with the physical operand's address.
1116		mr		r27,r28							// make virtual operand BOTH source AND destination
1117
1118// Now we're ready to relocate the physical operand address(es) into the physical memory window.
1119// Recall that we've mapped physical memory (including the I/O hole) into the kernel's address
1120// space somewhere at or over the 2**32 line. If one or both of the operands are in the I/O hole,
1121// we'll set the pvNoCache flag, forcing use of non-caching bcopy_nc() to do the copy.
1122copypv_phys:
1123		ld		r6,lgPMWvaddr(0)				// get physical memory window virtual address
1124		bf		pvPsnk,copypv_dstvirt			// is destination address virtual?
1125		cntlzd	r4,r30							// count leading zeros in destination address
1126		cmplwi	r4,32							// if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
1127		cror	pvNoCache,cr0_eq,pvNoCache		// use bcopy_nc for I/O hole locations
1128		add		r28,r30,r6						// relocate physical destination into physical window
1129copypv_dstvirt:
1130		bf		pvPsrc,copypv_srcvirt			// is source address virtual?
1131		cntlzd	r4,r29							// count leading zeros in source address
1132		cmplwi	r4,32							// if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
1133		cror	pvNoCache,cr0_eq,pvNoCache		// use bcopy_nc for I/O hole locations
1134		add		r27,r29,r6						// relocate physical source into physical window
1135copypv_srcvirt:
1136
1137// Once the copy is under way (bcopy or bcopy_nc), we will want to get control if anything
1138// funny happens during the copy. So, we set a pointer to our error handler in the per-thread
1139// control block.
1140		mfsprg	r8,1							// get current threads stuff
1141		lis		r3,hi16(copypv_error)			// get our error callback's address, high
1142		ori		r3,r3,lo16(copypv_error)		// get our error callback's address, low
1143		stw		r3,THREAD_RECOVER(r8)			// set our error callback
1144
1145// Since our physical operand(s) are relocated at or above the 2**32 line, we must enter
1146// 64-bit mode.
1147		li		r0,1							// get a handy one bit
1148		mfmsr	r3								// get current msr
1149		rldimi	r3,r0,63,MSR_SF_BIT				// set SF bit on in our msr copy
1150		mtmsrd	r3								// enter 64-bit mode
1151
1152// If requested, flush data cache
1153// Note that we don't flush, the code is being saved "just in case".
1154#if 0
1155		bf		pvFsrc,copypv_nfs				// do we flush the source?
1156		rldicl	r3,r27,32,32					// r3, r4 <- addr64_t source virtual address
1157		rldicl	r4,r27,0,32
1158		mr		r5,r31							// r5 <- count (in bytes)
1159		li		r6,0							// r6 <- boolean phys (false, not physical)
1160		bl		EXT(flush_dcache)				// flush the source operand
1161copypv_nfs:
1162		bf		pvFsnk,copypv_nfdx				// do we flush the destination?
1163		rldicl	r3,r28,32,32					// r3, r4 <- addr64_t destination virtual address
1164		rldicl	r4,r28,0,32
1165		mr		r5,r31							// r5 <- count (in bytes)
1166		li		r6,0							// r6 <- boolean phys (false, not physical)
1167		bl		EXT(flush_dcache)				// flush the destination operand
1168copypv_nfdx:
1169#endif
1170
1171// Call bcopy or bcopy_nc to perform the copy.
1172		mr		r3,r27							// r3 <- source virtual address
1173		mr		r4,r28							// r4 <- destination virtual address
1174		mr		r5,r31							// r5 <- bytes to copy
1175		bt		pvNoCache,copypv_nc				// take non-caching route
1176		bl		EXT(bcopy)						// call bcopy to do the copying
1177		b		copypv_copydone
1178copypv_nc:
1179		bl		EXT(bcopy_nc)					// call bcopy_nc to do the copying
1180copypv_copydone:
1181
1182// If requested, flush data cache
1183// Note that we don't flush, the code is being saved "just in case".
1184#if 0
1185		bf		pvFsrc,copypv_nfsx				// do we flush the source?
1186		rldicl	r3,r27,32,32					// r3, r4 <- addr64_t source virtual address
1187		rldicl	r4,r27,0,32
1188		mr		r5,r31							// r5 <- count (in bytes)
1189		li		r6,0							// r6 <- boolean phys (false, not physical)
1190		bl		EXT(flush_dcache)				// flush the source operand
1191copypv_nfsx:
1192		bf		pvFsnk,copypv_nfd				// do we flush the destination?
1193		rldicl	r3,r28,32,32					// r3, r4 <- addr64_t destination virtual address
1194		rldicl	r4,r28,0,32
1195		mr		r5,r31							// r5 <- count (in bytes)
1196		li		r6,0							// r6 <- boolean phys (false, not physical)
1197		bl		EXT(flush_dcache)				// flush the destination operand
1198copypv_nfd:
1199#endif
1200
1201// Leave 64-bit mode.
1202		mfmsr	r3								// get current msr
1203		rldicl	r3,r3,0,MSR_SF_BIT+1			// clear SF bit in our copy
1204		mtmsrd	r3								// leave 64-bit mode
1205
1206// If requested, set ref/chg on source/dest physical operand(s). It is possible that copy is
1207// from/to a RAM disk situated outside of mapped physical RAM, so we check each page by calling
1208// mapping_phys_lookup() before we try to set its ref/chg bits; otherwise, we might panic.
1209// Note that this code is page-size sensitive, so it should probably be a part of our low-level
1210// code in hw_vm.s.
1211		bt		pvNoModSnk,copypv_nomod			// skip destination update if not requested
1212		std		r29,FM_ARG0+0x30(r1)			// preserve 64-bit r29 across 32-bit calls
1213		li		r26,1							// r26 <- 4K-page count
1214		mr		r27,r31							// r27 <- byte count
1215		rlwinm	r3,r30,0,20,31					// does destination cross a page boundary?
1216		subfic	r3,r3,4096						//
1217		cmplw	r3,r27							//
1218		blt		copypv_modnox					// skip if not crossing case
1219		subf	r27,r3,r27						// r27 <- byte count less initial fragment
1220		addi	r26,r26,1						// increment page count
1221copypv_modnox:
1222		srdi	r3,r27,12						// pages to update (not including crosser)
1223		add		r26,r26,r3						// add in crosser
1224		srdi	r27,r30,12						// r27 <- destination page number
1225copypv_modloop:
1226		mr		r3,r27							// r3 <- destination page number
1227		la		r4,FM_ARG0+0x18(r1)				// r4 <- unsigned int *pindex
1228		bl		EXT(mapping_phys_lookup)		// see if page is really there
1229		mr.		r3,r3							// is it?
1230		beq--	copypv_modend					// nope, break out of modify loop
1231		mr		r3,r27							// r3 <- destination page number
1232		bl		EXT(mapping_set_mod)			// set page changed status
1233		subi	r26,r26,1						// decrement page count
1234		cmpwi	r26,0							// done yet?
1235		bgt		copypv_modloop					// nope, iterate
1236copypv_modend:
1237		ld		r29,FM_ARG0+0x30(r1)			// restore 64-bit r29
1238copypv_nomod:
1239		bt		pvNoRefSrc,copypv_done			// skip source update if not requested
1240copypv_debugref:
1241		li		r26,1							// r26 <- 4K-page count
1242		mr		r27,r31							// r27 <- byte count
1243		rlwinm	r3,r29,0,20,31					// does source cross a page boundary?
1244		subfic	r3,r3,4096						//
1245		cmplw	r3,r27							//
1246		blt		copypv_refnox					// skip if not crossing case
1247		subf	r27,r3,r27						// r27 <- byte count less initial fragment
1248		addi	r26,r26,1						// increment page count
1249copypv_refnox:
1250		srdi	r3,r27,12						// pages to update (not including crosser)
1251		add		r26,r26,r3						// add in crosser
1252		srdi	r27,r29,12						// r27 <- source page number
1253copypv_refloop:
1254		mr		r3,r27							// r3 <- source page number
1255		la		r4,FM_ARG0+0x18(r1)				// r4 <- unsigned int *pindex
1256		bl		EXT(mapping_phys_lookup)		// see if page is really there
1257		mr.		r3,r3							// is it?
1258		beq--	copypv_done						// nope, break out of modify loop
1259		mr		r3,r27							// r3 <- source  page number
1260		bl		EXT(mapping_set_ref)			// set page referenced status
1261		subi	r26,r26,1						// decrement page count
1262		cmpwi	r26,0							// done yet?
1263		bgt		copypv_refloop					// nope, iterate
1264
1265// Return, indicating success.
1266copypv_done:
1267copypv_zero:
1268		li		r3,0							// our efforts were crowned with success
1269
1270// Pop frame, restore caller's non-volatiles, clear recovery routine pointer.
1271copypv_return:
1272		mfsprg	r9,1							// get current threads stuff
1273		lwz		r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
1274												// get return address
1275		lwz		r4,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
1276												// get non-volatile cr2 and cr3
1277		lwz		r26,FM_ARG0+0x00(r1)			// restore non-volatile r26
1278		lwz		r27,FM_ARG0+0x04(r1)			// restore non-volatile r27
1279		mtlr	r0								// restore return address
1280		lwz		r28,FM_ARG0+0x08(r1)			// restore non-volatile r28
1281		mtcrf	0x20,r4							// restore non-volatile cr2
1282		mtcrf	0x10,r4							// restore non-volatile cr3
1283		lwz		r11,FM_ARG0+0x20(r1)			// save error callback
1284		lwz		r29,FM_ARG0+0x0C(r1)			// restore non-volatile r29
1285		lwz		r30,FM_ARG0+0x10(r1)			// restore non-volatile r30
1286		lwz		r31,FM_ARG0+0x14(r1)			// restore non-volatile r31
1287		stw		r11,THREAD_RECOVER(r9)			// restore our error callback
1288		lwz		r1,0(r1)						// release stack frame
1289
1290		blr										// y'all come back now
1291
1292// Invalid argument handler.
1293copypv_einval:
1294		li		r3,EINVAL						// invalid argument
1295		b		copypv_return					// return
1296
1297// Error encountered during bcopy or bcopy_nc.
1298copypv_error:
1299		mfmsr	r3								// get current msr
1300		rldicl	r3,r3,0,MSR_SF_BIT+1			// clear SF bit in our copy
1301		mtmsrd	r3								// leave 64-bit mode
1302		li		r3,EFAULT						// it was all his fault
1303		b		copypv_return					// return
1304