1/*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28#include <mach_assert.h>
29#include <sys/errno.h>
30#include <i386/param.h>
31#include <i386/misc_protos.h>
32#include <i386/cpu_data.h>
33#include <i386/machine_routines.h>
34#include <vm/pmap.h>
35#include <vm/vm_map.h>
36#include <vm/vm_kern.h>
37#include <vm/vm_fault.h>
38
39#include <sys/kdebug.h>
40
41/*
42 * the copy engine has the following characteristics
43 *   - copyio handles copies to/from user or kernel space
44 *   - copypv deals with physical or virtual addresses
45 *
46 * implementation details as follows
47 *   - a cache of up to NCOPY_WINDOWS is maintained per thread for
48 *     access of user virutal space
49 *   - the window size is determined by the amount of virtual space
50 *     that can be mapped by a single page table
51 *   - the mapping is done by copying the page table pointer from
52 *     the user's directory entry corresponding to the window's
53 *     address in user space to the directory entry corresponding
54 *     to the window slot in the kernel's address space
55 *   - the set of mappings is preserved across context switches,
56 *     so the copy can run with pre-emption enabled
57 *   - there is a gdt entry set up to anchor the kernel window on
58 *     each processor
59 *   - the copies are done using the selector corresponding to the
60 *     gdt entry
61 *   - the addresses corresponding to the user virtual address are
62 *     relative to the beginning of the window being used to map
63 *     that region... thus the thread can be pre-empted and switched
64 *     to a different processor while in the midst of a copy
65 *   - the window caches must be invalidated if the pmap changes out
66 *     from under the thread... this can happen during vfork/exec...
67 *     inval_copy_windows is the invalidation routine to be used
68 *   - the copyio engine has 4 different states associated with it
69 *     that allows for lazy tlb flushes and the ability to avoid
70 *     a flush all together if we've just come from user space
71 *     the 4 states are as follows...
72 *
73 *	WINDOWS_OPENED - set by copyio to indicate to the context
74 *	  switch code that it is necessary to do a tlbflush after
75 * 	  switching the windows since we're in the middle of a copy
76 *
77 *	WINDOWS_CLOSED - set by copyio to indicate that it's done
78 *	  using the windows, so that the context switch code need
79 *	  not do the tlbflush... instead it will set the state to...
80 *
81 *	WINDOWS_DIRTY - set by the context switch code to indicate
82 *	  to the copy engine that it is responsible for doing a
83 *	  tlbflush before using the windows again... it's also
84 *	  set by the inval_copy_windows routine to indicate the
85 *	  same responsibility.
86 *
87 *	WINDOWS_CLEAN - set by the return to user path to indicate
88 * 	  that a tlbflush has happened and that there is no need
89 *	  for copyio to do another when it is entered next...
90 *
91 *   - a window for mapping single physical pages is provided for copypv
92 *   - this window is maintained across context switches and has the
93 *     same characteristics as the user space windows w/r to pre-emption
94 */
95
96extern int copyout_user(const char *, vm_offset_t, vm_size_t);
97extern int copyout_kern(const char *, vm_offset_t, vm_size_t);
98extern int copyin_user(const vm_offset_t, char *, vm_size_t);
99extern int copyin_kern(const vm_offset_t, char *, vm_size_t);
100extern int copyoutphys_user(const char *, vm_offset_t, vm_size_t);
101extern int copyoutphys_kern(const char *, vm_offset_t, vm_size_t);
102extern int copyinphys_user(const vm_offset_t, char *, vm_size_t);
103extern int copyinphys_kern(const vm_offset_t, char *, vm_size_t);
104extern int copyinstr_user(const vm_offset_t, char *, vm_size_t, vm_size_t *);
105extern int copyinstr_kern(const vm_offset_t, char *, vm_size_t, vm_size_t *);
106
107static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int);
108static int copyio_phys(addr64_t, addr64_t, vm_size_t, int);
109
110
111#define COPYIN		0
112#define COPYOUT		1
113#define COPYINSTR	2
114#define COPYINPHYS	3
115#define COPYOUTPHYS	4
116
117void inval_copy_windows(thread_t thread)
118{
119        int	i;
120
121	for (i = 0; i < NCOPY_WINDOWS; i++) {
122                thread->machine.copy_window[i].user_base = -1;
123	}
124	thread->machine.nxt_window = 0;
125	thread->machine.copyio_state = WINDOWS_DIRTY;
126
127	KERNEL_DEBUG(0xeff70058 | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (int)thread->map, 0, 0, 0);
128}
129
130
131static int
132copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
133       vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map)
134{
135        thread_t	thread;
136	pmap_t		pmap;
137	pt_entry_t	*updp;
138	pt_entry_t	*kpdp;
139	user_addr_t 	user_base;
140	vm_offset_t 	user_offset;
141	vm_offset_t 	kern_vaddr;
142	vm_size_t	cnt;
143	vm_size_t	bytes_copied;
144	int		error = 0;
145	int		window_index;
146	int		copyio_state;
147        boolean_t	istate;
148#if KDEBUG
149	int		debug_type = 0xeff70010;
150	debug_type += (copy_type << 2);
151#endif
152
153	thread = current_thread();
154
155	KERNEL_DEBUG(debug_type | DBG_FUNC_START, (int)(user_addr >> 32), (int)user_addr,
156		     (int)nbytes, thread->machine.copyio_state, 0);
157
158	if (nbytes == 0) {
159	        KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr,
160			     (unsigned)kernel_addr, (unsigned)nbytes, 0, 0);
161	        return (0);
162	}
163        pmap = thread->map->pmap;
164
165        if (pmap == kernel_pmap || use_kernel_map) {
166
167	        kern_vaddr = (vm_offset_t)user_addr;
168
169	        switch (copy_type) {
170
171		case COPYIN:
172		        error = copyin_kern(kern_vaddr, kernel_addr, nbytes);
173			break;
174
175		case COPYOUT:
176		        error = copyout_kern(kernel_addr, kern_vaddr, nbytes);
177			break;
178
179		case COPYINSTR:
180		        error = copyinstr_kern(kern_vaddr, kernel_addr, nbytes, lencopied);
181			break;
182
183		case COPYINPHYS:
184		        error = copyinphys_kern(kern_vaddr, kernel_addr, nbytes);
185			break;
186
187		case COPYOUTPHYS:
188		        error = copyoutphys_kern(kernel_addr, kern_vaddr, nbytes);
189			break;
190		}
191		KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)kern_vaddr,
192			     (unsigned)kernel_addr, (unsigned)nbytes,
193			     error | 0x80000000, 0);
194		return (error);
195	}
196
197#if CONFIG_DTRACE
198	thread->machine.specFlags |= CopyIOActive;
199#endif /* CONFIG_DTRACE */
200
201	if ((nbytes && (user_addr + nbytes <= user_addr)) ||
202	    (user_addr          < vm_map_min(thread->map)) ||
203	    (user_addr + nbytes > vm_map_max(thread->map))) {
204		error = EFAULT;
205		goto done;
206	}
207
208	user_base = user_addr & ~((user_addr_t)(NBPDE - 1));
209	user_offset = (vm_offset_t)(user_addr & (NBPDE - 1));
210
211	KERNEL_DEBUG(debug_type | DBG_FUNC_NONE, (int)(user_base >> 32), (int)user_base,
212		     (int)user_offset, 0, 0);
213
214	cnt = NBPDE - user_offset;
215
216	if (cnt > nbytes)
217	        cnt = nbytes;
218
219	istate = ml_set_interrupts_enabled(FALSE);
220
221	copyio_state = thread->machine.copyio_state;
222	thread->machine.copyio_state = WINDOWS_OPENED;
223
224	(void) ml_set_interrupts_enabled(istate);
225
226
227	for (;;) {
228
229	        for (window_index = 0; window_index < NCOPY_WINDOWS; window_index++) {
230		        if (thread->machine.copy_window[window_index].user_base == user_base)
231					break;
232		}
233	        if (window_index >= NCOPY_WINDOWS) {
234
235		        window_index = thread->machine.nxt_window;
236			thread->machine.nxt_window++;
237
238			if (thread->machine.nxt_window >= NCOPY_WINDOWS)
239			        thread->machine.nxt_window = 0;
240
241			/*
242			 * it's necessary to disable pre-emption
243			 * since I have to compute the kernel descriptor pointer
244			 * for the new window
245			 */
246			istate = ml_set_interrupts_enabled(FALSE);
247
248			thread->machine.copy_window[window_index].user_base = user_base;
249
250		        updp = pmap_pde(pmap, user_base);
251
252			kpdp = current_cpu_datap()->cpu_copywindow_pdp;
253			kpdp += window_index;
254
255			pmap_store_pte(kpdp, updp ? *updp : 0);
256
257			(void) ml_set_interrupts_enabled(istate);
258
259		        copyio_state = WINDOWS_DIRTY;
260
261			KERNEL_DEBUG(0xeff70040 | DBG_FUNC_NONE, window_index,
262				     (unsigned)user_base, (unsigned)updp,
263				     (unsigned)kpdp, 0);
264
265		}
266#if JOE_DEBUG
267		else {
268			istate = ml_set_interrupts_enabled(FALSE);
269
270		        updp = pmap_pde(pmap, user_base);
271
272			kpdp = current_cpu_datap()->cpu_copywindow_pdp;
273
274			kpdp += window_index;
275
276			if ((*kpdp & PG_FRAME) != (*updp & PG_FRAME)) {
277				panic("copyio: user pdp mismatch - kpdp = 0x%qx,  updp = 0x%qx\n", *kpdp, *updp);
278			}
279			(void) ml_set_interrupts_enabled(istate);
280		}
281#endif
282		if (copyio_state == WINDOWS_DIRTY) {
283		        flush_tlb();
284
285		        copyio_state = WINDOWS_CLEAN;
286
287			KERNEL_DEBUG(0xeff70054 | DBG_FUNC_NONE, window_index, 0, 0, 0, 0);
288		}
289		user_offset += (window_index * NBPDE);
290
291		KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_offset,
292			     (unsigned)kernel_addr, cnt, 0, 0);
293
294	        switch (copy_type) {
295
296		case COPYIN:
297		        error = copyin_user(user_offset, kernel_addr, cnt);
298			break;
299
300		case COPYOUT:
301		        error = copyout_user(kernel_addr, user_offset, cnt);
302			break;
303
304		case COPYINPHYS:
305		        error = copyinphys_user(user_offset, kernel_addr, cnt);
306			break;
307
308		case COPYOUTPHYS:
309		        error = copyoutphys_user(kernel_addr, user_offset, cnt);
310			break;
311
312		case COPYINSTR:
313		        error = copyinstr_user(user_offset, kernel_addr, cnt, &bytes_copied);
314
315			/*
316			 * lencopied should be updated on success
317			 * or ENAMETOOLONG...  but not EFAULT
318			 */
319			if (error != EFAULT)
320			        *lencopied += bytes_copied;
321
322			/*
323			 * if we still have room, then the ENAMETOOLONG
324			 * is just an artifact of the buffer straddling
325			 * a window boundary and we should continue
326			 */
327			if (error == ENAMETOOLONG && nbytes > cnt)
328			        error = 0;
329
330			if (error) {
331#if KDEBUG
332			        nbytes = *lencopied;
333#endif
334			        break;
335			}
336			if (*(kernel_addr + bytes_copied - 1) == 0) {
337			        /*
338				 * we found a NULL terminator... we're done
339				 */
340#if KDEBUG
341			        nbytes = *lencopied;
342#endif
343				goto done;
344			}
345			if (cnt == nbytes) {
346			        /*
347				 * no more room in the buffer and we haven't
348				 * yet come across a NULL terminator
349				 */
350#if KDEBUG
351			        nbytes = *lencopied;
352#endif
353			        error = ENAMETOOLONG;
354				break;
355			}
356			assert(cnt == bytes_copied);
357
358			break;
359		}
360		if (error)
361		        break;
362		if ((nbytes -= cnt) == 0)
363		        break;
364
365		kernel_addr += cnt;
366		user_base += NBPDE;
367		user_offset = 0;
368
369		if (nbytes > NBPDE)
370		        cnt = NBPDE;
371		else
372		        cnt = nbytes;
373	}
374done:
375	thread->machine.copyio_state = WINDOWS_CLOSED;
376
377	KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr,
378		     (unsigned)kernel_addr, (unsigned)nbytes, error, 0);
379
380#if CONFIG_DTRACE
381	thread->machine.specFlags &= ~CopyIOActive;
382#endif /* CONFIG_DTRACE */
383
384	return (error);
385}
386
387static int
388copyio_phys(addr64_t source, addr64_t sink, vm_size_t csize, int which)
389{
390        pmap_paddr_t paddr;
391	user_addr_t vaddr;
392	char        *window_offset;
393	pt_entry_t  pentry;
394	int         ctype;
395	int	    retval;
396	boolean_t   istate;
397
398
399	if (which & cppvPsnk) {
400		paddr  = (pmap_paddr_t)sink;
401	        vaddr  = (user_addr_t)source;
402		ctype  = COPYINPHYS;
403		pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME) | INTEL_PTE_RW);
404	} else {
405	        paddr  = (pmap_paddr_t)source;
406		vaddr  = (user_addr_t)sink;
407		ctype  = COPYOUTPHYS;
408		pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME));
409	}
410	/* Fold in cache attributes for this physical page */
411	pentry |= pmap_get_cache_attributes(i386_btop(paddr));
412	window_offset = (char *)(uintptr_t)((uint32_t)paddr & (PAGE_SIZE - 1));
413
414	assert(!((current_thread()->machine.specFlags & CopyIOActive) && ((which & cppvKmap) == 0)));
415
416	if (current_thread()->machine.physwindow_busy) {
417	        pt_entry_t	old_pentry;
418
419	        KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, -1, 0);
420		/*
421		 * we had better be targeting wired memory at this point
422		 * we will not be able to handle a fault with interrupts
423		 * disabled... we disable them because we can't tolerate
424		 * being preempted during this nested use of the window
425		 */
426		istate = ml_set_interrupts_enabled(FALSE);
427
428		old_pentry = *(current_cpu_datap()->cpu_physwindow_ptep);
429		pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry);
430
431		invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base);
432
433		retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap);
434
435		pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), old_pentry);
436
437		invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base);
438
439		(void) ml_set_interrupts_enabled(istate);
440	} else {
441	        /*
442		 * mark the window as in use... if an interrupt hits while we're
443		 * busy, or we trigger another coyppv from the fault path into
444		 * the driver on a user address space page fault due to a copyin/out
445		 * then we need to save and restore the current window state instead
446		 * of caching the window preserving it across context switches
447		 */
448	        current_thread()->machine.physwindow_busy = 1;
449
450	        if (current_thread()->machine.physwindow_pte != pentry) {
451		        KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, 0, 0);
452
453			current_thread()->machine.physwindow_pte = pentry;
454
455			/*
456			 * preemption at this point would be bad since we
457			 * could end up on the other processor after we grabbed the
458			 * pointer to the current cpu data area, but before we finished
459			 * using it to stuff the page table entry since we would
460			 * be modifying a window that no longer belonged to us
461			 * the invlpg can be done unprotected since it only flushes
462			 * this page address from the tlb... if it flushes the wrong
463			 * one, no harm is done, and the context switch that moved us
464			 * to the other processor will have already take care of
465			 * flushing the tlb after it reloaded the page table from machine.physwindow_pte
466			 */
467			istate = ml_set_interrupts_enabled(FALSE);
468
469			pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry);
470			(void) ml_set_interrupts_enabled(istate);
471
472			invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base);
473		}
474#if JOE_DEBUG
475		else {
476		        if (pentry !=
477			    (*(current_cpu_datap()->cpu_physwindow_ptep) & (INTEL_PTE_VALID | PG_FRAME | INTEL_PTE_RW)))
478			        panic("copyio_phys: pentry != *physwindow_ptep");
479		}
480#endif
481		retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap);
482
483	        current_thread()->machine.physwindow_busy = 0;
484	}
485	return (retval);
486}
487
488int
489copyinmsg(const user_addr_t user_addr, char *kernel_addr, mach_msg_size_t nbytes)
490{
491        return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0));
492}
493
494int
495copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes)
496{
497        return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0));
498}
499
500int
501copyinstr(const user_addr_t user_addr,  char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied)
502{
503	*lencopied = 0;
504
505        return (copyio(COPYINSTR, user_addr, kernel_addr, nbytes, lencopied, 0));
506}
507
508int
509copyoutmsg(const char *kernel_addr, user_addr_t user_addr, mach_msg_size_t nbytes)
510{
511	return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0));
512}
513
514int
515copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes)
516{
517	return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0));
518}
519
520
521kern_return_t
522copypv(addr64_t src64, addr64_t snk64, unsigned int size, int which)
523{
524	unsigned int lop, csize;
525	int bothphys = 0;
526
527	KERNEL_DEBUG(0xeff7004c | DBG_FUNC_START, (unsigned)src64,
528		     (unsigned)snk64, size, which, 0);
529
530	if ((which & (cppvPsrc | cppvPsnk)) == 0 )				/* Make sure that only one is virtual */
531		panic("copypv: no more than 1 parameter may be virtual\n");	/* Not allowed */
532
533	if ((which & (cppvPsrc | cppvPsnk)) == (cppvPsrc | cppvPsnk))
534	        bothphys = 1;							/* both are physical */
535
536	while (size) {
537
538	        if (bothphys) {
539		        lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1)));		/* Assume sink smallest */
540
541			if (lop > (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))))
542			        lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)));	/* No, source is smaller */
543		} else {
544		        /*
545			 * only need to compute the resid for the physical page
546			 * address... we don't care about where we start/finish in
547			 * the virtual since we just call the normal copyin/copyout
548			 */
549		        if (which & cppvPsrc)
550			        lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)));
551			else
552			        lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1)));
553		}
554		csize = size;						/* Assume we can copy it all */
555		if (lop < size)
556		        csize = lop;					/* Nope, we can't do it all */
557#if 0
558		/*
559		 * flush_dcache64 is currently a nop on the i386...
560		 * it's used when copying to non-system memory such
561		 * as video capture cards... on PPC there was a need
562		 * to flush due to how we mapped this memory... not
563		 * sure if it's needed on i386.
564		 */
565		if (which & cppvFsrc)
566		        flush_dcache64(src64, csize, 1);		/* If requested, flush source before move */
567		if (which & cppvFsnk)
568		        flush_dcache64(snk64, csize, 1);		/* If requested, flush sink before move */
569#endif
570		if (bothphys) {
571			bcopy_phys(src64, snk64, csize);		/* Do a physical copy, virtually */
572		}
573		else {
574			if (copyio_phys(src64, snk64, csize, which)) {
575				return (KERN_FAILURE);
576			}
577		}
578#if 0
579		if (which & cppvFsrc)
580		        flush_dcache64(src64, csize, 1);	/* If requested, flush source after move */
581		if (which & cppvFsnk)
582		        flush_dcache64(snk64, csize, 1);	/* If requested, flush sink after move */
583#endif
584		size   -= csize;					/* Calculate what is left */
585		snk64 += csize;					/* Bump sink to next physical address */
586		src64 += csize;					/* Bump source to next physical address */
587	}
588	KERNEL_DEBUG(0xeff7004c | DBG_FUNC_END, (unsigned)src64,
589		     (unsigned)snk64, size, which, 0);
590
591	return KERN_SUCCESS;
592}
593void
594copy_window_fault(thread_t thread, vm_map_t map, int window)
595{
596	pt_entry_t	*updp;
597	pt_entry_t	*kpdp;
598
599	/*
600	 * in case there was no page table assigned
601	 * for the user base address and the pmap
602	 * got 'expanded' due to this fault, we'll
603	 * copy in the descriptor
604	 *
605	 * we're either setting the page table descriptor
606	 * to the same value or it was 0... no need
607	 * for a TLB flush in either case
608	 */
609
610        updp = pmap_pde(map->pmap, thread->machine.copy_window[window].user_base);
611	assert(updp);
612	if (0 == updp) panic("trap: updp 0"); /* XXX DEBUG */
613	kpdp = current_cpu_datap()->cpu_copywindow_pdp;
614	kpdp += window;
615
616#if JOE_DEBUG
617	if (*kpdp && (*kpdp & PG_FRAME) != (*updp & PG_FRAME))
618	        panic("kernel_fault: user pdp doesn't match - updp = 0x%qx, kpdp = 0x%qx\n", *updp, *kpdp);
619#endif
620	pmap_store_pte(kpdp, *updp);
621}
622