physmem.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27#include <sys/types.h>
28#include <sys/modctl.h>
29#include <sys/conf.h>
30#include <sys/ddi.h>
31#include <sys/sunddi.h>
32#include <sys/devops.h>
33#include <sys/stat.h>
34#include <sys/file.h>
35#include <sys/cred.h>
36#include <sys/policy.h>
37#include <sys/errno.h>
38#include <vm/seg_dev.h>
39#include <vm/seg_vn.h>
40#include <vm/page.h>
41#include <sys/fs/swapnode.h>
42#include <sys/sysmacros.h>
43#include <sys/fcntl.h>
44#include <sys/vmsystm.h>
45#include <sys/physmem.h>
46#include <sys/vfs_opreg.h>
47
48static dev_info_t		*physmem_dip = NULL;
49
50/*
51 * Linked list element hanging off physmem_proc_hash below, which holds all
52 * the information for a given segment which has been setup for this process.
53 * This is a simple linked list as we are assuming that for a given process
54 * the setup ioctl will only be called a handful of times.  If this assumption
55 * changes in the future, a quicker to traverse data structure should be used.
56 */
57struct physmem_hash {
58	struct physmem_hash *ph_next;
59	uint64_t ph_base_pa;
60	caddr_t ph_base_va;
61	size_t ph_seg_len;
62	struct vnode *ph_vnode;
63};
64
65/*
66 * Hash of all of the processes which have setup mappings with the driver with
67 * pointers to per process data.
68 */
69struct physmem_proc_hash {
70	struct proc *pph_proc;
71	struct physmem_hash *pph_hash;
72	struct physmem_proc_hash *pph_next;
73};
74
75
76/* Needs to be a power of two for simple hash algorithm */
77#define	PPH_SIZE	8
78struct physmem_proc_hash *pph[PPH_SIZE];
79
80/*
81 * Lock which protects the pph hash above.  To add an element (either a new
82 * process or a new segment) the WRITE lock must be held.  To traverse the
83 * list, only a READ lock is needed.
84 */
85krwlock_t pph_rwlock;
86
87#define	PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
88
89/*
90 * Need to keep a reference count of how many processes have the driver
91 * open to prevent it from disappearing.
92 */
93uint64_t physmem_vnodecnt;
94kmutex_t physmem_mutex;		/* protects phsymem_vnodecnt */
95
96static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
97    uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
98    enum seg_rw rw, struct cred *cr, caller_context_t *ct);
99
100static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
101    caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
102    struct cred *cred, caller_context_t *ct);
103
104static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
105    caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
106    struct cred *cred, caller_context_t *ct);
107
108static void physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct);
109
110const fs_operation_def_t physmem_vnodeops_template[] = {
111	VOPNAME_GETPAGE,	{ .vop_getpage = physmem_getpage },
112	VOPNAME_ADDMAP,		{ .vop_addmap = physmem_addmap },
113	VOPNAME_DELMAP,		{ .vop_delmap = physmem_delmap },
114	VOPNAME_INACTIVE,	{ .vop_inactive = physmem_inactive },
115	NULL,			NULL
116};
117
118vnodeops_t *physmem_vnodeops = NULL;
119
120/*
121 * Removes the current process from the hash if the process has no more
122 * physmem segments active.
123 */
124void
125physmem_remove_hash_proc()
126{
127	int index;
128	struct physmem_proc_hash **walker;
129	struct physmem_proc_hash *victim = NULL;
130
131	index = PHYSMEM_HASH(curproc);
132	rw_enter(&pph_rwlock, RW_WRITER);
133	walker = &pph[index];
134	while (*walker != NULL) {
135		if ((*walker)->pph_proc == curproc &&
136		    (*walker)->pph_hash == NULL) {
137			victim = *walker;
138			*walker = victim->pph_next;
139			break;
140		}
141		walker = &((*walker)->pph_next);
142	}
143	rw_exit(&pph_rwlock);
144	if (victim != NULL)
145		kmem_free(victim, sizeof (struct physmem_proc_hash));
146}
147
148/*
149 * Add a new entry to the hash for the given process to cache the
150 * address ranges that it is working on.  If this is the first hash
151 * item to be added for this process, we will create the head pointer
152 * for this process.
153 * Returns 0 on success, ERANGE when the physical address is already in the
154 * hash.
155 */
156int
157physmem_add_hash(struct physmem_hash *php)
158{
159	int index;
160	struct physmem_proc_hash *iterator;
161	struct physmem_proc_hash *newp = NULL;
162	struct physmem_hash *temp;
163	int ret = 0;
164
165	index = PHYSMEM_HASH(curproc);
166
167insert:
168	rw_enter(&pph_rwlock, RW_WRITER);
169	iterator = pph[index];
170	while (iterator != NULL) {
171		if (iterator->pph_proc == curproc) {
172			/*
173			 * check to make sure a single process does not try to
174			 * map the same region twice.
175			 */
176			for (temp = iterator->pph_hash; temp != NULL;
177			    temp = temp->ph_next) {
178				if ((php->ph_base_pa >= temp->ph_base_pa &&
179				    php->ph_base_pa < temp->ph_base_pa +
180				    temp->ph_seg_len) ||
181				    (temp->ph_base_pa >= php->ph_base_pa &&
182				    temp->ph_base_pa < php->ph_base_pa +
183				    php->ph_seg_len)) {
184					ret = ERANGE;
185					break;
186				}
187			}
188			if (ret == 0) {
189				php->ph_next = iterator->pph_hash;
190				iterator->pph_hash = php;
191			}
192			rw_exit(&pph_rwlock);
193			/* Need to check for two threads in sync */
194			if (newp != NULL)
195				kmem_free(newp, sizeof (*newp));
196			return (ret);
197		}
198		iterator = iterator->pph_next;
199	}
200
201	if (newp != NULL) {
202		newp->pph_proc = curproc;
203		newp->pph_next = pph[index];
204		newp->pph_hash = php;
205		php->ph_next = NULL;
206		pph[index] = newp;
207		rw_exit(&pph_rwlock);
208		return (0);
209	}
210
211	rw_exit(&pph_rwlock);
212	/* Dropped the lock so we could use KM_SLEEP */
213	newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
214	goto insert;
215}
216
217/*
218 * Will return the pointer to the physmem_hash struct if the setup routine
219 * has previously been called for this memory.
220 * Returns NULL on failure.
221 */
222struct physmem_hash *
223physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
224{
225	int index;
226	struct physmem_proc_hash *proc_hp;
227	struct physmem_hash *php;
228
229	ASSERT(rw_lock_held(&pph_rwlock));
230
231	index = PHYSMEM_HASH(procp);
232	proc_hp = pph[index];
233	while (proc_hp != NULL) {
234		if (proc_hp->pph_proc == procp) {
235			php = proc_hp->pph_hash;
236			while (php != NULL) {
237				if ((req_paddr >= php->ph_base_pa) &&
238				    (req_paddr + len <=
239				    php->ph_base_pa + php->ph_seg_len)) {
240					return (php);
241				}
242				php = php->ph_next;
243			}
244		}
245		proc_hp = proc_hp->pph_next;
246	}
247	return (NULL);
248}
249
250int
251physmem_validate_cookie(uint64_t p_cookie)
252{
253	int index;
254	struct physmem_proc_hash *proc_hp;
255	struct physmem_hash *php;
256
257	ASSERT(rw_lock_held(&pph_rwlock));
258
259	index = PHYSMEM_HASH(curproc);
260	proc_hp = pph[index];
261	while (proc_hp != NULL) {
262		if (proc_hp->pph_proc == curproc) {
263			php = proc_hp->pph_hash;
264			while (php != NULL) {
265				if ((uint64_t)(uintptr_t)php == p_cookie) {
266					return (1);
267				}
268				php = php->ph_next;
269			}
270		}
271		proc_hp = proc_hp->pph_next;
272	}
273	return (0);
274}
275
276/*
277 * Remove the given vnode from the pph hash.  If it exists in the hash the
278 * process still has to be around as the vnode is obviously still around and
279 * since it's a physmem vnode, it must be in the hash.
280 * If it is not in the hash that must mean that the setup ioctl failed.
281 * Return 0 in this instance, 1 if it is in the hash.
282 */
283int
284physmem_remove_vnode_hash(vnode_t *vp)
285{
286	int index;
287	struct physmem_proc_hash *proc_hp;
288	struct physmem_hash **phpp;
289	struct physmem_hash *victim;
290
291	index = PHYSMEM_HASH(curproc);
292	/* synchronize with the map routine */
293	rw_enter(&pph_rwlock, RW_WRITER);
294	proc_hp = pph[index];
295	while (proc_hp != NULL) {
296		if (proc_hp->pph_proc == curproc) {
297			phpp = &proc_hp->pph_hash;
298			while (*phpp != NULL) {
299				if ((*phpp)->ph_vnode == vp) {
300					victim = *phpp;
301					*phpp = victim->ph_next;
302
303					rw_exit(&pph_rwlock);
304					kmem_free(victim, sizeof (*victim));
305					return (1);
306				}
307				phpp = &(*phpp)->ph_next;
308			}
309		}
310		proc_hp = proc_hp->pph_next;
311	}
312	rw_exit(&pph_rwlock);
313
314	/* not found */
315	return (0);
316}
317
318int
319physmem_setup_vnops()
320{
321	int error;
322	char *name = "physmem";
323	if (physmem_vnodeops != NULL)
324		cmn_err(CE_PANIC, "physmem vnodeops already set\n");
325	error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
326	if (error != 0) {
327		cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
328	}
329	return (error);
330}
331
332/*
333 * The guts of the PHYSMEM_SETUP ioctl.
334 * Create a segment in the address space with the specified parameters.
335 * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
336 * We do not do bounds checking on the requested physical addresses, if they
337 * do not exist in the system, they will not be mappable.
338 * Returns 0 on success with the following error codes on failure:
339 *	ENOMEM - The VA range requested was already mapped if pspp->user_va is
340 *		non-NULL or the system was unable to find enough VA space for
341 *		the desired length if user_va was NULL>
342 *	EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
343 */
344int
345physmem_setup_addrs(struct physmem_setup_param *pspp)
346{
347	struct as *as = curproc->p_as;
348	struct segvn_crargs vn_a;
349	int ret = 0;
350	uint64_t base_pa;
351	size_t len;
352	caddr_t uvaddr;
353	struct vnode *vp;
354	struct physmem_hash *php;
355
356	ASSERT(pspp != NULL);
357	base_pa = pspp->req_paddr;
358	len = pspp->len;
359	uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
360
361	/* Sanity checking */
362	if (!IS_P2ALIGNED(base_pa, PAGESIZE))
363		return (EINVAL);
364	if (!IS_P2ALIGNED(len, PAGESIZE))
365		return (EINVAL);
366	if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
367		return (EINVAL);
368
369	php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
370
371	/* Need to bump vnode count so that the driver can not be unloaded */
372	mutex_enter(&physmem_mutex);
373	physmem_vnodecnt++;
374	mutex_exit(&physmem_mutex);
375
376	vp = vn_alloc(KM_SLEEP);
377	ASSERT(vp != NULL);	/* SLEEP can't return NULL */
378	vn_setops(vp, physmem_vnodeops);
379
380	php->ph_vnode = vp;
381
382	vn_a.vp = vp;
383	vn_a.offset = (u_offset_t)base_pa;
384	vn_a.type = MAP_SHARED;
385	vn_a.prot = PROT_ALL;
386	vn_a.maxprot = PROT_ALL;
387	vn_a.flags = 0;
388	vn_a.cred = NULL;
389	vn_a.amp = NULL;
390	vn_a.szc = 0;
391	vn_a.lgrp_mem_policy_flags = 0;
392
393	as_rangelock(as);
394	if (uvaddr != NULL) {
395		if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
396			ret = ENOMEM;
397fail:
398			as_rangeunlock(as);
399			vn_free(vp);
400			kmem_free(php, sizeof (*php));
401			mutex_enter(&physmem_mutex);
402			physmem_vnodecnt--;
403			mutex_exit(&physmem_mutex);
404			return (ret);
405		}
406	} else {
407		/* We pick the address for the user */
408		map_addr(&uvaddr, len, 0, 1, 0);
409		if (uvaddr == NULL) {
410			ret = ENOMEM;
411			goto fail;
412		}
413	}
414	ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
415
416	if (ret == 0) {
417		as_rangeunlock(as);
418		php->ph_base_pa = base_pa;
419		php->ph_base_va = uvaddr;
420		php->ph_seg_len = len;
421		pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
422		pspp->cookie = (uint64_t)(uintptr_t)php;
423		ret = physmem_add_hash(php);
424		if (ret == 0)
425			return (0);
426
427		/* Note that the call to as_unmap will free the vnode */
428		(void) as_unmap(as, uvaddr, len);
429		kmem_free(php, sizeof (*php));
430		return (ret);
431	}
432
433	goto fail;
434	/*NOTREACHED*/
435}
436
437/*
438 * The guts of the PHYSMEM_MAP ioctl.
439 * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
440 * been called for this PA range.
441 * Returns 0 on success with the following error codes on failure:
442 *	EPERM - The requested page is long term locked, and thus repeated
443 *		requests to allocate this page will likely fail.
444 *	EAGAIN - The requested page could not be allocated, but it is believed
445 *		that future attempts could succeed.
446 *	ENOMEM - There was not enough free memory in the system to safely
447 *		map the requested page.
448 *	EINVAL - The requested paddr was not PAGESIZE aligned or the
449 *		PHYSMEM_SETUP ioctl was not called for this page.
450 *	ENOENT - The requested page was iniside the kernel cage, and the
451 *		PHYSMEM_CAGE flag was not set.
452 *	EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
453 *		was not set.
454 */
455static int
456physmem_map_addrs(struct physmem_map_param *pmpp)
457{
458	caddr_t uvaddr;
459	page_t *pp;
460	uint64_t req_paddr;
461	struct vnode *vp;
462	int ret = 0;
463	struct physmem_hash *php;
464	uint_t flags = 0;
465
466	ASSERT(pmpp != NULL);
467	req_paddr = pmpp->req_paddr;
468
469	if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
470		return (EINVAL);
471	/* Find the vnode for this map request */
472	rw_enter(&pph_rwlock, RW_READER);
473	php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
474	if (php == NULL) {
475		rw_exit(&pph_rwlock);
476		return (EINVAL);
477	}
478	vp = php->ph_vnode;
479	uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
480	rw_exit(&pph_rwlock);
481
482	pp = page_numtopp_nolock(btop((size_t)req_paddr));
483	if (pp == NULL) {
484		pmpp->ret_va = NULL;
485		return (EPERM);
486	}
487
488	/*
489	 * Check to see if page already mapped correctly.  This can happen
490	 * when we failed to capture a page previously and it was captured
491	 * asynchronously for us.  Return success in this case.
492	 */
493	if (pp->p_vnode == vp) {
494		ASSERT(pp->p_offset == (u_offset_t)req_paddr);
495		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
496		return (0);
497	}
498
499	/*
500	 * physmem should be responsible for checking for cage
501	 * and prom pages.
502	 */
503	if (pmpp->flags & PHYSMEM_CAGE)
504		flags = CAPTURE_GET_CAGE;
505	if (pmpp->flags & PHYSMEM_RETIRED)
506		flags |= CAPTURE_GET_RETIRED;
507
508	ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
509
510	if (ret != 0) {
511		pmpp->ret_va = NULL;
512		return (ret);
513	} else {
514		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
515		return (0);
516	}
517}
518
519/*
520 * Map the given page into the process's address space if possible.
521 * We actually only hash the page in on the correct vnode as the page
522 * will be mapped via segvn_pagefault.
523 * returns 0 on success
524 * returns 1 if there is no need to map this page anymore (process exited)
525 * returns -1 if we failed to map the page.
526 */
527int
528map_page_proc(page_t *pp, void *arg, uint_t flags)
529{
530	struct vnode *vp;
531	proc_t *procp = (proc_t *)arg;
532	int ret;
533	u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
534	struct physmem_hash *php;
535
536	ASSERT(pp != NULL);
537
538	/*
539	 * Check against availrmem to make sure that we're not low on memory.
540	 * We check again here as ASYNC requests do not do this check elsewhere.
541	 * We return 1 as we don't want the page to have the PR_CAPTURE bit
542	 * set or be on the page capture hash.
543	 */
544	if (swapfs_minfree > availrmem + 1) {
545		page_free(pp, 1);
546		return (1);
547	}
548
549	/*
550	 * If this is an asynchronous request for the current process,
551	 * we can not map the page as it's possible that we are also in the
552	 * process of unmapping the page which could result in a deadlock
553	 * with the as lock.
554	 */
555	if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
556		page_free(pp, 1);
557		return (-1);
558	}
559
560	/* only return zeroed out pages */
561	pagezero(pp, 0, PAGESIZE);
562
563	rw_enter(&pph_rwlock, RW_READER);
564	php = physmem_get_hash(paddr, PAGESIZE, procp);
565	if (php == NULL) {
566		rw_exit(&pph_rwlock);
567		/*
568		 * Free the page as there is no longer a valid outstanding
569		 * request for this page.
570		 */
571		page_free(pp, 1);
572		return (1);
573	}
574
575	vp = php->ph_vnode;
576
577	/*
578	 * We need to protect against a possible deadlock here where we own
579	 * the vnode page hash mutex and want to acquire it again as there
580	 * are locations in the code, where we unlock a page while holding
581	 * the mutex which can lead to the page being captured and eventually
582	 * end up here.
583	 */
584	if (mutex_owned(page_vnode_mutex(vp))) {
585		rw_exit(&pph_rwlock);
586		page_free(pp, 1);
587		return (-1);
588	}
589
590	ret = page_hashin(pp, vp, paddr, NULL);
591	rw_exit(&pph_rwlock);
592	if (ret == 0) {
593		page_free(pp, 1);
594		return (-1);
595	}
596
597	page_downgrade(pp);
598
599	mutex_enter(&freemem_lock);
600	availrmem--;
601	mutex_exit(&freemem_lock);
602
603	return (0);
604}
605
606/*
607 * The guts of the PHYSMEM_DESTROY ioctl.
608 * The cookie passed in will provide all of the information needed to
609 * free up the address space and physical memory associated with the
610 * corresponding PHSYMEM_SETUP ioctl.
611 * Returns 0 on success with the following error codes on failure:
612 *	EINVAL - The cookie supplied is not valid.
613 */
614int
615physmem_destroy_addrs(uint64_t p_cookie)
616{
617	struct as *as = curproc->p_as;
618	size_t len;
619	caddr_t uvaddr;
620
621	rw_enter(&pph_rwlock, RW_READER);
622	if (physmem_validate_cookie(p_cookie) == 0) {
623		rw_exit(&pph_rwlock);
624		return (EINVAL);
625	}
626
627	len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
628	uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
629	rw_exit(&pph_rwlock);
630
631	(void) as_unmap(as, uvaddr, len);
632
633	return (0);
634}
635
636/*
637 * If the page has been hashed into the physmem vnode, then just look it up
638 * and return it via pl, otherwise return ENOMEM as the map ioctl has not
639 * succeeded on the given page.
640 */
641/*ARGSUSED*/
642static int
643physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
644    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
645    struct cred *cr, caller_context_t *ct)
646{
647	page_t *pp;
648
649	ASSERT(len == PAGESIZE);
650	ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
651
652	/*
653	 * If the page is in the hash, then we successfully claimed this
654	 * page earlier, so return it to the caller.
655	 */
656	pp = page_lookup(vp, off, SE_SHARED);
657	if (pp != NULL) {
658		pl[0] = pp;
659		pl[1] = NULL;
660		*protp = PROT_ALL;
661		return (0);
662	}
663	return (ENOMEM);
664}
665
666/*
667 * We can not allow a process mapping /dev/physmem pages to fork as there can
668 * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
669 * return of EINVAL when we are not working on our own address space.
670 * Otherwise we return zero as this function is required for normal operation.
671 */
672/*ARGSUSED*/
673static int
674physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
675    caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
676    struct cred *cred, caller_context_t *ct)
677{
678	if (curproc->p_as != as) {
679		return (EINVAL);
680	}
681	return (0);
682}
683
684/* Will always get called for removing a whole segment. */
685/*ARGSUSED*/
686static int
687physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
688    caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
689    struct cred *cred, caller_context_t *ct)
690{
691	/*
692	 * Release our hold on the vnode so that the final VN_RELE will
693	 * call physmem_inactive to clean things up.
694	 */
695	VN_RELE(vp);
696
697	return (0);
698}
699
700/*
701 * Clean up all the pages belonging to this vnode and then free it.
702 */
703/*ARGSUSED*/
704static void
705physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct)
706{
707	page_t *pp;
708
709	/*
710	 * Remove the vnode from the hash now, to prevent asynchronous
711	 * attempts to map into this vnode.  This avoids a deadlock
712	 * where two threads try to get into this logic at the same
713	 * time and try to map the pages they are destroying into the
714	 * other's address space.
715	 * If it's not in the hash, just free it.
716	 */
717	if (physmem_remove_vnode_hash(vp) == 0) {
718		ASSERT(vp->v_pages == NULL);
719		vn_free(vp);
720		physmem_remove_hash_proc();
721		mutex_enter(&physmem_mutex);
722		physmem_vnodecnt--;
723		mutex_exit(&physmem_mutex);
724		return;
725	}
726
727	/*
728	 * At this point in time, no other logic can be adding or removing
729	 * pages from the vnode, otherwise the v_pages list could be inaccurate.
730	 */
731
732	while ((pp = vp->v_pages) != NULL) {
733		page_t *rpp;
734		if (page_tryupgrade(pp)) {
735			/*
736			 * set lckcnt for page_destroy to do availrmem
737			 * accounting
738			 */
739			pp->p_lckcnt = 1;
740			page_destroy(pp, 0);
741		} else {
742			/* failure to lock should be transient */
743			rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
744			if (rpp != pp) {
745				page_unlock(rpp);
746				continue;
747			}
748			page_unlock(pp);
749		}
750	}
751	vn_free(vp);
752	physmem_remove_hash_proc();
753	mutex_enter(&physmem_mutex);
754	physmem_vnodecnt--;
755	mutex_exit(&physmem_mutex);
756}
757
758/*ARGSUSED*/
759static int
760physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
761    int *rvalp)
762{
763	int ret;
764
765	switch (cmd) {
766	case PHYSMEM_SETUP:
767		{
768			struct physmem_setup_param psp;
769			if (ddi_copyin((void *)arg, &psp,
770			    sizeof (struct physmem_setup_param), 0))
771				return (EFAULT);
772			ret = physmem_setup_addrs(&psp);
773			if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
774				return (EFAULT);
775		}
776		break;
777	case PHYSMEM_MAP:
778		{
779			struct physmem_map_param pmp;
780			if (ddi_copyin((void *)arg, &pmp,
781			    sizeof (struct physmem_map_param), 0))
782				return (EFAULT);
783			ret = physmem_map_addrs(&pmp);
784			if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
785				return (EFAULT);
786		}
787		break;
788	case PHYSMEM_DESTROY:
789		{
790			uint64_t cookie;
791			if (ddi_copyin((void *)arg, &cookie,
792			    sizeof (uint64_t), 0))
793				return (EFAULT);
794			ret = physmem_destroy_addrs(cookie);
795		}
796		break;
797	default:
798		return (ENOTSUP);
799	}
800	return (ret);
801}
802
803/*ARGSUSED*/
804static int
805physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
806{
807	int ret;
808	static int msg_printed = 0;
809
810	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
811		return (EINVAL);
812	}
813
814	/* need to make sure we have the right privileges */
815	if ((ret = secpolicy_resource(credp)) != 0)
816		return (ret);
817	if ((ret = secpolicy_lock_memory(credp)) != 0)
818		return (ret);
819
820	if (msg_printed == 0) {
821		cmn_err(CE_NOTE, "!driver has been opened. This driver may "
822		    "take out long term locks on pages which may impact "
823		    "dynamic reconfiguration events");
824		msg_printed = 1;
825	}
826
827	return (0);
828}
829
830/*ARGSUSED*/
831static int
832physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
833{
834	return (0);
835}
836
837/*ARGSUSED*/
838static int
839physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
840    void *arg, void **resultp)
841{
842	switch (infocmd) {
843	case DDI_INFO_DEVT2DEVINFO:
844		*resultp = physmem_dip;
845		return (DDI_SUCCESS);
846
847	case DDI_INFO_DEVT2INSTANCE:
848		*resultp = (void *)(ulong_t)getminor((dev_t)arg);
849		return (DDI_SUCCESS);
850
851	default:
852		return (DDI_FAILURE);
853	}
854}
855
856static int
857physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
858{
859	int i;
860
861	if (cmd == DDI_RESUME) {
862		return (DDI_SUCCESS);
863	}
864
865	if (cmd != DDI_ATTACH)
866		return (DDI_FAILURE);
867
868	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
869	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
870		return (DDI_FAILURE);
871
872	physmem_dip = dip;
873
874	/* Initialize driver specific data */
875	if (physmem_setup_vnops()) {
876		ddi_remove_minor_node(dip, ddi_get_name(dip));
877		return (DDI_FAILURE);
878	}
879
880	for (i = 0; i < PPH_SIZE; i++)
881		pph[i] = NULL;
882
883	page_capture_register_callback(PC_PHYSMEM, 10000,
884	    map_page_proc);
885
886	return (DDI_SUCCESS);
887}
888
889static int
890physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
891{
892	int ret = DDI_SUCCESS;
893
894	if (cmd == DDI_SUSPEND) {
895		return (DDI_SUCCESS);
896	}
897
898	if (cmd != DDI_DETACH)
899		return (DDI_FAILURE);
900
901	ASSERT(physmem_dip == dip);
902
903	mutex_enter(&physmem_mutex);
904	if (physmem_vnodecnt == 0) {
905		if (physmem_vnodeops != NULL) {
906			vn_freevnodeops(physmem_vnodeops);
907			physmem_vnodeops = NULL;
908			page_capture_unregister_callback(PC_PHYSMEM);
909		}
910	} else {
911		ret = EBUSY;
912	}
913	mutex_exit(&physmem_mutex);
914	if (ret == DDI_SUCCESS)
915		ddi_remove_minor_node(dip, ddi_get_name(dip));
916	return (ret);
917}
918
919static struct cb_ops physmem_cb_ops = {
920	physmem_open,	/* open */
921	physmem_close,	/* close */
922	nodev,		/* strategy */
923	nodev,		/* print */
924	nodev,		/* dump */
925	nodev,		/* read */
926	nodev,		/* write */
927	physmem_ioctl,	/* ioctl */
928	nodev,		/* devmap */
929	nodev,		/* mmap */
930	nodev,		/* segmap */
931	nochpoll,	/* chpoll */
932	ddi_prop_op,	/* prop_op */
933	NULL,		/* cb_str */
934	D_NEW | D_MP | D_DEVMAP,
935	CB_REV,
936	NULL,
937	NULL
938};
939
940static struct dev_ops physmem_ops = {
941	DEVO_REV,
942	0,
943	physmem_getinfo,
944	nulldev,
945	nulldev,
946	physmem_attach,
947	physmem_detach,
948	nodev,
949	&physmem_cb_ops,
950	NULL,
951	NULL,
952	ddi_quiesce_not_needed,		/* quiesce */
953};
954
955static struct modldrv modldrv = {
956	&mod_driverops,
957	"physmem driver",
958	&physmem_ops
959};
960
961static struct modlinkage modlinkage = {
962	MODREV_1,
963	&modldrv,
964	NULL
965};
966
967int
968_init(void)
969{
970	return (mod_install(&modlinkage));
971}
972
973int
974_info(struct modinfo *modinfop)
975{
976	return (mod_info(&modlinkage, modinfop));
977}
978
979int
980_fini(void)
981{
982	return (mod_remove(&modlinkage));
983}
984