1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * vm_usage
29 *
30 * This file implements the getvmusage() private system call.
31 * getvmusage() counts the amount of resident memory pages and swap
32 * reserved by the specified process collective. A "process collective" is
33 * the set of processes owned by a particular, zone, project, task, or user.
34 *
35 * rss and swap are counted so that for a given process collective, a page is
36 * only counted once.  For example, this means that if multiple processes in
37 * the same project map the same page, then the project will only be charged
38 * once for that page.  On the other hand, if two processes in different
39 * projects map the same page, then both projects will be charged
40 * for the page.
41 *
42 * The vm_getusage() calculation is implemented so that the first thread
43 * performs the rss/swap counting. Other callers will wait for that thread to
44 * finish, copying the results.  This enables multiple rcapds and prstats to
45 * consume data from the same calculation.  The results are also cached so that
46 * a caller interested in recent results can just copy them instead of starting
47 * a new calculation. The caller passes the maximium age (in seconds) of the
48 * data.  If the cached data is young enough, the cache is copied, otherwise,
49 * a new calculation is executed and the cache is replaced with the new
50 * data.
51 *
52 * The rss calculation for each process collective is as follows:
53 *
54 *   - Inspect flags, determine if counting rss for zones, projects, tasks,
55 *     and/or users.
56 *   - For each proc:
57 *	- Figure out proc's collectives (zone, project, task, and/or user).
58 *	- For each seg in proc's address space:
59 *		- If seg is private:
60 *			- Lookup anons in the amp.
61 *			- For incore pages not previously visited each of the
62 *			  proc's collectives, add incore pagesize to each.
63 *			  collective.
64 *			  Anon's with a refcnt of 1 can be assummed to be not
65 *			  previously visited.
66 *			- For address ranges without anons in the amp:
67 *				- Lookup pages in underlying vnode.
68 *				- For incore pages not previously visiting for
69 *				  each of the proc's collectives, add incore
70 *				  pagesize to each collective.
71 *		- If seg is shared:
72 *			- Lookup pages in the shared amp or vnode.
73 *			- For incore pages not previously visited for each of
74 *			  the proc's collectives, add incore pagesize to each
75 *			  collective.
76 *
77 * Swap is reserved by private segments, and shared anonymous segments.
78 * The only shared anon segments which do not reserve swap are ISM segments
79 * and schedctl segments, both of which can be identified by having
80 * amp->swresv == 0.
81 *
82 * The swap calculation for each collective is as follows:
83 *
84 *   - Inspect flags, determine if counting rss for zones, projects, tasks,
85 *     and/or users.
86 *   - For each proc:
87 *	- Figure out proc's collectives (zone, project, task, and/or user).
88 *	- For each seg in proc's address space:
89 *		- If seg is private:
90 *			- Add svd->swresv pages to swap count for each of the
91 *			  proc's collectives.
92 *		- If seg is anon, shared, and amp->swresv != 0
93 *			- For address ranges in amp not previously visited for
94 *			  each of the proc's collectives, add size of address
95 *			  range to the swap count for each collective.
96 *
97 * These two calculations are done simultaneously, with most of the work
98 * being done in vmu_calculate_seg().  The results of the calculation are
99 * copied into "vmu_data.vmu_cache_results".
100 *
101 * To perform the calculation, various things are tracked and cached:
102 *
103 *    - incore/not-incore page ranges for all vnodes.
104 *	(vmu_data.vmu_all_vnodes_hash)
105 *	This eliminates looking up the same page more than once.
106 *
107 *    - incore/not-incore page ranges for all shared amps.
108 *	(vmu_data.vmu_all_amps_hash)
109 *	This eliminates looking up the same page more than once.
110 *
111 *    - visited page ranges for each collective.
112 *	   - per vnode (entity->vme_vnode_hash)
113 *	   - per shared amp (entity->vme_amp_hash)
114 *	For accurate counting of map-shared and COW-shared pages.
115 *
116 *    - visited private anons (refcnt > 1) for each collective.
117 *	(entity->vme_anon_hash)
118 *	For accurate counting of COW-shared pages.
119 *
120 * The common accounting structure is the vmu_entity_t, which represents
121 * collectives:
122 *
123 *    - A zone.
124 *    - A project, task, or user within a zone.
125 *    - The entire system (vmu_data.vmu_system).
126 *    - Each collapsed (col) project and user.  This means a given projid or
127 *	uid, regardless of which zone the process is in.  For instance,
128 *      project 0 in the global zone and project 0 in a non global zone are
129 *	the same collapsed project.
130 *
131 *  Each entity structure tracks which pages have been already visited for
132 *  that entity (via previously inspected processes) so that these pages are
133 *  not double counted.
134 */
135
136#include <sys/errno.h>
137#include <sys/types.h>
138#include <sys/zone.h>
139#include <sys/proc.h>
140#include <sys/project.h>
141#include <sys/task.h>
142#include <sys/thread.h>
143#include <sys/time.h>
144#include <sys/mman.h>
145#include <sys/modhash.h>
146#include <sys/modhash_impl.h>
147#include <sys/shm.h>
148#include <sys/swap.h>
149#include <sys/synch.h>
150#include <sys/systm.h>
151#include <sys/var.h>
152#include <sys/vm_usage.h>
153#include <sys/zone.h>
154#include <sys/sunddi.h>
155#include <sys/avl.h>
156#include <vm/anon.h>
157#include <vm/as.h>
158#include <vm/seg_vn.h>
159#include <vm/seg_spt.h>
160
161#define	VMUSAGE_HASH_SIZE		512
162
163#define	VMUSAGE_TYPE_VNODE		1
164#define	VMUSAGE_TYPE_AMP		2
165#define	VMUSAGE_TYPE_ANON		3
166
167#define	VMUSAGE_BOUND_UNKNOWN		0
168#define	VMUSAGE_BOUND_INCORE		1
169#define	VMUSAGE_BOUND_NOT_INCORE	2
170
171#define	ISWITHIN(node, addr)	((node)->vmb_start <= addr && \
172				    (node)->vmb_end >= addr ? 1 : 0)
173
174/*
175 * bounds for vnodes and shared amps
176 * Each bound is either entirely incore, entirely not in core, or
177 * entirely unknown.  bounds are stored in an avl tree sorted by start member
178 * when in use, otherwise (free or temporary lists) they're strung
179 * together off of vmb_next.
180 */
181typedef struct vmu_bound {
182	avl_node_t vmb_node;
183	struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
184	pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
185	pgcnt_t	vmb_end;    /* page offset in vnode/amp on which bound ends */
186	char	vmb_type;   /* One of VMUSAGE_BOUND_* */
187} vmu_bound_t;
188
189/*
190 * hash of visited objects (vnodes or shared amps)
191 * key is address of vnode or amp.  Bounds lists known incore/non-incore
192 * bounds for vnode/amp.
193 */
194typedef struct vmu_object {
195	struct vmu_object	*vmo_next;	/* free list */
196	caddr_t		vmo_key;
197	short		vmo_type;
198	avl_tree_t	vmo_bounds;
199} vmu_object_t;
200
201/*
202 * Entity by which to count results.
203 *
204 * The entity structure keeps the current rss/swap counts for each entity
205 * (zone, project, etc), and hashes of vm structures that have already
206 * been visited for the entity.
207 *
208 * vme_next:	links the list of all entities currently being counted by
209 *		vmu_calculate().
210 *
211 * vme_next_calc: links the list of entities related to the current process
212 *		 being counted by vmu_calculate_proc().
213 *
214 * vmu_calculate_proc() walks all processes.  For each process, it makes a
215 * list of the entities related to that process using vme_next_calc.  This
216 * list changes each time vmu_calculate_proc() is called.
217 *
218 */
219typedef struct vmu_entity {
220	struct vmu_entity *vme_next;
221	struct vmu_entity *vme_next_calc;
222	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
223	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
224	mod_hash_t	*vme_anon_hash;	 /* COW anons visited for entity */
225	vmusage_t	vme_result;	 /* identifies entity and results */
226} vmu_entity_t;
227
228/*
229 * Hash of entities visited within a zone, and an entity for the zone
230 * itself.
231 */
232typedef struct vmu_zone {
233	struct vmu_zone	*vmz_next;	/* free list */
234	id_t		vmz_id;
235	vmu_entity_t	*vmz_zone;
236	mod_hash_t	*vmz_projects_hash;
237	mod_hash_t	*vmz_tasks_hash;
238	mod_hash_t	*vmz_rusers_hash;
239	mod_hash_t	*vmz_eusers_hash;
240} vmu_zone_t;
241
242/*
243 * Cache of results from last calculation
244 */
245typedef struct vmu_cache {
246	vmusage_t	*vmc_results;	/* Results from last call to */
247					/* vm_getusage(). */
248	uint64_t	vmc_nresults;	/* Count of cached results */
249	uint64_t	vmc_refcnt;	/* refcnt for free */
250	uint_t		vmc_flags;	/* Flags for vm_getusage() */
251	hrtime_t	vmc_timestamp;	/* when cache was created */
252} vmu_cache_t;
253
254/*
255 * top level rss info for the system
256 */
257typedef struct vmu_data {
258	kmutex_t	vmu_lock;		/* Protects vmu_data */
259	kcondvar_t	vmu_cv;			/* Used to signal threads */
260						/* Waiting for */
261						/* Rss_calc_thread to finish */
262	vmu_entity_t	*vmu_system;		/* Entity for tracking */
263						/* rss/swap for all processes */
264						/* in all zones */
265	mod_hash_t	*vmu_zones_hash;	/* Zones visited */
266	mod_hash_t	*vmu_projects_col_hash; /* These *_col_hash hashes */
267	mod_hash_t	*vmu_rusers_col_hash;	/* keep track of entities, */
268	mod_hash_t	*vmu_eusers_col_hash;	/* ignoring zoneid, in order */
269						/* to implement VMUSAGE_COL_* */
270						/* flags, which aggregate by */
271						/* project or user regardless */
272						/* of zoneid. */
273	mod_hash_t	*vmu_all_vnodes_hash;	/* System wide visited vnodes */
274						/* to track incore/not-incore */
275	mod_hash_t	*vmu_all_amps_hash;	/* System wide visited shared */
276						/* amps to track incore/not- */
277						/* incore */
278	vmu_entity_t	*vmu_entities;		/* Linked list of entities */
279	size_t		vmu_nentities;		/* Count of entities in list */
280	vmu_cache_t	*vmu_cache;		/* Cached results */
281	kthread_t	*vmu_calc_thread;	/* NULL, or thread running */
282						/* vmu_calculate() */
283	uint_t		vmu_calc_flags;		/* Flags being using by */
284						/* currently running calc */
285						/* thread */
286	uint_t		vmu_pending_flags;	/* Flags of vm_getusage() */
287						/* threads waiting for */
288						/* calc thread to finish */
289	uint_t		vmu_pending_waiters;	/* Number of threads waiting */
290						/* for calc thread */
291	vmu_bound_t	*vmu_free_bounds;
292	vmu_object_t	*vmu_free_objects;
293	vmu_entity_t	*vmu_free_entities;
294	vmu_zone_t	*vmu_free_zones;
295} vmu_data_t;
296
297extern struct as kas;
298extern proc_t *practive;
299extern zone_t *global_zone;
300extern struct seg_ops segvn_ops;
301extern struct seg_ops segspt_shmops;
302
303static vmu_data_t vmu_data;
304static kmem_cache_t *vmu_bound_cache;
305static kmem_cache_t *vmu_object_cache;
306
307/*
308 * Comparison routine for AVL tree. We base our comparison on vmb_start.
309 */
310static int
311bounds_cmp(const void *bnd1, const void *bnd2)
312{
313	const vmu_bound_t *bound1 = bnd1;
314	const vmu_bound_t *bound2 = bnd2;
315
316	if (bound1->vmb_start == bound2->vmb_start) {
317		return (0);
318	}
319	if (bound1->vmb_start < bound2->vmb_start) {
320		return (-1);
321	}
322
323	return (1);
324}
325
326/*
327 * Save a bound on the free list.
328 */
329static void
330vmu_free_bound(vmu_bound_t *bound)
331{
332	bound->vmb_next = vmu_data.vmu_free_bounds;
333	bound->vmb_start = 0;
334	bound->vmb_end = 0;
335	bound->vmb_type = 0;
336	vmu_data.vmu_free_bounds = bound;
337}
338
339/*
340 * Free an object, and all visited bound info.
341 */
342static void
343vmu_free_object(mod_hash_val_t val)
344{
345	vmu_object_t *obj = (vmu_object_t *)val;
346	avl_tree_t *tree = &(obj->vmo_bounds);
347	vmu_bound_t *bound;
348	void *cookie = NULL;
349
350	while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
351		vmu_free_bound(bound);
352	avl_destroy(tree);
353
354	obj->vmo_type = 0;
355	obj->vmo_next = vmu_data.vmu_free_objects;
356	vmu_data.vmu_free_objects = obj;
357}
358
359/*
360 * Free an entity, and hashes of visited objects for that entity.
361 */
362static void
363vmu_free_entity(mod_hash_val_t val)
364{
365	vmu_entity_t *entity = (vmu_entity_t *)val;
366
367	if (entity->vme_vnode_hash != NULL)
368		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
369	if (entity->vme_amp_hash != NULL)
370		i_mod_hash_clear_nosync(entity->vme_amp_hash);
371	if (entity->vme_anon_hash != NULL)
372		i_mod_hash_clear_nosync(entity->vme_anon_hash);
373
374	entity->vme_next = vmu_data.vmu_free_entities;
375	vmu_data.vmu_free_entities = entity;
376}
377
378/*
379 * Free zone entity, and all hashes of entities inside that zone,
380 * which are projects, tasks, and users.
381 */
382static void
383vmu_free_zone(mod_hash_val_t val)
384{
385	vmu_zone_t *zone = (vmu_zone_t *)val;
386
387	if (zone->vmz_zone != NULL) {
388		vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
389		zone->vmz_zone = NULL;
390	}
391	if (zone->vmz_projects_hash != NULL)
392		i_mod_hash_clear_nosync(zone->vmz_projects_hash);
393	if (zone->vmz_tasks_hash != NULL)
394		i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
395	if (zone->vmz_rusers_hash != NULL)
396		i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
397	if (zone->vmz_eusers_hash != NULL)
398		i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
399	zone->vmz_next = vmu_data.vmu_free_zones;
400	vmu_data.vmu_free_zones = zone;
401}
402
403/*
404 * Initialize synchronization primitives and hashes for system-wide tracking
405 * of visited vnodes and shared amps.  Initialize results cache.
406 */
407void
408vm_usage_init()
409{
410	mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
411	cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
412
413	vmu_data.vmu_system = NULL;
414	vmu_data.vmu_zones_hash = NULL;
415	vmu_data.vmu_projects_col_hash = NULL;
416	vmu_data.vmu_rusers_col_hash = NULL;
417	vmu_data.vmu_eusers_col_hash = NULL;
418
419	vmu_data.vmu_free_bounds = NULL;
420	vmu_data.vmu_free_objects = NULL;
421	vmu_data.vmu_free_entities = NULL;
422	vmu_data.vmu_free_zones = NULL;
423
424	vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
425	    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
426	    sizeof (vnode_t));
427	vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
428	    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
429	    sizeof (struct anon_map));
430	vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
431	    "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
432	    vmu_free_entity);
433	vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
434	    "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
435	    vmu_free_entity);
436	vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
437	    "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
438	    vmu_free_entity);
439	vmu_data.vmu_zones_hash = mod_hash_create_idhash(
440	    "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
441
442	vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
443	    sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
444	vmu_object_cache = kmem_cache_create("vmu_object_cache",
445	    sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
446
447	vmu_data.vmu_entities = NULL;
448	vmu_data.vmu_nentities = 0;
449
450	vmu_data.vmu_cache = NULL;
451	vmu_data.vmu_calc_thread = NULL;
452	vmu_data.vmu_calc_flags = 0;
453	vmu_data.vmu_pending_flags = 0;
454	vmu_data.vmu_pending_waiters = 0;
455}
456
457/*
458 * Allocate hashes for tracking vm objects visited for an entity.
459 * Update list of entities.
460 */
461static vmu_entity_t *
462vmu_alloc_entity(id_t id, int type, id_t zoneid)
463{
464	vmu_entity_t *entity;
465
466	if (vmu_data.vmu_free_entities != NULL) {
467		entity = vmu_data.vmu_free_entities;
468		vmu_data.vmu_free_entities =
469		    vmu_data.vmu_free_entities->vme_next;
470		bzero(&entity->vme_result, sizeof (vmusage_t));
471	} else {
472		entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
473	}
474	entity->vme_result.vmu_id = id;
475	entity->vme_result.vmu_zoneid = zoneid;
476	entity->vme_result.vmu_type = type;
477
478	if (entity->vme_vnode_hash == NULL)
479		entity->vme_vnode_hash = mod_hash_create_ptrhash(
480		    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
481		    sizeof (vnode_t));
482
483	if (entity->vme_amp_hash == NULL)
484		entity->vme_amp_hash = mod_hash_create_ptrhash(
485		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
486		    sizeof (struct anon_map));
487
488	if (entity->vme_anon_hash == NULL)
489		entity->vme_anon_hash = mod_hash_create_ptrhash(
490		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
491		    mod_hash_null_valdtor, sizeof (struct anon));
492
493	entity->vme_next = vmu_data.vmu_entities;
494	vmu_data.vmu_entities = entity;
495	vmu_data.vmu_nentities++;
496
497	return (entity);
498}
499
500/*
501 * Allocate a zone entity, and hashes for tracking visited vm objects
502 * for projects, tasks, and users within that zone.
503 */
504static vmu_zone_t *
505vmu_alloc_zone(id_t id)
506{
507	vmu_zone_t *zone;
508
509	if (vmu_data.vmu_free_zones != NULL) {
510		zone = vmu_data.vmu_free_zones;
511		vmu_data.vmu_free_zones =
512		    vmu_data.vmu_free_zones->vmz_next;
513		zone->vmz_next = NULL;
514		zone->vmz_zone = NULL;
515	} else {
516		zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
517	}
518
519	zone->vmz_id = id;
520
521	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
522		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
523
524	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
525	    VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
526		zone->vmz_projects_hash = mod_hash_create_idhash(
527		    "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
528
529	if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
530	    != 0 && zone->vmz_tasks_hash == NULL)
531		zone->vmz_tasks_hash = mod_hash_create_idhash(
532		    "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
533
534	if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
535	    != 0 && zone->vmz_rusers_hash == NULL)
536		zone->vmz_rusers_hash = mod_hash_create_idhash(
537		    "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
538
539	if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
540	    != 0 && zone->vmz_eusers_hash == NULL)
541		zone->vmz_eusers_hash = mod_hash_create_idhash(
542		    "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
543
544	return (zone);
545}
546
547/*
548 * Allocate a structure for tracking visited bounds for a vm object.
549 */
550static vmu_object_t *
551vmu_alloc_object(caddr_t key, int type)
552{
553	vmu_object_t *object;
554
555	if (vmu_data.vmu_free_objects != NULL) {
556		object = vmu_data.vmu_free_objects;
557		vmu_data.vmu_free_objects =
558		    vmu_data.vmu_free_objects->vmo_next;
559	} else {
560		object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
561	}
562
563	object->vmo_next = NULL;
564	object->vmo_key = key;
565	object->vmo_type = type;
566	avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
567
568	return (object);
569}
570
571/*
572 * Allocate and return a bound structure.
573 */
574static vmu_bound_t *
575vmu_alloc_bound()
576{
577	vmu_bound_t *bound;
578
579	if (vmu_data.vmu_free_bounds != NULL) {
580		bound = vmu_data.vmu_free_bounds;
581		vmu_data.vmu_free_bounds =
582		    vmu_data.vmu_free_bounds->vmb_next;
583	} else {
584		bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
585	}
586
587	bound->vmb_next = NULL;
588	bound->vmb_start = 0;
589	bound->vmb_end = 0;
590	bound->vmb_type = 0;
591	return (bound);
592}
593
594/*
595 * vmu_find_insert_* functions implement hash lookup or allocate and
596 * insert operations.
597 */
598static vmu_object_t *
599vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
600{
601	int ret;
602	vmu_object_t *object;
603
604	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
605	    (mod_hash_val_t *)&object);
606	if (ret != 0) {
607		object = vmu_alloc_object(key, type);
608		ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
609		    (mod_hash_val_t)object, (mod_hash_hndl_t)0);
610		ASSERT(ret == 0);
611	}
612	return (object);
613}
614
615static int
616vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
617{
618	int ret;
619	caddr_t val;
620
621	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
622	    (mod_hash_val_t *)&val);
623
624	if (ret == 0)
625		return (0);
626
627	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
628	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
629
630	ASSERT(ret == 0);
631
632	return (1);
633}
634
635static vmu_entity_t *
636vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
637{
638	int ret;
639	vmu_entity_t *entity;
640
641	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
642	    (mod_hash_val_t *)&entity);
643	if (ret != 0) {
644		entity = vmu_alloc_entity(id, type, zoneid);
645		ret = i_mod_hash_insert_nosync(hash,
646		    (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
647		    (mod_hash_hndl_t)0);
648		ASSERT(ret == 0);
649	}
650	return (entity);
651}
652
653
654
655
656/*
657 * Returns list of object bounds between start and end.  New bounds inserted
658 * by this call are given type.
659 *
660 * Returns the number of pages covered if new bounds are created.  Returns 0
661 * if region between start/end consists of all existing bounds.
662 */
663static pgcnt_t
664vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
665    end, char type, vmu_bound_t **first, vmu_bound_t **last)
666{
667	avl_tree_t	*tree = &(ro->vmo_bounds);
668	avl_index_t	where;
669	vmu_bound_t	*walker, *tmp;
670	pgcnt_t		ret = 0;
671
672	ASSERT(start <= end);
673
674	*first = *last = NULL;
675
676	tmp = vmu_alloc_bound();
677	tmp->vmb_start = start;
678	tmp->vmb_type = type;
679
680	/* Hopelessly optimistic case. */
681	if (walker = avl_find(tree, tmp, &where)) {
682		/* We got lucky. */
683		vmu_free_bound(tmp);
684		*first = walker;
685	}
686
687	if (walker == NULL) {
688		/* Is start in the previous node? */
689		walker = avl_nearest(tree, where, AVL_BEFORE);
690		if (walker != NULL) {
691			if (ISWITHIN(walker, start)) {
692				/* We found start. */
693				vmu_free_bound(tmp);
694				*first = walker;
695			}
696		}
697	}
698
699	/*
700	 * At this point, if *first is still NULL, then we
701	 * didn't get a direct hit and start isn't covered
702	 * by the previous node. We know that the next node
703	 * must have a greater start value than we require
704	 * because avl_find tells us where the AVL routines would
705	 * insert our new node. We have some gap between the
706	 * start we want and the next node.
707	 */
708	if (*first == NULL) {
709		walker = avl_nearest(tree, where, AVL_AFTER);
710		if (walker != NULL && walker->vmb_start <= end) {
711			/* Fill the gap. */
712			tmp->vmb_end = walker->vmb_start - 1;
713			*first = tmp;
714		} else {
715			/* We have a gap over [start, end]. */
716			tmp->vmb_end = end;
717			*first = *last = tmp;
718		}
719		ret += tmp->vmb_end - tmp->vmb_start + 1;
720		avl_insert(tree, tmp, where);
721	}
722
723	ASSERT(*first != NULL);
724
725	if (*last != NULL) {
726		/* We're done. */
727		return (ret);
728	}
729
730	/*
731	 * If we are here we still need to set *last and
732	 * that may involve filling in some gaps.
733	 */
734	*last = *first;
735	for (;;) {
736		if (ISWITHIN(*last, end)) {
737			/* We're done. */
738			break;
739		}
740		walker = AVL_NEXT(tree, *last);
741		if (walker == NULL || walker->vmb_start > end) {
742			/* Bottom or mid tree with gap. */
743			tmp = vmu_alloc_bound();
744			tmp->vmb_start = (*last)->vmb_end + 1;
745			tmp->vmb_end = end;
746			tmp->vmb_type = type;
747			ret += tmp->vmb_end - tmp->vmb_start + 1;
748			avl_insert_here(tree, tmp, *last, AVL_AFTER);
749			*last = tmp;
750			break;
751		} else {
752			if ((*last)->vmb_end + 1 != walker->vmb_start) {
753				/* Non-contiguous. */
754				tmp = vmu_alloc_bound();
755				tmp->vmb_start = (*last)->vmb_end + 1;
756				tmp->vmb_end = walker->vmb_start - 1;
757				tmp->vmb_type = type;
758				ret += tmp->vmb_end - tmp->vmb_start + 1;
759				avl_insert_here(tree, tmp, *last, AVL_AFTER);
760				*last = tmp;
761			} else {
762				*last = walker;
763			}
764		}
765	}
766
767	return (ret);
768}
769
770/*
771 * vmu_update_bounds()
772 *
773 * tree: avl_tree in which first and last hang.
774 *
775 * first, last:	list of continuous bounds, of which zero or more are of
776 * 		type VMUSAGE_BOUND_UNKNOWN.
777 *
778 * new_tree: avl_tree in which new_first and new_last hang.
779 *
780 * new_first, new_last:	list of continuous bounds, of which none are of
781 *			type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
782 *			update the types of bounds in (first,last) with
783 *			type VMUSAGE_BOUND_UNKNOWN.
784 *
785 * For the list of bounds (first,last), this function updates any bounds
786 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
787 * the list (new_first, new_last).
788 *
789 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
790 * (new_first, new_last), it will be split into multiple bounds.
791 *
792 * Return value:
793 * 	The number of pages in the list of bounds (first,last) that were of
794 *	type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
795 *	VMUSAGE_BOUND_INCORE.
796 *
797 */
798static pgcnt_t
799vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
800    avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
801{
802	vmu_bound_t *next, *new_next, *tmp;
803	pgcnt_t rss = 0;
804
805	next = *first;
806	new_next = new_first;
807
808	/*
809	 * Verify first and last bound are covered by new bounds if they
810	 * have unknown type.
811	 */
812	ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
813	    (*first)->vmb_start >= new_first->vmb_start);
814	ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
815	    (*last)->vmb_end <= new_last->vmb_end);
816	for (;;) {
817		/* If bound already has type, proceed to next bound. */
818		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
819			if (next == *last)
820				break;
821			next = AVL_NEXT(tree, next);
822			continue;
823		}
824		while (new_next->vmb_end < next->vmb_start)
825			new_next = AVL_NEXT(new_tree, new_next);
826		ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
827		next->vmb_type = new_next->vmb_type;
828		if (new_next->vmb_end < next->vmb_end) {
829			/* need to split bound */
830			tmp = vmu_alloc_bound();
831			tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
832			tmp->vmb_start = new_next->vmb_end + 1;
833			tmp->vmb_end = next->vmb_end;
834			avl_insert_here(tree, tmp, next, AVL_AFTER);
835			next->vmb_end = new_next->vmb_end;
836			if (*last == next)
837				*last = tmp;
838			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
839				rss += next->vmb_end - next->vmb_start + 1;
840			next = tmp;
841		} else {
842			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
843				rss += next->vmb_end - next->vmb_start + 1;
844			if (next == *last)
845				break;
846			next = AVL_NEXT(tree, next);
847		}
848	}
849	return (rss);
850}
851
852/*
853 * Merges adjacent bounds with same type between first and last bound.
854 * After merge, last pointer may point to a different bound, as (incoming)
855 * last bound may have been merged away.
856 */
857static void
858vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
859{
860	vmu_bound_t *current;
861	vmu_bound_t *next;
862
863	ASSERT(tree != NULL);
864	ASSERT(*first != NULL);
865	ASSERT(*last != NULL);
866
867	current = *first;
868	while (current != *last) {
869		next = AVL_NEXT(tree, current);
870		if ((current->vmb_end + 1) == next->vmb_start &&
871		    current->vmb_type == next->vmb_type) {
872			current->vmb_end = next->vmb_end;
873			avl_remove(tree, next);
874			vmu_free_bound(next);
875			if (next == *last) {
876				*last = current;
877			}
878		} else {
879			current = AVL_NEXT(tree, current);
880		}
881	}
882}
883
884/*
885 * Given an amp and a list of bounds, updates each bound's type with
886 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
887 *
888 * If a bound is partially incore, it will be split into two bounds.
889 * first and last may be modified, as bounds may be split into multiple
890 * bounds if they are partially incore/not-incore.
891 *
892 * Set incore to non-zero if bounds are already known to be incore.
893 *
894 */
895static void
896vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
897    vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
898{
899	vmu_bound_t *next;
900	vmu_bound_t *tmp;
901	pgcnt_t index;
902	short bound_type;
903	short page_type;
904	vnode_t *vn;
905	anoff_t off;
906	struct anon *ap;
907
908	next = *first;
909	/* Shared anon slots don't change once set. */
910	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
911	for (;;) {
912		if (incore == B_TRUE)
913			next->vmb_type = VMUSAGE_BOUND_INCORE;
914
915		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
916			if (next == *last)
917				break;
918			next = AVL_NEXT(tree, next);
919			continue;
920		}
921		bound_type = next->vmb_type;
922		index = next->vmb_start;
923		while (index <= next->vmb_end) {
924
925			/*
926			 * These are used to determine how much to increment
927			 * index when a large page is found.
928			 */
929			page_t *page;
930			pgcnt_t pgcnt = 1;
931			uint_t pgshft;
932			pgcnt_t pgmsk;
933
934			ap = anon_get_ptr(amp->ahp, index);
935			if (ap != NULL)
936				swap_xlate(ap, &vn, &off);
937
938			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
939			    (page = page_exists(vn, off)) != NULL) {
940				page_type = VMUSAGE_BOUND_INCORE;
941				if (page->p_szc > 0) {
942					pgcnt = page_get_pagecnt(page->p_szc);
943					pgshft = page_get_shift(page->p_szc);
944					pgmsk = (0x1 << (pgshft - PAGESHIFT))
945					    - 1;
946				}
947			} else {
948				page_type = VMUSAGE_BOUND_NOT_INCORE;
949			}
950			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
951				next->vmb_type = page_type;
952			} else if (next->vmb_type != page_type) {
953				/*
954				 * If current bound type does not match page
955				 * type, need to split off new bound.
956				 */
957				tmp = vmu_alloc_bound();
958				tmp->vmb_type = page_type;
959				tmp->vmb_start = index;
960				tmp->vmb_end = next->vmb_end;
961				avl_insert_here(tree, tmp, next, AVL_AFTER);
962				next->vmb_end = index - 1;
963				if (*last == next)
964					*last = tmp;
965				next = tmp;
966			}
967			if (pgcnt > 1) {
968				/*
969				 * If inside large page, jump to next large
970				 * page
971				 */
972				index = (index & ~pgmsk) + pgcnt;
973			} else {
974				index++;
975			}
976		}
977		if (next == *last) {
978			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
979			break;
980		} else
981			next = AVL_NEXT(tree, next);
982	}
983	ANON_LOCK_EXIT(&amp->a_rwlock);
984}
985
986/*
987 * Same as vmu_amp_update_incore_bounds(), except for tracking
988 * incore-/not-incore for vnodes.
989 */
990static void
991vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
992    vmu_bound_t **first, vmu_bound_t **last)
993{
994	vmu_bound_t *next;
995	vmu_bound_t *tmp;
996	pgcnt_t index;
997	short bound_type;
998	short page_type;
999
1000	next = *first;
1001	for (;;) {
1002		if (vnode->v_pages == NULL)
1003			next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004
1005		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006			if (next == *last)
1007				break;
1008			next = AVL_NEXT(tree, next);
1009			continue;
1010		}
1011
1012		bound_type = next->vmb_type;
1013		index = next->vmb_start;
1014		while (index <= next->vmb_end) {
1015
1016			/*
1017			 * These are used to determine how much to increment
1018			 * index when a large page is found.
1019			 */
1020			page_t *page;
1021			pgcnt_t pgcnt = 1;
1022			uint_t pgshft;
1023			pgcnt_t pgmsk;
1024
1025			if (vnode->v_pages != NULL &&
1026			    (page = page_exists(vnode, ptob(index))) != NULL) {
1027				page_type = VMUSAGE_BOUND_INCORE;
1028				if (page->p_szc > 0) {
1029					pgcnt = page_get_pagecnt(page->p_szc);
1030					pgshft = page_get_shift(page->p_szc);
1031					pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032					    - 1;
1033				}
1034			} else {
1035				page_type = VMUSAGE_BOUND_NOT_INCORE;
1036			}
1037			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038				next->vmb_type = page_type;
1039			} else if (next->vmb_type != page_type) {
1040				/*
1041				 * If current bound type does not match page
1042				 * type, need to split off new bound.
1043				 */
1044				tmp = vmu_alloc_bound();
1045				tmp->vmb_type = page_type;
1046				tmp->vmb_start = index;
1047				tmp->vmb_end = next->vmb_end;
1048				avl_insert_here(tree, tmp, next, AVL_AFTER);
1049				next->vmb_end = index - 1;
1050				if (*last == next)
1051					*last = tmp;
1052				next = tmp;
1053			}
1054			if (pgcnt > 1) {
1055				/*
1056				 * If inside large page, jump to next large
1057				 * page
1058				 */
1059				index = (index & ~pgmsk) + pgcnt;
1060			} else {
1061				index++;
1062			}
1063		}
1064		if (next == *last) {
1065			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1066			break;
1067		} else
1068			next = AVL_NEXT(tree, next);
1069	}
1070}
1071
1072/*
1073 * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1074 * list of entities to visit.  For shared segments, the vnode or amp
1075 * is looked up in each entity to see if it has been already counted.  Private
1076 * anon pages are checked per entity to ensure that COW pages are not
1077 * double counted.
1078 *
1079 * For private mapped files, first the amp is checked for private pages.
1080 * Bounds not backed by the amp are looked up in the vnode for each entity
1081 * to avoid double counting of private COW vnode pages.
1082 */
1083static void
1084vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1085{
1086	struct segvn_data *svd;
1087	struct shm_data *shmd;
1088	struct spt_data *sptd;
1089	vmu_object_t *shared_object = NULL;
1090	vmu_object_t *entity_object = NULL;
1091	vmu_entity_t *entity;
1092	vmusage_t *result;
1093	vmu_bound_t *first = NULL;
1094	vmu_bound_t *last = NULL;
1095	vmu_bound_t *cur = NULL;
1096	vmu_bound_t *e_first = NULL;
1097	vmu_bound_t *e_last = NULL;
1098	vmu_bound_t *tmp;
1099	pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1100	struct anon_map *private_amp = NULL;
1101	boolean_t incore = B_FALSE;
1102	boolean_t shared = B_FALSE;
1103	int file = 0;
1104	pgcnt_t swresv = 0;
1105	pgcnt_t panon = 0;
1106
1107	/* Can zero-length segments exist?  Not sure, so paranoia. */
1108	if (seg->s_size <= 0)
1109		return;
1110
1111	/*
1112	 * Figure out if there is a shared object (such as a named vnode or
1113	 * a shared amp, then figure out if there is a private amp, which
1114	 * identifies private pages.
1115	 */
1116	if (seg->s_ops == &segvn_ops) {
1117		svd = (struct segvn_data *)seg->s_data;
1118		if (svd->type == MAP_SHARED) {
1119			shared = B_TRUE;
1120		} else {
1121			swresv = svd->swresv;
1122
1123			if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1124			    RW_READER) != 0) {
1125				/*
1126				 * Text replication anon maps can be shared
1127				 * across all zones. Space used for text
1128				 * replication is typically capped as a small %
1129				 * of memory.  To keep it simple for now we
1130				 * don't account for swap and memory space used
1131				 * for text replication.
1132				 */
1133				if (svd->tr_state == SEGVN_TR_OFF &&
1134				    svd->amp != NULL) {
1135					private_amp = svd->amp;
1136					p_start = svd->anon_index;
1137					p_end = svd->anon_index +
1138					    btop(seg->s_size) - 1;
1139				}
1140				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1141			}
1142		}
1143		if (svd->vp != NULL) {
1144			file = 1;
1145			shared_object = vmu_find_insert_object(
1146			    vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1147			    VMUSAGE_TYPE_VNODE);
1148			s_start = btop(svd->offset);
1149			s_end = btop(svd->offset + seg->s_size) - 1;
1150		}
1151		if (svd->amp != NULL && svd->type == MAP_SHARED) {
1152			ASSERT(shared_object == NULL);
1153			shared_object = vmu_find_insert_object(
1154			    vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1155			    VMUSAGE_TYPE_AMP);
1156			s_start = svd->anon_index;
1157			s_end = svd->anon_index + btop(seg->s_size) - 1;
1158			/* schedctl mappings are always in core */
1159			if (svd->amp->swresv == 0)
1160				incore = B_TRUE;
1161		}
1162	} else if (seg->s_ops == &segspt_shmops) {
1163		shared = B_TRUE;
1164		shmd = (struct shm_data *)seg->s_data;
1165		shared_object = vmu_find_insert_object(
1166		    vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1167		    VMUSAGE_TYPE_AMP);
1168		s_start = 0;
1169		s_end = btop(seg->s_size) - 1;
1170		sptd = shmd->shm_sptseg->s_data;
1171
1172		/* ism segments are always incore and do not reserve swap */
1173		if (sptd->spt_flags & SHM_SHARE_MMU)
1174			incore = B_TRUE;
1175
1176	} else {
1177		return;
1178	}
1179
1180	/*
1181	 * If there is a private amp, count anon pages that exist.  If an
1182	 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1183	 * hash so that it is not double counted.
1184	 *
1185	 * If there is also a shared object, then figure out the bounds
1186	 * which are not mapped by the private amp.
1187	 */
1188	if (private_amp != NULL) {
1189
1190		/* Enter as writer to prevent COW anons from being freed */
1191		ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1192
1193		p_index = p_start;
1194		s_index = s_start;
1195
1196		while (p_index <= p_end) {
1197
1198			pgcnt_t p_index_next;
1199			pgcnt_t p_bound_size;
1200			int cnt;
1201			anoff_t off;
1202			struct vnode *vn;
1203			struct anon *ap;
1204			page_t *page;		/* For handling of large */
1205			pgcnt_t pgcnt = 1;	/* pages */
1206			pgcnt_t pgstart;
1207			pgcnt_t pgend;
1208			uint_t pgshft;
1209			pgcnt_t pgmsk;
1210
1211			p_index_next = p_index;
1212			ap = anon_get_next_ptr(private_amp->ahp,
1213			    &p_index_next);
1214
1215			/*
1216			 * If next anon is past end of mapping, simulate
1217			 * end of anon so loop terminates.
1218			 */
1219			if (p_index_next > p_end) {
1220				p_index_next = p_end + 1;
1221				ap = NULL;
1222			}
1223			/*
1224			 * For COW segments, keep track of bounds not
1225			 * backed by private amp so they can be looked
1226			 * up in the backing vnode
1227			 */
1228			if (p_index_next != p_index) {
1229
1230				/*
1231				 * Compute index difference between anon and
1232				 * previous anon.
1233				 */
1234				p_bound_size = p_index_next - p_index - 1;
1235
1236				if (shared_object != NULL) {
1237					cur = vmu_alloc_bound();
1238					cur->vmb_start = s_index;
1239					cur->vmb_end = s_index + p_bound_size;
1240					cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1241					if (first == NULL) {
1242						first = cur;
1243						last = cur;
1244					} else {
1245						last->vmb_next = cur;
1246						last = cur;
1247					}
1248				}
1249				p_index = p_index + p_bound_size + 1;
1250				s_index = s_index + p_bound_size + 1;
1251			}
1252
1253			/* Detect end of anons in amp */
1254			if (ap == NULL)
1255				break;
1256
1257			cnt = ap->an_refcnt;
1258			swap_xlate(ap, &vn, &off);
1259
1260			if (vn == NULL || vn->v_pages == NULL ||
1261			    (page = page_exists(vn, off)) == NULL) {
1262				p_index++;
1263				s_index++;
1264				continue;
1265			}
1266
1267			/*
1268			 * If large page is found, compute portion of large
1269			 * page in mapping, and increment indicies to the next
1270			 * large page.
1271			 */
1272			if (page->p_szc > 0) {
1273
1274				pgcnt = page_get_pagecnt(page->p_szc);
1275				pgshft = page_get_shift(page->p_szc);
1276				pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1277
1278				/* First page in large page */
1279				pgstart = p_index & ~pgmsk;
1280				/* Last page in large page */
1281				pgend = pgstart + pgcnt - 1;
1282				/*
1283				 * Artifically end page if page extends past
1284				 * end of mapping.
1285				 */
1286				if (pgend > p_end)
1287					pgend = p_end;
1288
1289				/*
1290				 * Compute number of pages from large page
1291				 * which are mapped.
1292				 */
1293				pgcnt = pgend - p_index + 1;
1294
1295				/*
1296				 * Point indicies at page after large page,
1297				 * or at page after end of mapping.
1298				 */
1299				p_index += pgcnt;
1300				s_index += pgcnt;
1301			} else {
1302				p_index++;
1303				s_index++;
1304			}
1305
1306			/*
1307			 * Assume anon structs with a refcnt
1308			 * of 1 are not COW shared, so there
1309			 * is no reason to track them per entity.
1310			 */
1311			if (cnt == 1) {
1312				panon += pgcnt;
1313				continue;
1314			}
1315			for (entity = vmu_entities; entity != NULL;
1316			    entity = entity->vme_next_calc) {
1317
1318				result = &entity->vme_result;
1319				/*
1320				 * Track COW anons per entity so
1321				 * they are not double counted.
1322				 */
1323				if (vmu_find_insert_anon(entity->vme_anon_hash,
1324				    (caddr_t)ap) == 0)
1325					continue;
1326
1327				result->vmu_rss_all += (pgcnt << PAGESHIFT);
1328				result->vmu_rss_private +=
1329				    (pgcnt << PAGESHIFT);
1330			}
1331		}
1332		ANON_LOCK_EXIT(&private_amp->a_rwlock);
1333	}
1334
1335	/* Add up resident anon and swap reserved for private mappings */
1336	if (swresv > 0 || panon > 0) {
1337		for (entity = vmu_entities; entity != NULL;
1338		    entity = entity->vme_next_calc) {
1339			result = &entity->vme_result;
1340			result->vmu_swap_all += swresv;
1341			result->vmu_swap_private += swresv;
1342			result->vmu_rss_all += (panon << PAGESHIFT);
1343			result->vmu_rss_private += (panon << PAGESHIFT);
1344		}
1345	}
1346
1347	/* Compute resident pages backing shared amp or named vnode */
1348	if (shared_object != NULL) {
1349		avl_tree_t *tree = &(shared_object->vmo_bounds);
1350
1351		if (first == NULL) {
1352			/*
1353			 * No private amp, or private amp has no anon
1354			 * structs.  This means entire segment is backed by
1355			 * the shared object.
1356			 */
1357			first = vmu_alloc_bound();
1358			first->vmb_start = s_start;
1359			first->vmb_end = s_end;
1360			first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1361		}
1362		/*
1363		 * Iterate bounds not backed by private amp, and compute
1364		 * resident pages.
1365		 */
1366		cur = first;
1367		while (cur != NULL) {
1368
1369			if (vmu_insert_lookup_object_bounds(shared_object,
1370			    cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1371			    &first, &last) > 0) {
1372				/* new bounds, find incore/not-incore */
1373				if (shared_object->vmo_type ==
1374				    VMUSAGE_TYPE_VNODE) {
1375					vmu_vnode_update_incore_bounds(
1376					    tree,
1377					    (vnode_t *)
1378					    shared_object->vmo_key, &first,
1379					    &last);
1380				} else {
1381					vmu_amp_update_incore_bounds(
1382					    tree,
1383					    (struct anon_map *)
1384					    shared_object->vmo_key, &first,
1385					    &last, incore);
1386				}
1387				vmu_merge_bounds(tree, &first, &last);
1388			}
1389			for (entity = vmu_entities; entity != NULL;
1390			    entity = entity->vme_next_calc) {
1391				avl_tree_t *e_tree;
1392
1393				result = &entity->vme_result;
1394
1395				entity_object = vmu_find_insert_object(
1396				    shared_object->vmo_type ==
1397				    VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1398				    entity->vme_amp_hash,
1399				    shared_object->vmo_key,
1400				    shared_object->vmo_type);
1401
1402				virt = vmu_insert_lookup_object_bounds(
1403				    entity_object, cur->vmb_start, cur->vmb_end,
1404				    VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1405
1406				if (virt == 0)
1407					continue;
1408				/*
1409				 * Range visited for this entity
1410				 */
1411				e_tree = &(entity_object->vmo_bounds);
1412				rss = vmu_update_bounds(e_tree, &e_first,
1413				    &e_last, tree, first, last);
1414				result->vmu_rss_all += (rss << PAGESHIFT);
1415				if (shared == B_TRUE && file == B_FALSE) {
1416					/* shared anon mapping */
1417					result->vmu_swap_all +=
1418					    (virt << PAGESHIFT);
1419					result->vmu_swap_shared +=
1420					    (virt << PAGESHIFT);
1421					result->vmu_rss_shared +=
1422					    (rss << PAGESHIFT);
1423				} else if (shared == B_TRUE && file == B_TRUE) {
1424					/* shared file mapping */
1425					result->vmu_rss_shared +=
1426					    (rss << PAGESHIFT);
1427				} else if (shared == B_FALSE &&
1428				    file == B_TRUE) {
1429					/* private file mapping */
1430					result->vmu_rss_private +=
1431					    (rss << PAGESHIFT);
1432				}
1433				vmu_merge_bounds(e_tree, &e_first, &e_last);
1434			}
1435			tmp = cur;
1436			cur = cur->vmb_next;
1437			vmu_free_bound(tmp);
1438		}
1439	}
1440}
1441
1442/*
1443 * Based on the current calculation flags, find the relevant entities
1444 * which are relative to the process.  Then calculate each segment
1445 * in the process'es address space for each relevant entity.
1446 */
1447static void
1448vmu_calculate_proc(proc_t *p)
1449{
1450	vmu_entity_t *entities = NULL;
1451	vmu_zone_t *zone;
1452	vmu_entity_t *tmp;
1453	struct as *as;
1454	struct seg *seg;
1455	int ret;
1456
1457	/* Figure out which entities are being computed */
1458	if ((vmu_data.vmu_system) != NULL) {
1459		tmp = vmu_data.vmu_system;
1460		tmp->vme_next_calc = entities;
1461		entities = tmp;
1462	}
1463	if (vmu_data.vmu_calc_flags &
1464	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1465	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1466	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1467	    VMUSAGE_ALL_EUSERS)) {
1468		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1469		    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1470		    (mod_hash_val_t *)&zone);
1471		if (ret != 0) {
1472			zone = vmu_alloc_zone(p->p_zone->zone_id);
1473			ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1474			    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1475			    (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1476			ASSERT(ret == 0);
1477		}
1478		if (zone->vmz_zone != NULL) {
1479			tmp = zone->vmz_zone;
1480			tmp->vme_next_calc = entities;
1481			entities = tmp;
1482		}
1483		if (vmu_data.vmu_calc_flags &
1484		    (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1485			tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1486			    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1487			    zone->vmz_id);
1488			tmp->vme_next_calc = entities;
1489			entities = tmp;
1490		}
1491		if (vmu_data.vmu_calc_flags &
1492		    (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1493			tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1494			    p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1495			tmp->vme_next_calc = entities;
1496			entities = tmp;
1497		}
1498		if (vmu_data.vmu_calc_flags &
1499		    (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1500			tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1501			    crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1502			tmp->vme_next_calc = entities;
1503			entities = tmp;
1504		}
1505		if (vmu_data.vmu_calc_flags &
1506		    (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1507			tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1508			    crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1509			tmp->vme_next_calc = entities;
1510			entities = tmp;
1511		}
1512	}
1513	/* Entities which collapse projects and users for all zones */
1514	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1515		tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1516		    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1517		tmp->vme_next_calc = entities;
1518		entities = tmp;
1519	}
1520	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1521		tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1522		    crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1523		tmp->vme_next_calc = entities;
1524		entities = tmp;
1525	}
1526	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1527		tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1528		    crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1529		tmp->vme_next_calc = entities;
1530		entities = tmp;
1531	}
1532
1533	ASSERT(entities != NULL);
1534	/* process all segs in process's address space */
1535	as = p->p_as;
1536	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1537	for (seg = AS_SEGFIRST(as); seg != NULL;
1538	    seg = AS_SEGNEXT(as, seg)) {
1539		vmu_calculate_seg(entities, seg);
1540	}
1541	AS_LOCK_EXIT(as, &as->a_lock);
1542}
1543
1544/*
1545 * Free data created by previous call to vmu_calculate().
1546 */
1547static void
1548vmu_clear_calc()
1549{
1550	if (vmu_data.vmu_system != NULL)
1551		vmu_free_entity(vmu_data.vmu_system);
1552		vmu_data.vmu_system = NULL;
1553	if (vmu_data.vmu_zones_hash != NULL)
1554		i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1555	if (vmu_data.vmu_projects_col_hash != NULL)
1556		i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1557	if (vmu_data.vmu_rusers_col_hash != NULL)
1558		i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1559	if (vmu_data.vmu_eusers_col_hash != NULL)
1560		i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1561
1562	i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1563	i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1564}
1565
1566/*
1567 * Free unused data structures.  These can result if the system workload
1568 * decreases between calculations.
1569 */
1570static void
1571vmu_free_extra()
1572{
1573	vmu_bound_t *tb;
1574	vmu_object_t *to;
1575	vmu_entity_t *te;
1576	vmu_zone_t *tz;
1577
1578	while (vmu_data.vmu_free_bounds != NULL) {
1579		tb = vmu_data.vmu_free_bounds;
1580		vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1581		kmem_cache_free(vmu_bound_cache, tb);
1582	}
1583	while (vmu_data.vmu_free_objects != NULL) {
1584		to = vmu_data.vmu_free_objects;
1585		vmu_data.vmu_free_objects =
1586		    vmu_data.vmu_free_objects->vmo_next;
1587		kmem_cache_free(vmu_object_cache, to);
1588	}
1589	while (vmu_data.vmu_free_entities != NULL) {
1590		te = vmu_data.vmu_free_entities;
1591		vmu_data.vmu_free_entities =
1592		    vmu_data.vmu_free_entities->vme_next;
1593		if (te->vme_vnode_hash != NULL)
1594			mod_hash_destroy_hash(te->vme_vnode_hash);
1595		if (te->vme_amp_hash != NULL)
1596			mod_hash_destroy_hash(te->vme_amp_hash);
1597		if (te->vme_anon_hash != NULL)
1598			mod_hash_destroy_hash(te->vme_anon_hash);
1599		kmem_free(te, sizeof (vmu_entity_t));
1600	}
1601	while (vmu_data.vmu_free_zones != NULL) {
1602		tz = vmu_data.vmu_free_zones;
1603		vmu_data.vmu_free_zones =
1604		    vmu_data.vmu_free_zones->vmz_next;
1605		if (tz->vmz_projects_hash != NULL)
1606			mod_hash_destroy_hash(tz->vmz_projects_hash);
1607		if (tz->vmz_tasks_hash != NULL)
1608			mod_hash_destroy_hash(tz->vmz_tasks_hash);
1609		if (tz->vmz_rusers_hash != NULL)
1610			mod_hash_destroy_hash(tz->vmz_rusers_hash);
1611		if (tz->vmz_eusers_hash != NULL)
1612			mod_hash_destroy_hash(tz->vmz_eusers_hash);
1613		kmem_free(tz, sizeof (vmu_zone_t));
1614	}
1615}
1616
1617extern kcondvar_t *pr_pid_cv;
1618
1619/*
1620 * Determine which entity types are relevant and allocate the hashes to
1621 * track them.  Then walk the process table and count rss and swap
1622 * for each process'es address space.  Address space object such as
1623 * vnodes, amps and anons are tracked per entity, so that they are
1624 * not double counted in the results.
1625 *
1626 */
1627static void
1628vmu_calculate()
1629{
1630	int i = 0;
1631	int ret;
1632	proc_t *p;
1633
1634	vmu_clear_calc();
1635
1636	if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1637		vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1638		    ALL_ZONES);
1639
1640	/*
1641	 * Walk process table and calculate rss of each proc.
1642	 *
1643	 * Pidlock and p_lock cannot be held while doing the rss calculation.
1644	 * This is because:
1645	 *	1.  The calculation allocates using KM_SLEEP.
1646	 *	2.  The calculation grabs a_lock, which cannot be grabbed
1647	 *	    after p_lock.
1648	 *
1649	 * Since pidlock must be dropped, we cannot simply just walk the
1650	 * practive list.  Instead, we walk the process table, and sprlock
1651	 * each process to ensure that it does not exit during the
1652	 * calculation.
1653	 */
1654
1655	mutex_enter(&pidlock);
1656	for (i = 0; i < v.v_proc; i++) {
1657again:
1658		p = pid_entry(i);
1659		if (p == NULL)
1660			continue;
1661
1662		mutex_enter(&p->p_lock);
1663		mutex_exit(&pidlock);
1664
1665		if (panicstr) {
1666			mutex_exit(&p->p_lock);
1667			return;
1668		}
1669
1670		/* Try to set P_PR_LOCK */
1671		ret = sprtrylock_proc(p);
1672		if (ret == -1) {
1673			/* Process in invalid state */
1674			mutex_exit(&p->p_lock);
1675			mutex_enter(&pidlock);
1676			continue;
1677		} else if (ret == 1) {
1678			/*
1679			 * P_PR_LOCK is already set.  Wait and try again.
1680			 * This also drops p_lock.
1681			 */
1682			sprwaitlock_proc(p);
1683			mutex_enter(&pidlock);
1684			goto again;
1685		}
1686		mutex_exit(&p->p_lock);
1687
1688		vmu_calculate_proc(p);
1689
1690		mutex_enter(&p->p_lock);
1691		sprunlock(p);
1692		mutex_enter(&pidlock);
1693	}
1694	mutex_exit(&pidlock);
1695
1696	vmu_free_extra();
1697}
1698
1699/*
1700 * allocate a new cache for N results satisfying flags
1701 */
1702vmu_cache_t *
1703vmu_cache_alloc(size_t nres, uint_t flags)
1704{
1705	vmu_cache_t *cache;
1706
1707	cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1708	cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1709	cache->vmc_nresults = nres;
1710	cache->vmc_flags = flags;
1711	cache->vmc_refcnt = 1;
1712	return (cache);
1713}
1714
1715/*
1716 * Make sure cached results are not freed
1717 */
1718static void
1719vmu_cache_hold(vmu_cache_t *cache)
1720{
1721	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1722	cache->vmc_refcnt++;
1723}
1724
1725/*
1726 * free cache data
1727 */
1728static void
1729vmu_cache_rele(vmu_cache_t *cache)
1730{
1731	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1732	ASSERT(cache->vmc_refcnt > 0);
1733	cache->vmc_refcnt--;
1734	if (cache->vmc_refcnt == 0) {
1735		kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1736		    cache->vmc_nresults);
1737		kmem_free(cache, sizeof (vmu_cache_t));
1738	}
1739}
1740
1741/*
1742 * Copy out the cached results to a caller.  Inspect the callers flags
1743 * and zone to determine which cached results should be copied.
1744 */
1745static int
1746vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1747    uint_t flags, int cpflg)
1748{
1749	vmusage_t *result, *out_result;
1750	vmusage_t dummy;
1751	size_t i, count = 0;
1752	size_t bufsize;
1753	int ret = 0;
1754	uint_t types = 0;
1755
1756	if (nres != NULL) {
1757		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1758			return (set_errno(EFAULT));
1759	} else {
1760		bufsize = 0;
1761	}
1762
1763	/* figure out what results the caller is interested in. */
1764	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1765		types |= VMUSAGE_SYSTEM;
1766	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1767		types |= VMUSAGE_ZONE;
1768	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1769	    VMUSAGE_COL_PROJECTS))
1770		types |= VMUSAGE_PROJECTS;
1771	if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1772		types |= VMUSAGE_TASKS;
1773	if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1774		types |= VMUSAGE_RUSERS;
1775	if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1776		types |= VMUSAGE_EUSERS;
1777
1778	/* count results for current zone */
1779	out_result = buf;
1780	for (result = cache->vmc_results, i = 0;
1781	    i < cache->vmc_nresults; result++, i++) {
1782
1783		/* Do not return "other-zone" results to non-global zones */
1784		if (curproc->p_zone != global_zone &&
1785		    curproc->p_zone->zone_id != result->vmu_zoneid)
1786			continue;
1787
1788		/*
1789		 * If non-global zone requests VMUSAGE_SYSTEM, fake
1790		 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1791		 */
1792		if (curproc->p_zone != global_zone &&
1793		    (flags & VMUSAGE_SYSTEM) != 0 &&
1794		    result->vmu_type == VMUSAGE_ZONE) {
1795			count++;
1796			if (out_result != NULL) {
1797				if (bufsize < count) {
1798					ret = set_errno(EOVERFLOW);
1799				} else {
1800					dummy = *result;
1801					dummy.vmu_zoneid = ALL_ZONES;
1802					dummy.vmu_id = 0;
1803					dummy.vmu_type = VMUSAGE_SYSTEM;
1804					if (ddi_copyout(&dummy, out_result,
1805					    sizeof (vmusage_t), cpflg))
1806						return (set_errno(EFAULT));
1807					out_result++;
1808				}
1809			}
1810		}
1811
1812		/* Skip results that do not match requested type */
1813		if ((result->vmu_type & types) == 0)
1814			continue;
1815
1816		/* Skip collated results if not requested */
1817		if (result->vmu_zoneid == ALL_ZONES) {
1818			if (result->vmu_type == VMUSAGE_PROJECTS &&
1819			    (flags & VMUSAGE_COL_PROJECTS) == 0)
1820				continue;
1821			if (result->vmu_type == VMUSAGE_EUSERS &&
1822			    (flags & VMUSAGE_COL_EUSERS) == 0)
1823				continue;
1824			if (result->vmu_type == VMUSAGE_RUSERS &&
1825			    (flags & VMUSAGE_COL_RUSERS) == 0)
1826				continue;
1827		}
1828
1829		/* Skip "other zone" results if not requested */
1830		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1831			if (result->vmu_type == VMUSAGE_ZONE &&
1832			    (flags & VMUSAGE_ALL_ZONES) == 0)
1833				continue;
1834			if (result->vmu_type == VMUSAGE_PROJECTS &&
1835			    (flags & (VMUSAGE_ALL_PROJECTS |
1836			    VMUSAGE_COL_PROJECTS)) == 0)
1837				continue;
1838			if (result->vmu_type == VMUSAGE_TASKS &&
1839			    (flags & VMUSAGE_ALL_TASKS) == 0)
1840				continue;
1841			if (result->vmu_type == VMUSAGE_RUSERS &&
1842			    (flags & (VMUSAGE_ALL_RUSERS |
1843			    VMUSAGE_COL_RUSERS)) == 0)
1844				continue;
1845			if (result->vmu_type == VMUSAGE_EUSERS &&
1846			    (flags & (VMUSAGE_ALL_EUSERS |
1847			    VMUSAGE_COL_EUSERS)) == 0)
1848				continue;
1849		}
1850		count++;
1851		if (out_result != NULL) {
1852			if (bufsize < count) {
1853				ret = set_errno(EOVERFLOW);
1854			} else {
1855				if (ddi_copyout(result, out_result,
1856				    sizeof (vmusage_t), cpflg))
1857					return (set_errno(EFAULT));
1858				out_result++;
1859			}
1860		}
1861	}
1862	if (nres != NULL)
1863		if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1864			return (set_errno(EFAULT));
1865
1866	return (ret);
1867}
1868
1869/*
1870 * vm_getusage()
1871 *
1872 * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1873 * determines the type of results structures returned.  Flags requesting
1874 * results from more than one zone are "flattened" to the local zone if the
1875 * caller is not the global zone.
1876 *
1877 * args:
1878 *	flags:	bitmap consisting of one or more of VMUSAGE_*.
1879 *	age:	maximum allowable age (time since counting was done) in
1880 *		seconds of the results.  Results from previous callers are
1881 *		cached in kernel.
1882 *	buf:	pointer to buffer array of vmusage_t.  If NULL, then only nres
1883 *		set on success.
1884 *	nres:	Set to number of vmusage_t structures pointed to by buf
1885 *		before calling vm_getusage().
1886 *		On return 0 (success) or ENOSPC, is set to the number of result
1887 *		structures returned or attempted to return.
1888 *
1889 * returns 0 on success, -1 on failure:
1890 *	EINTR (interrupted)
1891 *	ENOSPC (nres to small for results, nres set to needed value for success)
1892 *	EINVAL (flags invalid)
1893 *	EFAULT (bad address for buf or nres)
1894 */
1895int
1896vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1897{
1898	vmu_entity_t *entity;
1899	vmusage_t *result;
1900	int ret = 0;
1901	int cacherecent = 0;
1902	hrtime_t now;
1903	uint_t flags_orig;
1904
1905	/*
1906	 * Non-global zones cannot request system wide and/or collated
1907	 * results, or the system result, so munge the flags accordingly.
1908	 */
1909	flags_orig = flags;
1910	if (curproc->p_zone != global_zone) {
1911		if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1912			flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1913			flags |= VMUSAGE_PROJECTS;
1914		}
1915		if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1916			flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1917			flags |= VMUSAGE_RUSERS;
1918		}
1919		if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1920			flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1921			flags |= VMUSAGE_EUSERS;
1922		}
1923		if (flags & VMUSAGE_SYSTEM) {
1924			flags &= ~VMUSAGE_SYSTEM;
1925			flags |= VMUSAGE_ZONE;
1926		}
1927	}
1928
1929	/* Check for unknown flags */
1930	if ((flags & (~VMUSAGE_MASK)) != 0)
1931		return (set_errno(EINVAL));
1932
1933	/* Check for no flags */
1934	if ((flags & VMUSAGE_MASK) == 0)
1935		return (set_errno(EINVAL));
1936
1937	mutex_enter(&vmu_data.vmu_lock);
1938	now = gethrtime();
1939
1940start:
1941	if (vmu_data.vmu_cache != NULL) {
1942
1943		vmu_cache_t *cache;
1944
1945		if ((vmu_data.vmu_cache->vmc_timestamp +
1946		    ((hrtime_t)age * NANOSEC)) > now)
1947			cacherecent = 1;
1948
1949		if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1950		    cacherecent == 1) {
1951			cache = vmu_data.vmu_cache;
1952			vmu_cache_hold(cache);
1953			mutex_exit(&vmu_data.vmu_lock);
1954
1955			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1956			    cpflg);
1957			mutex_enter(&vmu_data.vmu_lock);
1958			vmu_cache_rele(cache);
1959			if (vmu_data.vmu_pending_waiters > 0)
1960				cv_broadcast(&vmu_data.vmu_cv);
1961			mutex_exit(&vmu_data.vmu_lock);
1962			return (ret);
1963		}
1964		/*
1965		 * If the cache is recent, it is likely that there are other
1966		 * consumers of vm_getusage running, so add their flags to the
1967		 * desired flags for the calculation.
1968		 */
1969		if (cacherecent == 1)
1970			flags = vmu_data.vmu_cache->vmc_flags | flags;
1971	}
1972	if (vmu_data.vmu_calc_thread == NULL) {
1973
1974		vmu_cache_t *cache;
1975
1976		vmu_data.vmu_calc_thread = curthread;
1977		vmu_data.vmu_calc_flags = flags;
1978		vmu_data.vmu_entities = NULL;
1979		vmu_data.vmu_nentities = 0;
1980		if (vmu_data.vmu_pending_waiters > 0)
1981			vmu_data.vmu_calc_flags |=
1982			    vmu_data.vmu_pending_flags;
1983
1984		vmu_data.vmu_pending_flags = 0;
1985		mutex_exit(&vmu_data.vmu_lock);
1986		vmu_calculate();
1987		mutex_enter(&vmu_data.vmu_lock);
1988		/* copy results to cache */
1989		if (vmu_data.vmu_cache != NULL)
1990			vmu_cache_rele(vmu_data.vmu_cache);
1991		cache = vmu_data.vmu_cache =
1992		    vmu_cache_alloc(vmu_data.vmu_nentities,
1993		    vmu_data.vmu_calc_flags);
1994
1995		result = cache->vmc_results;
1996		for (entity = vmu_data.vmu_entities; entity != NULL;
1997		    entity = entity->vme_next) {
1998			*result = entity->vme_result;
1999			result++;
2000		}
2001		cache->vmc_timestamp = gethrtime();
2002		vmu_cache_hold(cache);
2003
2004		vmu_data.vmu_calc_flags = 0;
2005		vmu_data.vmu_calc_thread = NULL;
2006
2007		if (vmu_data.vmu_pending_waiters > 0)
2008			cv_broadcast(&vmu_data.vmu_cv);
2009
2010		mutex_exit(&vmu_data.vmu_lock);
2011
2012		/* copy cache */
2013		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
2014		mutex_enter(&vmu_data.vmu_lock);
2015		vmu_cache_rele(cache);
2016		mutex_exit(&vmu_data.vmu_lock);
2017
2018		return (ret);
2019	}
2020	vmu_data.vmu_pending_flags |= flags;
2021	vmu_data.vmu_pending_waiters++;
2022	while (vmu_data.vmu_calc_thread != NULL) {
2023		if (cv_wait_sig(&vmu_data.vmu_cv,
2024		    &vmu_data.vmu_lock) == 0) {
2025			vmu_data.vmu_pending_waiters--;
2026			mutex_exit(&vmu_data.vmu_lock);
2027			return (set_errno(EINTR));
2028		}
2029	}
2030	vmu_data.vmu_pending_waiters--;
2031	goto start;
2032}
2033