rctl.c revision 7240:c4957ab6a78e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/atomic.h>
29#include <sys/cmn_err.h>
30#include <sys/id_space.h>
31#include <sys/kmem.h>
32#include <sys/kstat.h>
33#include <sys/log.h>
34#include <sys/modctl.h>
35#include <sys/modhash.h>
36#include <sys/mutex.h>
37#include <sys/proc.h>
38#include <sys/procset.h>
39#include <sys/project.h>
40#include <sys/resource.h>
41#include <sys/rctl.h>
42#include <sys/siginfo.h>
43#include <sys/strlog.h>
44#include <sys/systm.h>
45#include <sys/task.h>
46#include <sys/types.h>
47#include <sys/policy.h>
48#include <sys/zone.h>
49
50/*
51 * Resource controls (rctls)
52 *
53 *   The rctl subsystem provides a mechanism for kernel components to
54 *   register their individual resource controls with the system as a whole,
55 *   such that those controls can subscribe to specific actions while being
56 *   associated with the various process-model entities provided by the kernel:
57 *   the process, the task, the project, and the zone.  (In principle, only
58 *   minor modifications would be required to connect the resource control
59 *   functionality to non-process-model entities associated with the system.)
60 *
61 *   Subsystems register their rctls via rctl_register().  Subsystems
62 *   also wishing to provide additional limits on a given rctl can modify
63 *   them once they have the rctl handle.  Each subsystem should store the
64 *   handle to their rctl for direct access.
65 *
66 *   A primary dictionary, rctl_dict, contains a hash of id to the default
67 *   control definition for each controlled resource-entity pair on the system.
68 *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
69 *   resource control handles.  The resource control handles are distributed by
70 *   the rctl_ids ID space.  The handles are private and not to be
71 *   advertised to userland; all userland interactions are via the rctl
72 *   names.
73 *
74 *   Entities inherit their rctls from their predecessor.  Since projects have
75 *   no ancestor, they inherit their rctls from the rctl dict for project
76 *   rctls.  It is expected that project controls will be set to their
77 *   appropriate values shortly after project creation, presumably from a
78 *   policy source such as the project database.
79 *
80 * Data structures
81 *   The rctl_set_t attached to each of the process model entities is a simple
82 *   hash table keyed on the rctl handle assigned at registration.  The entries
83 *   in the hash table are rctl_t's, whose relationship with the active control
84 *   values on that resource and with the global state of the resource we
85 *   illustrate below:
86 *
87 *   rctl_dict[key] --> rctl_dict_entry
88 *			   ^
89 *			   |
90 *			+--+---+
91 *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
92 *			+--+---+		 ^
93 *			   |			 |
94 *			   +------- cursor ------+
95 *
96 *   That is, the rctl contains a back pointer to the global resource control
97 *   state for this resource, which is also available in the rctl_dict hash
98 *   table mentioned earlier.  The rctl contains two pointers to resource
99 *   control values:  one, values, indicates the entire sequence of control
100 *   values; the other, cursor, indicates the currently active control
101 *   value--the next value to be enforced.  The value list itself is an open,
102 *   doubly-linked list, the last non-NULL member of which is the system value
103 *   for that resource (being the theoretical/conventional maximum allowable
104 *   value for the resource on this OS instance).
105 *
106 * Ops Vector
107 *   Subsystems publishing rctls need not provide instances of all of the
108 *   functions specified by the ops vector.  In particular, if general
109 *   rctl_*() entry points are not being called, certain functions can be
110 *   omitted.  These align as follows:
111 *
112 *   rctl_set()
113 *     You may wish to provide a set callback if locking circumstances prevent
114 *     it or if the performance cost of requesting the enforced value from the
115 *     resource control is prohibitively expensive.  For instance, the currently
116 *     enforced file size limit is stored on the process in the p_fsz_ctl to
117 *     maintain read()/write() performance.
118 *
119 *   rctl_test()
120 *     You must provide a test callback if you are using the rctl_test()
121 *     interface.  An action callback is optional.
122 *
123 *   rctl_action()
124 *     You may wish to provide an action callback.
125 *
126 * Registration
127 *   New resource controls can be added to a running instance by loaded modules
128 *   via registration.  (The current implementation does not support unloadable
129 *   modules; this functionality can be added if needed, via an
130 *   activation/deactivation interface involving the manipulation of the
131 *   ops vector for the resource control(s) needing to support unloading.)
132 *
133 * Control value ordering
134 *   Because the rctl_val chain on each rctl must be navigable in a
135 *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
136 *   defined order is (flags & [maximal], value, flags & [deny-action],
137 *   privilege).
138 *
139 * Locking
140 *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
141 *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
142 *   of any subsystem, holding subsystem locks, it is at all times inappropriate
143 *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
144 *   Traversing any of the various resource control entity lists requires
145 *   holding rctl_lists_lock.
146 *
147 *   Each individual resource control set associated with an entity must have
148 *   its rcs_lock held for the duration of any operations that would add
149 *   resource controls or control values to the set.
150 *
151 *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
152 *   rctl_lists_lock, entity->rcs_lock.
153 *
154 * The projects(4) database and project entity resource controls
155 *   A special case is made for RCENTITY_PROJECT values set through the
156 *   setproject(3PROJECT) interface.  setproject() makes use of a private
157 *   interface, setprojrctl(), which passes through an array of resource control
158 *   blocks that need to be set while holding the entity->rcs_lock.  This
159 *   ensures that the act of modifying a project's resource controls is
160 *   "atomic" within the kernel.
161 *
162 *   Within the rctl sub-system, we provide two interfaces that are only used by
163 *   the setprojrctl() code path - rctl_local_insert_all() and
164 *   rctl_local_replace_all().  rctl_local_insert_all() will ensure that the
165 *   resource values specified in *new_values are applied.
166 *   rctl_local_replace_all() will purge the current rctl->rc_projdb and
167 *   rctl->rc_values entries, and apply the *new_values.
168 *
169 *   These functions modify not only the linked list of active resource controls
170 *   (rctl->rc_values), but also a "cached" linked list (rctl->rc_projdb) of
171 *   values set through these interfaces.  To clarify:
172 *
173 *      rctl->rc_values - a linked list of rctl_val_t.  These are the active
174 *      resource values associated with this rctl, and may have been set by
175 *      setrctl() - via prctl(1M), or by setprojrctl() - via
176 *      setproject(3PROJECT).
177 *
178 *      rctl->rc_projdb - a linked list of rctl_val_t.  These reflect the
179 *      resource values set by the setprojrctl() code path.  rc_projdb is not
180 *      referenced by any other component of the rctl sub-system.
181 *
182 *   As various locks are held when calling these functions, we ensure that all
183 *   the possible memory allocations are performed prior to calling the
184 *   function.  *alloc_values is a linked list of uninitialized rctl_val_t,
185 *   which may be used to duplicate a new resource control value (passed in as
186 *   one of the members of the *new_values linked list), in order to populate
187 *   rctl->rc_values.
188 */
189
190id_t max_rctl_hndl = 32768;
191int rctl_dict_size = 64;
192int rctl_set_size = 8;
193kmutex_t rctl_dict_lock;
194mod_hash_t *rctl_dict;
195mod_hash_t *rctl_dict_by_name;
196id_space_t *rctl_ids;
197kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
198kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
199
200kmutex_t rctl_lists_lock;
201rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
202
203/*
204 * Default resource control operations and ops vector
205 *   To be used if the particular rcontrol has no specific actions defined, or
206 *   if the subsystem providing the control is quiescing (in preparation for
207 *   unloading, presumably.)
208 *
209 *   Resource controls with callbacks should fill the unused operations with the
210 *   appropriate default impotent callback.
211 */
212/*ARGSUSED*/
213void
214rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
215{
216}
217
218/*ARGSUSED*/
219rctl_qty_t
220rcop_no_usage(struct rctl *r, struct proc *p)
221{
222	return (0);
223}
224
225/*ARGSUSED*/
226int
227rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
228{
229	return (0);
230}
231
232/*ARGSUSED*/
233int
234rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
235    struct rctl_val *rv, rctl_qty_t i, uint_t f)
236{
237	return (0);
238}
239
240rctl_ops_t rctl_default_ops = {
241	rcop_no_action,
242	rcop_no_usage,
243	rcop_no_set,
244	rcop_no_test
245};
246
247/*
248 * Default "absolute" resource control operation and ops vector
249 *   Useful if there is no usage associated with the
250 *   resource control.
251 */
252/*ARGSUSED*/
253int
254rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
255    struct rctl_val *rv, rctl_qty_t i, uint_t f)
256{
257	return (i > rv->rcv_value);
258}
259
260rctl_ops_t rctl_absolute_ops = {
261	rcop_no_action,
262	rcop_no_usage,
263	rcop_no_set,
264	rcop_absolute_test
265};
266
267/*ARGSUSED*/
268static uint_t
269rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
270{
271	return ((uint_t)(uintptr_t)key % rctl_dict_size);
272}
273
274static int
275rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
276{
277	uint_t u1 = (uint_t)(uintptr_t)key1;
278	uint_t u2 = (uint_t)(uintptr_t)key2;
279
280	if (u1 > u2)
281		return (1);
282
283	if (u1 == u2)
284		return (0);
285
286	return (-1);
287}
288
289static void
290rctl_dict_val_dtor(mod_hash_val_t val)
291{
292	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
293
294	kmem_free(kr, sizeof (rctl_dict_entry_t));
295}
296
297/*
298 * size_t rctl_build_name_buf()
299 *
300 * Overview
301 *   rctl_build_name_buf() walks all active resource controls in the dictionary,
302 *   building a buffer of continguous NUL-terminated strings.
303 *
304 * Return values
305 *   The size of the buffer is returned, the passed pointer's contents are
306 *   modified to that of the location of the buffer.
307 *
308 * Caller's context
309 *   Caller must be in a context suitable for KM_SLEEP allocations.
310 */
311size_t
312rctl_build_name_buf(char **rbufp)
313{
314	size_t req_size, cpy_size;
315	char *rbufloc;
316	int i;
317
318rctl_rebuild_name_buf:
319	req_size = cpy_size = 0;
320
321	/*
322	 * Calculate needed buffer length.
323	 */
324	mutex_enter(&rctl_lists_lock);
325	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
326		rctl_dict_entry_t *rde;
327
328		for (rde = rctl_lists[i];
329		    rde != NULL;
330		    rde = rde->rcd_next)
331			req_size += strlen(rde->rcd_name) + 1;
332	}
333	mutex_exit(&rctl_lists_lock);
334
335	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
336
337	/*
338	 * Copy rctl names into our buffer.  If the copy length exceeds the
339	 * allocate length (due to registration changes), stop copying, free the
340	 * buffer, and start again.
341	 */
342	mutex_enter(&rctl_lists_lock);
343	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
344		rctl_dict_entry_t *rde;
345
346		for (rde = rctl_lists[i];
347		    rde != NULL;
348		    rde = rde->rcd_next) {
349			size_t length = strlen(rde->rcd_name) + 1;
350
351			cpy_size += length;
352
353			if (cpy_size > req_size) {
354				kmem_free(*rbufp, req_size);
355				mutex_exit(&rctl_lists_lock);
356				goto rctl_rebuild_name_buf;
357			}
358
359			bcopy(rde->rcd_name, rbufloc, length);
360			rbufloc += length;
361		}
362	}
363	mutex_exit(&rctl_lists_lock);
364
365	return (req_size);
366}
367
368/*
369 * rctl_dict_entry_t *rctl_dict_lookup(const char *)
370 *
371 * Overview
372 *   rctl_dict_lookup() returns the resource control dictionary entry for the
373 *   named resource control.
374 *
375 * Return values
376 *   A pointer to the appropriate resource control dictionary entry, or NULL if
377 *   no such named entry exists.
378 *
379 * Caller's context
380 *   Caller must not be holding rctl_dict_lock.
381 */
382rctl_dict_entry_t *
383rctl_dict_lookup(const char *name)
384{
385	rctl_dict_entry_t *rde;
386
387	mutex_enter(&rctl_dict_lock);
388
389	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
390	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
391		mutex_exit(&rctl_dict_lock);
392		return (NULL);
393	}
394
395	mutex_exit(&rctl_dict_lock);
396
397	return (rde);
398}
399
400/*
401 * rctl_hndl_t rctl_hndl_lookup(const char *)
402 *
403 * Overview
404 *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
405 *   named resource control.
406 *
407 * Return values
408 *   The appropriate id, or -1 if no such named entry exists.
409 *
410 * Caller's context
411 *   Caller must not be holding rctl_dict_lock.
412 */
413rctl_hndl_t
414rctl_hndl_lookup(const char *name)
415{
416	rctl_dict_entry_t *rde;
417
418	if ((rde = rctl_dict_lookup(name)) == NULL)
419		return (-1);
420
421	return (rde->rcd_id);
422}
423
424/*
425 * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
426 *
427 * Overview
428 *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
429 *   the resource control dictionary entry matching a given resource control id.
430 *
431 * Return values
432 *   A pointer to the matching resource control dictionary entry, or NULL if the
433 *   id does not match any existing entries.
434 *
435 * Caller's context
436 *   Caller must not be holding rctl_lists_lock.
437 */
438rctl_dict_entry_t *
439rctl_dict_lookup_hndl(rctl_hndl_t hndl)
440{
441	uint_t i;
442
443	mutex_enter(&rctl_lists_lock);
444	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
445		rctl_dict_entry_t *rde;
446
447		for (rde = rctl_lists[i];
448		    rde != NULL;
449		    rde = rde->rcd_next)
450			if (rde->rcd_id == hndl) {
451				mutex_exit(&rctl_lists_lock);
452				return (rde);
453			}
454	}
455	mutex_exit(&rctl_lists_lock);
456
457	return (NULL);
458}
459
460/*
461 * void rctl_add_default_limit(const char *name, rctl_qty_t value,
462 *     rctl_priv_t privilege, uint_t action)
463 *
464 * Overview
465 *   Create a default limit with specified value, privilege, and action.
466 *
467 * Return value
468 *   No value returned.
469 */
470void
471rctl_add_default_limit(const char *name, rctl_qty_t value,
472    rctl_priv_t privilege, uint_t action)
473{
474	rctl_val_t *dval;
475	rctl_dict_entry_t *rde;
476
477	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
478	bzero(dval, sizeof (rctl_val_t));
479	dval->rcv_value = value;
480	dval->rcv_privilege = privilege;
481	dval->rcv_flagaction = action;
482	dval->rcv_action_recip_pid = -1;
483
484	rde = rctl_dict_lookup(name);
485	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
486}
487
488/*
489 * void rctl_add_legacy_limit(const char *name, const char *mname,
490 *     const char *lname, rctl_qty_t dflt)
491 *
492 * Overview
493 *   Create a default privileged limit, using the value obtained from
494 *   /etc/system if it exists and is greater than the specified default
495 *   value.  Exists primarily for System V IPC.
496 *
497 * Return value
498 *   No value returned.
499 */
500void
501rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
502    rctl_qty_t dflt, rctl_qty_t max)
503{
504	rctl_qty_t qty;
505
506	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
507		qty = dflt;
508
509	if (qty > max)
510		qty = max;
511
512	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
513}
514
515static rctl_set_t *
516rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
517{
518	rctl_set_t *rset = NULL;
519
520	if (rcd == NULL)
521		return (NULL);
522
523	switch (rcd->rcd_entity) {
524	case RCENTITY_PROCESS:
525		rset = p->p_rctls;
526		break;
527	case RCENTITY_TASK:
528		ASSERT(MUTEX_HELD(&p->p_lock));
529		if (p->p_task != NULL)
530			rset = p->p_task->tk_rctls;
531		break;
532	case RCENTITY_PROJECT:
533		ASSERT(MUTEX_HELD(&p->p_lock));
534		if (p->p_task != NULL &&
535		    p->p_task->tk_proj != NULL)
536			rset = p->p_task->tk_proj->kpj_rctls;
537		break;
538	case RCENTITY_ZONE:
539		ASSERT(MUTEX_HELD(&p->p_lock));
540		if (p->p_zone != NULL)
541			rset = p->p_zone->zone_rctls;
542		break;
543	default:
544		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
545		break;
546	}
547
548	return (rset);
549}
550
551static void
552rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
553    rctl_entity_p_t *e)
554{
555	e->rcep_p.proc = NULL;
556	e->rcep_t = entity;
557
558	switch (entity) {
559	case RCENTITY_PROCESS:
560		e->rcep_p.proc = p;
561		break;
562	case RCENTITY_TASK:
563		ASSERT(MUTEX_HELD(&p->p_lock));
564		if (p->p_task != NULL)
565			e->rcep_p.task = p->p_task;
566		break;
567	case RCENTITY_PROJECT:
568		ASSERT(MUTEX_HELD(&p->p_lock));
569		if (p->p_task != NULL &&
570		    p->p_task->tk_proj != NULL)
571			e->rcep_p.proj = p->p_task->tk_proj;
572		break;
573	case RCENTITY_ZONE:
574		ASSERT(MUTEX_HELD(&p->p_lock));
575		if (p->p_zone != NULL)
576			e->rcep_p.zone = p->p_zone;
577		break;
578	default:
579		panic("unknown rctl entity type %d seen", entity);
580		break;
581	}
582}
583
584static void
585rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
586{
587	uint_t i;
588
589	if (rcgp->rcag_nctls > 0) {
590		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
591		rctl_t *rctl = prev;
592
593		rcgp->rcag_ctls = prev;
594
595		for (i = 1; i < rcgp->rcag_nctls; i++) {
596			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
597			prev->rc_next = rctl;
598			prev = rctl;
599		}
600
601		rctl->rc_next = NULL;
602	}
603
604	if (rcgp->rcag_nvals > 0) {
605		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
606		rctl_val_t *rval = prev;
607
608		rcgp->rcag_vals = prev;
609
610		for (i = 1; i < rcgp->rcag_nvals; i++) {
611			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
612			prev->rcv_next = rval;
613			prev = rval;
614		}
615
616		rval->rcv_next = NULL;
617	}
618
619}
620
621static rctl_val_t *
622rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
623{
624	rctl_val_t *rval = rcgp->rcag_vals;
625
626	ASSERT(rcgp->rcag_nvals > 0);
627	rcgp->rcag_nvals--;
628	rcgp->rcag_vals = rval->rcv_next;
629
630	rval->rcv_next = NULL;
631
632	return (rval);
633}
634
635static rctl_t *
636rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
637{
638	rctl_t *rctl = rcgp->rcag_ctls;
639
640	ASSERT(rcgp->rcag_nctls > 0);
641	rcgp->rcag_nctls--;
642	rcgp->rcag_ctls = rctl->rc_next;
643
644	rctl->rc_next = NULL;
645
646	return (rctl);
647
648}
649
650static void
651rctl_gp_free(rctl_alloc_gp_t *rcgp)
652{
653	rctl_val_t *rval = rcgp->rcag_vals;
654	rctl_t *rctl = rcgp->rcag_ctls;
655
656	while (rval != NULL) {
657		rctl_val_t *next = rval->rcv_next;
658
659		kmem_cache_free(rctl_val_cache, rval);
660		rval = next;
661	}
662
663	while (rctl != NULL) {
664		rctl_t *next = rctl->rc_next;
665
666		kmem_cache_free(rctl_cache, rctl);
667		rctl = next;
668	}
669}
670
671/*
672 * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
673 *
674 * Overview
675 *   Release all unused memory allocated via one of the "prealloc" functions:
676 *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
677 *
678 * Return values
679 *   None.
680 *
681 * Caller's context
682 *   No restrictions on context.
683 */
684void
685rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
686{
687	rctl_gp_free(gp);
688	kmem_free(gp, sizeof (rctl_alloc_gp_t));
689}
690
691/*
692 * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
693 *
694 * Overview
695 *   This function defines an ordering to rctl_val_t's in order to allow
696 *   for correct placement in value lists. When the imprecise flag is set,
697 *   the action recipient is ignored. This is to facilitate insert,
698 *   delete, and replace operations by rctlsys.
699 *
700 * Return values
701 *   0 if the val_t's are are considered identical
702 *   -1 if a is ordered lower than b
703 *   1 if a is lowered higher than b
704 *
705 * Caller's context
706 *   No restrictions on context.
707 */
708int
709rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
710{
711	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
712	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
713		return (-1);
714
715	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
716	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
717		return (1);
718
719	if (a->rcv_value < b->rcv_value)
720		return (-1);
721
722	if (a->rcv_value > b->rcv_value)
723		return (1);
724
725	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
726	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
727		return (-1);
728
729	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
730	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
731		return (1);
732
733	if (a->rcv_privilege < b->rcv_privilege)
734		return (-1);
735
736	if (a->rcv_privilege > b->rcv_privilege)
737		return (1);
738
739	if (imprecise)
740		return (0);
741
742	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
743		return (-1);
744
745	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
746		return (1);
747
748	return (0);
749}
750
751static rctl_val_t *
752rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
753{
754	rctl_val_t *rval = *head;
755
756	while (rval != NULL) {
757		if (rctl_val_cmp(cval, rval, 0) == 0)
758			return (rval);
759
760		rval = rval->rcv_next;
761	}
762
763	return (NULL);
764
765}
766
767/*
768 * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
769 *
770 * Overview
771 *   This function inserts the rctl_val_t into the value list provided.
772 *   The insert is always successful unless if the value is a duplicate
773 *   of one already in the list.
774 *
775 * Return values
776 *    1 if the value was a duplicate of an existing value in the list.
777 *    0 if the insert was successful.
778 */
779int
780rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
781{
782	rctl_val_t *prev;
783	int equiv;
784
785	rval->rcv_next = NULL;
786	rval->rcv_prev = NULL;
787
788	if (*root == NULL) {
789		*root = rval;
790		return (0);
791	}
792
793	equiv = rctl_val_cmp(rval, *root, 0);
794
795	if (equiv == 0)
796		return (1);
797
798	if (equiv < 0) {
799		rval->rcv_next = *root;
800		rval->rcv_next->rcv_prev = rval;
801		*root = rval;
802
803		return (0);
804	}
805
806	prev = *root;
807	while (prev->rcv_next != NULL &&
808	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
809		prev = prev->rcv_next;
810	}
811
812	if (equiv == 0)
813		return (1);
814
815	rval->rcv_next = prev->rcv_next;
816	if (rval->rcv_next != NULL)
817		rval->rcv_next->rcv_prev = rval;
818	prev->rcv_next = rval;
819	rval->rcv_prev = prev;
820
821	return (0);
822}
823
824static int
825rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
826{
827	rctl_val_t *prev;
828
829	if (*root == NULL)
830		return (-1);
831
832	prev = *root;
833	if (rctl_val_cmp(rval, prev, 0) == 0) {
834		*root = prev->rcv_next;
835		if (*root != NULL)
836			(*root)->rcv_prev = NULL;
837
838		kmem_cache_free(rctl_val_cache, prev);
839
840		return (0);
841	}
842
843	while (prev->rcv_next != NULL &&
844	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
845		prev = prev->rcv_next;
846	}
847
848	if (prev->rcv_next == NULL) {
849		/*
850		 * If we navigate the entire list and cannot find a match, then
851		 * return failure.
852		 */
853		return (-1);
854	}
855
856	prev = prev->rcv_next;
857	prev->rcv_prev->rcv_next = prev->rcv_next;
858	if (prev->rcv_next != NULL)
859		prev->rcv_next->rcv_prev = prev->rcv_prev;
860
861	kmem_cache_free(rctl_val_cache, prev);
862
863	return (0);
864}
865
866static rctl_val_t *
867rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
868    struct proc *newp)
869{
870	rctl_val_t *head = NULL;
871
872	for (; rval != NULL; rval = rval->rcv_next) {
873		rctl_val_t *dval = rctl_gp_detach_val(ragp);
874
875		bcopy(rval, dval, sizeof (rctl_val_t));
876		dval->rcv_prev = dval->rcv_next = NULL;
877
878		if (oldp == NULL ||
879		    rval->rcv_action_recipient == NULL ||
880		    rval->rcv_action_recipient == oldp) {
881			if (rval->rcv_privilege == RCPRIV_BASIC) {
882				dval->rcv_action_recipient = newp;
883				dval->rcv_action_recip_pid = newp->p_pid;
884			} else {
885				dval->rcv_action_recipient = NULL;
886				dval->rcv_action_recip_pid = -1;
887			}
888
889			(void) rctl_val_list_insert(&head, dval);
890		} else {
891			kmem_cache_free(rctl_val_cache, dval);
892		}
893	}
894
895	return (head);
896}
897
898static void
899rctl_val_list_reset(rctl_val_t *rval)
900{
901	for (; rval != NULL; rval = rval->rcv_next)
902		rval->rcv_firing_time = 0;
903}
904
905static uint_t
906rctl_val_list_count(rctl_val_t *rval)
907{
908	uint_t n = 0;
909
910	for (; rval != NULL; rval = rval->rcv_next)
911		n++;
912
913	return (n);
914}
915
916
917static void
918rctl_val_list_free(rctl_val_t *rval)
919{
920	while (rval != NULL) {
921		rctl_val_t *next = rval->rcv_next;
922
923		kmem_cache_free(rctl_val_cache, rval);
924
925		rval = next;
926	}
927}
928
929/*
930 * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
931 *
932 * Overview
933 *   In cases where the operating system supports more than one process
934 *   addressing model, the operating system capabilities will exceed those of
935 *   one or more of these models.  Processes in a less capable model must have
936 *   their resources accurately controlled, without diluting those of their
937 *   descendants reached via exec().  rctl_model_maximum() returns the governing
938 *   value for the specified process with respect to a resource control, such
939 *   that the value can used for the RCTLOP_SET callback or compatability
940 *   support.
941 *
942 * Return values
943 *   The maximum value for the given process for the specified resource control.
944 *
945 * Caller's context
946 *   No restrictions on context.
947 */
948rctl_qty_t
949rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
950{
951	if (p->p_model == DATAMODEL_NATIVE)
952		return (rde->rcd_max_native);
953
954	return (rde->rcd_max_ilp32);
955}
956
957/*
958 * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
959 *
960 * Overview
961 *   Convenience function wrapping the rctl_model_maximum() functionality.
962 *
963 * Return values
964 *   The lesser of the process's maximum value and the given value for the
965 *   specified resource control.
966 *
967 * Caller's context
968 *   No restrictions on context.
969 */
970rctl_qty_t
971rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
972{
973	rctl_qty_t max = rctl_model_maximum(rde, p);
974
975	return (value < max ? value : max);
976}
977
978static void
979rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
980{
981	uint_t index = hndl % rctl_set_size;
982	rctl_t *next_ctl, *prev_ctl;
983
984	ASSERT(MUTEX_HELD(&set->rcs_lock));
985
986	rctl->rc_next = NULL;
987
988	if (set->rcs_ctls[index] == NULL) {
989		set->rcs_ctls[index] = rctl;
990		return;
991	}
992
993	if (hndl < set->rcs_ctls[index]->rc_id) {
994		rctl->rc_next = set->rcs_ctls[index];
995		set->rcs_ctls[index] = rctl;
996
997		return;
998	}
999
1000	for (next_ctl = set->rcs_ctls[index]->rc_next,
1001	    prev_ctl = set->rcs_ctls[index];
1002	    next_ctl != NULL;
1003	    prev_ctl = next_ctl,
1004	    next_ctl = next_ctl->rc_next) {
1005		if (next_ctl->rc_id > hndl) {
1006			rctl->rc_next = next_ctl;
1007			prev_ctl->rc_next = rctl;
1008
1009			return;
1010		}
1011	}
1012
1013	rctl->rc_next = next_ctl;
1014	prev_ctl->rc_next = rctl;
1015}
1016
1017/*
1018 * rctl_set_t *rctl_set_create()
1019 *
1020 * Overview
1021 *   Create an empty resource control set, suitable for attaching to a
1022 *   controlled entity.
1023 *
1024 * Return values
1025 *   A pointer to the newly created set.
1026 *
1027 * Caller's context
1028 *   Safe for KM_SLEEP allocations.
1029 */
1030rctl_set_t *
1031rctl_set_create()
1032{
1033	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
1034
1035	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1036	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1037	    KM_SLEEP);
1038	rset->rcs_entity = -1;
1039
1040	return (rset);
1041}
1042
1043/*
1044 * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1045 *
1046 * Overview
1047 *    rctl_set_init_prealloc() examines the globally defined resource controls
1048 *    and their default values and returns a resource control allocation group
1049 *    populated with sufficient controls and values to form a representative
1050 *    resource control set for the specified entity.
1051 *
1052 * Return values
1053 *    A pointer to the newly created allocation group.
1054 *
1055 * Caller's context
1056 *    Caller must be in a context suitable for KM_SLEEP allocations.
1057 */
1058rctl_alloc_gp_t *
1059rctl_set_init_prealloc(rctl_entity_t entity)
1060{
1061	rctl_dict_entry_t *rde;
1062	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1063
1064	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1065
1066	if (rctl_lists[entity] == NULL)
1067		return (ragp);
1068
1069	mutex_enter(&rctl_lists_lock);
1070
1071	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1072		ragp->rcag_nctls++;
1073		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1074	}
1075
1076	mutex_exit(&rctl_lists_lock);
1077
1078	rctl_gp_alloc(ragp);
1079
1080	return (ragp);
1081}
1082
1083/*
1084 * rctl_set_t *rctl_set_init(rctl_entity_t)
1085 *
1086 * Overview
1087 *   rctl_set_create() creates a resource control set, initialized with the
1088 *   system infinite values on all registered controls, for attachment to a
1089 *   system entity requiring resource controls, such as a process or a task.
1090 *
1091 * Return values
1092 *   A pointer to the newly filled set.
1093 *
1094 * Caller's context
1095 *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1096 *   may modify task and project members based on the proc structure
1097 *   they are passed.
1098 */
1099rctl_set_t *
1100rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1101    rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1102{
1103	rctl_dict_entry_t *rde;
1104
1105	ASSERT(MUTEX_HELD(&p->p_lock));
1106	ASSERT(e);
1107	rset->rcs_entity = entity;
1108
1109	if (rctl_lists[entity] == NULL)
1110		return (rset);
1111
1112	mutex_enter(&rctl_lists_lock);
1113	mutex_enter(&rset->rcs_lock);
1114
1115	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1116		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1117
1118		rctl->rc_dict_entry = rde;
1119		rctl->rc_id = rde->rcd_id;
1120		rctl->rc_projdb = NULL;
1121
1122		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1123		    ragp, NULL, p);
1124		rctl->rc_cursor = rctl->rc_values;
1125
1126		ASSERT(rctl->rc_cursor != NULL);
1127
1128		rctl_set_insert(rset, rde->rcd_id, rctl);
1129
1130		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1131		    rctl->rc_cursor->rcv_value));
1132	}
1133
1134	mutex_exit(&rset->rcs_lock);
1135	mutex_exit(&rctl_lists_lock);
1136
1137	return (rset);
1138}
1139
1140static rctl_t *
1141rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1142    struct proc *newp)
1143{
1144	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1145	rctl_val_t *dval;
1146
1147	dup->rc_id = rctl->rc_id;
1148	dup->rc_dict_entry = rctl->rc_dict_entry;
1149	dup->rc_next = NULL;
1150	dup->rc_cursor = NULL;
1151	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1152
1153	for (dval = dup->rc_values;
1154	    dval != NULL; dval = dval->rcv_next) {
1155		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1156			dup->rc_cursor = dval;
1157			break;
1158		}
1159	}
1160
1161	if (dup->rc_cursor == NULL)
1162		dup->rc_cursor = dup->rc_values;
1163
1164	return (dup);
1165}
1166
1167static void
1168rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1169{
1170	uint_t i;
1171
1172	bzero(ragp, sizeof (rctl_alloc_gp_t));
1173
1174	for (i = 0; i < rctl_set_size; i++) {
1175		rctl_t *r = set->rcs_ctls[i];
1176
1177		while (r != NULL) {
1178			ragp->rcag_nctls++;
1179
1180			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1181
1182			r = r->rc_next;
1183		}
1184	}
1185}
1186
1187/*
1188 * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1189 *
1190 * Overview
1191 *   Given a resource control set, allocate a sufficiently large allocation
1192 *   group to contain a duplicate of the set.
1193 *
1194 * Return value
1195 *   A pointer to the newly created allocation group.
1196 *
1197 * Caller's context
1198 *   Safe for KM_SLEEP allocations.
1199 */
1200rctl_alloc_gp_t *
1201rctl_set_dup_prealloc(rctl_set_t *set)
1202{
1203	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1204
1205	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1206
1207	mutex_enter(&set->rcs_lock);
1208	rctl_set_fill_alloc_gp(set, ragp);
1209	mutex_exit(&set->rcs_lock);
1210
1211	rctl_gp_alloc(ragp);
1212
1213	return (ragp);
1214}
1215
1216/*
1217 * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1218 *
1219 * Overview
1220 *   Verify that the allocation group provided is large enough to allow a
1221 *   duplicate of the given resource control set to be constructed from its
1222 *   contents.
1223 *
1224 * Return values
1225 *   1 if the allocation group is sufficiently large, 0 otherwise.
1226 *
1227 * Caller's context
1228 *   rcs_lock must be held prior to entry.
1229 */
1230int
1231rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1232{
1233	rctl_alloc_gp_t curr_gp;
1234
1235	ASSERT(MUTEX_HELD(&set->rcs_lock));
1236
1237	rctl_set_fill_alloc_gp(set, &curr_gp);
1238
1239	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1240	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1241		return (1);
1242
1243	return (0);
1244}
1245
1246/*
1247 * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1248 *   rctl_set_t *, rctl_alloc_gp_t *, int)
1249 *
1250 * Overview
1251 *   Make a duplicate of the resource control set.  The proc pointers are those
1252 *   of the owning process and of the process associated with the entity
1253 *   receiving the duplicate.
1254 *
1255 *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1256 *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1257 *   Stage 2 consists of copying all rctls and values from the old set into
1258 *   the new. Stage 3 completes the duplication by performing the appropriate
1259 *   callbacks for each rctl in the new set.
1260 *
1261 *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1262 *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1263 *   be supplied if the newp proc structure reflects the new task and
1264 *   project linkage.
1265 *
1266 * Return value
1267 *   A pointer to the duplicate set.
1268 *
1269 * Caller's context
1270 *   The rcs_lock of the set to be duplicated must be held prior to entry.
1271 */
1272rctl_set_t *
1273rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1274    rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1275{
1276	uint_t i;
1277	rctl_set_t	*iter;
1278
1279	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1280	ASSERT(e);
1281	/*
1282	 * When copying the old set, iterate over that. Otherwise, when
1283	 * only callbacks have been requested, iterate over the dup set.
1284	 */
1285	if (flag & RCD_DUP) {
1286		ASSERT(MUTEX_HELD(&set->rcs_lock));
1287		iter = set;
1288		dup->rcs_entity = set->rcs_entity;
1289	} else {
1290		iter = dup;
1291	}
1292
1293	mutex_enter(&dup->rcs_lock);
1294
1295	for (i = 0; i < rctl_set_size; i++) {
1296		rctl_t *r = iter->rcs_ctls[i];
1297		rctl_t *d;
1298
1299		while (r != NULL) {
1300			if (flag & RCD_DUP) {
1301				d = rctl_dup(r, ragp, oldp, newp);
1302				rctl_set_insert(dup, r->rc_id, d);
1303			} else {
1304				d = r;
1305			}
1306
1307			if (flag & RCD_CALLBACK)
1308				RCTLOP_SET(d, newp, e,
1309				    rctl_model_value(d->rc_dict_entry, newp,
1310				    d->rc_cursor->rcv_value));
1311
1312			r = r->rc_next;
1313		}
1314	}
1315
1316	mutex_exit(&dup->rcs_lock);
1317
1318	return (dup);
1319}
1320
1321/*
1322 * void rctl_set_free(rctl_set_t *)
1323 *
1324 * Overview
1325 *   Delete resource control set and all attached values.
1326 *
1327 * Return values
1328 *   No value returned.
1329 *
1330 * Caller's context
1331 *   No restrictions on context.
1332 */
1333void
1334rctl_set_free(rctl_set_t *set)
1335{
1336	uint_t i;
1337
1338	mutex_enter(&set->rcs_lock);
1339	for (i = 0; i < rctl_set_size; i++) {
1340		rctl_t *r = set->rcs_ctls[i];
1341
1342		while (r != NULL) {
1343			rctl_val_t *v = r->rc_values;
1344			rctl_t *n = r->rc_next;
1345
1346			kmem_cache_free(rctl_cache, r);
1347
1348			rctl_val_list_free(v);
1349
1350			r = n;
1351		}
1352	}
1353	mutex_exit(&set->rcs_lock);
1354
1355	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1356	kmem_free(set, sizeof (rctl_set_t));
1357}
1358
1359/*
1360 * void rctl_set_reset(rctl_set_t *)
1361 *
1362 * Overview
1363 *   Resets all rctls within the set such that the lowest value becomes active.
1364 *
1365 * Return values
1366 *   No value returned.
1367 *
1368 * Caller's context
1369 *   No restrictions on context.
1370 */
1371void
1372rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1373{
1374	uint_t i;
1375
1376	ASSERT(e);
1377
1378	mutex_enter(&set->rcs_lock);
1379	for (i = 0; i < rctl_set_size; i++) {
1380		rctl_t *r = set->rcs_ctls[i];
1381
1382		while (r != NULL) {
1383			r->rc_cursor = r->rc_values;
1384			rctl_val_list_reset(r->rc_cursor);
1385			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1386			    p, r->rc_cursor->rcv_value));
1387
1388			ASSERT(r->rc_cursor != NULL);
1389
1390			r = r->rc_next;
1391		}
1392	}
1393
1394	mutex_exit(&set->rcs_lock);
1395}
1396
1397/*
1398 * void rctl_set_tearoff(rctl_set *, struct proc *)
1399 *
1400 * Overview
1401 *   Tear off any resource control values on this set with an action recipient
1402 *   equal to the specified process (as they are becoming invalid with the
1403 *   process's departure from this set as an observer).
1404 *
1405 * Return values
1406 *   No value returned.
1407 *
1408 * Caller's context
1409 *   No restrictions on context
1410 */
1411void
1412rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1413{
1414	uint_t i;
1415
1416	mutex_enter(&set->rcs_lock);
1417	for (i = 0; i < rctl_set_size; i++) {
1418		rctl_t *r = set->rcs_ctls[i];
1419
1420		while (r != NULL) {
1421			rctl_val_t *rval;
1422
1423tearoff_rewalk_list:
1424			rval = r->rc_values;
1425
1426			while (rval != NULL) {
1427				if (rval->rcv_privilege == RCPRIV_BASIC &&
1428				    rval->rcv_action_recipient == p) {
1429					if (r->rc_cursor == rval)
1430						r->rc_cursor = rval->rcv_next;
1431
1432					(void) rctl_val_list_delete(
1433					    &r->rc_values, rval);
1434
1435					goto tearoff_rewalk_list;
1436				}
1437
1438				rval = rval->rcv_next;
1439			}
1440
1441			ASSERT(r->rc_cursor != NULL);
1442
1443			r = r->rc_next;
1444		}
1445	}
1446
1447	mutex_exit(&set->rcs_lock);
1448}
1449
1450static int
1451rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1452{
1453	uint_t index = hndl % rctl_set_size;
1454	rctl_t *curr_ctl;
1455
1456	ASSERT(MUTEX_HELD(&set->rcs_lock));
1457
1458	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1459	    curr_ctl = curr_ctl->rc_next) {
1460		if (curr_ctl->rc_id == hndl) {
1461			*rctl = curr_ctl;
1462
1463			return (0);
1464		}
1465	}
1466
1467	return (-1);
1468}
1469
1470/*
1471 * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1472 *
1473 * Overview
1474 *   Given a process, get the next enforced value on the rctl of the specified
1475 *   handle.
1476 *
1477 * Return value
1478 *   The enforced value.
1479 *
1480 * Caller's context
1481 *   For controls on process collectives, p->p_lock must be held across the
1482 *   operation.
1483 */
1484/*ARGSUSED*/
1485rctl_qty_t
1486rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1487{
1488	rctl_t *rctl;
1489	rlim64_t ret;
1490
1491	mutex_enter(&rset->rcs_lock);
1492
1493	if (rctl_set_find(rset, hndl, &rctl) == -1)
1494		panic("unknown resource control handle %d requested", hndl);
1495	else
1496		ret = rctl_model_value(rctl->rc_dict_entry, p,
1497		    rctl->rc_cursor->rcv_value);
1498
1499	mutex_exit(&rset->rcs_lock);
1500
1501	return (ret);
1502}
1503
1504/*
1505 * int rctl_global_get(const char *, rctl_dict_entry_t *)
1506 *
1507 * Overview
1508 *   Copy a sanitized version of the global rctl for a given resource control
1509 *   name.  (By sanitization, we mean that the unsafe data pointers have been
1510 *   zeroed.)
1511 *
1512 * Return value
1513 *   -1 if name not defined, 0 otherwise.
1514 *
1515 * Caller's context
1516 *   No restrictions on context.  rctl_dict_lock must not be held.
1517 */
1518int
1519rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1520{
1521	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1522
1523	if (rde == NULL)
1524		return (-1);
1525
1526	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1527
1528	drde->rcd_next = NULL;
1529	drde->rcd_ops = NULL;
1530
1531	return (0);
1532}
1533
1534/*
1535 * int rctl_global_set(const char *, rctl_dict_entry_t *)
1536 *
1537 * Overview
1538 *   Transfer the settable fields of the named rctl to the global rctl matching
1539 *   the given resource control name.
1540 *
1541 * Return value
1542 *   -1 if name not defined, 0 otherwise.
1543 *
1544 * Caller's context
1545 *   No restrictions on context.  rctl_dict_lock must not be held.
1546 */
1547int
1548rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1549{
1550	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1551
1552	if (rde == NULL)
1553		return (-1);
1554
1555	rde->rcd_flagaction = drde->rcd_flagaction;
1556	rde->rcd_syslog_level = drde->rcd_syslog_level;
1557	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1558
1559	return (0);
1560}
1561
1562static int
1563rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1564    int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1565    rctl_val_t *, rctl_val_t *), struct proc *p)
1566{
1567	rctl_t *rctl;
1568	rctl_set_t *rset;
1569	rctl_entity_p_t e;
1570	int ret = 0;
1571	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1572
1573local_op_retry:
1574
1575	ASSERT(MUTEX_HELD(&p->p_lock));
1576
1577	rset = rctl_entity_obtain_rset(rde, p);
1578
1579	if (rset == NULL) {
1580		return (-1);
1581	}
1582	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1583
1584	mutex_enter(&rset->rcs_lock);
1585
1586	/* using rctl's hndl, get rctl from local set */
1587	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1588		mutex_exit(&rset->rcs_lock);
1589		return (-1);
1590	}
1591
1592	ret = cbop(hndl, p, &e, rctl, oval, nval);
1593
1594	mutex_exit(&rset->rcs_lock);
1595	return (ret);
1596}
1597
1598/*ARGSUSED*/
1599static int
1600rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1601    rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1602{
1603	if (oval == NULL) {
1604		/*
1605		 * RCTL_FIRST
1606		 */
1607		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1608	} else {
1609		/*
1610		 * RCTL_NEXT
1611		 */
1612		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1613
1614		if (tval == NULL)
1615			return (ESRCH);
1616		else if (tval->rcv_next == NULL)
1617			return (ENOENT);
1618		else
1619			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1620	}
1621
1622	return (0);
1623}
1624
1625/*
1626 * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1627 *
1628 * Overview
1629 *   Get the rctl value for the given flags.
1630 *
1631 * Return values
1632 *   0 for successful get, errno otherwise.
1633 */
1634int
1635rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1636    struct proc *p)
1637{
1638	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1639}
1640
1641/*ARGSUSED*/
1642static int
1643rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1644    rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1645{
1646	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1647		return (ESRCH);
1648
1649	if (rctl->rc_cursor == oval) {
1650		rctl->rc_cursor = oval->rcv_next;
1651		rctl_val_list_reset(rctl->rc_cursor);
1652		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1653		    rctl->rc_cursor->rcv_value));
1654
1655		ASSERT(rctl->rc_cursor != NULL);
1656	}
1657
1658	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1659
1660	return (0);
1661}
1662
1663/*
1664 * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1665 *
1666 * Overview
1667 *   Delete the rctl value for the given flags.
1668 *
1669 * Return values
1670 *   0 for successful delete, errno otherwise.
1671 */
1672int
1673rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1674{
1675	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1676}
1677
1678/*
1679 * rctl_local_insert_cb()
1680 *
1681 * Overview
1682 *   Insert a new value into the rctl's val list. If an error occurs,
1683 *   the val list must be left in the same state as when the function
1684 *   was entered.
1685 *
1686 * Return Values
1687 *   0 for successful insert, EINVAL if the value is duplicated in the
1688 *   existing list.
1689 */
1690/*ARGSUSED*/
1691static int
1692rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1693    rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1694{
1695	/*
1696	 * Before inserting, confirm there are no duplicates of this value
1697	 * and flag level. If there is a duplicate, flag an error and do
1698	 * nothing.
1699	 */
1700	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1701		return (EINVAL);
1702
1703	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1704		rctl->rc_cursor = nval;
1705		rctl_val_list_reset(rctl->rc_cursor);
1706		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1707		    rctl->rc_cursor->rcv_value));
1708
1709		ASSERT(rctl->rc_cursor != NULL);
1710	}
1711
1712	return (0);
1713}
1714
1715/*
1716 * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1717 *
1718 * Overview
1719 *   Insert the rctl value into the appropriate rctl set for the calling
1720 *   process, given the handle.
1721 */
1722int
1723rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1724{
1725	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1726}
1727
1728/*
1729 * rctl_local_insert_all_cb()
1730 *
1731 * Overview
1732 *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1733 *
1734 *   Inserts new values from the project database (new_values).  alloc_values
1735 *   should be a linked list of pre-allocated rctl_val_t, which are used to
1736 *   populate (rc_projdb).
1737 *
1738 *   Should the *new_values linked list match the contents of the rctl's
1739 *   rp_projdb then we do nothing.
1740 *
1741 * Return Values
1742 *   0 is always returned.
1743 */
1744/*ARGSUSED*/
1745static int
1746rctl_local_insert_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1747    rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1748{
1749	rctl_val_t *val;
1750	rctl_val_t *tmp_val;
1751	rctl_val_t *next;
1752	int modified = 0;
1753
1754	/*
1755	 * If this the first time we've set this project rctl, then we delete
1756	 * all the privilege values.  These privilege values have been set by
1757	 * rctl_add_default_limit().
1758	 *
1759	 * We save some cycles here by not calling rctl_val_list_delete().
1760	 */
1761	if (rctl->rc_projdb == NULL) {
1762		val = rctl->rc_values;
1763
1764		while (val != NULL) {
1765			if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1766				if (val->rcv_prev != NULL)
1767					val->rcv_prev->rcv_next = val->rcv_next;
1768				else
1769					rctl->rc_values = val->rcv_next;
1770
1771				if (val->rcv_next != NULL)
1772					val->rcv_next->rcv_prev = val->rcv_prev;
1773
1774				tmp_val = val;
1775				val = val->rcv_next;
1776				kmem_cache_free(rctl_val_cache, tmp_val);
1777			} else {
1778				val = val->rcv_next;
1779			}
1780		}
1781		modified = 1;
1782	}
1783
1784	/*
1785	 * Delete active values previously set through the project database.
1786	 */
1787	val = rctl->rc_projdb;
1788
1789	while (val != NULL) {
1790
1791		/* Is the old value found in the new values? */
1792		if (rctl_val_list_find(&new_values, val) == NULL) {
1793
1794			/*
1795			 * Delete from the active values if it originated from
1796			 * the project database.
1797			 */
1798			if (((tmp_val = rctl_val_list_find(&rctl->rc_values,
1799			    val)) != NULL) &&
1800			    (tmp_val->rcv_flagaction & RCTL_LOCAL_PROJDB)) {
1801				(void) rctl_val_list_delete(&rctl->rc_values,
1802				    tmp_val);
1803			}
1804
1805			tmp_val = val->rcv_next;
1806			(void) rctl_val_list_delete(&rctl->rc_projdb, val);
1807			val = tmp_val;
1808			modified = 1;
1809
1810		} else
1811			val = val->rcv_next;
1812	}
1813
1814	/*
1815	 * Insert new values from the project database.
1816	 */
1817	while (new_values != NULL) {
1818		next = new_values->rcv_next;
1819
1820		/*
1821		 * Insert this new value into the rc_projdb, and duplicate this
1822		 * entry to the active list.
1823		 */
1824		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1825
1826			tmp_val = alloc_values->rcv_next;
1827			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1828			alloc_values->rcv_next = tmp_val;
1829
1830			if (rctl_val_list_insert(&rctl->rc_values,
1831			    alloc_values) == 0) {
1832				/* inserted move alloc_values on */
1833				alloc_values = tmp_val;
1834				modified = 1;
1835			}
1836		} else {
1837			/*
1838			 * Unlike setrctl() we don't want to return an error on
1839			 * a duplicate entry; we are concerned solely with
1840			 * ensuring that all the values specified are set.
1841			 */
1842			kmem_cache_free(rctl_val_cache, new_values);
1843		}
1844		new_values = next;
1845	}
1846
1847	/* Teardown any unused rctl_val_t */
1848	while (alloc_values != NULL) {
1849		tmp_val = alloc_values;
1850		alloc_values = alloc_values->rcv_next;
1851		kmem_cache_free(rctl_val_cache, tmp_val);
1852	}
1853
1854	/* Reset the cursor if rctl values have been modified */
1855	if (modified) {
1856		rctl->rc_cursor = rctl->rc_values;
1857		rctl_val_list_reset(rctl->rc_cursor);
1858		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1859		    rctl->rc_cursor->rcv_value));
1860	}
1861
1862	return (0);
1863}
1864
1865int
1866rctl_local_insert_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1867    rctl_val_t *alloc_values, struct proc *p)
1868{
1869	return (rctl_local_op(hndl, new_values, alloc_values,
1870	    rctl_local_insert_all_cb, p));
1871}
1872
1873/*
1874 * rctl_local_replace_all_cb()
1875 *
1876 * Overview
1877 *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1878 *
1879 *   Clears the active rctl values (rc_values), and stored values from the
1880 *   previous insertions from the project database (rc_projdb).
1881 *
1882 *   Inserts new values from the project database (new_values).  alloc_values
1883 *   should be a linked list of pre-allocated rctl_val_t, which are used to
1884 *   populate (rc_projdb).
1885 *
1886 * Return Values
1887 *   0 is always returned.
1888 */
1889/*ARGSUSED*/
1890static int
1891rctl_local_replace_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1892    rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1893{
1894	rctl_val_t *val;
1895	rctl_val_t *next;
1896	rctl_val_t *tmp_val;
1897
1898	/* Delete all the privilege vaules */
1899	val = rctl->rc_values;
1900
1901	while (val != NULL) {
1902		if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1903			if (val->rcv_prev != NULL)
1904				val->rcv_prev->rcv_next = val->rcv_next;
1905			else
1906				rctl->rc_values = val->rcv_next;
1907
1908			if (val->rcv_next != NULL)
1909				val->rcv_next->rcv_prev = val->rcv_prev;
1910
1911			tmp_val = val;
1912			val = val->rcv_next;
1913			kmem_cache_free(rctl_val_cache, tmp_val);
1914		} else {
1915			val = val->rcv_next;
1916		}
1917	}
1918
1919	/* Delete the contents of rc_projdb */
1920	val = rctl->rc_projdb;
1921	while (val != NULL) {
1922
1923		tmp_val = val;
1924		val = val->rcv_next;
1925		kmem_cache_free(rctl_val_cache, tmp_val);
1926	}
1927	rctl->rc_projdb = NULL;
1928
1929	/*
1930	 * Insert new values from the project database.
1931	 */
1932	while (new_values != NULL) {
1933		next = new_values->rcv_next;
1934
1935		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1936			tmp_val = alloc_values->rcv_next;
1937			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1938			alloc_values->rcv_next = tmp_val;
1939
1940			if (rctl_val_list_insert(&rctl->rc_values,
1941			    alloc_values) == 0) {
1942				/* inserted, so move alloc_values on */
1943				alloc_values = tmp_val;
1944			}
1945		} else {
1946			/*
1947			 * Unlike setrctl() we don't want to return an error on
1948			 * a duplicate entry; we are concerned solely with
1949			 * ensuring that all the values specified are set.
1950			 */
1951			kmem_cache_free(rctl_val_cache, new_values);
1952		}
1953
1954		new_values = next;
1955	}
1956
1957	/* Teardown any unused rctl_val_t */
1958	while (alloc_values != NULL) {
1959		tmp_val = alloc_values;
1960		alloc_values = alloc_values->rcv_next;
1961		kmem_cache_free(rctl_val_cache, tmp_val);
1962	}
1963
1964	/* Always reset the cursor */
1965	rctl->rc_cursor = rctl->rc_values;
1966	rctl_val_list_reset(rctl->rc_cursor);
1967	RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1968	    rctl->rc_cursor->rcv_value));
1969
1970	return (0);
1971}
1972
1973int
1974rctl_local_replace_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1975    rctl_val_t *alloc_values, struct proc *p)
1976{
1977	return (rctl_local_op(hndl, new_values, alloc_values,
1978	    rctl_local_replace_all_cb, p));
1979}
1980
1981static int
1982rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1983    rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1984{
1985	int ret;
1986	rctl_val_t *tmp;
1987
1988	/* Verify that old will be delete-able */
1989	tmp = rctl_val_list_find(&rctl->rc_values, oval);
1990	if (tmp == NULL)
1991		return (ESRCH);
1992	/*
1993	 * Caller should verify that value being deleted is not the
1994	 * system value.
1995	 */
1996	ASSERT(tmp->rcv_privilege != RCPRIV_SYSTEM);
1997
1998	/*
1999	 * rctl_local_insert_cb() does the job of flagging an error
2000	 * for any duplicate values. So, call rctl_local_insert_cb()
2001	 * for the new value first, then do deletion of the old value.
2002	 * Since this is a callback function to rctl_local_op, we can
2003	 * count on rcs_lock being held at this point. This guarantees
2004	 * that there is at no point a visible list which contains both
2005	 * new and old values.
2006	 */
2007	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
2008		return (ret);
2009
2010	ret = rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval);
2011	ASSERT(ret == 0);
2012	return (0);
2013}
2014
2015/*
2016 * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
2017 *
2018 * Overview
2019 *   Replace the rctl value with a new one.
2020 *
2021 * Return values
2022 *   0 for successful replace, errno otherwise.
2023 */
2024int
2025rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
2026    struct proc *p)
2027{
2028	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
2029}
2030
2031/*
2032 * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
2033 *
2034 * Overview
2035 *   To support rlimit compatibility, we need a function which takes a 64-bit
2036 *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2037 *   This operation is only intended for legacy rlimits.
2038 */
2039int
2040rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
2041{
2042	rctl_t *rctl;
2043	rctl_val_t *rval;
2044	rctl_set_t *rset = p->p_rctls;
2045	int soft_limit_seen = 0;
2046	int test_for_deny = 1;
2047
2048	mutex_enter(&rset->rcs_lock);
2049	if (rctl_set_find(rset, rc, &rctl) == -1) {
2050		mutex_exit(&rset->rcs_lock);
2051		return (-1);
2052	}
2053
2054	rval = rctl->rc_values;
2055
2056	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
2057	    RCTL_GLOBAL_DENY_ALWAYS))
2058		test_for_deny = 0;
2059
2060	/*
2061	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
2062	 */
2063	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
2064		if (test_for_deny &&
2065		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
2066			rval = rval->rcv_next;
2067			continue;
2068		}
2069
2070		/*
2071		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
2072		 * effective soft limit and should set rlim_cur.  We should then
2073		 * continue looking for another control value with the DENY bit
2074		 * set.
2075		 */
2076		if (rval->rcv_privilege == RCPRIV_BASIC) {
2077			if (soft_limit_seen) {
2078				rval = rval->rcv_next;
2079				continue;
2080			}
2081
2082			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2083			    rval->rcv_value < rctl_model_maximum(
2084			    rctl->rc_dict_entry, p))
2085				rlp64->rlim_cur = rval->rcv_value;
2086			else
2087				rlp64->rlim_cur = RLIM64_INFINITY;
2088			soft_limit_seen = 1;
2089
2090			rval = rval->rcv_next;
2091			continue;
2092		}
2093
2094		/*
2095		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
2096		 * a soft limit candidate, then we've found the effective hard
2097		 * and soft limits and should set both  If we had found a soft
2098		 * limit, then this is only the hard limit and we need only set
2099		 * rlim_max.
2100		 */
2101		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2102		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
2103		    p))
2104			rlp64->rlim_max = rval->rcv_value;
2105		else
2106			rlp64->rlim_max = RLIM64_INFINITY;
2107		if (!soft_limit_seen)
2108			rlp64->rlim_cur = rlp64->rlim_max;
2109
2110		mutex_exit(&rset->rcs_lock);
2111		return (0);
2112	}
2113
2114	if (rval == NULL) {
2115		/*
2116		 * This control sequence is corrupt, as it is not terminated by
2117		 * a system privileged control value.
2118		 */
2119		mutex_exit(&rset->rcs_lock);
2120		return (-1);
2121	}
2122
2123	/*
2124	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
2125	 * the soft, if we haven't a soft candidate) should be the value of the
2126	 * system control value.
2127	 */
2128	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2129	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
2130		rlp64->rlim_max = rval->rcv_value;
2131	else
2132		rlp64->rlim_max = RLIM64_INFINITY;
2133
2134	if (!soft_limit_seen)
2135		rlp64->rlim_cur = rlp64->rlim_max;
2136
2137	mutex_exit(&rset->rcs_lock);
2138	return (0);
2139}
2140
2141/*
2142 * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
2143 *
2144 * Overview
2145 *   Before making a series of calls to rctl_rlimit_set(), we must have a
2146 *   preallocated batch of resource control values, as rctl_rlimit_set() can
2147 *   potentially consume two resource control values per call.
2148 *
2149 * Return values
2150 *   A populated resource control allocation group with 2n resource control
2151 *   values.
2152 *
2153 * Caller's context
2154 *   Must be safe for KM_SLEEP allocations.
2155 */
2156rctl_alloc_gp_t *
2157rctl_rlimit_set_prealloc(uint_t n)
2158{
2159	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
2160
2161	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
2162
2163	gp->rcag_nvals = 2 * n;
2164
2165	rctl_gp_alloc(gp);
2166
2167	return (gp);
2168}
2169
2170/*
2171 * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
2172 *   int)
2173 *
2174 * Overview
2175 *   To support rlimit compatibility, we need a function which takes a 64-bit
2176 *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2177 *   This operation is only intended for legacy rlimits.
2178 *
2179 *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
2180 *   minimize the number of values placed on the value sequence in various
2181 *   cases.  Furthermore, we don't allow multiple identical privilege-action
2182 *   values on the same sequence.  (That is, we don't want a sequence like
2183 *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
2184 *   memory.)  So we want to delete any values with the same privilege value and
2185 *   action.
2186 *
2187 * Return values
2188 *   0 for successful set, errno otherwise. Errno will be either EINVAL
2189 *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
2190 *   system calls.
2191 */
2192/*ARGSUSED*/
2193int
2194rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
2195    rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
2196{
2197	rctl_t *rctl;
2198	rctl_val_t *rval, *rval_priv, *rval_basic;
2199	rctl_set_t *rset = p->p_rctls;
2200	rctl_qty_t max;
2201	rctl_entity_p_t e;
2202	struct rlimit64 cur_rl;
2203
2204	e.rcep_t = RCENTITY_PROCESS;
2205	e.rcep_p.proc = p;
2206
2207	if (rlp64->rlim_cur > rlp64->rlim_max)
2208		return (EINVAL);
2209
2210	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
2211		return (EINVAL);
2212
2213	/*
2214	 * If we are not privileged, we can only lower the hard limit.
2215	 */
2216	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
2217	    cur_rl.rlim_max != RLIM64_INFINITY &&
2218	    secpolicy_resource(cr) != 0)
2219		return (EPERM);
2220
2221	mutex_enter(&rset->rcs_lock);
2222
2223	if (rctl_set_find(rset, rc, &rctl) == -1) {
2224		mutex_exit(&rset->rcs_lock);
2225		return (EINVAL);
2226	}
2227
2228	rval_priv = rctl_gp_detach_val(ragp);
2229
2230	rval = rctl->rc_values;
2231
2232	while (rval != NULL) {
2233		rctl_val_t *next = rval->rcv_next;
2234
2235		if (rval->rcv_privilege == RCPRIV_SYSTEM)
2236			break;
2237
2238		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
2239		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
2240		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
2241			if (rctl->rc_cursor == rval) {
2242				rctl->rc_cursor = rval->rcv_next;
2243				rctl_val_list_reset(rctl->rc_cursor);
2244				RCTLOP_SET(rctl, p, &e, rctl_model_value(
2245				    rctl->rc_dict_entry, p,
2246				    rctl->rc_cursor->rcv_value));
2247			}
2248			(void) rctl_val_list_delete(&rctl->rc_values, rval);
2249		}
2250
2251		rval = next;
2252	}
2253
2254	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
2255	rval_priv->rcv_flagaction = flagaction;
2256	if (rlp64->rlim_max == RLIM64_INFINITY) {
2257		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
2258		max = rctl->rc_dict_entry->rcd_max_native;
2259	} else {
2260		max = rlp64->rlim_max;
2261	}
2262	rval_priv->rcv_value = max;
2263	rval_priv->rcv_action_signal = signal;
2264	rval_priv->rcv_action_recipient = NULL;
2265	rval_priv->rcv_action_recip_pid = -1;
2266	rval_priv->rcv_firing_time = 0;
2267	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
2268
2269	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
2270	rctl->rc_cursor = rval_priv;
2271	rctl_val_list_reset(rctl->rc_cursor);
2272	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2273	    rctl->rc_cursor->rcv_value));
2274
2275	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
2276		rval_basic = rctl_gp_detach_val(ragp);
2277
2278		rval_basic->rcv_privilege = RCPRIV_BASIC;
2279		rval_basic->rcv_value = rlp64->rlim_cur;
2280		rval_basic->rcv_flagaction = flagaction;
2281		rval_basic->rcv_action_signal = signal;
2282		rval_basic->rcv_action_recipient = p;
2283		rval_basic->rcv_action_recip_pid = p->p_pid;
2284		rval_basic->rcv_firing_time = 0;
2285		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
2286
2287		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
2288		rctl->rc_cursor = rval_basic;
2289		rctl_val_list_reset(rctl->rc_cursor);
2290		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2291		    rctl->rc_cursor->rcv_value));
2292	}
2293
2294	ASSERT(rctl->rc_cursor != NULL);
2295
2296	mutex_exit(&rset->rcs_lock);
2297	return (0);
2298}
2299
2300
2301/*
2302 * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2303 *   rlim64_t, rctl_ops_t *)
2304 *
2305 * Overview
2306 *   rctl_register() performs a look-up in the dictionary of rctls
2307 *   active on the system; if a rctl of that name is absent, an entry is
2308 *   made into the dictionary.  The rctl is returned with its reference
2309 *   count incremented by one.  If the rctl name already exists, we panic.
2310 *   (Were the resource control system to support dynamic loading and unloading,
2311 *   which it is structured for, duplicate registration should lead to load
2312 *   failure instead of panicking.)
2313 *
2314 *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2315 *   defined.  This limit contains the highest possible value for this quantity
2316 *   on the system.  Furthermore, the registered control must provide infinite
2317 *   values for all applicable address space models supported by the operating
2318 *   system.  Attempts to set resource control values beyond the system limit
2319 *   will fail.
2320 *
2321 * Return values
2322 *   The rctl's ID.
2323 *
2324 * Caller's context
2325 *   Caller must be in a context suitable for KM_SLEEP allocations.
2326 */
2327rctl_hndl_t
2328rctl_register(
2329    const char *name,
2330    rctl_entity_t entity,
2331    int global_flags,
2332    rlim64_t max_native,
2333    rlim64_t max_ilp32,
2334    rctl_ops_t *ops)
2335{
2336	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2337	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2338	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2339	    KM_SLEEP);
2340	rctl_t *old_rctl;
2341	rctl_hndl_t rhndl;
2342	int localflags;
2343
2344	ASSERT(ops != NULL);
2345
2346	bzero(rctl, sizeof (rctl_t));
2347	bzero(rctl_val, sizeof (rctl_val_t));
2348
2349	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2350		localflags = RCTL_LOCAL_MAXIMAL;
2351	else
2352		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2353
2354	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2355	rctl_val->rcv_value = max_native;
2356	rctl_val->rcv_flagaction = localflags;
2357	rctl_val->rcv_action_signal = 0;
2358	rctl_val->rcv_action_recipient = NULL;
2359	rctl_val->rcv_action_recip_pid = -1;
2360	rctl_val->rcv_firing_time = 0;
2361	rctl_val->rcv_next = NULL;
2362	rctl_val->rcv_prev = NULL;
2363
2364	rctl_de->rcd_name = (char *)name;
2365	rctl_de->rcd_default_value = rctl_val;
2366	rctl_de->rcd_max_native = max_native;
2367	rctl_de->rcd_max_ilp32 = max_ilp32;
2368	rctl_de->rcd_entity = entity;
2369	rctl_de->rcd_ops = ops;
2370	rctl_de->rcd_flagaction = global_flags;
2371
2372	rctl->rc_dict_entry = rctl_de;
2373	rctl->rc_values = rctl_val;
2374
2375	/*
2376	 * 1.  Take global lock, validate nonexistence of name, get ID.
2377	 */
2378	mutex_enter(&rctl_dict_lock);
2379
2380	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2381	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2382		panic("duplicate registration of rctl %s", name);
2383
2384	rhndl = rctl_de->rcd_id = rctl->rc_id =
2385	    (rctl_hndl_t)id_alloc(rctl_ids);
2386
2387	/*
2388	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2389	 */
2390	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2391	    (mod_hash_val_t)rctl_de))
2392		panic("unable to insert rctl dict entry for %s (%u)", name,
2393		    (uint_t)rctl->rc_id);
2394
2395	/*
2396	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2397	 */
2398	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2399	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2400		panic("duplicate rctl ID %u registered", rctl->rc_id);
2401
2402	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2403	    (mod_hash_val_t)rctl))
2404		panic("unable to insert rctl %s/%u (%p)", name,
2405		    (uint_t)rctl->rc_id, (void *)rctl);
2406
2407	/*
2408	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2409	 */
2410
2411	mutex_enter(&rctl_lists_lock);
2412
2413	switch (entity) {
2414	case RCENTITY_ZONE:
2415	case RCENTITY_PROJECT:
2416	case RCENTITY_TASK:
2417	case RCENTITY_PROCESS:
2418		rctl_de->rcd_next = rctl_lists[entity];
2419		rctl_lists[entity] = rctl_de;
2420		break;
2421	default:
2422		panic("registering unknown rctl entity %d (%s)", entity,
2423		    name);
2424		break;
2425	}
2426
2427	mutex_exit(&rctl_lists_lock);
2428
2429	/*
2430	 * 4.  Drop lock.
2431	 */
2432	mutex_exit(&rctl_dict_lock);
2433
2434	return (rhndl);
2435}
2436
2437/*
2438 * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2439 *    rctl_val_t *v)
2440 *
2441 * Overview
2442 *   rctl_global_action() takes, in according with the flags on the rctl_dict
2443 *   entry for the given control, the appropriate actions on the exceeded
2444 *   control value.  Additionally, rctl_global_action() updates the firing time
2445 *   on the exceeded value.
2446 *
2447 * Return values
2448 *   A bitmask reflecting the actions actually taken.
2449 *
2450 * Caller's context
2451 *   No restrictions on context.
2452 */
2453/*ARGSUSED*/
2454static int
2455rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2456{
2457	rctl_dict_entry_t *rde = r->rc_dict_entry;
2458	const char *pr, *en, *idstr;
2459	id_t id;
2460	enum {
2461		SUFFIX_NONE,	/* id consumed directly */
2462		SUFFIX_NUMERIC,	/* id consumed in suffix */
2463		SUFFIX_STRING	/* idstr consumed in suffix */
2464	} suffix = SUFFIX_NONE;
2465	int ret = 0;
2466
2467	v->rcv_firing_time = gethrtime();
2468
2469	switch (v->rcv_privilege) {
2470	case RCPRIV_BASIC:
2471		pr = "basic";
2472		break;
2473	case RCPRIV_PRIVILEGED:
2474		pr = "privileged";
2475		break;
2476	case RCPRIV_SYSTEM:
2477		pr = "system";
2478		break;
2479	default:
2480		pr = "unknown";
2481		break;
2482	}
2483
2484	switch (rde->rcd_entity) {
2485	case RCENTITY_PROCESS:
2486		en = "process";
2487		id = p->p_pid;
2488		suffix = SUFFIX_NONE;
2489		break;
2490	case RCENTITY_TASK:
2491		en = "task";
2492		id = p->p_task->tk_tkid;
2493		suffix = SUFFIX_NUMERIC;
2494		break;
2495	case RCENTITY_PROJECT:
2496		en = "project";
2497		id = p->p_task->tk_proj->kpj_id;
2498		suffix = SUFFIX_NUMERIC;
2499		break;
2500	case RCENTITY_ZONE:
2501		en = "zone";
2502		idstr = p->p_zone->zone_name;
2503		suffix = SUFFIX_STRING;
2504		break;
2505	default:
2506		en = "unknown entity associated with process";
2507		id = p->p_pid;
2508		suffix = SUFFIX_NONE;
2509		break;
2510	}
2511
2512	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2513		switch (suffix) {
2514		default:
2515		case SUFFIX_NONE:
2516			(void) strlog(0, 0, 0,
2517			    rde->rcd_strlog_flags | log_global.lz_active,
2518			    "%s rctl %s (value %llu) exceeded by %s %d.",
2519			    pr, rde->rcd_name, v->rcv_value, en, id);
2520			break;
2521		case SUFFIX_NUMERIC:
2522			(void) strlog(0, 0, 0,
2523			    rde->rcd_strlog_flags | log_global.lz_active,
2524			    "%s rctl %s (value %llu) exceeded by process %d"
2525			    " in %s %d.",
2526			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2527			    en, id);
2528			break;
2529		case SUFFIX_STRING:
2530			(void) strlog(0, 0, 0,
2531			    rde->rcd_strlog_flags | log_global.lz_active,
2532			    "%s rctl %s (value %llu) exceeded by process %d"
2533			    " in %s %s.",
2534			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2535			    en, idstr);
2536			break;
2537		}
2538	}
2539
2540	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2541		ret |= RCT_DENY;
2542
2543	return (ret);
2544}
2545
2546static int
2547rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2548    uint_t safety)
2549{
2550	int ret = 0;
2551	sigqueue_t *sqp = NULL;
2552	rctl_dict_entry_t *rde = r->rc_dict_entry;
2553	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2554
2555	proc_t *recipient = v->rcv_action_recipient;
2556	id_t recip_pid = v->rcv_action_recip_pid;
2557	int recip_signal = v->rcv_action_signal;
2558	uint_t flagaction = v->rcv_flagaction;
2559
2560	if (safety == RCA_UNSAFE_ALL) {
2561		if (flagaction & RCTL_LOCAL_DENY) {
2562			ret |= RCT_DENY;
2563		}
2564		return (ret);
2565	}
2566
2567	if (flagaction & RCTL_LOCAL_SIGNAL) {
2568		/*
2569		 * We can build a siginfo only in the case that it is
2570		 * safe for us to drop p_lock.  (For asynchronous
2571		 * checks this is currently not true.)
2572		 */
2573		if (safety == RCA_SAFE) {
2574			mutex_exit(&rset->rcs_lock);
2575			mutex_exit(&p->p_lock);
2576			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2577			mutex_enter(&p->p_lock);
2578			mutex_enter(&rset->rcs_lock);
2579
2580			sqp->sq_info.si_signo = recip_signal;
2581			sqp->sq_info.si_code = SI_RCTL;
2582			sqp->sq_info.si_errno = 0;
2583			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2584		}
2585
2586		if (recipient == NULL || recipient == p) {
2587			ret |= RCT_SIGNAL;
2588
2589			if (sqp == NULL) {
2590				sigtoproc(p, NULL, recip_signal);
2591			} else if (p == curproc) {
2592				/*
2593				 * Then this is a synchronous test and we can
2594				 * direct the signal at the violating thread.
2595				 */
2596				sigaddqa(curproc, curthread, sqp);
2597			} else {
2598				sigaddqa(p, NULL, sqp);
2599			}
2600		} else if (!unobservable) {
2601			proc_t *rp;
2602
2603			mutex_exit(&rset->rcs_lock);
2604			mutex_exit(&p->p_lock);
2605
2606			mutex_enter(&pidlock);
2607			if ((rp = prfind(recip_pid)) == recipient) {
2608				/*
2609				 * Recipient process is still alive, but may not
2610				 * be in this task or project any longer.  In
2611				 * this case, the recipient's resource control
2612				 * set pertinent to this control will have
2613				 * changed--and we will not deliver the signal,
2614				 * as the recipient process is trying to tear
2615				 * itself off of its former set.
2616				 */
2617				mutex_enter(&rp->p_lock);
2618				mutex_exit(&pidlock);
2619
2620				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2621					ret |= RCT_SIGNAL;
2622
2623					if (sqp == NULL)
2624						sigtoproc(rp, NULL,
2625						    recip_signal);
2626					else
2627						sigaddqa(rp, NULL, sqp);
2628				} else if (sqp) {
2629					kmem_free(sqp, sizeof (sigqueue_t));
2630				}
2631				mutex_exit(&rp->p_lock);
2632			} else {
2633				mutex_exit(&pidlock);
2634				if (sqp)
2635					kmem_free(sqp, sizeof (sigqueue_t));
2636			}
2637
2638			mutex_enter(&p->p_lock);
2639			/*
2640			 * Since we dropped p_lock, we may no longer be in the
2641			 * same task or project as we were at entry.  It is thus
2642			 * unsafe for us to reacquire the set lock at this
2643			 * point; callers of rctl_local_action() must handle
2644			 * this possibility.
2645			 */
2646			ret |= RCT_LK_ABANDONED;
2647		} else if (sqp) {
2648			kmem_free(sqp, sizeof (sigqueue_t));
2649		}
2650	}
2651
2652	if ((flagaction & RCTL_LOCAL_DENY) &&
2653	    (recipient == NULL || recipient == p)) {
2654		ret |= RCT_DENY;
2655	}
2656
2657	return (ret);
2658}
2659
2660/*
2661 * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2662 *
2663 * Overview
2664 *   Take the action associated with the enforced value (as defined by
2665 *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2666 *   a restricted subset of the available actions, if circumstances dictate that
2667 *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2668 *   persistence across the duration of the function (an asynchronous action).
2669 *
2670 * Return values
2671 *   Actions taken, according to the rctl_test bitmask.
2672 *
2673 * Caller's context
2674 *   Safe to acquire rcs_lock.
2675 */
2676int
2677rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2678{
2679	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2680}
2681
2682int
2683rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2684    rctl_entity_p_t *e, uint_t safety)
2685{
2686	int ret = RCT_NONE;
2687	rctl_t *lrctl;
2688	rctl_entity_p_t e_tmp;
2689
2690rctl_action_acquire:
2691	mutex_enter(&rset->rcs_lock);
2692	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2693		mutex_exit(&rset->rcs_lock);
2694		return (ret);
2695	}
2696
2697	if (e == NULL) {
2698		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2699		    p, &e_tmp);
2700		e = &e_tmp;
2701	}
2702
2703	if ((ret & RCT_LK_ABANDONED) == 0) {
2704		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2705
2706		RCTLOP_ACTION(lrctl, p, e);
2707
2708		ret |= rctl_local_action(lrctl, rset, p,
2709		    lrctl->rc_cursor, safety);
2710
2711		if (ret & RCT_LK_ABANDONED)
2712			goto rctl_action_acquire;
2713	}
2714
2715	ret &= ~RCT_LK_ABANDONED;
2716
2717	if (!(ret & RCT_DENY) &&
2718	    lrctl->rc_cursor->rcv_next != NULL) {
2719		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2720
2721		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2722		    p, lrctl->rc_cursor->rcv_value));
2723
2724	}
2725	mutex_exit(&rset->rcs_lock);
2726
2727	return (ret);
2728}
2729
2730/*
2731 * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2732 *
2733 * Overview
2734 *   Increment the resource associated with the given handle, returning zero if
2735 *   the incremented value does not exceed the threshold for the current limit
2736 *   on the resource.
2737 *
2738 * Return values
2739 *   Actions taken, according to the rctl_test bitmask.
2740 *
2741 * Caller's context
2742 *   p_lock held by caller.
2743 */
2744/*ARGSUSED*/
2745int
2746rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2747    rctl_qty_t incr, uint_t flags)
2748{
2749	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2750}
2751
2752int
2753rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2754    rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2755{
2756	rctl_t *lrctl;
2757	int ret = RCT_NONE;
2758	rctl_entity_p_t e_tmp;
2759	if (p == &p0) {
2760		/*
2761		 * We don't enforce rctls on the kernel itself.
2762		 */
2763		return (ret);
2764	}
2765
2766rctl_test_acquire:
2767	ASSERT(MUTEX_HELD(&p->p_lock));
2768
2769	mutex_enter(&rset->rcs_lock);
2770
2771	/*
2772	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2773	 * that haven't been set on this entity (since the only valid value is
2774	 * the infinite system value).
2775	 */
2776	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2777		mutex_exit(&rset->rcs_lock);
2778		return (ret);
2779	}
2780
2781	/*
2782	 * This control is currently unenforced:  maximal value on control
2783	 * supporting infinitely available resource.
2784	 */
2785	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2786	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2787
2788		mutex_exit(&rset->rcs_lock);
2789		return (ret);
2790	}
2791
2792	/*
2793	 * If we have been called by rctl_test, look up the entity pointer
2794	 * from the proc pointer.
2795	 */
2796	if (e == NULL) {
2797		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2798		    p, &e_tmp);
2799		e = &e_tmp;
2800	}
2801
2802	/*
2803	 * Get enforced rctl value and current usage.  Test the increment
2804	 * with the current usage against the enforced value--take action as
2805	 * necessary.
2806	 */
2807	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2808		if ((ret & RCT_LK_ABANDONED) == 0) {
2809			ret |= rctl_global_action(lrctl, rset, p,
2810			    lrctl->rc_cursor);
2811
2812			RCTLOP_ACTION(lrctl, p, e);
2813
2814			ret |= rctl_local_action(lrctl, rset, p,
2815			    lrctl->rc_cursor, flags);
2816
2817			if (ret & RCT_LK_ABANDONED)
2818				goto rctl_test_acquire;
2819		}
2820
2821		ret &= ~RCT_LK_ABANDONED;
2822
2823		if ((ret & RCT_DENY) == RCT_DENY ||
2824		    lrctl->rc_cursor->rcv_next == NULL) {
2825			ret |= RCT_DENY;
2826			break;
2827		}
2828
2829		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2830		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2831		    p, lrctl->rc_cursor->rcv_value));
2832	}
2833
2834	mutex_exit(&rset->rcs_lock);
2835
2836	return (ret);
2837}
2838
2839/*
2840 * void rctl_init(void)
2841 *
2842 * Overview
2843 *   Initialize the rctl subsystem, including the primoridal rctls
2844 *   provided by the system.  New subsystem-specific rctls should _not_ be
2845 *   initialized here.  (Do it in your own file.)
2846 *
2847 * Return values
2848 *   None.
2849 *
2850 * Caller's context
2851 *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2852 *   initialization.
2853 */
2854void
2855rctl_init(void)
2856{
2857	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2858	    0, NULL, NULL, NULL, NULL, NULL, 0);
2859	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2860	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2861
2862	rctl_dict = mod_hash_create_extended("rctl_dict",
2863	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2864	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2865	rctl_dict_by_name = mod_hash_create_strhash(
2866	    "rctl_handles_by_name", rctl_dict_size,
2867	    mod_hash_null_valdtor);
2868	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2869	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2870
2871	rctlproc_init();
2872}
2873
2874/*
2875 * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2876 *
2877 * Increments the amount of locked memory on a project, and
2878 * zone. If proj is NULL, the proj and zone of proc_t p is used.  If
2879 * chargeproc is non-zero, then the charged amount is cached on p->p_locked_mem
2880 * so that the charge can be migrated when a process changes projects.
2881 *
2882 * Return values
2883 *    0 - success
2884 *    EAGAIN - attempting to increment locked memory is denied by one
2885 *      or more resource entities.
2886 */
2887int
2888rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2889    int chargeproc)
2890{
2891	kproject_t *projp;
2892	zone_t *zonep;
2893	rctl_entity_p_t e;
2894	int ret = 0;
2895
2896	ASSERT(p != NULL);
2897	ASSERT(MUTEX_HELD(&p->p_lock));
2898	if (proj != NULL) {
2899		projp = proj;
2900		zonep = zone_find_by_id(projp->kpj_zoneid);
2901	} else {
2902		projp = p->p_task->tk_proj;
2903		zonep = p->p_zone;
2904	}
2905
2906	mutex_enter(&zonep->zone_mem_lock);
2907
2908	e.rcep_p.proj = projp;
2909	e.rcep_t = RCENTITY_PROJECT;
2910	if (projp->kpj_data.kpd_locked_mem + inc >
2911	    projp->kpj_data.kpd_locked_mem_ctl) {
2912		if (rctl_test_entity(rc_project_locked_mem, projp->kpj_rctls,
2913		    p, &e, inc, 0) & RCT_DENY) {
2914			ret = EAGAIN;
2915			goto out;
2916		}
2917	}
2918	e.rcep_p.zone = zonep;
2919	e.rcep_t = RCENTITY_ZONE;
2920	if (zonep->zone_locked_mem + inc > zonep->zone_locked_mem_ctl) {
2921		if (rctl_test_entity(rc_zone_locked_mem, zonep->zone_rctls,
2922		    p, &e, inc, 0) & RCT_DENY) {
2923			ret = EAGAIN;
2924			goto out;
2925		}
2926	}
2927
2928	zonep->zone_locked_mem += inc;
2929	projp->kpj_data.kpd_locked_mem += inc;
2930	if (chargeproc != 0) {
2931		p->p_locked_mem += inc;
2932	}
2933out:
2934	mutex_exit(&zonep->zone_mem_lock);
2935	if (proj != NULL)
2936		zone_rele(zonep);
2937	return (ret);
2938}
2939
2940/*
2941 * rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2942 *
2943 * Decrements the amount of locked memory on a project and
2944 * zone.  If proj is NULL, the proj and zone of proc_t p is used.  If
2945 * creditproc is non-zero, then the quantity of locked memory is subtracted
2946 * from p->p_locked_mem.
2947 *
2948 * Return values
2949 *   none
2950 */
2951void
2952rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2953    int creditproc)
2954{
2955	kproject_t *projp;
2956	zone_t *zonep;
2957
2958	if (proj != NULL) {
2959		projp = proj;
2960		zonep = zone_find_by_id(projp->kpj_zoneid);
2961	} else {
2962		ASSERT(p != NULL);
2963		ASSERT(MUTEX_HELD(&p->p_lock));
2964		projp = p->p_task->tk_proj;
2965		zonep = p->p_zone;
2966	}
2967
2968	mutex_enter(&zonep->zone_mem_lock);
2969	zonep->zone_locked_mem -= inc;
2970	projp->kpj_data.kpd_locked_mem -= inc;
2971	if (creditproc != 0) {
2972		ASSERT(p != NULL);
2973		ASSERT(MUTEX_HELD(&p->p_lock));
2974		p->p_locked_mem -= inc;
2975	}
2976	mutex_exit(&zonep->zone_mem_lock);
2977	if (proj != NULL)
2978		zone_rele(zonep);
2979}
2980
2981/*
2982 * rctl_incr_swap(proc_t *, zone_t *, size_t)
2983 *
2984 * Overview
2985 *   Increments the swap charge on the specified zone.
2986 *
2987 * Return values
2988 *   0 on success.  EAGAIN if swap increment fails due an rctl value
2989 *   on the zone.
2990 *
2991 * Callers context
2992 *   p_lock held on specified proc.
2993 *   swap must be even multiple of PAGESIZE
2994 */
2995int
2996rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
2997{
2998	rctl_entity_p_t e;
2999
3000	ASSERT(MUTEX_HELD(&proc->p_lock));
3001	ASSERT((swap & PAGEOFFSET) == 0);
3002	e.rcep_p.zone = zone;
3003	e.rcep_t = RCENTITY_ZONE;
3004
3005	mutex_enter(&zone->zone_mem_lock);
3006
3007	if ((zone->zone_max_swap + swap) >
3008	    zone->zone_max_swap_ctl) {
3009
3010		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
3011		    proc, &e, swap, 0) & RCT_DENY) {
3012			mutex_exit(&zone->zone_mem_lock);
3013			return (EAGAIN);
3014		}
3015	}
3016	zone->zone_max_swap += swap;
3017	mutex_exit(&zone->zone_mem_lock);
3018	return (0);
3019}
3020
3021/*
3022 * rctl_decr_swap(zone_t *, size_t)
3023 *
3024 * Overview
3025 *   Decrements the swap charge on the specified zone.
3026 *
3027 * Return values
3028 *   None
3029 *
3030 * Callers context
3031 *   swap must be even multiple of PAGESIZE
3032 */
3033void
3034rctl_decr_swap(zone_t *zone, size_t swap)
3035{
3036	ASSERT((swap & PAGEOFFSET) == 0);
3037	mutex_enter(&zone->zone_mem_lock);
3038	ASSERT(zone->zone_max_swap >= swap);
3039	zone->zone_max_swap -= swap;
3040	mutex_exit(&zone->zone_mem_lock);
3041}
3042
3043/*
3044 * Create resource kstat
3045 */
3046static kstat_t *
3047rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
3048    uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
3049{
3050	kstat_t *ksp = NULL;
3051	char name[KSTAT_STRLEN];
3052
3053	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
3054
3055	if ((ksp = kstat_create_zone("caps", ks_zoneid,
3056	    name, ks_class, ks_type,
3057	    ks_ndata, ks_flags, ks_zoneid)) != NULL) {
3058		if (ks_zoneid != GLOBAL_ZONEID)
3059			kstat_zone_add(ksp, GLOBAL_ZONEID);
3060	}
3061	return (ksp);
3062}
3063
3064/*
3065 * Create zone-specific resource kstat
3066 */
3067kstat_t *
3068rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
3069    uint_t ks_ndata, uchar_t ks_flags)
3070{
3071	char name[KSTAT_STRLEN];
3072
3073	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
3074
3075	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
3076	    ks_type, ks_ndata, ks_flags, zone->zone_id));
3077}
3078
3079/*
3080 * Create project-specific resource kstat
3081 */
3082kstat_t *
3083rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
3084    uint_t ks_ndata, uchar_t ks_flags)
3085{
3086	char name[KSTAT_STRLEN];
3087
3088	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
3089
3090	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
3091	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
3092}
3093