1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/disp.h>
28#include <sys/param.h>
29#include <sys/systm.h>
30#include <sys/sysmacros.h>
31#include <sys/atomic.h>
32#include <sys/cpucaps_impl.h>
33#include <sys/dtrace.h>
34#include <sys/sdt.h>
35#include <sys/debug.h>
36#include <sys/rctl.h>
37#include <sys/errno.h>
38
39/*
40 * CPU Caps implementation
41 * =======================
42 *
43 * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
44 * usage for all projects running inside the zone. If the zone CPU cap is set
45 * below the project CPU cap, the latter will have no effect.
46 *
47 * When CPU usage of projects and/or zones reaches specified caps, threads in
48 * them do not get scheduled and instead are placed on wait queues associated
49 * with a cap. Such threads will start running again only when CPU usage drops
50 * below the cap level. Each zone and each project has its own wait queue.
51 *
52 * When CPU cap is set, the kernel continously keeps track of CPU time used by
53 * capped zones and/or projects over a short time interval and calculates their
54 * current CPU usage as a percentage. When the accumulated usage reaches the CPU
55 * cap, LWPs running in the user-land (when they are not holding any critical
56 * kernel locks) are placed on special wait queues until their project's or
57 * zone's CPU usage drops below the cap.
58 *
59 * The system maintains a list of all capped projects and all capped zones. On
60 * every clock tick every active thread belonging to a capped project adds its
61 * CPU usage to its project. Usage from all projects belonging to a capped zone
62 * is aggregated to get the zone usage.
63 *
64 * When the current CPU usage is above the cap, a project or zone is considered
65 * over-capped. Every user thread caught running in an over-capped project or
66 * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
67 * is requested to surrender its CPU. This causes scheduling class specific
68 * CL_PREEMPT() callback to be invoked. The callback function places threads
69 * marked as TS_PROJWAIT on a wait queue and calls switch().
70 *
71 * Threads are only placed on wait queues after trapping from user-land
72 * (they could be holding some user locks, but no kernel locks) and while
73 * returning from the trap back to the user-land when no kernel locks are held.
74 * Putting threads on wait queues in random places while running in the
75 * kernel might lead to all kinds of locking problems.
76 *
77 * Accounting
78 * ==========
79 *
80 * Accounting of CPU usage is based on per-thread micro-state accounting data.
81 * On every clock tick clock() adds new on-CPU time for every thread found on
82 * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
83 * New times means time since it was last accounted for. On-CPU times greater
84 * than 1 tick are truncated to 1 tick.
85 *
86 * Project CPU usage is aggregated from all threads within the project.
87 * Zone CPU usage is the sum of usages for all projects within the zone. Zone
88 * CPU usage is calculated on every clock tick by walking list of projects and
89 * adding their usage together.
90 *
91 * Decay
92 * =====
93 *
94 * CPU usage is decayed by the caps_update() routine which is called once per
95 * every clock tick. It walks lists of project caps and decays their usages by
96 * one per cent. If CPU usage drops below cap levels, threads on the wait queue
97 * are made runnable again, one thread per clock tick.
98 *
99 * Interfaces
100 * ==========
101 *
102 * The CPU Caps facility provides the following interfaces to the rest of the
103 * system:
104 *
105 *   cpucaps_project_add(kproject_t *)
106 *
107 * Notifies the framework of a new project. It should be put on the
108 * capped_projects list if its zone has a cap.
109 *
110 *   cpucaps_project_remove(kproject_t *)
111 *
112 * Remove the association between the specified project and its cap.
113 * Called right before the project is destroyed.
114 *
115 * cpucaps_project_set(kproject_t *, rctl_qty_t)
116 *
117 * Set project cap of the specified project to the specified value. Setting the
118 * value to NOCAP is equivalent to removing the cap.
119 *
120 *   cpucaps_zone_set(zone_t *, rctl_qty_t)
121 *
122 * Set zone cap of the specified zone to the specified value. Setting the value
123 * to NOCAP is equivalent to removing the cap.
124 *
125 *   cpucaps_zone_remove(zone_t *)
126 *
127 * Remove the association between the zone and its cap.
128 *
129 *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
130 *
131 * Charges specified thread's project the amount of on-CPU time that it used.
132 * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
133 * Otherwise returns True if project or zone should be penalized because its
134 * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
135 * bits in t_schedflag in this case.
136 *
137 *   CPUCAPS_ENFORCE(kthread_id_t *)
138 *
139 * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
140 * state on project or zone wait queues, as requested by TS_PROJWAITQ or
141 * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
142 * wait queue or False otherwise.
143 *
144 *   cpucaps_sc_init(caps_sc_t *)
145 *
146 * Initializes the scheduling-class specific CPU Caps data for a thread.
147 *
148 * LOCKS
149 * =====
150 *
151 * all the individual caps structures and their lists are protected by a global
152 * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
153 * caps, so it is usually uncontended. We avoid all blocking memory allocations
154 * while holding caps_lock to prevent clock() from blocking.
155 *
156 * Thread state is protected by the thread lock. It protects the association
157 * between a thread and its project and, as a consequence, to its zone. The
158 * association can not break while thread lock is held, so the project or zone
159 * cap are not going to disappear while thread lock is held.
160 *
161 * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
162 * grabbed by scheduling classes already holding thread lock at high PIL and by
163 * clock thread performing usage decay. We should do as little work as possible
164 * while holding the lock since it may be very hot. All threads in the project
165 * contend for the same cache line doing cap usage updates.
166 */
167
168/*
169 * caps_lock protects list of capped projects and zones, changes in the cap
170 * state and changes of the global cpucaps_enabled flag.
171 *
172 * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
173 * modified in parallel. This can be per-zone cap flag, but we don't keep any
174 * cap state for now.
175 */
176static kmutex_t caps_lock;		/* lock to protect: */
177static list_t capped_zones;		/* - list of zones with caps */
178static list_t capped_projects;		/* - list of projects with caps */
179boolean_t cpucaps_enabled;		/* - are there any caps defined? */
180boolean_t cpucaps_busy;			/* - is framework busy? */
181
182/*
183 * The accounting is based on the number of nanoseconds threads spend running
184 * during a tick which is kept in the cap_tick_cost variable.
185 */
186static hrtime_t cap_tick_cost;
187
188/*
189 * How much of the usage value is decayed every clock tick
190 * Decay one per cent of value per tick
191 */
192#define	CAP_DECAY_FACTOR 100
193
194/*
195 * Scale the value and round it to the closest integer value
196 */
197#define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
198
199static void caps_update();
200
201/*
202 * CAP kstats.
203 */
204struct cap_kstat {
205	kstat_named_t	cap_value;
206	kstat_named_t	cap_usage;
207	kstat_named_t	cap_nwait;
208	kstat_named_t	cap_below;
209	kstat_named_t	cap_above;
210	kstat_named_t	cap_maxusage;
211	kstat_named_t	cap_zonename;
212} cap_kstat = {
213	{ "value",	KSTAT_DATA_UINT64 },
214	{ "usage",	KSTAT_DATA_UINT64 },
215	{ "nwait",	KSTAT_DATA_UINT64 },
216	{ "below_sec",	KSTAT_DATA_UINT64 },
217	{ "above_sec",	KSTAT_DATA_UINT64 },
218	{ "maxusage",	KSTAT_DATA_UINT64 },
219	{ "zonename",	KSTAT_DATA_STRING },
220};
221
222
223static kmutex_t cap_kstat_lock;
224static int cap_kstat_update(kstat_t *, int);
225
226/*
227 * Initialize CPU caps infrastructure.
228 *   - Initialize lists of capped zones and capped projects
229 *   - Set cpucaps_clock_callout to NULL
230 */
231void
232cpucaps_init()
233{
234	/*
235	 * Initialize global variables
236	 */
237	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
238
239	list_create(&capped_zones, sizeof (cpucap_t),
240	    offsetof(cpucap_t, cap_link));
241	list_create(&capped_projects, sizeof (cpucap_t),
242	    offsetof(cpucap_t, cap_link));
243
244	cpucaps_enabled = B_FALSE;
245	cpucaps_busy = B_FALSE;
246	cpucaps_clock_callout = NULL;
247}
248
249/*
250 * Initialize scheduling-class specific CPU Caps data.
251 */
252void
253cpucaps_sc_init(caps_sc_t *csc)
254{
255	csc->csc_cputime = 0;
256}
257
258/*
259 * Allocate and initialize cpucap structure
260 */
261static cpucap_t *
262cap_alloc(void)
263{
264	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
265
266	DISP_LOCK_INIT(&cap->cap_usagelock);
267	waitq_init(&cap->cap_waitq);
268
269	return (cap);
270}
271
272/*
273 * Free cpucap structure
274 */
275static void
276cap_free(cpucap_t *cap)
277{
278	if (cap == NULL)
279		return;
280
281	/*
282	 * This cap should not be active
283	 */
284	ASSERT(!list_link_active(&cap->cap_link));
285	ASSERT(cap->cap_value == 0);
286	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
287
288	waitq_fini(&cap->cap_waitq);
289	DISP_LOCK_DESTROY(&cap->cap_usagelock);
290
291	kmem_free(cap, sizeof (cpucap_t));
292}
293
294/*
295 * Activate cap - insert into active list and unblock its
296 * wait queue. Should be called with caps_lock held.
297 * The cap_value field is set to the value supplied.
298 */
299static void
300cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
301{
302	ASSERT(MUTEX_HELD(&caps_lock));
303
304	/*
305	 * Cap can not be already enabled
306	 */
307	ASSERT(!CAP_ENABLED(cap));
308	ASSERT(!list_link_active(&cap->cap_link));
309
310	list_insert_tail(l, cap);
311	cap->cap_below = cap->cap_above = 0;
312	cap->cap_maxusage = 0;
313	cap->cap_usage = 0;
314	cap->cap_value = value;
315	waitq_unblock(&cap->cap_waitq);
316	if (CPUCAPS_OFF()) {
317		cpucaps_enabled = B_TRUE;
318		cpucaps_clock_callout = caps_update;
319	}
320}
321
322/*
323 * Deactivate cap
324 *   - Block its wait queue. This prevents any new threads from being
325 *	enqueued there and moves all enqueued threads to the run queue.
326 *   - Remove cap from list l.
327 *   - Disable CPU caps globally if there are no capped projects or zones
328 *
329 * Should be called with caps_lock held.
330 */
331static void
332cap_disable(list_t *l, cpucap_t *cap)
333{
334	ASSERT(MUTEX_HELD(&caps_lock));
335	/*
336	 * Cap should be currently active
337	 */
338	ASSERT(CPUCAPS_ON());
339	ASSERT(list_link_active(&cap->cap_link));
340	ASSERT(CAP_ENABLED(cap));
341
342	waitq_block(&cap->cap_waitq);
343	list_remove(l, cap);
344	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
345		cpucaps_enabled = B_FALSE;
346		cpucaps_clock_callout = NULL;
347	}
348	cap->cap_value = 0;
349	cap->cap_project = NULL;
350	cap->cap_zone = NULL;
351	if (cap->cap_kstat != NULL) {
352		kstat_delete(cap->cap_kstat);
353		cap->cap_kstat = NULL;
354	}
355
356}
357
358/*
359 * Enable cap for a project kpj
360 * It is safe to enable already enabled project cap.
361 * Should be called with caps_lock held.
362 */
363static void
364cap_project_enable(kproject_t *kpj, hrtime_t value)
365{
366	cpucap_t *cap = kpj->kpj_cpucap;
367
368	ASSERT(MUTEX_HELD(&caps_lock));
369	ASSERT(cap != NULL);
370
371	if (CAP_DISABLED(cap)) {
372		ASSERT(cap->cap_kstat == NULL);
373		cap_enable(&capped_projects, cap, value);
374		cap->cap_project = kpj;
375		cap->cap_zone = kpj->kpj_zone;
376
377		/*
378		 * Create cap kstats
379		 */
380		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
381		    KSTAT_TYPE_NAMED,
382		    sizeof (cap_kstat) / sizeof (kstat_named_t),
383		    KSTAT_FLAG_VIRTUAL)) != NULL) {
384			cap->cap_kstat->ks_data_size +=
385			    strlen(cap->cap_zone->zone_name) + 1;
386			cap->cap_kstat->ks_lock = &cap_kstat_lock;
387			cap->cap_kstat->ks_data = &cap_kstat;
388			cap->cap_kstat->ks_update = cap_kstat_update;
389			cap->cap_kstat->ks_private = cap;
390			kstat_install(cap->cap_kstat);
391		}
392	}
393}
394
395/*
396 * Disable project cap.
397 * It is safe to disable already disabled project cap.
398 * Should be called with caps_lock held.
399 */
400static void
401cap_project_disable(kproject_t *kpj)
402{
403	cpucap_t *cap = kpj->kpj_cpucap;
404
405	ASSERT(MUTEX_HELD(&caps_lock));
406	ASSERT(cap != NULL);
407	ASSERT(cap->cap_project == kpj);
408
409	if (CAP_ENABLED(cap))
410		cap_disable(&capped_projects, cap);
411}
412
413/*
414 * Enable cap for a zone
415 * It is safe to enable already enabled zone cap.
416 * Should be called with caps_lock held.
417 */
418static void
419cap_zone_enable(zone_t *zone, hrtime_t value)
420{
421	cpucap_t *cap = zone->zone_cpucap;
422
423	ASSERT(MUTEX_HELD(&caps_lock));
424	ASSERT(cap != NULL);
425
426	if (CAP_DISABLED(cap)) {
427		ASSERT(cap->cap_kstat == NULL);
428		cap_enable(&capped_zones, cap, value);
429		cap->cap_zone = zone;
430
431		/*
432		 * Create cap kstats
433		 */
434		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
435		    KSTAT_TYPE_NAMED,
436		    sizeof (cap_kstat) / sizeof (kstat_named_t),
437		    KSTAT_FLAG_VIRTUAL)) != NULL) {
438			cap->cap_kstat->ks_data_size +=
439			    strlen(cap->cap_zone->zone_name) + 1;
440			cap->cap_kstat->ks_lock = &cap_kstat_lock;
441			cap->cap_kstat->ks_data = &cap_kstat;
442			cap->cap_kstat->ks_update = cap_kstat_update;
443			cap->cap_kstat->ks_private = cap;
444			kstat_install(cap->cap_kstat);
445		}
446	}
447}
448
449/*
450 * Disable zone cap.
451 * It is safe to disable already disabled zone cap.
452 * Should be called with caps_lock held.
453 */
454static void
455cap_zone_disable(zone_t *zone)
456{
457	cpucap_t *cap = zone->zone_cpucap;
458
459	ASSERT(MUTEX_HELD(&caps_lock));
460	ASSERT(cap != NULL);
461	ASSERT(cap->cap_zone == zone);
462
463	if (CAP_ENABLED(cap))
464		cap_disable(&capped_zones, cap);
465}
466
467/*
468 * Apply specified callback to all caps contained in the list `l'.
469 */
470static void
471cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
472{
473	static uint64_t cpucap_walk_gen;
474	cpucap_t *cap;
475
476	ASSERT(MUTEX_HELD(&caps_lock));
477
478	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
479		(*cb)(cap, cpucap_walk_gen);
480	}
481
482	atomic_inc_64(&cpucap_walk_gen);
483}
484
485/*
486 * If cap limit is not reached, make one thread from wait queue runnable.
487 * The waitq_isempty check is performed without the waitq lock. If a new thread
488 * is placed on the waitq right after the check, it will be picked up during the
489 * next invocation of cap_poke_waitq().
490 */
491/* ARGSUSED */
492static void
493cap_poke_waitq(cpucap_t *cap, int64_t gen)
494{
495	ASSERT(MUTEX_HELD(&caps_lock));
496
497	if (cap->cap_usage >= cap->cap_value) {
498		cap->cap_above++;
499	} else {
500		waitq_t *wq = &cap->cap_waitq;
501
502		cap->cap_below++;
503
504		if (!waitq_isempty(wq))
505			waitq_runone(wq);
506	}
507}
508
509/*
510 * The callback function called for every cap on capped_projects list.
511 * Decay cap usage by CAP_DECAY_FACTOR
512 * Add this cap project usage to its zone usage.
513 * Kick off a thread from the cap waitq if cap is not reached.
514 */
515static void
516cap_project_usage_walker(cpucap_t *cap, int64_t gen)
517{
518	zone_t		*zone = cap->cap_zone;
519	hrtime_t	cap_usage = cap->cap_usage;
520
521	ASSERT(MUTEX_HELD(&caps_lock));
522	ASSERT(cap->cap_project->kpj_cpucap == cap);
523	ASSERT(zone == cap->cap_project->kpj_zone);
524	ASSERT(CAP_ENABLED(cap));
525
526	/*
527	 * Set or clear the CAP_REACHED flag based on the current usage.
528	 * Only projects having their own caps are ever marked as CAP_REACHED.
529	 */
530	cap_poke_waitq(cap, 0);
531
532	/*
533	 * Add project's CPU usage to our zone's CPU usage.
534	 */
535	if (ZONE_IS_CAPPED(zone)) {
536		cpucap_t *zcap = zone->zone_cpucap;
537
538		ASSERT(zcap->cap_zone == zone);
539
540		/*
541		 * If we haven't reset this zone's usage during this clock tick
542		 * yet, then do it now. The cap_gen field is used to check
543		 * whether this is the first zone's project we see during this
544		 * tick or a subsequent one.
545		 */
546		if (zcap->cap_gen != gen) {
547			if (zcap->cap_usage > zcap->cap_maxusage)
548				zcap->cap_maxusage = zcap->cap_usage;
549			zcap->cap_usage = 0;
550			zcap->cap_gen = gen;
551		}
552		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
553		    hrtime_t, cap_usage);
554		zcap->cap_usage += cap_usage;
555		/* Check for overflows */
556		if (zcap->cap_usage < 0)
557			zcap->cap_usage = MAX_USAGE - 1;
558	}
559
560	/*
561	 * Decay project usage.
562	 */
563	disp_lock_enter(&cap->cap_usagelock);
564	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
565	disp_lock_exit(&cap->cap_usagelock);
566}
567
568/*
569 * On every clock tick walk the list of project caps and update the CPU usage.
570 * Also walk the list of zone caps checking whether any threads should
571 * transition from wait queue to run queue.
572 *
573 * This function gets called by the clock thread directly when there are any
574 * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
575 * caps_lock for long periods of time, so there should be almost no contention
576 * for it.
577 */
578static void
579caps_update()
580{
581	mutex_enter(&caps_lock);
582	cap_walk(&capped_projects, cap_project_usage_walker);
583	cap_walk(&capped_zones, cap_poke_waitq);
584	mutex_exit(&caps_lock);
585}
586
587/*
588 * The function is called for each project in a zone when the zone cap is
589 * modified. It enables project caps if zone cap is enabled and disables if the
590 * zone cap is disabled and project doesn't have its own cap.
591 *
592 * For each project that does not have cpucap structure allocated it allocates a
593 * new structure and assigns to kpj->cpu_cap. The allocation is performed
594 * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
595 * held.
596 */
597static int
598cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
599{
600	cpucap_t *project_cap = NULL;
601	cpucap_t *zone_cap = (cpucap_t *)arg;
602
603	ASSERT(zone_cap != NULL);
604
605	if (kpj->kpj_cpucap == NULL) {
606		/*
607		 * This is the first time any cap was established for this
608		 * project. Allocate a new cpucap structure for it.
609		 */
610		project_cap = cap_alloc();
611	}
612
613	mutex_enter(&caps_lock);
614
615	/*
616	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
617	 * and assign the newly allocated cpucap structure to it.
618	 */
619	if (kpj->kpj_cpucap == NULL) {
620		kpj->kpj_cpucap = project_cap;
621	} else if (project_cap != NULL) {
622		cap_free(project_cap);
623	}
624
625	project_cap = kpj->kpj_cpucap;
626
627	if (CAP_DISABLED(zone_cap)) {
628		/*
629		 * Remove all projects in this zone without caps
630		 * from the capped_projects list.
631		 */
632		if (project_cap->cap_value == MAX_USAGE) {
633			cap_project_disable(kpj);
634		}
635	} else if (CAP_DISABLED(project_cap)) {
636		/*
637		 * Add the project to capped_projects list.
638		 */
639		ASSERT(project_cap->cap_value == 0);
640		cap_project_enable(kpj, MAX_USAGE);
641	}
642	mutex_exit(&caps_lock);
643
644	return (0);
645}
646
647/*
648 * Set zone cap to cap_val
649 * If cap_val is equal to NOCAP, disable zone cap.
650 *
651 * If this is the first time a cap is set on a zone, allocate cpucap structure
652 * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
653 */
654int
655cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
656{
657	cpucap_t *cap = NULL;
658	hrtime_t value;
659
660	if (cap_val == 0)
661		return (EINVAL);
662
663	ASSERT(cap_val <= MAXCAP);
664	if (cap_val > MAXCAP)
665		cap_val = MAXCAP;
666
667	/*
668	 * Nothing to do if trying to disable a cap on a zone when caps are off
669	 * or a zone which does not have a cap yet.
670	 */
671	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
672		return (0);
673
674	if (zone->zone_cpucap == NULL)
675		cap = cap_alloc();
676
677	mutex_enter(&caps_lock);
678
679	if (cpucaps_busy) {
680		mutex_exit(&caps_lock);
681		return (EBUSY);
682	}
683
684	/*
685	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
686	 * held. If it is still NULL, assign a newly allocated cpucap to it.
687	 */
688	if (zone->zone_cpucap == NULL) {
689		zone->zone_cpucap = cap;
690	} else if (cap != NULL) {
691		cap_free(cap);
692	}
693
694	cap = zone->zone_cpucap;
695	value = cap_val * cap_tick_cost;
696	if (value < 0)
697		value = MAX_USAGE;
698
699	/* Nothing to do if the value is staying the same */
700	if (value == cap->cap_value) {
701		mutex_exit(&caps_lock);
702		return (0);
703	}
704
705	/*
706	 * Clear cap statistics since the cap value itself changes.
707	 */
708	cap->cap_above = cap->cap_below = 0;
709
710
711	if (cap_val == NOCAP) {
712		if (CAP_ENABLED(cap)) {
713			/*
714			 * Remove cap for the zone
715			 */
716			cap_zone_disable(zone);
717			cpucaps_busy = B_TRUE;
718			mutex_exit(&caps_lock);
719			/*
720			 * Disable caps for all project belonging to this zone
721			 * unless they have their own cap.
722			 */
723			(void) project_walk_all(zone->zone_id,
724			    cap_project_zone_modify_walker, cap);
725
726			mutex_enter(&caps_lock);
727			cpucaps_busy = B_FALSE;
728		}
729	} else if (CAP_DISABLED(cap)) {
730		/*
731		 * Set a cap on a zone which previously was not capped.
732		 */
733		cap_zone_enable(zone, value);
734		cpucaps_busy = B_TRUE;
735		mutex_exit(&caps_lock);
736
737		/*
738		 * Enable cap for all projects belonging to this zone.
739		 */
740		(void) project_walk_all(zone->zone_id,
741		    cap_project_zone_modify_walker, cap);
742
743		mutex_enter(&caps_lock);
744		cpucaps_busy = B_FALSE;
745	} else {
746		/*
747		 * No state transitions, just change the value
748		 */
749		cap->cap_value = value;
750	}
751
752	ASSERT(MUTEX_HELD(&caps_lock));
753	ASSERT(!cpucaps_busy);
754	mutex_exit(&caps_lock);
755
756	return (0);
757}
758
759/*
760 * The project is going away so disable its cap.
761 */
762void
763cpucaps_project_remove(kproject_t *kpj)
764{
765	mutex_enter(&caps_lock);
766	if (PROJECT_IS_CAPPED(kpj))
767		cap_project_disable(kpj);
768	if (kpj->kpj_cpucap != NULL) {
769		cap_free(kpj->kpj_cpucap);
770		kpj->kpj_cpucap = NULL;
771	}
772	mutex_exit(&caps_lock);
773}
774
775/*
776 * The zone is going away, so disable its cap.
777 */
778void
779cpucaps_zone_remove(zone_t *zone)
780{
781	mutex_enter(&caps_lock);
782	while (ZONE_IS_CAPPED(zone)) {
783		mutex_exit(&caps_lock);
784		(void) cpucaps_zone_set(zone, NOCAP);
785		mutex_enter(&caps_lock);
786	}
787	if (zone->zone_cpucap != NULL) {
788		cap_free(zone->zone_cpucap);
789		zone->zone_cpucap = NULL;
790	}
791	mutex_exit(&caps_lock);
792}
793
794/*
795 * New project was created. It should be put on the capped_projects list if
796 * its zone has a cap.
797 */
798void
799cpucaps_project_add(kproject_t *kpj)
800{
801	cpucap_t *cap = NULL;
802
803	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
804		return;
805
806	/*
807	 * This project was never capped before, so allocate its cap structure.
808	 */
809	if (kpj->kpj_cpucap == NULL)
810		cap = cap_alloc();
811
812	mutex_enter(&caps_lock);
813	/*
814	 * Double-check with caps_lock held
815	 */
816	if (kpj->kpj_cpucap == NULL) {
817		kpj->kpj_cpucap = cap;
818	} else if (cap != NULL) {
819		cap_free(cap);
820	}
821
822	if (ZONE_IS_CAPPED(kpj->kpj_zone))
823		cap_project_enable(kpj, MAX_USAGE);
824
825	mutex_exit(&caps_lock);
826}
827
828/*
829 * Set project cap to cap_val
830 * If cap_val is equal to NOCAP, disable project cap.
831 *
832 * If this is the first time a cap is set on a project, allocate cpucap
833 * structure without holding caps_lock to avoid KM_SLEEP allocation with
834 * caps_lock held.
835 */
836int
837cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
838{
839	cpucap_t *cap = NULL;
840	hrtime_t value;
841
842	if (cap_val == 0)
843		return (EINVAL);
844
845	ASSERT(cap_val <= MAXCAP);
846	if (cap_val > MAXCAP)
847		cap_val = MAXCAP;
848
849	/*
850	 * Nothing to do if trying to disable project cap and caps are not
851	 * enabled or if trying to disable cap on a project that does not have
852	 * cap enabled.
853	 */
854	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
855		return (0);
856
857	if (kpj->kpj_cpucap == NULL) {
858		/*
859		 * This project was never capped before, so allocate its cap
860		 * structure.
861		 */
862		cap = cap_alloc();
863	}
864
865	mutex_enter(&caps_lock);
866
867	/*
868	 * Double-check with caps_lock held.
869	 */
870	if (kpj->kpj_cpucap == NULL) {
871		kpj->kpj_cpucap = cap;
872	} else if (cap != NULL) {
873		cap_free(cap);
874	}
875
876	/*
877	 * Get the actual pointer to the project cap.
878	 */
879	cap = kpj->kpj_cpucap;
880	value = cap_val * cap_tick_cost;
881	if (value < 0)
882		value = MAX_USAGE;
883
884	/*
885	 * Nothing to do if the value is not changing
886	 */
887	if (value == cap->cap_value) {
888		mutex_exit(&caps_lock);
889		return (0);
890	}
891
892	/*
893	 * Clear cap statistics since the cap value itself changes.
894	 */
895	cap->cap_above = cap->cap_below = 0;
896	cap->cap_maxusage = 0;
897
898	if (cap_val != NOCAP) {
899		/*
900		 * Enable this cap if it is not already enabled.
901		 */
902		if (CAP_DISABLED(cap))
903			cap_project_enable(kpj, value);
904		else
905			cap->cap_value = value;
906	} else if (CAP_ENABLED(cap)) {
907		/*
908		 * User requested to drop a cap on the project. If it is part of
909		 * capped zone, keep the cap and set the value to MAX_USAGE,
910		 * otherwise disable the cap.
911		 */
912		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
913			cap->cap_value = MAX_USAGE;
914		} else {
915			cap_project_disable(kpj);
916		}
917	}
918	mutex_exit(&caps_lock);
919
920	return (0);
921}
922
923/*
924 * Get cap usage.
925 */
926static rctl_qty_t
927cap_get(cpucap_t *cap)
928{
929	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
930}
931
932/*
933 * Get current project usage.
934 */
935rctl_qty_t
936cpucaps_project_get(kproject_t *kpj)
937{
938	return (cap_get(kpj->kpj_cpucap));
939}
940
941/*
942 * Get current zone usage.
943 */
944rctl_qty_t
945cpucaps_zone_get(zone_t *zone)
946{
947	return (cap_get(zone->zone_cpucap));
948}
949
950/*
951 * Charge project of thread t the time thread t spent on CPU since previously
952 * adjusted.
953 *
954 * Record the current on-CPU time in the csc structure.
955 *
956 * Do not adjust for more than one tick worth of time.
957 *
958 * It is possible that the project cap is being disabled while this routine is
959 * executed. This should not cause any issues since the association between the
960 * thread and its project is protected by thread lock.
961 */
962static void
963caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
964{
965	kproject_t	*kpj = ttoproj(t);
966	hrtime_t	new_usage;
967	hrtime_t	usage_delta;
968
969	ASSERT(THREAD_LOCK_HELD(t));
970	ASSERT(kpj->kpj_cpucap != NULL);
971
972	/* Get on-CPU time since birth of a thread */
973	new_usage = mstate_thread_onproc_time(t);
974
975	/* Time spent on CPU since last checked */
976	usage_delta = new_usage - csc->csc_cputime;
977
978	/* Save the accumulated on-CPU time */
979	csc->csc_cputime = new_usage;
980
981	/* Charge at most one tick worth of on-CPU time */
982	if (usage_delta > cap_tick_cost)
983		usage_delta = cap_tick_cost;
984
985	/* Add usage_delta to the project usage value. */
986	if (usage_delta > 0) {
987		cpucap_t *cap = kpj->kpj_cpucap;
988
989		DTRACE_PROBE2(cpucaps__project__charge,
990		    kthread_id_t, t, hrtime_t, usage_delta);
991
992		disp_lock_enter_high(&cap->cap_usagelock);
993		cap->cap_usage += usage_delta;
994
995		/* Check for overflows */
996		if (cap->cap_usage < 0)
997			cap->cap_usage = MAX_USAGE - 1;
998
999		disp_lock_exit_high(&cap->cap_usagelock);
1000
1001		/*
1002		 * cap_maxusage is only kept for observability. Move it outside
1003		 * the lock to reduce the time spent while holding the lock.
1004		 */
1005		if (cap->cap_usage > cap->cap_maxusage)
1006			cap->cap_maxusage = cap->cap_usage;
1007	}
1008}
1009
1010/*
1011 * Charge thread's project and return True if project or zone should be
1012 * penalized because its project or zone is exceeding its cap. Also sets
1013 * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
1014 *
1015 * It is possible that the project cap is being disabled while this routine is
1016 * executed. This should not cause any issues since the association between the
1017 * thread and its project is protected by thread lock. It will still set
1018 * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
1019 * anything on the blocked wait queue.
1020 *
1021 */
1022boolean_t
1023cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
1024{
1025	kproject_t	*kpj = ttoproj(t);
1026	klwp_t		*lwp = t->t_lwp;
1027	zone_t		*zone;
1028	cpucap_t	*project_cap;
1029	boolean_t	rc = B_FALSE;
1030
1031	ASSERT(THREAD_LOCK_HELD(t));
1032
1033	/* Nothing to do for projects that are not capped. */
1034	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
1035		return (B_FALSE);
1036
1037	caps_charge_adjust(t, csc);
1038
1039	/*
1040	 * The caller only requested to charge the project usage, no enforcement
1041	 * part.
1042	 */
1043	if (charge_type == CPUCAPS_CHARGE_ONLY)
1044		return (B_FALSE);
1045
1046	project_cap = kpj->kpj_cpucap;
1047
1048	if (project_cap->cap_usage >= project_cap->cap_value) {
1049		t->t_schedflag |= TS_PROJWAITQ;
1050		rc = B_TRUE;
1051	} else if (t->t_schedflag & TS_PROJWAITQ) {
1052		t->t_schedflag &= ~TS_PROJWAITQ;
1053	}
1054
1055	zone = ttozone(t);
1056	if (!ZONE_IS_CAPPED(zone)) {
1057		if (t->t_schedflag & TS_ZONEWAITQ)
1058			t->t_schedflag &= ~TS_ZONEWAITQ;
1059	} else {
1060		cpucap_t *zone_cap = zone->zone_cpucap;
1061
1062		if (zone_cap->cap_usage >= zone_cap->cap_value) {
1063			t->t_schedflag |= TS_ZONEWAITQ;
1064			rc = B_TRUE;
1065		} else if (t->t_schedflag & TS_ZONEWAITQ) {
1066			t->t_schedflag &= ~TS_ZONEWAITQ;
1067		}
1068	}
1069
1070
1071	return (rc);
1072}
1073
1074/*
1075 * Enforce CPU caps. If got preempted in the user-land, we know that thread does
1076 * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
1077 *
1078 * CPU Caps are only enforced for user threads.
1079 *
1080 * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
1081 * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
1082 *
1083 * It is possible that by the time we enter cpucaps_enforce() the cap is already
1084 * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
1085 * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
1086 * apply.
1087 */
1088boolean_t
1089cpucaps_enforce(kthread_t *t)
1090{
1091	klwp_t *lwp = t->t_lwp;
1092
1093	ASSERT(THREAD_LOCK_HELD(t));
1094
1095	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
1096		if (t->t_schedflag & TS_PROJWAITQ) {
1097			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
1098			t->t_schedflag &= ~TS_ANYWAITQ;
1099			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
1100			    t)) {
1101				return (B_TRUE);
1102			}
1103		}
1104		if (t->t_schedflag & TS_ZONEWAITQ) {
1105			ASSERT(ttozone(t)->zone_cpucap != NULL);
1106			t->t_schedflag &= ~TS_ZONEWAITQ;
1107			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
1108			    t)) {
1109				return (B_TRUE);
1110			}
1111		}
1112	}
1113
1114	/*
1115	 * The thread is not enqueued on the wait queue.
1116	 */
1117	return (B_FALSE);
1118}
1119
1120/*
1121 * Convert internal cap statistics into values exported by cap kstat.
1122 */
1123static int
1124cap_kstat_update(kstat_t *ksp, int rw)
1125{
1126	struct cap_kstat *capsp = &cap_kstat;
1127	cpucap_t *cap = ksp->ks_private;
1128	clock_t	tick_sec = SEC_TO_TICK(1);
1129	char *zonename = cap->cap_zone->zone_name;
1130
1131	if (rw == KSTAT_WRITE)
1132		return (EACCES);
1133
1134	capsp->cap_value.value.ui64 =
1135	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
1136	capsp->cap_usage.value.ui64 =
1137	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
1138	capsp->cap_maxusage.value.ui64 =
1139	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
1140	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
1141	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
1142	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
1143	kstat_named_setstr(&capsp->cap_zonename, zonename);
1144
1145	return (0);
1146}
1147