1/*
2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_FREE_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 *	File:	kern/task.c
58 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young, David Golub,
59 *		David Black
60 *
61 *	Task management primitives implementation.
62 */
63/*
64 * Copyright (c) 1993 The University of Utah and
65 * the Computer Systems Laboratory (CSL).  All rights reserved.
66 *
67 * Permission to use, copy, modify and distribute this software and its
68 * documentation is hereby granted, provided that both the copyright
69 * notice and this permission notice appear in all copies of the
70 * software, derivative works or modified versions, and any portions
71 * thereof, and that both notices appear in supporting documentation.
72 *
73 * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS
74 * IS" CONDITION.  THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF
75 * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
76 *
77 * CSL requests users of this software to return to csl-dist@cs.utah.edu any
78 * improvements that they make and grant CSL redistribution rights.
79 *
80 */
81/*
82 * NOTICE: This file was modified by McAfee Research in 2004 to introduce
83 * support for mandatory and extensible security protections.  This notice
84 * is included in support of clause 2.2 (b) of the Apple Public License,
85 * Version 2.0.
86 * Copyright (c) 2005 SPARTA, Inc.
87 */
88
89#include <mach/mach_types.h>
90#include <mach/boolean.h>
91#include <mach/host_priv.h>
92#include <mach/machine/vm_types.h>
93#include <mach/vm_param.h>
94#include <mach/semaphore.h>
95#include <mach/task_info.h>
96#include <mach/task_special_ports.h>
97
98#include <ipc/ipc_importance.h>
99#include <ipc/ipc_types.h>
100#include <ipc/ipc_space.h>
101#include <ipc/ipc_entry.h>
102#include <ipc/ipc_hash.h>
103
104#include <kern/kern_types.h>
105#include <kern/mach_param.h>
106#include <kern/misc_protos.h>
107#include <kern/task.h>
108#include <kern/thread.h>
109#include <kern/coalition.h>
110#include <kern/zalloc.h>
111#include <kern/kalloc.h>
112#include <kern/processor.h>
113#include <kern/sched_prim.h>	/* for thread_wakeup */
114#include <kern/ipc_tt.h>
115#include <kern/host.h>
116#include <kern/clock.h>
117#include <kern/timer.h>
118#include <kern/assert.h>
119#include <kern/sync_lock.h>
120#include <kern/affinity.h>
121#include <kern/exc_resource.h>
122#if CONFIG_TELEMETRY
123#include <kern/telemetry.h>
124#endif
125
126#include <vm/pmap.h>
127#include <vm/vm_map.h>
128#include <vm/vm_kern.h>		/* for kernel_map, ipc_kernel_map */
129#include <vm/vm_pageout.h>
130#include <vm/vm_protos.h>
131#include <vm/vm_purgeable_internal.h>
132
133#include <sys/resource.h>
134/*
135 * Exported interfaces
136 */
137
138#include <mach/task_server.h>
139#include <mach/mach_host_server.h>
140#include <mach/host_security_server.h>
141#include <mach/mach_port_server.h>
142
143#include <vm/vm_shared_region.h>
144
145#if CONFIG_COUNTERS
146#include <pmc/pmc.h>
147#endif /* CONFIG_COUNTERS */
148
149#include <libkern/OSDebug.h>
150#include <libkern/OSAtomic.h>
151
152#if CONFIG_ATM
153#include <atm/atm_internal.h>
154#endif
155
156#include <kern/sfi.h>
157
158#if KPERF
159extern int kpc_force_all_ctrs(task_t, int);
160#endif
161
162task_t			kernel_task;
163zone_t			task_zone;
164lck_attr_t      task_lck_attr;
165lck_grp_t       task_lck_grp;
166lck_grp_attr_t  task_lck_grp_attr;
167
168/* Flag set by core audio when audio is playing. Used to stifle EXC_RESOURCE generation when active. */
169int audio_active = 0;
170
171zinfo_usage_store_t tasks_tkm_private;
172zinfo_usage_store_t tasks_tkm_shared;
173
174/* A container to accumulate statistics for expired tasks */
175expired_task_statistics_t		dead_task_statistics;
176lck_spin_t		dead_task_statistics_lock;
177
178ledger_template_t task_ledger_template = NULL;
179
180struct _task_ledger_indices task_ledgers __attribute__((used)) =
181	{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
182	 { 0 /* initialized at runtime */},
183#ifdef CONFIG_BANK
184	 -1, -1,
185#endif
186	};
187
188void init_task_ledgers(void);
189void task_footprint_exceeded(int warning, __unused const void *param0, __unused const void *param1);
190void task_wakeups_rate_exceeded(int warning, __unused const void *param0, __unused const void *param1);
191void __attribute__((noinline)) THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE(void);
192void __attribute__((noinline)) THIS_PROCESS_CROSSED_HIGH_WATERMARK__SENDING_EXC_RESOURCE(int max_footprint_mb);
193int coredump(void *core_proc, int reserve_mb, int ignore_ulimit);
194
195kern_return_t task_suspend_internal(task_t);
196kern_return_t task_resume_internal(task_t);
197
198void proc_init_cpumon_params(void);
199
200// Warn tasks when they hit 80% of their memory limit.
201#define	PHYS_FOOTPRINT_WARNING_LEVEL 80
202
203#define TASK_WAKEUPS_MONITOR_DEFAULT_LIMIT		150 /* wakeups per second */
204#define TASK_WAKEUPS_MONITOR_DEFAULT_INTERVAL	300 /* in seconds. */
205
206/*
207 * Level (in terms of percentage of the limit) at which the wakeups monitor triggers telemetry.
208 *
209 * (ie when the task's wakeups rate exceeds 70% of the limit, start taking user
210 *  stacktraces, aka micro-stackshots)
211 */
212#define	TASK_WAKEUPS_MONITOR_DEFAULT_USTACKSHOTS_TRIGGER	70
213
214int task_wakeups_monitor_interval; /* In seconds. Time period over which wakeups rate is observed */
215int task_wakeups_monitor_rate;     /* In hz. Maximum allowable wakeups per task before EXC_RESOURCE is sent */
216
217int task_wakeups_monitor_ustackshots_trigger_pct; /* Percentage. Level at which we start gathering telemetry. */
218
219int disable_exc_resource; /* Global override to supress EXC_RESOURCE for resource monitor violations. */
220
221int max_task_footprint = 0; /* Per-task limit on physical memory consumption */
222#if MACH_ASSERT
223int pmap_ledgers_panic = 1;
224#endif /* MACH_ASSERT */
225
226int task_max = CONFIG_TASK_MAX; /* Max number of tasks */
227
228int hwm_user_cores = 0; /* high watermark violations generate user core files */
229
230#ifdef MACH_BSD
231extern void	proc_getexecutableuuid(void *, unsigned char *, unsigned long);
232extern int	proc_pid(struct proc *p);
233extern int	proc_selfpid(void);
234extern char	*proc_name_address(struct proc *p);
235#if CONFIG_JETSAM
236extern void	memorystatus_on_ledger_footprint_exceeded(int warning, const int max_footprint_mb);
237#endif
238#endif
239#if MACH_ASSERT
240extern int pmap_ledgers_panic;
241#endif /* MACH_ASSERT */
242
243/* Forwards */
244
245void		task_hold_locked(
246			task_t		task);
247void		task_wait_locked(
248			task_t		task,
249			boolean_t	until_not_runnable);
250void		task_release_locked(
251			task_t		task);
252void		task_free(
253			task_t		task );
254void		task_synchronizer_destroy_all(
255			task_t		task);
256
257int check_for_tasksuspend(
258			task_t task);
259
260void
261task_backing_store_privileged(
262			task_t task)
263{
264	task_lock(task);
265	task->priv_flags |= VM_BACKING_STORE_PRIV;
266	task_unlock(task);
267	return;
268}
269
270
271void
272task_set_64bit(
273		task_t task,
274		boolean_t is64bit)
275{
276#if defined(__i386__) || defined(__x86_64__) || defined(__arm64__)
277	thread_t thread;
278#endif /* defined(__i386__) || defined(__x86_64__) || defined(__arm64__) */
279
280	task_lock(task);
281
282	if (is64bit) {
283		if (task_has_64BitAddr(task))
284			goto out;
285		task_set_64BitAddr(task);
286	} else {
287		if ( !task_has_64BitAddr(task))
288			goto out;
289		task_clear_64BitAddr(task);
290	}
291	/* FIXME: On x86, the thread save state flavor can diverge from the
292	 * task's 64-bit feature flag due to the 32-bit/64-bit register save
293	 * state dichotomy. Since we can be pre-empted in this interval,
294	 * certain routines may observe the thread as being in an inconsistent
295	 * state with respect to its task's 64-bitness.
296	 */
297
298#if defined(__i386__) || defined(__x86_64__) || defined(__arm64__)
299	queue_iterate(&task->threads, thread, thread_t, task_threads) {
300		thread_mtx_lock(thread);
301		machine_thread_switch_addrmode(thread);
302		thread_mtx_unlock(thread);
303	}
304#endif /* defined(__i386__) || defined(__x86_64__) || defined(__arm64__) */
305
306out:
307	task_unlock(task);
308}
309
310
311void
312task_set_dyld_info(task_t task, mach_vm_address_t addr, mach_vm_size_t size)
313{
314	task_lock(task);
315	task->all_image_info_addr = addr;
316	task->all_image_info_size = size;
317	task_unlock(task);
318}
319
320void
321task_atm_reset(__unused task_t task) {
322
323#if CONFIG_ATM
324	if (task->atm_context != NULL) {
325		 atm_task_descriptor_destroy(task->atm_context);
326		 task->atm_context = NULL;
327	}
328#endif
329
330}
331
332#if TASK_REFERENCE_LEAK_DEBUG
333#include <kern/btlog.h>
334
335decl_simple_lock_data(static,task_ref_lock);
336static btlog_t *task_ref_btlog;
337#define TASK_REF_OP_INCR	0x1
338#define TASK_REF_OP_DECR	0x2
339
340#define TASK_REF_BTDEPTH	7
341
342static void
343task_ref_lock_lock(void *context)
344{
345	simple_lock((simple_lock_t)context);
346}
347static void
348task_ref_lock_unlock(void *context)
349{
350	simple_unlock((simple_lock_t)context);
351}
352
353void
354task_reference_internal(task_t task)
355{
356	void *       bt[TASK_REF_BTDEPTH];
357	int             numsaved = 0;
358
359	numsaved = OSBacktrace(bt, TASK_REF_BTDEPTH);
360
361	(void)hw_atomic_add(&(task)->ref_count, 1);
362	btlog_add_entry(task_ref_btlog, task, TASK_REF_OP_INCR,
363					bt, numsaved);
364}
365
366uint32_t
367task_deallocate_internal(task_t task)
368{
369	void *       bt[TASK_REF_BTDEPTH];
370	int             numsaved = 0;
371
372	numsaved = OSBacktrace(bt, TASK_REF_BTDEPTH);
373
374	btlog_add_entry(task_ref_btlog, task, TASK_REF_OP_DECR,
375					bt, numsaved);
376	return hw_atomic_sub(&(task)->ref_count, 1);
377}
378
379#endif /* TASK_REFERENCE_LEAK_DEBUG */
380
381void
382task_init(void)
383{
384
385	lck_grp_attr_setdefault(&task_lck_grp_attr);
386	lck_grp_init(&task_lck_grp, "task", &task_lck_grp_attr);
387	lck_attr_setdefault(&task_lck_attr);
388	lck_mtx_init(&tasks_threads_lock, &task_lck_grp, &task_lck_attr);
389
390	task_zone = zinit(
391			sizeof(struct task),
392			task_max * sizeof(struct task),
393			TASK_CHUNK * sizeof(struct task),
394			"tasks");
395
396	zone_change(task_zone, Z_NOENCRYPT, TRUE);
397
398	/*
399	 * Configure per-task memory limit.
400	 * The boot-arg is interpreted as Megabytes,
401	 * and takes precedence over the device tree.
402	 * Setting the boot-arg to 0 disables task limits.
403	 */
404	if (!PE_parse_boot_argn("max_task_pmem", &max_task_footprint,
405			sizeof (max_task_footprint))) {
406		/*
407		 * No limit was found in boot-args, so go look in the device tree.
408		 */
409		if (!PE_get_default("kern.max_task_pmem", &max_task_footprint,
410				sizeof(max_task_footprint))) {
411			/*
412			 * No limit was found in device tree.
413			 */
414			max_task_footprint = 0;
415		}
416	}
417
418	if (max_task_footprint != 0) {
419#if CONFIG_JETSAM
420		if (max_task_footprint < 50) {
421				printf("Warning: max_task_pmem %d below minimum.\n",
422				max_task_footprint);
423				max_task_footprint = 50;
424		}
425		printf("Limiting task physical memory footprint to %d MB\n",
426			max_task_footprint);
427		max_task_footprint *= 1024 * 1024; // Convert MB to bytes
428#else
429		printf("Warning: max_task_footprint specified, but jetsam not configured; ignoring.\n");
430#endif
431	}
432
433#if MACH_ASSERT
434	PE_parse_boot_argn("pmap_ledgers_panic", &pmap_ledgers_panic,
435			  sizeof (pmap_ledgers_panic));
436#endif /* MACH_ASSERT */
437
438	if (!PE_parse_boot_argn("hwm_user_cores", &hwm_user_cores,
439			sizeof (hwm_user_cores))) {
440		hwm_user_cores = 0;
441	}
442
443	proc_init_cpumon_params();
444
445	if (!PE_parse_boot_argn("task_wakeups_monitor_rate", &task_wakeups_monitor_rate, sizeof (task_wakeups_monitor_rate))) {
446		task_wakeups_monitor_rate = TASK_WAKEUPS_MONITOR_DEFAULT_LIMIT;
447	}
448
449	if (!PE_parse_boot_argn("task_wakeups_monitor_interval", &task_wakeups_monitor_interval, sizeof (task_wakeups_monitor_interval))) {
450		task_wakeups_monitor_interval = TASK_WAKEUPS_MONITOR_DEFAULT_INTERVAL;
451	}
452
453	if (!PE_parse_boot_argn("task_wakeups_monitor_ustackshots_trigger_pct", &task_wakeups_monitor_ustackshots_trigger_pct,
454		sizeof (task_wakeups_monitor_ustackshots_trigger_pct))) {
455		task_wakeups_monitor_ustackshots_trigger_pct = TASK_WAKEUPS_MONITOR_DEFAULT_USTACKSHOTS_TRIGGER;
456	}
457
458	if (!PE_parse_boot_argn("disable_exc_resource", &disable_exc_resource,
459		sizeof (disable_exc_resource))) {
460		disable_exc_resource = 0;
461	}
462
463/*
464 * If we have coalitions, coalition_init() will call init_task_ledgers() as it
465 * sets up the ledgers for the default coalition. If we don't have coalitions,
466 * then we have to call it now.
467 */
468#if CONFIG_COALITIONS
469	assert(task_ledger_template);
470#else /* CONFIG_COALITIONS */
471	init_task_ledgers();
472#endif /* CONFIG_COALITIONS */
473
474#if TASK_REFERENCE_LEAK_DEBUG
475	simple_lock_init(&task_ref_lock, 0);
476	task_ref_btlog = btlog_create(100000,
477								  TASK_REF_BTDEPTH,
478								  task_ref_lock_lock,
479								  task_ref_lock_unlock,
480								  &task_ref_lock);
481	assert(task_ref_btlog);
482#endif
483
484	/*
485	 * Create the kernel task as the first task.
486	 */
487#ifdef __LP64__
488	if (task_create_internal(TASK_NULL, COALITION_NULL, FALSE, TRUE, &kernel_task) != KERN_SUCCESS)
489#else
490	if (task_create_internal(TASK_NULL, COALITION_NULL, FALSE, FALSE, &kernel_task) != KERN_SUCCESS)
491#endif
492		panic("task_init\n");
493
494	vm_map_deallocate(kernel_task->map);
495	kernel_task->map = kernel_map;
496	lck_spin_init(&dead_task_statistics_lock, &task_lck_grp, &task_lck_attr);
497
498}
499
500/*
501 * Create a task running in the kernel address space.  It may
502 * have its own map of size mem_size and may have ipc privileges.
503 */
504kern_return_t
505kernel_task_create(
506	__unused task_t		parent_task,
507	__unused vm_offset_t		map_base,
508	__unused vm_size_t		map_size,
509	__unused task_t		*child_task)
510{
511	return (KERN_INVALID_ARGUMENT);
512}
513
514kern_return_t
515task_create(
516	task_t				parent_task,
517	__unused ledger_port_array_t	ledger_ports,
518	__unused mach_msg_type_number_t	num_ledger_ports,
519	__unused boolean_t		inherit_memory,
520	__unused task_t			*child_task)	/* OUT */
521{
522	if (parent_task == TASK_NULL)
523		return(KERN_INVALID_ARGUMENT);
524
525	/*
526	 * No longer supported: too many calls assume that a task has a valid
527	 * process attached.
528	 */
529	return(KERN_FAILURE);
530}
531
532kern_return_t
533host_security_create_task_token(
534	host_security_t			host_security,
535	task_t				parent_task,
536	__unused security_token_t	sec_token,
537	__unused audit_token_t		audit_token,
538	__unused host_priv_t		host_priv,
539	__unused ledger_port_array_t	ledger_ports,
540	__unused mach_msg_type_number_t	num_ledger_ports,
541	__unused boolean_t		inherit_memory,
542	__unused task_t			*child_task)	/* OUT */
543{
544	if (parent_task == TASK_NULL)
545		return(KERN_INVALID_ARGUMENT);
546
547	if (host_security == HOST_NULL)
548		return(KERN_INVALID_SECURITY);
549
550	/*
551	 * No longer supported.
552	 */
553	return(KERN_FAILURE);
554}
555
556/*
557 * Task ledgers
558 * ------------
559 *
560 * phys_footprint
561 *   Physical footprint: This is the sum of:
562 *     + internal
563 *     + internal_compressed
564 *     + iokit_mapped
565 *     - alternate_accounting
566 *
567 * internal
568 *   The task's anonymous memory, which on iOS is always resident.
569 *
570 * internal_compressed
571 *   Amount of this task's internal memory which is held by the compressor.
572 *   Such memory is no longer actually resident for the task [i.e., resident in its pmap],
573 *   and could be either decompressed back into memory, or paged out to storage, depending
574 *   on our implementation.
575 *
576 * iokit_mapped
577 *   IOKit mappings: The total size of all IOKit mappings in this task, regardless of
578     clean/dirty or internal/external state].
579 *
580 * alternate_accounting
581 *   The number of internal dirty pages which are part of IOKit mappings. By definition, these pages
582 *   are counted in both internal *and* iokit_mapped, so we must subtract them from the total to avoid
583 *   double counting.
584 */
585void
586init_task_ledgers(void)
587{
588	ledger_template_t t;
589
590	assert(task_ledger_template == NULL);
591	assert(kernel_task == TASK_NULL);
592
593	if ((t = ledger_template_create("Per-task ledger")) == NULL)
594		panic("couldn't create task ledger template");
595
596	task_ledgers.cpu_time = ledger_entry_add(t, "cpu_time", "sched", "ns");
597	task_ledgers.tkm_private = ledger_entry_add(t, "tkm_private",
598	    "physmem", "bytes");
599	task_ledgers.tkm_shared = ledger_entry_add(t, "tkm_shared", "physmem",
600	    "bytes");
601	task_ledgers.phys_mem = ledger_entry_add(t, "phys_mem", "physmem",
602	    "bytes");
603	task_ledgers.wired_mem = ledger_entry_add(t, "wired_mem", "physmem",
604	    "bytes");
605	task_ledgers.internal = ledger_entry_add(t, "internal", "physmem",
606	    "bytes");
607	task_ledgers.iokit_mapped = ledger_entry_add(t, "iokit_mapped", "mappings",
608 	    "bytes");
609	task_ledgers.alternate_accounting = ledger_entry_add(t, "alternate_accounting", "physmem",
610 	    "bytes");
611	task_ledgers.phys_footprint = ledger_entry_add(t, "phys_footprint", "physmem",
612 	    "bytes");
613	task_ledgers.internal_compressed = ledger_entry_add(t, "internal_compressed", "physmem",
614 	    "bytes");
615	task_ledgers.purgeable_volatile = ledger_entry_add(t, "purgeable_volatile", "physmem", "bytes");
616	task_ledgers.purgeable_nonvolatile = ledger_entry_add(t, "purgeable_nonvolatile", "physmem", "bytes");
617	task_ledgers.purgeable_volatile_compressed = ledger_entry_add(t, "purgeable_volatile_compress", "physmem", "bytes");
618	task_ledgers.purgeable_nonvolatile_compressed = ledger_entry_add(t, "purgeable_nonvolatile_compress", "physmem", "bytes");
619	task_ledgers.platform_idle_wakeups = ledger_entry_add(t, "platform_idle_wakeups", "power",
620 	    "count");
621	task_ledgers.interrupt_wakeups = ledger_entry_add(t, "interrupt_wakeups", "power",
622 	    "count");
623
624	sfi_class_id_t class_id, ledger_alias;
625	for (class_id = SFI_CLASS_UNSPECIFIED; class_id < MAX_SFI_CLASS_ID; class_id++) {
626		task_ledgers.sfi_wait_times[class_id] = -1;
627	}
628
629	/* don't account for UNSPECIFIED */
630	for (class_id = SFI_CLASS_UNSPECIFIED + 1; class_id < MAX_SFI_CLASS_ID; class_id++) {
631		ledger_alias = sfi_get_ledger_alias_for_class(class_id);
632		if (ledger_alias != SFI_CLASS_UNSPECIFIED) {
633			/* Check to see if alias has been registered yet */
634			if (task_ledgers.sfi_wait_times[ledger_alias] != -1) {
635				task_ledgers.sfi_wait_times[class_id] = task_ledgers.sfi_wait_times[ledger_alias];
636			} else {
637				/* Otherwise, initialize it first */
638				task_ledgers.sfi_wait_times[class_id] = task_ledgers.sfi_wait_times[ledger_alias] = sfi_ledger_entry_add(t, ledger_alias);
639			}
640		} else {
641			task_ledgers.sfi_wait_times[class_id] = sfi_ledger_entry_add(t, class_id);
642		}
643
644		if (task_ledgers.sfi_wait_times[class_id] < 0) {
645			panic("couldn't create entries for task ledger template for SFI class 0x%x", class_id);
646		}
647	}
648
649#ifdef CONFIG_BANK
650	task_ledgers.cpu_time_billed_to_me = ledger_entry_add(t, "cpu_time_billed_to_me", "sched", "ns");
651	task_ledgers.cpu_time_billed_to_others = ledger_entry_add(t, "cpu_time_billed_to_others", "sched", "ns");
652#endif
653
654	assert(task_ledgers.sfi_wait_times[MAX_SFI_CLASS_ID -1] != -1);
655
656	if ((task_ledgers.cpu_time < 0) ||
657	    (task_ledgers.tkm_private < 0) ||
658	    (task_ledgers.tkm_shared < 0) ||
659	    (task_ledgers.phys_mem < 0) ||
660	    (task_ledgers.wired_mem < 0) ||
661	    (task_ledgers.internal < 0) ||
662	    (task_ledgers.iokit_mapped < 0) ||
663	    (task_ledgers.alternate_accounting < 0) ||
664	    (task_ledgers.phys_footprint < 0) ||
665	    (task_ledgers.internal_compressed < 0) ||
666	    (task_ledgers.purgeable_volatile < 0) ||
667	    (task_ledgers.purgeable_nonvolatile < 0) ||
668	    (task_ledgers.purgeable_volatile_compressed < 0) ||
669	    (task_ledgers.purgeable_nonvolatile_compressed < 0) ||
670	    (task_ledgers.platform_idle_wakeups < 0) ||
671	    (task_ledgers.interrupt_wakeups < 0)
672#ifdef CONFIG_BANK
673	    || (task_ledgers.cpu_time_billed_to_me < 0) || (task_ledgers.cpu_time_billed_to_others < 0)
674#endif
675	    ) {
676		panic("couldn't create entries for task ledger template");
677	}
678
679	ledger_track_maximum(t, task_ledgers.phys_footprint, 60);
680#if MACH_ASSERT
681	if (pmap_ledgers_panic) {
682		ledger_panic_on_negative(t, task_ledgers.phys_footprint);
683		ledger_panic_on_negative(t, task_ledgers.internal);
684		ledger_panic_on_negative(t, task_ledgers.internal_compressed);
685		ledger_panic_on_negative(t, task_ledgers.iokit_mapped);
686		ledger_panic_on_negative(t, task_ledgers.alternate_accounting);
687		ledger_panic_on_negative(t, task_ledgers.purgeable_volatile);
688		ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile);
689		ledger_panic_on_negative(t, task_ledgers.purgeable_volatile_compressed);
690		ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile_compressed);
691	}
692#endif /* MACH_ASSERT */
693
694#if CONFIG_JETSAM
695	ledger_set_callback(t, task_ledgers.phys_footprint, task_footprint_exceeded, NULL, NULL);
696#endif
697
698	ledger_set_callback(t, task_ledgers.interrupt_wakeups,
699		task_wakeups_rate_exceeded, NULL, NULL);
700
701	task_ledger_template = t;
702}
703
704kern_return_t
705task_create_internal(
706	task_t		parent_task,
707	coalition_t	parent_coalition __unused,
708	boolean_t	inherit_memory,
709	boolean_t	is_64bit,
710	task_t		*child_task)		/* OUT */
711{
712	task_t			new_task;
713	vm_shared_region_t	shared_region;
714	ledger_t		ledger = NULL;
715
716	new_task = (task_t) zalloc(task_zone);
717
718	if (new_task == TASK_NULL)
719		return(KERN_RESOURCE_SHORTAGE);
720
721	/* one ref for just being alive; one for our caller */
722	new_task->ref_count = 2;
723
724	/* allocate with active entries */
725	assert(task_ledger_template != NULL);
726	if ((ledger = ledger_instantiate(task_ledger_template,
727			LEDGER_CREATE_ACTIVE_ENTRIES)) == NULL) {
728		zfree(task_zone, new_task);
729		return(KERN_RESOURCE_SHORTAGE);
730	}
731
732	new_task->ledger = ledger;
733
734#if defined(CONFIG_SCHED_MULTIQ)
735	new_task->sched_group = sched_group_create();
736#endif
737
738	/* if inherit_memory is true, parent_task MUST not be NULL */
739	if (inherit_memory)
740		new_task->map = vm_map_fork(ledger, parent_task->map);
741	else
742		new_task->map = vm_map_create(pmap_create(ledger, 0, is_64bit),
743				(vm_map_offset_t)(VM_MIN_ADDRESS),
744				(vm_map_offset_t)(VM_MAX_ADDRESS), TRUE);
745
746	/* Inherit memlock limit from parent */
747	if (parent_task)
748		vm_map_set_user_wire_limit(new_task->map, (vm_size_t)parent_task->map->user_wire_limit);
749
750	lck_mtx_init(&new_task->lock, &task_lck_grp, &task_lck_attr);
751	queue_init(&new_task->threads);
752	new_task->suspend_count = 0;
753	new_task->thread_count = 0;
754	new_task->active_thread_count = 0;
755	new_task->user_stop_count = 0;
756	new_task->legacy_stop_count = 0;
757	new_task->active = TRUE;
758	new_task->halting = FALSE;
759	new_task->user_data = NULL;
760	new_task->faults = 0;
761	new_task->cow_faults = 0;
762	new_task->pageins = 0;
763	new_task->messages_sent = 0;
764	new_task->messages_received = 0;
765	new_task->syscalls_mach = 0;
766	new_task->priv_flags = 0;
767	new_task->syscalls_unix=0;
768	new_task->c_switch = new_task->p_switch = new_task->ps_switch = 0;
769	new_task->t_flags = 0;
770	new_task->importance = 0;
771
772#if CONFIG_ATM
773	new_task->atm_context = NULL;
774#endif
775#if CONFIG_BANK
776	new_task->bank_context = NULL;
777#endif
778
779	zinfo_task_init(new_task);
780
781#ifdef MACH_BSD
782	new_task->bsd_info = NULL;
783#endif /* MACH_BSD */
784
785#if CONFIG_JETSAM
786	if (max_task_footprint != 0) {
787		ledger_set_limit(ledger, task_ledgers.phys_footprint, max_task_footprint, PHYS_FOOTPRINT_WARNING_LEVEL);
788	}
789#endif
790
791	if (task_wakeups_monitor_rate != 0) {
792		uint32_t flags = WAKEMON_ENABLE | WAKEMON_SET_DEFAULTS;
793		int32_t  rate; // Ignored because of WAKEMON_SET_DEFAULTS
794		task_wakeups_monitor_ctl(new_task, &flags, &rate);
795	}
796
797#if defined(__i386__) || defined(__x86_64__)
798	new_task->i386_ldt = 0;
799#endif
800
801	new_task->task_debug = NULL;
802
803	queue_init(&new_task->semaphore_list);
804	new_task->semaphores_owned = 0;
805
806	ipc_task_init(new_task, parent_task);
807
808	new_task->total_user_time = 0;
809	new_task->total_system_time = 0;
810
811	new_task->vtimers = 0;
812
813	new_task->shared_region = NULL;
814
815	new_task->affinity_space = NULL;
816
817#if CONFIG_COUNTERS
818	new_task->t_chud = 0U;
819#endif
820
821	new_task->pidsuspended = FALSE;
822	new_task->frozen = FALSE;
823	new_task->changing_freeze_state = FALSE;
824	new_task->rusage_cpu_flags = 0;
825	new_task->rusage_cpu_percentage = 0;
826	new_task->rusage_cpu_interval = 0;
827	new_task->rusage_cpu_deadline = 0;
828	new_task->rusage_cpu_callt = NULL;
829#if MACH_ASSERT
830	new_task->suspends_outstanding = 0;
831#endif
832
833#if HYPERVISOR
834	new_task->hv_task_target = NULL;
835#endif /* HYPERVISOR */
836
837
838	new_task->low_mem_notified_warn = 0;
839	new_task->low_mem_notified_critical = 0;
840	new_task->purged_memory_warn = 0;
841	new_task->purged_memory_critical = 0;
842	new_task->mem_notify_reserved = 0;
843#if IMPORTANCE_INHERITANCE
844	new_task->task_imp_base = NULL;
845#endif /* IMPORTANCE_INHERITANCE */
846
847#if	defined(__x86_64__)
848	new_task->uexc_range_start = new_task->uexc_range_size = new_task->uexc_handler = 0;
849#endif
850
851	new_task->requested_policy = default_task_requested_policy;
852	new_task->effective_policy = default_task_effective_policy;
853	new_task->pended_policy    = default_task_pended_policy;
854
855	if (parent_task != TASK_NULL) {
856		new_task->sec_token = parent_task->sec_token;
857		new_task->audit_token = parent_task->audit_token;
858
859		/* inherit the parent's shared region */
860		shared_region = vm_shared_region_get(parent_task);
861		vm_shared_region_set(new_task, shared_region);
862
863		if(task_has_64BitAddr(parent_task))
864			task_set_64BitAddr(new_task);
865		new_task->all_image_info_addr = parent_task->all_image_info_addr;
866		new_task->all_image_info_size = parent_task->all_image_info_size;
867
868#if defined(__i386__) || defined(__x86_64__)
869		if (inherit_memory && parent_task->i386_ldt)
870			new_task->i386_ldt = user_ldt_copy(parent_task->i386_ldt);
871#endif
872		if (inherit_memory && parent_task->affinity_space)
873			task_affinity_create(parent_task, new_task);
874
875		new_task->pset_hint = parent_task->pset_hint = task_choose_pset(parent_task);
876
877#if IMPORTANCE_INHERITANCE
878		ipc_importance_task_t new_task_imp = IIT_NULL;
879
880		if (task_is_marked_importance_donor(parent_task)) {
881			new_task_imp = ipc_importance_for_task(new_task, FALSE);
882			assert(IIT_NULL != new_task_imp);
883			ipc_importance_task_mark_donor(new_task_imp, TRUE);
884		}
885		/* Embedded doesn't want this to inherit */
886		if (task_is_marked_importance_receiver(parent_task)) {
887			if (IIT_NULL == new_task_imp)
888				new_task_imp = ipc_importance_for_task(new_task, FALSE);
889			assert(IIT_NULL != new_task_imp);
890			ipc_importance_task_mark_receiver(new_task_imp, TRUE);
891		}
892		if (task_is_marked_importance_denap_receiver(parent_task)) {
893			if (IIT_NULL == new_task_imp)
894				new_task_imp = ipc_importance_for_task(new_task, FALSE);
895			assert(IIT_NULL != new_task_imp);
896			ipc_importance_task_mark_denap_receiver(new_task_imp, TRUE);
897		}
898
899		if (IIT_NULL != new_task_imp) {
900			assert(new_task->task_imp_base == new_task_imp);
901			ipc_importance_task_release(new_task_imp);
902		}
903#endif /* IMPORTANCE_INHERITANCE */
904
905		new_task->priority = BASEPRI_DEFAULT;
906		new_task->max_priority = MAXPRI_USER;
907
908		new_task->requested_policy.t_apptype     = parent_task->requested_policy.t_apptype;
909
910		new_task->requested_policy.int_darwinbg  = parent_task->requested_policy.int_darwinbg;
911		new_task->requested_policy.ext_darwinbg  = parent_task->requested_policy.ext_darwinbg;
912		new_task->requested_policy.int_iotier    = parent_task->requested_policy.int_iotier;
913		new_task->requested_policy.ext_iotier    = parent_task->requested_policy.ext_iotier;
914		new_task->requested_policy.int_iopassive = parent_task->requested_policy.int_iopassive;
915		new_task->requested_policy.ext_iopassive = parent_task->requested_policy.ext_iopassive;
916		new_task->requested_policy.bg_iotier     = parent_task->requested_policy.bg_iotier;
917		new_task->requested_policy.terminated    = parent_task->requested_policy.terminated;
918		new_task->requested_policy.t_qos_clamp   = parent_task->requested_policy.t_qos_clamp;
919
920		task_policy_create(new_task, parent_task->requested_policy.t_boosted);
921	} else {
922		new_task->sec_token = KERNEL_SECURITY_TOKEN;
923		new_task->audit_token = KERNEL_AUDIT_TOKEN;
924#ifdef __LP64__
925		if(is_64bit)
926			task_set_64BitAddr(new_task);
927#endif
928		new_task->all_image_info_addr = (mach_vm_address_t)0;
929		new_task->all_image_info_size = (mach_vm_size_t)0;
930
931		new_task->pset_hint = PROCESSOR_SET_NULL;
932
933		if (kernel_task == TASK_NULL) {
934			new_task->priority = BASEPRI_KERNEL;
935			new_task->max_priority = MAXPRI_KERNEL;
936		} else {
937			new_task->priority = BASEPRI_DEFAULT;
938			new_task->max_priority = MAXPRI_USER;
939		}
940	}
941
942	new_task->coalition = COALITION_NULL;
943
944#if CONFIG_COALITIONS
945	if (parent_coalition) {
946		coalition_adopt_task(parent_coalition, new_task);
947	} else if (parent_task && parent_task->coalition) {
948		coalition_adopt_task(parent_task->coalition, new_task);
949	} else {
950		coalition_default_adopt_task(new_task);
951	}
952
953	if (new_task->coalition == COALITION_NULL) {
954		panic("created task is not a member of any coalition");
955	}
956#endif /* CONFIG_COALITIONS */
957
958	/* Allocate I/O Statistics */
959	new_task->task_io_stats = (io_stat_info_t)kalloc(sizeof(struct io_stat_info));
960	assert(new_task->task_io_stats != NULL);
961	bzero(new_task->task_io_stats, sizeof(struct io_stat_info));
962
963	bzero(&(new_task->cpu_time_qos_stats), sizeof(struct _cpu_time_qos_stats));
964
965	bzero(&new_task->extmod_statistics, sizeof(new_task->extmod_statistics));
966	new_task->task_timer_wakeups_bin_1 = new_task->task_timer_wakeups_bin_2 = 0;
967	new_task->task_gpu_ns = 0;
968	lck_mtx_lock(&tasks_threads_lock);
969	queue_enter(&tasks, new_task, task_t, tasks);
970	tasks_count++;
971	lck_mtx_unlock(&tasks_threads_lock);
972
973	if (vm_backing_store_low && parent_task != NULL)
974		new_task->priv_flags |= (parent_task->priv_flags&VM_BACKING_STORE_PRIV);
975
976	new_task->task_volatile_objects = 0;
977	new_task->task_nonvolatile_objects = 0;
978	new_task->task_purgeable_disowning = FALSE;
979	new_task->task_purgeable_disowned = FALSE;
980
981	ipc_task_enable(new_task);
982
983	*child_task = new_task;
984	return(KERN_SUCCESS);
985}
986
987int task_dropped_imp_count = 0;
988
989/*
990 *	task_deallocate:
991 *
992 *	Drop a reference on a task.
993 */
994void
995task_deallocate(
996	task_t		task)
997{
998	ledger_amount_t credit, debit, interrupt_wakeups, platform_idle_wakeups;
999	uint32_t refs;
1000
1001	if (task == TASK_NULL)
1002	    return;
1003
1004	refs = task_deallocate_internal(task);
1005
1006#if IMPORTANCE_INHERITANCE
1007	if (refs > 1)
1008		return;
1009
1010	if (refs == 1) {
1011		/*
1012		 * If last ref potentially comes from the task's importance,
1013		 * disconnect it.  But more task refs may be added before
1014		 * that completes, so wait for the reference to go to zero
1015		 * naturually (it may happen on a recursive task_deallocate()
1016		 * from the ipc_importance_disconnect_task() call).
1017		 */
1018		if (IIT_NULL != task->task_imp_base)
1019			ipc_importance_disconnect_task(task);
1020		return;
1021	}
1022#else
1023	if (refs > 0)
1024		return;
1025#endif /* IMPORTANCE_INHERITANCE */
1026
1027	lck_mtx_lock(&tasks_threads_lock);
1028	queue_remove(&terminated_tasks, task, task_t, tasks);
1029	terminated_tasks_count--;
1030	lck_mtx_unlock(&tasks_threads_lock);
1031
1032	/*
1033	 * remove the reference on atm descriptor
1034	 */
1035	 task_atm_reset(task);
1036
1037#if CONFIG_BANK
1038	/*
1039	 * remove the reference on bank context
1040	 */
1041	if (task->bank_context != NULL) {
1042		bank_task_destroy(task->bank_context);
1043		task->bank_context = NULL;
1044	}
1045#endif
1046
1047	if (task->task_io_stats)
1048		kfree(task->task_io_stats, sizeof(struct io_stat_info));
1049
1050	/*
1051	 *	Give the machine dependent code a chance
1052	 *	to perform cleanup before ripping apart
1053	 *	the task.
1054	 */
1055	machine_task_terminate(task);
1056
1057	ipc_task_terminate(task);
1058
1059	if (task->affinity_space)
1060		task_affinity_deallocate(task);
1061
1062#if MACH_ASSERT
1063	if (task->ledger != NULL &&
1064	    task->map != NULL &&
1065	    task->map->pmap != NULL &&
1066	    task->map->pmap->ledger != NULL) {
1067		assert(task->ledger == task->map->pmap->ledger);
1068	}
1069#endif /* MACH_ASSERT */
1070
1071	vm_purgeable_disown(task);
1072	assert(task->task_purgeable_disowned);
1073	if (task->task_volatile_objects != 0 ||
1074	    task->task_nonvolatile_objects != 0) {
1075		panic("task_deallocate(%p): "
1076		      "volatile_objects=%d nonvolatile_objects=%d\n",
1077		      task,
1078		      task->task_volatile_objects,
1079		      task->task_nonvolatile_objects);
1080	}
1081
1082	vm_map_deallocate(task->map);
1083	is_release(task->itk_space);
1084
1085	ledger_get_entries(task->ledger, task_ledgers.interrupt_wakeups,
1086	                   &interrupt_wakeups, &debit);
1087	ledger_get_entries(task->ledger, task_ledgers.platform_idle_wakeups,
1088	                   &platform_idle_wakeups, &debit);
1089
1090#if defined(CONFIG_SCHED_MULTIQ)
1091	sched_group_destroy(task->sched_group);
1092#endif
1093
1094	/* Accumulate statistics for dead tasks */
1095	lck_spin_lock(&dead_task_statistics_lock);
1096	dead_task_statistics.total_user_time += task->total_user_time;
1097	dead_task_statistics.total_system_time += task->total_system_time;
1098
1099	dead_task_statistics.task_interrupt_wakeups += interrupt_wakeups;
1100	dead_task_statistics.task_platform_idle_wakeups += platform_idle_wakeups;
1101
1102	dead_task_statistics.task_timer_wakeups_bin_1 += task->task_timer_wakeups_bin_1;
1103	dead_task_statistics.task_timer_wakeups_bin_2 += task->task_timer_wakeups_bin_2;
1104
1105	lck_spin_unlock(&dead_task_statistics_lock);
1106	lck_mtx_destroy(&task->lock, &task_lck_grp);
1107
1108	if (!ledger_get_entries(task->ledger, task_ledgers.tkm_private, &credit,
1109	    &debit)) {
1110		OSAddAtomic64(credit, (int64_t *)&tasks_tkm_private.alloc);
1111		OSAddAtomic64(debit, (int64_t *)&tasks_tkm_private.free);
1112	}
1113	if (!ledger_get_entries(task->ledger, task_ledgers.tkm_shared, &credit,
1114	    &debit)) {
1115		OSAddAtomic64(credit, (int64_t *)&tasks_tkm_shared.alloc);
1116		OSAddAtomic64(debit, (int64_t *)&tasks_tkm_shared.free);
1117	}
1118	ledger_dereference(task->ledger);
1119	zinfo_task_free(task);
1120
1121#if TASK_REFERENCE_LEAK_DEBUG
1122	btlog_remove_entries_for_element(task_ref_btlog, task);
1123#endif
1124
1125#if CONFIG_COALITIONS
1126	if (!task->coalition) {
1127		panic("deallocating task was not a member of any coalition");
1128	}
1129	coalition_release(task->coalition);
1130#endif /* CONFIG_COALITIONS */
1131
1132	task->coalition = COALITION_NULL;
1133
1134	zfree(task_zone, task);
1135}
1136
1137/*
1138 *	task_name_deallocate:
1139 *
1140 *	Drop a reference on a task name.
1141 */
1142void
1143task_name_deallocate(
1144	task_name_t		task_name)
1145{
1146	return(task_deallocate((task_t)task_name));
1147}
1148
1149/*
1150 *	task_suspension_token_deallocate:
1151 *
1152 *	Drop a reference on a task suspension token.
1153 */
1154void
1155task_suspension_token_deallocate(
1156	task_suspension_token_t		token)
1157{
1158	return(task_deallocate((task_t)token));
1159}
1160
1161/*
1162 *	task_terminate:
1163 *
1164 *	Terminate the specified task.  See comments on thread_terminate
1165 *	(kern/thread.c) about problems with terminating the "current task."
1166 */
1167
1168kern_return_t
1169task_terminate(
1170	task_t		task)
1171{
1172	if (task == TASK_NULL)
1173		return (KERN_INVALID_ARGUMENT);
1174
1175	if (task->bsd_info)
1176		return (KERN_FAILURE);
1177
1178	return (task_terminate_internal(task));
1179}
1180
1181#if MACH_ASSERT
1182extern int proc_pid(struct proc *);
1183extern void proc_name_kdp(task_t t, char *buf, int size);
1184#endif /* MACH_ASSERT */
1185
1186#define VM_MAP_PARTIAL_REAP 0x54  /* 0x150 */
1187static void
1188__unused task_partial_reap(task_t task, __unused int pid)
1189{
1190        unsigned int    reclaimed_resident = 0;
1191        unsigned int    reclaimed_compressed = 0;
1192	uint64_t        task_page_count;
1193
1194	task_page_count = (get_task_phys_footprint(task) / PAGE_SIZE_64);
1195
1196	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_MAP_PARTIAL_REAP) | DBG_FUNC_START),
1197                              pid, task_page_count, 0, 0, 0);
1198
1199	vm_map_partial_reap(task->map, &reclaimed_resident, &reclaimed_compressed);
1200
1201        KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_MAP_PARTIAL_REAP) | DBG_FUNC_END),
1202                              pid, reclaimed_resident, reclaimed_compressed, 0, 0);
1203}
1204
1205kern_return_t
1206task_terminate_internal(
1207	task_t			task)
1208{
1209	thread_t			thread, self;
1210	task_t				self_task;
1211	boolean_t			interrupt_save;
1212	int				pid = 0;
1213
1214	assert(task != kernel_task);
1215
1216	self = current_thread();
1217	self_task = self->task;
1218
1219	/*
1220	 *	Get the task locked and make sure that we are not racing
1221	 *	with someone else trying to terminate us.
1222	 */
1223	if (task == self_task)
1224		task_lock(task);
1225	else
1226	if (task < self_task) {
1227		task_lock(task);
1228		task_lock(self_task);
1229	}
1230	else {
1231		task_lock(self_task);
1232		task_lock(task);
1233	}
1234
1235	if (!task->active) {
1236		/*
1237		 *	Task is already being terminated.
1238		 *	Just return an error. If we are dying, this will
1239		 *	just get us to our AST special handler and that
1240		 *	will get us to finalize the termination of ourselves.
1241		 */
1242		task_unlock(task);
1243		if (self_task != task)
1244			task_unlock(self_task);
1245
1246		return (KERN_FAILURE);
1247	}
1248
1249	if (self_task != task)
1250		task_unlock(self_task);
1251
1252	/*
1253	 * Make sure the current thread does not get aborted out of
1254	 * the waits inside these operations.
1255	 */
1256	interrupt_save = thread_interrupt_level(THREAD_UNINT);
1257
1258	/*
1259	 *	Indicate that we want all the threads to stop executing
1260	 *	at user space by holding the task (we would have held
1261	 *	each thread independently in thread_terminate_internal -
1262	 *	but this way we may be more likely to already find it
1263	 *	held there).  Mark the task inactive, and prevent
1264	 *	further task operations via the task port.
1265	 */
1266	task_hold_locked(task);
1267	task->active = FALSE;
1268	ipc_task_disable(task);
1269
1270#if CONFIG_TELEMETRY
1271	/*
1272	 * Notify telemetry that this task is going away.
1273	 */
1274	telemetry_task_ctl_locked(task, TF_TELEMETRY, 0);
1275#endif
1276
1277	/*
1278	 *	Terminate each thread in the task.
1279	 */
1280	queue_iterate(&task->threads, thread, thread_t, task_threads) {
1281			thread_terminate_internal(thread);
1282	}
1283
1284#ifdef MACH_BSD
1285	if (task->bsd_info != NULL) {
1286		pid = proc_pid(task->bsd_info);
1287	}
1288#endif /* MACH_BSD */
1289
1290	task_unlock(task);
1291
1292        /* Early object reap phase */
1293
1294// PR-17045188: Revisit implementation
1295//        task_partial_reap(task, pid);
1296
1297
1298	/*
1299	 *	Destroy all synchronizers owned by the task.
1300	 */
1301	task_synchronizer_destroy_all(task);
1302
1303	/*
1304	 *	Destroy the IPC space, leaving just a reference for it.
1305	 */
1306	ipc_space_terminate(task->itk_space);
1307
1308#if 00
1309	/* if some ledgers go negative on tear-down again... */
1310	ledger_disable_panic_on_negative(task->map->pmap->ledger,
1311					 task_ledgers.phys_footprint);
1312	ledger_disable_panic_on_negative(task->map->pmap->ledger,
1313					 task_ledgers.internal);
1314	ledger_disable_panic_on_negative(task->map->pmap->ledger,
1315					 task_ledgers.internal_compressed);
1316	ledger_disable_panic_on_negative(task->map->pmap->ledger,
1317					 task_ledgers.iokit_mapped);
1318	ledger_disable_panic_on_negative(task->map->pmap->ledger,
1319					 task_ledgers.alternate_accounting);
1320#endif
1321
1322	/*
1323	 * If the current thread is a member of the task
1324	 * being terminated, then the last reference to
1325	 * the task will not be dropped until the thread
1326	 * is finally reaped.  To avoid incurring the
1327	 * expense of removing the address space regions
1328	 * at reap time, we do it explictly here.
1329	 */
1330	vm_map_remove(task->map,
1331		      task->map->min_offset,
1332		      task->map->max_offset,
1333		      VM_MAP_NO_FLAGS);
1334
1335	/* release our shared region */
1336	vm_shared_region_set(task, NULL);
1337
1338#if MACH_ASSERT
1339	/*
1340	 * Identify the pmap's process, in case the pmap ledgers drift
1341	 * and we have to report it.
1342	 */
1343	char procname[17];
1344	if (task->bsd_info) {
1345		pid = proc_pid(task->bsd_info);
1346		proc_name_kdp(task, procname, sizeof (procname));
1347	} else {
1348		pid = 0;
1349		strlcpy(procname, "<unknown>", sizeof (procname));
1350	}
1351	pmap_set_process(task->map->pmap, pid, procname);
1352#endif /* MACH_ASSERT */
1353
1354	lck_mtx_lock(&tasks_threads_lock);
1355	queue_remove(&tasks, task, task_t, tasks);
1356	queue_enter(&terminated_tasks, task, task_t, tasks);
1357	tasks_count--;
1358	terminated_tasks_count++;
1359	lck_mtx_unlock(&tasks_threads_lock);
1360
1361	/*
1362	 * We no longer need to guard against being aborted, so restore
1363	 * the previous interruptible state.
1364	 */
1365	thread_interrupt_level(interrupt_save);
1366
1367#if KPERF
1368	/* force the task to release all ctrs */
1369	if (task->t_chud & TASK_KPC_FORCED_ALL_CTRS)
1370		kpc_force_all_ctrs(task, 0);
1371#endif
1372
1373#if CONFIG_COALITIONS
1374	/*
1375	 * Leave our coalition. (drop activation but not reference)
1376	 */
1377	coalition_remove_task(task);
1378#endif
1379
1380	/*
1381	 * Get rid of the task active reference on itself.
1382	 */
1383	task_deallocate(task);
1384
1385	return (KERN_SUCCESS);
1386}
1387
1388/*
1389 * task_start_halt:
1390 *
1391 * 	Shut the current task down (except for the current thread) in
1392 *	preparation for dramatic changes to the task (probably exec).
1393 *	We hold the task and mark all other threads in the task for
1394 *	termination.
1395 */
1396kern_return_t
1397task_start_halt(
1398	task_t		task)
1399{
1400	thread_t	thread, self;
1401
1402	assert(task != kernel_task);
1403
1404	self = current_thread();
1405
1406	if (task != self->task)
1407		return (KERN_INVALID_ARGUMENT);
1408
1409	task_lock(task);
1410
1411	if (task->halting || !task->active || !self->active) {
1412		/*
1413		 *	Task or current thread is already being terminated.
1414		 *	Hurry up and return out of the current kernel context
1415		 *	so that we run our AST special handler to terminate
1416		 *	ourselves.
1417		 */
1418		task_unlock(task);
1419
1420		return (KERN_FAILURE);
1421	}
1422
1423	task->halting = TRUE;
1424
1425	if (task->thread_count > 1) {
1426
1427		/*
1428		 * Mark all the threads to keep them from starting any more
1429		 * user-level execution.  The thread_terminate_internal code
1430		 * would do this on a thread by thread basis anyway, but this
1431		 * gives us a better chance of not having to wait there.
1432		 */
1433		task_hold_locked(task);
1434
1435		/*
1436		 *	Terminate all the other threads in the task.
1437		 */
1438		queue_iterate(&task->threads, thread, thread_t, task_threads) {
1439			if (thread != self)
1440				thread_terminate_internal(thread);
1441		}
1442
1443		task_release_locked(task);
1444	}
1445	task_unlock(task);
1446	return KERN_SUCCESS;
1447}
1448
1449
1450/*
1451 * task_complete_halt:
1452 *
1453 *	Complete task halt by waiting for threads to terminate, then clean
1454 *	up task resources (VM, port namespace, etc...) and then let the
1455 *	current thread go in the (practically empty) task context.
1456 */
1457void
1458task_complete_halt(task_t task)
1459{
1460	task_lock(task);
1461	assert(task->halting);
1462	assert(task == current_task());
1463
1464	/*
1465	 *	Wait for the other threads to get shut down.
1466	 *      When the last other thread is reaped, we'll be
1467	 *	woken up.
1468	 */
1469	if (task->thread_count > 1) {
1470		assert_wait((event_t)&task->halting, THREAD_UNINT);
1471		task_unlock(task);
1472		thread_block(THREAD_CONTINUE_NULL);
1473	} else {
1474		task_unlock(task);
1475	}
1476
1477	/*
1478	 *	Give the machine dependent code a chance
1479	 *	to perform cleanup of task-level resources
1480	 *	associated with the current thread before
1481	 *	ripping apart the task.
1482	 */
1483	machine_task_terminate(task);
1484
1485	/*
1486	 *	Destroy all synchronizers owned by the task.
1487	 */
1488	task_synchronizer_destroy_all(task);
1489
1490	/*
1491	 *	Destroy the contents of the IPC space, leaving just
1492	 *	a reference for it.
1493	 */
1494	ipc_space_clean(task->itk_space);
1495
1496	/*
1497	 * Clean out the address space, as we are going to be
1498	 * getting a new one.
1499	 */
1500	vm_map_remove(task->map, task->map->min_offset,
1501		      task->map->max_offset, VM_MAP_NO_FLAGS);
1502
1503	task->halting = FALSE;
1504}
1505
1506/*
1507 *	task_hold_locked:
1508 *
1509 *	Suspend execution of the specified task.
1510 *	This is a recursive-style suspension of the task, a count of
1511 *	suspends is maintained.
1512 *
1513 * 	CONDITIONS: the task is locked and active.
1514 */
1515void
1516task_hold_locked(
1517	register task_t		task)
1518{
1519	register thread_t	thread;
1520
1521	assert(task->active);
1522
1523	if (task->suspend_count++ > 0)
1524		return;
1525
1526	/*
1527	 *	Iterate through all the threads and hold them.
1528	 */
1529	queue_iterate(&task->threads, thread, thread_t, task_threads) {
1530		thread_mtx_lock(thread);
1531		thread_hold(thread);
1532		thread_mtx_unlock(thread);
1533	}
1534}
1535
1536/*
1537 *	task_hold:
1538 *
1539 *	Same as the internal routine above, except that is must lock
1540 *	and verify that the task is active.  This differs from task_suspend
1541 *	in that it places a kernel hold on the task rather than just a
1542 *	user-level hold.  This keeps users from over resuming and setting
1543 *	it running out from under the kernel.
1544 *
1545 * 	CONDITIONS: the caller holds a reference on the task
1546 */
1547kern_return_t
1548task_hold(
1549	register task_t		task)
1550{
1551	if (task == TASK_NULL)
1552		return (KERN_INVALID_ARGUMENT);
1553
1554	task_lock(task);
1555
1556	if (!task->active) {
1557		task_unlock(task);
1558
1559		return (KERN_FAILURE);
1560	}
1561
1562	task_hold_locked(task);
1563	task_unlock(task);
1564
1565	return (KERN_SUCCESS);
1566}
1567
1568kern_return_t
1569task_wait(
1570		task_t		task,
1571		boolean_t	until_not_runnable)
1572{
1573	if (task == TASK_NULL)
1574		return (KERN_INVALID_ARGUMENT);
1575
1576	task_lock(task);
1577
1578	if (!task->active) {
1579		task_unlock(task);
1580
1581		return (KERN_FAILURE);
1582	}
1583
1584	task_wait_locked(task, until_not_runnable);
1585	task_unlock(task);
1586
1587	return (KERN_SUCCESS);
1588}
1589
1590/*
1591 *	task_wait_locked:
1592 *
1593 *	Wait for all threads in task to stop.
1594 *
1595 * Conditions:
1596 *	Called with task locked, active, and held.
1597 */
1598void
1599task_wait_locked(
1600	register task_t		task,
1601	boolean_t		until_not_runnable)
1602{
1603	register thread_t	thread, self;
1604
1605	assert(task->active);
1606	assert(task->suspend_count > 0);
1607
1608	self = current_thread();
1609
1610	/*
1611	 *	Iterate through all the threads and wait for them to
1612	 *	stop.  Do not wait for the current thread if it is within
1613	 *	the task.
1614	 */
1615	queue_iterate(&task->threads, thread, thread_t, task_threads) {
1616		if (thread != self)
1617			thread_wait(thread, until_not_runnable);
1618	}
1619}
1620
1621/*
1622 *	task_release_locked:
1623 *
1624 *	Release a kernel hold on a task.
1625 *
1626 * 	CONDITIONS: the task is locked and active
1627 */
1628void
1629task_release_locked(
1630	register task_t		task)
1631{
1632	register thread_t	thread;
1633
1634	assert(task->active);
1635	assert(task->suspend_count > 0);
1636
1637	if (--task->suspend_count > 0)
1638		return;
1639
1640	queue_iterate(&task->threads, thread, thread_t, task_threads) {
1641		thread_mtx_lock(thread);
1642		thread_release(thread);
1643		thread_mtx_unlock(thread);
1644	}
1645}
1646
1647/*
1648 *	task_release:
1649 *
1650 *	Same as the internal routine above, except that it must lock
1651 *	and verify that the task is active.
1652 *
1653 * 	CONDITIONS: The caller holds a reference to the task
1654 */
1655kern_return_t
1656task_release(
1657	task_t		task)
1658{
1659	if (task == TASK_NULL)
1660		return (KERN_INVALID_ARGUMENT);
1661
1662	task_lock(task);
1663
1664	if (!task->active) {
1665		task_unlock(task);
1666
1667		return (KERN_FAILURE);
1668	}
1669
1670	task_release_locked(task);
1671	task_unlock(task);
1672
1673	return (KERN_SUCCESS);
1674}
1675
1676kern_return_t
1677task_threads(
1678	task_t					task,
1679	thread_act_array_t		*threads_out,
1680	mach_msg_type_number_t	*count)
1681{
1682	mach_msg_type_number_t	actual;
1683	thread_t				*thread_list;
1684	thread_t				thread;
1685	vm_size_t				size, size_needed;
1686	void					*addr;
1687	unsigned int			i, j;
1688
1689	if (task == TASK_NULL)
1690		return (KERN_INVALID_ARGUMENT);
1691
1692	size = 0; addr = NULL;
1693
1694	for (;;) {
1695		task_lock(task);
1696		if (!task->active) {
1697			task_unlock(task);
1698
1699			if (size != 0)
1700				kfree(addr, size);
1701
1702			return (KERN_FAILURE);
1703		}
1704
1705		actual = task->thread_count;
1706
1707		/* do we have the memory we need? */
1708		size_needed = actual * sizeof (mach_port_t);
1709		if (size_needed <= size)
1710			break;
1711
1712		/* unlock the task and allocate more memory */
1713		task_unlock(task);
1714
1715		if (size != 0)
1716			kfree(addr, size);
1717
1718		assert(size_needed > 0);
1719		size = size_needed;
1720
1721		addr = kalloc(size);
1722		if (addr == 0)
1723			return (KERN_RESOURCE_SHORTAGE);
1724	}
1725
1726	/* OK, have memory and the task is locked & active */
1727	thread_list = (thread_t *)addr;
1728
1729	i = j = 0;
1730
1731	for (thread = (thread_t)queue_first(&task->threads); i < actual;
1732				++i, thread = (thread_t)queue_next(&thread->task_threads)) {
1733		thread_reference_internal(thread);
1734		thread_list[j++] = thread;
1735	}
1736
1737	assert(queue_end(&task->threads, (queue_entry_t)thread));
1738
1739	actual = j;
1740	size_needed = actual * sizeof (mach_port_t);
1741
1742	/* can unlock task now that we've got the thread refs */
1743	task_unlock(task);
1744
1745	if (actual == 0) {
1746		/* no threads, so return null pointer and deallocate memory */
1747
1748		*threads_out = NULL;
1749		*count = 0;
1750
1751		if (size != 0)
1752			kfree(addr, size);
1753	}
1754	else {
1755		/* if we allocated too much, must copy */
1756
1757		if (size_needed < size) {
1758			void *newaddr;
1759
1760			newaddr = kalloc(size_needed);
1761			if (newaddr == 0) {
1762				for (i = 0; i < actual; ++i)
1763					thread_deallocate(thread_list[i]);
1764				kfree(addr, size);
1765				return (KERN_RESOURCE_SHORTAGE);
1766			}
1767
1768			bcopy(addr, newaddr, size_needed);
1769			kfree(addr, size);
1770			thread_list = (thread_t *)newaddr;
1771		}
1772
1773		*threads_out = thread_list;
1774		*count = actual;
1775
1776		/* do the conversion that Mig should handle */
1777
1778		for (i = 0; i < actual; ++i)
1779			((ipc_port_t *) thread_list)[i] = convert_thread_to_port(thread_list[i]);
1780	}
1781
1782	return (KERN_SUCCESS);
1783}
1784
1785#define TASK_HOLD_NORMAL	0
1786#define TASK_HOLD_PIDSUSPEND	1
1787#define TASK_HOLD_LEGACY	2
1788#define TASK_HOLD_LEGACY_ALL	3
1789
1790static kern_return_t
1791place_task_hold    (
1792	register task_t task,
1793	int mode)
1794{
1795	if (!task->active) {
1796		return (KERN_FAILURE);
1797	}
1798
1799	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1800	    MACHDBG_CODE(DBG_MACH_IPC,MACH_TASK_SUSPEND) | DBG_FUNC_NONE,
1801	    proc_pid(task->bsd_info), ((thread_t)queue_first(&task->threads))->thread_id,
1802	    task->user_stop_count, task->user_stop_count + 1, 0);
1803
1804#if MACH_ASSERT
1805	current_task()->suspends_outstanding++;
1806#endif
1807
1808	if (mode == TASK_HOLD_LEGACY)
1809		task->legacy_stop_count++;
1810
1811	if (task->user_stop_count++ > 0) {
1812		/*
1813		 *	If the stop count was positive, the task is
1814		 *	already stopped and we can exit.
1815		 */
1816		return (KERN_SUCCESS);
1817	}
1818
1819	/*
1820	 * Put a kernel-level hold on the threads in the task (all
1821	 * user-level task suspensions added together represent a
1822	 * single kernel-level hold).  We then wait for the threads
1823	 * to stop executing user code.
1824	 */
1825	task_hold_locked(task);
1826	task_wait_locked(task, FALSE);
1827
1828	return (KERN_SUCCESS);
1829}
1830
1831static kern_return_t
1832release_task_hold    (
1833	register task_t		task,
1834	int           		mode)
1835{
1836	register boolean_t release = FALSE;
1837
1838	if (!task->active) {
1839		return (KERN_FAILURE);
1840	}
1841
1842	if (mode == TASK_HOLD_PIDSUSPEND) {
1843	    if (task->pidsuspended == FALSE) {
1844		    return (KERN_FAILURE);
1845	    }
1846	    task->pidsuspended = FALSE;
1847	}
1848
1849	if (task->user_stop_count > (task->pidsuspended ? 1 : 0)) {
1850
1851		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1852		    MACHDBG_CODE(DBG_MACH_IPC,MACH_TASK_RESUME) | DBG_FUNC_NONE,
1853		    proc_pid(task->bsd_info), ((thread_t)queue_first(&task->threads))->thread_id,
1854		    task->user_stop_count, mode, task->legacy_stop_count);
1855
1856#if MACH_ASSERT
1857		/*
1858		 * This is obviously not robust; if we suspend one task and then resume a different one,
1859		 * we'll fly under the radar. This is only meant to catch the common case of a crashed
1860		 * or buggy suspender.
1861		 */
1862		current_task()->suspends_outstanding--;
1863#endif
1864
1865		if (mode == TASK_HOLD_LEGACY_ALL) {
1866			if (task->legacy_stop_count >= task->user_stop_count) {
1867				task->user_stop_count = 0;
1868				release = TRUE;
1869			} else {
1870				task->user_stop_count -= task->legacy_stop_count;
1871			}
1872			task->legacy_stop_count = 0;
1873		} else {
1874			if (mode == TASK_HOLD_LEGACY && task->legacy_stop_count > 0)
1875				task->legacy_stop_count--;
1876			if (--task->user_stop_count == 0)
1877				release = TRUE;
1878		}
1879	}
1880	else {
1881		return (KERN_FAILURE);
1882	}
1883
1884	/*
1885	 *	Release the task if necessary.
1886	 */
1887	if (release)
1888		task_release_locked(task);
1889
1890    return (KERN_SUCCESS);
1891}
1892
1893
1894/*
1895 *	task_suspend:
1896 *
1897 *	Implement an (old-fashioned) user-level suspension on a task.
1898 *
1899 *	Because the user isn't expecting to have to manage a suspension
1900 *	token, we'll track it for him in the kernel in the form of a naked
1901 *	send right to the task's resume port.  All such send rights
1902 *	account for a single suspension against the task (unlike task_suspend2()
1903 *	where each caller gets a unique suspension count represented by a
1904 *	unique send-once right).
1905 *
1906 * Conditions:
1907 * 	The caller holds a reference to the task
1908 */
1909kern_return_t
1910task_suspend(
1911	register task_t		task)
1912{
1913	kern_return_t	 		kr;
1914	mach_port_t			port, send, old_notify;
1915	mach_port_name_t		name;
1916
1917	if (task == TASK_NULL || task == kernel_task)
1918		return (KERN_INVALID_ARGUMENT);
1919
1920	task_lock(task);
1921
1922	/*
1923	 * Claim a send right on the task resume port, and request a no-senders
1924	 * notification on that port (if none outstanding).
1925	 */
1926	if (task->itk_resume == IP_NULL) {
1927		task->itk_resume = ipc_port_alloc_kernel();
1928		if (!IP_VALID(task->itk_resume))
1929			panic("failed to create resume port");
1930		ipc_kobject_set(task->itk_resume, (ipc_kobject_t)task, IKOT_TASK_RESUME);
1931	}
1932
1933	port = task->itk_resume;
1934	ip_lock(port);
1935	assert(ip_active(port));
1936
1937	send = ipc_port_make_send_locked(port);
1938	assert(IP_VALID(send));
1939
1940	if (port->ip_nsrequest == IP_NULL) {
1941		ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify);
1942		assert(old_notify == IP_NULL);
1943		/* port unlocked */
1944	} else {
1945		ip_unlock(port);
1946	}
1947
1948	/*
1949	 * place a legacy hold on the task.
1950	 */
1951	kr = place_task_hold(task, TASK_HOLD_LEGACY);
1952	if (kr != KERN_SUCCESS) {
1953		task_unlock(task);
1954		ipc_port_release_send(send);
1955		return kr;
1956	}
1957
1958	task_unlock(task);
1959
1960	/*
1961	 * Copyout the send right into the calling task's IPC space.  It won't know it is there,
1962	 * but we'll look it up when calling a traditional resume.  Any IPC operations that
1963	 * deallocate the send right will auto-release the suspension.
1964	 */
1965	if ((kr = ipc_kmsg_copyout_object(current_task()->itk_space, (ipc_object_t)send,
1966		MACH_MSG_TYPE_MOVE_SEND, &name)) != KERN_SUCCESS) {
1967		printf("warning: %s(%d) failed to copyout suspension token for task %s(%d) with error: %d\n",
1968			proc_name_address(current_task()->bsd_info), proc_pid(current_task()->bsd_info),
1969			proc_name_address(task->bsd_info), proc_pid(task->bsd_info), kr);
1970		return (kr);
1971	}
1972
1973	return (kr);
1974}
1975
1976/*
1977 *	task_resume:
1978 *		Release a user hold on a task.
1979 *
1980 * Conditions:
1981 *		The caller holds a reference to the task
1982 */
1983kern_return_t
1984task_resume(
1985	register task_t	task)
1986{
1987	kern_return_t	 kr;
1988	mach_port_name_t resume_port_name;
1989	ipc_entry_t		 resume_port_entry;
1990	ipc_space_t		 space = current_task()->itk_space;
1991
1992	if (task == TASK_NULL || task == kernel_task )
1993		return (KERN_INVALID_ARGUMENT);
1994
1995	/* release a legacy task hold */
1996	task_lock(task);
1997	kr = release_task_hold(task, TASK_HOLD_LEGACY);
1998	task_unlock(task);
1999
2000	is_write_lock(space);
2001	if (is_active(space) && IP_VALID(task->itk_resume) &&
2002	    ipc_hash_lookup(space, (ipc_object_t)task->itk_resume, &resume_port_name, &resume_port_entry) == TRUE) {
2003		/*
2004		 * We found a suspension token in the caller's IPC space. Release a send right to indicate that
2005		 * we are holding one less legacy hold on the task from this caller.  If the release failed,
2006		 * go ahead and drop all the rights, as someone either already released our holds or the task
2007		 * is gone.
2008		 */
2009		if (kr == KERN_SUCCESS)
2010			ipc_right_dealloc(space, resume_port_name, resume_port_entry);
2011		else
2012			ipc_right_destroy(space, resume_port_name, resume_port_entry, FALSE, 0);
2013		/* space unlocked */
2014	} else {
2015		is_write_unlock(space);
2016		if (kr == KERN_SUCCESS)
2017			printf("warning: %s(%d) performed out-of-band resume on %s(%d)\n",
2018			       proc_name_address(current_task()->bsd_info), proc_pid(current_task()->bsd_info),
2019			       proc_name_address(task->bsd_info), proc_pid(task->bsd_info));
2020	}
2021
2022	return kr;
2023}
2024
2025/*
2026 * Suspend the target task.
2027 * Making/holding a token/reference/port is the callers responsibility.
2028 */
2029kern_return_t
2030task_suspend_internal(task_t task)
2031{
2032	kern_return_t	 kr;
2033
2034	if (task == TASK_NULL || task == kernel_task)
2035		return (KERN_INVALID_ARGUMENT);
2036
2037	task_lock(task);
2038	kr = place_task_hold(task, TASK_HOLD_NORMAL);
2039	task_unlock(task);
2040	return (kr);
2041}
2042
2043/*
2044 * Suspend the target task, and return a suspension token. The token
2045 * represents a reference on the suspended task.
2046 */
2047kern_return_t
2048task_suspend2(
2049	register task_t			task,
2050	task_suspension_token_t *suspend_token)
2051{
2052	kern_return_t	 kr;
2053
2054	kr = task_suspend_internal(task);
2055	if (kr != KERN_SUCCESS) {
2056		*suspend_token = TASK_NULL;
2057		return (kr);
2058	}
2059
2060	/*
2061	 * Take a reference on the target task and return that to the caller
2062	 * as a "suspension token," which can be converted into an SO right to
2063	 * the now-suspended task's resume port.
2064	 */
2065	task_reference_internal(task);
2066	*suspend_token = task;
2067
2068	return (KERN_SUCCESS);
2069}
2070
2071/*
2072 * Resume the task
2073 * (reference/token/port management is caller's responsibility).
2074 */
2075kern_return_t
2076task_resume_internal(
2077	register task_suspension_token_t		task)
2078{
2079	kern_return_t kr;
2080
2081	if (task == TASK_NULL || task == kernel_task)
2082		return (KERN_INVALID_ARGUMENT);
2083
2084	task_lock(task);
2085	kr = release_task_hold(task, TASK_HOLD_NORMAL);
2086	task_unlock(task);
2087	return (kr);
2088}
2089
2090/*
2091 * Resume the task using a suspension token. Consumes the token's ref.
2092 */
2093kern_return_t
2094task_resume2(
2095	register task_suspension_token_t		task)
2096{
2097	kern_return_t kr;
2098
2099	kr = task_resume_internal(task);
2100	task_suspension_token_deallocate(task);
2101
2102	return (kr);
2103}
2104
2105boolean_t
2106task_suspension_notify(mach_msg_header_t *request_header)
2107{
2108	ipc_port_t port = (ipc_port_t) request_header->msgh_remote_port;
2109	task_t task = convert_port_to_task_suspension_token(port);
2110	mach_msg_type_number_t not_count;
2111
2112	if (task == TASK_NULL || task == kernel_task)
2113		return TRUE;  /* nothing to do */
2114
2115	switch (request_header->msgh_id) {
2116
2117	case MACH_NOTIFY_SEND_ONCE:
2118		/* release the hold held by this specific send-once right */
2119		task_lock(task);
2120		release_task_hold(task, TASK_HOLD_NORMAL);
2121		task_unlock(task);
2122		break;
2123
2124	case MACH_NOTIFY_NO_SENDERS:
2125		not_count = ((mach_no_senders_notification_t *)request_header)->not_count;
2126
2127		task_lock(task);
2128		ip_lock(port);
2129		if (port->ip_mscount == not_count) {
2130
2131			/* release all the [remaining] outstanding legacy holds */
2132			assert(port->ip_nsrequest == IP_NULL);
2133			ip_unlock(port);
2134			release_task_hold(task, TASK_HOLD_LEGACY_ALL);
2135			task_unlock(task);
2136
2137		} else if (port->ip_nsrequest == IP_NULL) {
2138			ipc_port_t old_notify;
2139
2140			task_unlock(task);
2141			/* new send rights, re-arm notification at current make-send count */
2142			ipc_port_nsrequest(port, port->ip_mscount, ipc_port_make_sonce_locked(port), &old_notify);
2143			assert(old_notify == IP_NULL);
2144			/* port unlocked */
2145		} else {
2146			ip_unlock(port);
2147			task_unlock(task);
2148		}
2149		break;
2150
2151	default:
2152		break;
2153	}
2154
2155	task_suspension_token_deallocate(task); /* drop token reference */
2156	return TRUE;
2157}
2158
2159kern_return_t
2160task_pidsuspend_locked(task_t task)
2161{
2162	kern_return_t kr;
2163
2164	if (task->pidsuspended) {
2165		kr = KERN_FAILURE;
2166		goto out;
2167	}
2168
2169	task->pidsuspended = TRUE;
2170
2171	kr = place_task_hold(task, TASK_HOLD_PIDSUSPEND);
2172	if (kr != KERN_SUCCESS) {
2173		task->pidsuspended = FALSE;
2174	}
2175out:
2176	return(kr);
2177}
2178
2179
2180/*
2181 *	task_pidsuspend:
2182 *
2183 *	Suspends a task by placing a hold on its threads.
2184 *
2185 * Conditions:
2186 * 	The caller holds a reference to the task
2187 */
2188kern_return_t
2189task_pidsuspend(
2190	register task_t		task)
2191{
2192	kern_return_t	 kr;
2193
2194	if (task == TASK_NULL || task == kernel_task)
2195		return (KERN_INVALID_ARGUMENT);
2196
2197	task_lock(task);
2198
2199	kr = task_pidsuspend_locked(task);
2200
2201	task_unlock(task);
2202
2203	return (kr);
2204}
2205
2206/* If enabled, we bring all the frozen pages back in prior to resumption; otherwise, they're faulted back in on demand */
2207#define THAW_ON_RESUME 1
2208
2209/*
2210 *	task_pidresume:
2211 *		Resumes a previously suspended task.
2212 *
2213 * Conditions:
2214 *		The caller holds a reference to the task
2215 */
2216kern_return_t
2217task_pidresume(
2218	register task_t	task)
2219{
2220	kern_return_t	 kr;
2221
2222	if (task == TASK_NULL || task == kernel_task)
2223		return (KERN_INVALID_ARGUMENT);
2224
2225	task_lock(task);
2226
2227#if (CONFIG_FREEZE && THAW_ON_RESUME)
2228
2229	while (task->changing_freeze_state) {
2230
2231		assert_wait((event_t)&task->changing_freeze_state, THREAD_UNINT);
2232		task_unlock(task);
2233		thread_block(THREAD_CONTINUE_NULL);
2234
2235		task_lock(task);
2236	}
2237	task->changing_freeze_state = TRUE;
2238#endif
2239
2240	kr = release_task_hold(task, TASK_HOLD_PIDSUSPEND);
2241
2242	task_unlock(task);
2243
2244#if (CONFIG_FREEZE && THAW_ON_RESUME)
2245	if ((kr == KERN_SUCCESS) && (task->frozen == TRUE)) {
2246
2247		if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
2248
2249			kr = KERN_SUCCESS;
2250		} else {
2251
2252			kr = vm_map_thaw(task->map);
2253		}
2254	}
2255	task_lock(task);
2256
2257	if (kr == KERN_SUCCESS)
2258		task->frozen = FALSE;
2259	task->changing_freeze_state = FALSE;
2260	thread_wakeup(&task->changing_freeze_state);
2261
2262	task_unlock(task);
2263#endif
2264
2265	return (kr);
2266}
2267
2268#if CONFIG_FREEZE
2269
2270/*
2271 *	task_freeze:
2272 *
2273 *	Freeze a task.
2274 *
2275 * Conditions:
2276 * 	The caller holds a reference to the task
2277 */
2278kern_return_t
2279task_freeze(
2280	register task_t    task,
2281	uint32_t           *purgeable_count,
2282	uint32_t           *wired_count,
2283	uint32_t           *clean_count,
2284	uint32_t           *dirty_count,
2285	uint32_t           dirty_budget,
2286	boolean_t          *shared,
2287	boolean_t          walk_only)
2288{
2289	kern_return_t kr;
2290
2291	if (task == TASK_NULL || task == kernel_task)
2292		return (KERN_INVALID_ARGUMENT);
2293
2294	task_lock(task);
2295
2296	while (task->changing_freeze_state) {
2297
2298		assert_wait((event_t)&task->changing_freeze_state, THREAD_UNINT);
2299		task_unlock(task);
2300		thread_block(THREAD_CONTINUE_NULL);
2301
2302		task_lock(task);
2303	}
2304	if (task->frozen) {
2305		task_unlock(task);
2306		return (KERN_FAILURE);
2307	}
2308	task->changing_freeze_state = TRUE;
2309
2310	task_unlock(task);
2311
2312	if (walk_only) {
2313		kr = vm_map_freeze_walk(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared);
2314	} else {
2315		kr = vm_map_freeze(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared);
2316	}
2317
2318	task_lock(task);
2319
2320	if (walk_only == FALSE && kr == KERN_SUCCESS)
2321		task->frozen = TRUE;
2322	task->changing_freeze_state = FALSE;
2323	thread_wakeup(&task->changing_freeze_state);
2324
2325	task_unlock(task);
2326
2327	return (kr);
2328}
2329
2330/*
2331 *	task_thaw:
2332 *
2333 *	Thaw a currently frozen task.
2334 *
2335 * Conditions:
2336 * 	The caller holds a reference to the task
2337 */
2338extern void
2339vm_consider_waking_compactor_swapper(void);
2340
2341kern_return_t
2342task_thaw(
2343	register task_t		task)
2344{
2345	kern_return_t kr;
2346
2347	if (task == TASK_NULL || task == kernel_task)
2348		return (KERN_INVALID_ARGUMENT);
2349
2350	task_lock(task);
2351
2352	while (task->changing_freeze_state) {
2353
2354		assert_wait((event_t)&task->changing_freeze_state, THREAD_UNINT);
2355		task_unlock(task);
2356		thread_block(THREAD_CONTINUE_NULL);
2357
2358		task_lock(task);
2359	}
2360	if (!task->frozen) {
2361		task_unlock(task);
2362		return (KERN_FAILURE);
2363	}
2364	task->changing_freeze_state = TRUE;
2365
2366	if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) {
2367		task_unlock(task);
2368
2369		kr = vm_map_thaw(task->map);
2370
2371		task_lock(task);
2372
2373		if (kr == KERN_SUCCESS)
2374			task->frozen = FALSE;
2375	} else {
2376		task->frozen = FALSE;
2377		kr = KERN_SUCCESS;
2378	}
2379
2380	task->changing_freeze_state = FALSE;
2381	thread_wakeup(&task->changing_freeze_state);
2382
2383	task_unlock(task);
2384
2385	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
2386		vm_consider_waking_compactor_swapper();
2387	}
2388
2389	return (kr);
2390}
2391
2392#endif /* CONFIG_FREEZE */
2393
2394kern_return_t
2395host_security_set_task_token(
2396        host_security_t  host_security,
2397        task_t		 task,
2398        security_token_t sec_token,
2399	audit_token_t	 audit_token,
2400	host_priv_t	 host_priv)
2401{
2402	ipc_port_t	 host_port;
2403	kern_return_t	 kr;
2404
2405	if (task == TASK_NULL)
2406		return(KERN_INVALID_ARGUMENT);
2407
2408	if (host_security == HOST_NULL)
2409		return(KERN_INVALID_SECURITY);
2410
2411        task_lock(task);
2412        task->sec_token = sec_token;
2413	task->audit_token = audit_token;
2414
2415	task_unlock(task);
2416
2417	if (host_priv != HOST_PRIV_NULL) {
2418		kr = host_get_host_priv_port(host_priv, &host_port);
2419	} else {
2420		kr = host_get_host_port(host_priv_self(), &host_port);
2421	}
2422	assert(kr == KERN_SUCCESS);
2423	kr = task_set_special_port(task, TASK_HOST_PORT, host_port);
2424        return(kr);
2425}
2426
2427kern_return_t
2428task_send_trace_memory(
2429	task_t        target_task,
2430	__unused uint32_t pid,
2431	__unused uint64_t uniqueid)
2432{
2433	kern_return_t kr = KERN_INVALID_ARGUMENT;
2434	if (target_task == TASK_NULL)
2435		return (KERN_INVALID_ARGUMENT);
2436
2437#if CONFIG_ATM
2438	kr = atm_send_proc_inspect_notification(target_task,
2439				  pid,
2440				  uniqueid);
2441
2442#endif
2443	return (kr);
2444}
2445/*
2446 * This routine was added, pretty much exclusively, for registering the
2447 * RPC glue vector for in-kernel short circuited tasks.  Rather than
2448 * removing it completely, I have only disabled that feature (which was
2449 * the only feature at the time).  It just appears that we are going to
2450 * want to add some user data to tasks in the future (i.e. bsd info,
2451 * task names, etc...), so I left it in the formal task interface.
2452 */
2453kern_return_t
2454task_set_info(
2455	task_t		task,
2456	task_flavor_t	flavor,
2457	__unused task_info_t	task_info_in,		/* pointer to IN array */
2458	__unused mach_msg_type_number_t	task_info_count)
2459{
2460	if (task == TASK_NULL)
2461		return(KERN_INVALID_ARGUMENT);
2462
2463	switch (flavor) {
2464
2465#if CONFIG_ATM
2466		case TASK_TRACE_MEMORY_INFO:
2467		{
2468			if (task_info_count != TASK_TRACE_MEMORY_INFO_COUNT)
2469				return (KERN_INVALID_ARGUMENT);
2470
2471			assert(task_info_in != NULL);
2472			task_trace_memory_info_t mem_info;
2473			mem_info = (task_trace_memory_info_t) task_info_in;
2474			kern_return_t kr = atm_register_trace_memory(task,
2475						mem_info->user_memory_address,
2476						mem_info->buffer_size,
2477						mem_info->mailbox_array_size);
2478			return kr;
2479			break;
2480		}
2481
2482#endif
2483	    default:
2484		return (KERN_INVALID_ARGUMENT);
2485	}
2486	return (KERN_SUCCESS);
2487}
2488
2489kern_return_t
2490task_info(
2491	task_t			task,
2492	task_flavor_t		flavor,
2493	task_info_t		task_info_out,
2494	mach_msg_type_number_t	*task_info_count)
2495{
2496	kern_return_t error = KERN_SUCCESS;
2497
2498	if (task == TASK_NULL)
2499		return (KERN_INVALID_ARGUMENT);
2500
2501	task_lock(task);
2502
2503	if ((task != current_task()) && (!task->active)) {
2504		task_unlock(task);
2505		return (KERN_INVALID_ARGUMENT);
2506	}
2507
2508	switch (flavor) {
2509
2510	case TASK_BASIC_INFO_32:
2511	case TASK_BASIC2_INFO_32:
2512	{
2513		task_basic_info_32_t	basic_info;
2514		vm_map_t				map;
2515		clock_sec_t				secs;
2516		clock_usec_t			usecs;
2517
2518		if (*task_info_count < TASK_BASIC_INFO_32_COUNT) {
2519		    error = KERN_INVALID_ARGUMENT;
2520		    break;
2521		}
2522
2523		basic_info = (task_basic_info_32_t)task_info_out;
2524
2525		map = (task == kernel_task)? kernel_map: task->map;
2526		basic_info->virtual_size = (typeof(basic_info->virtual_size))map->size;
2527		if (flavor == TASK_BASIC2_INFO_32) {
2528			/*
2529			 * The "BASIC2" flavor gets the maximum resident
2530			 * size instead of the current resident size...
2531			 */
2532			basic_info->resident_size = pmap_resident_max(map->pmap);
2533		} else {
2534			basic_info->resident_size = pmap_resident_count(map->pmap);
2535		}
2536		basic_info->resident_size *= PAGE_SIZE;
2537
2538		basic_info->policy = ((task != kernel_task)?
2539										  POLICY_TIMESHARE: POLICY_RR);
2540		basic_info->suspend_count = task->user_stop_count;
2541
2542		absolutetime_to_microtime(task->total_user_time, &secs, &usecs);
2543		basic_info->user_time.seconds =
2544			(typeof(basic_info->user_time.seconds))secs;
2545		basic_info->user_time.microseconds = usecs;
2546
2547		absolutetime_to_microtime(task->total_system_time, &secs, &usecs);
2548		basic_info->system_time.seconds =
2549			(typeof(basic_info->system_time.seconds))secs;
2550		basic_info->system_time.microseconds = usecs;
2551
2552		*task_info_count = TASK_BASIC_INFO_32_COUNT;
2553		break;
2554	}
2555
2556	case TASK_BASIC_INFO_64:
2557	{
2558		task_basic_info_64_t	basic_info;
2559		vm_map_t				map;
2560		clock_sec_t				secs;
2561		clock_usec_t			usecs;
2562
2563		if (*task_info_count < TASK_BASIC_INFO_64_COUNT) {
2564		    error = KERN_INVALID_ARGUMENT;
2565		    break;
2566		}
2567
2568		basic_info = (task_basic_info_64_t)task_info_out;
2569
2570		map = (task == kernel_task)? kernel_map: task->map;
2571		basic_info->virtual_size  = map->size;
2572		basic_info->resident_size =
2573			(mach_vm_size_t)(pmap_resident_count(map->pmap))
2574			* PAGE_SIZE_64;
2575
2576		basic_info->policy = ((task != kernel_task)?
2577										  POLICY_TIMESHARE: POLICY_RR);
2578		basic_info->suspend_count = task->user_stop_count;
2579
2580		absolutetime_to_microtime(task->total_user_time, &secs, &usecs);
2581		basic_info->user_time.seconds =
2582			(typeof(basic_info->user_time.seconds))secs;
2583		basic_info->user_time.microseconds = usecs;
2584
2585		absolutetime_to_microtime(task->total_system_time, &secs, &usecs);
2586		basic_info->system_time.seconds =
2587			(typeof(basic_info->system_time.seconds))secs;
2588		basic_info->system_time.microseconds = usecs;
2589
2590		*task_info_count = TASK_BASIC_INFO_64_COUNT;
2591		break;
2592	}
2593
2594	case MACH_TASK_BASIC_INFO:
2595	{
2596		mach_task_basic_info_t  basic_info;
2597		vm_map_t                map;
2598		clock_sec_t             secs;
2599		clock_usec_t            usecs;
2600
2601		if (*task_info_count < MACH_TASK_BASIC_INFO_COUNT) {
2602		    error = KERN_INVALID_ARGUMENT;
2603		    break;
2604		}
2605
2606		basic_info = (mach_task_basic_info_t)task_info_out;
2607
2608		map = (task == kernel_task) ? kernel_map : task->map;
2609
2610		basic_info->virtual_size  = map->size;
2611
2612		basic_info->resident_size =
2613		    (mach_vm_size_t)(pmap_resident_count(map->pmap));
2614		basic_info->resident_size *= PAGE_SIZE_64;
2615
2616		basic_info->resident_size_max =
2617		    (mach_vm_size_t)(pmap_resident_max(map->pmap));
2618		basic_info->resident_size_max *= PAGE_SIZE_64;
2619
2620		basic_info->policy = ((task != kernel_task) ?
2621		                      POLICY_TIMESHARE : POLICY_RR);
2622
2623		basic_info->suspend_count = task->user_stop_count;
2624
2625		absolutetime_to_microtime(task->total_user_time, &secs, &usecs);
2626		basic_info->user_time.seconds =
2627		    (typeof(basic_info->user_time.seconds))secs;
2628		basic_info->user_time.microseconds = usecs;
2629
2630		absolutetime_to_microtime(task->total_system_time, &secs, &usecs);
2631		basic_info->system_time.seconds =
2632		    (typeof(basic_info->system_time.seconds))secs;
2633		basic_info->system_time.microseconds = usecs;
2634
2635		*task_info_count = MACH_TASK_BASIC_INFO_COUNT;
2636		break;
2637	}
2638
2639	case TASK_THREAD_TIMES_INFO:
2640	{
2641		register task_thread_times_info_t	times_info;
2642		register thread_t					thread;
2643
2644		if (*task_info_count < TASK_THREAD_TIMES_INFO_COUNT) {
2645		    error = KERN_INVALID_ARGUMENT;
2646		    break;
2647		}
2648
2649		times_info = (task_thread_times_info_t) task_info_out;
2650		times_info->user_time.seconds = 0;
2651		times_info->user_time.microseconds = 0;
2652		times_info->system_time.seconds = 0;
2653		times_info->system_time.microseconds = 0;
2654
2655
2656		queue_iterate(&task->threads, thread, thread_t, task_threads) {
2657			time_value_t	user_time, system_time;
2658
2659			if (thread->options & TH_OPT_IDLE_THREAD)
2660				continue;
2661
2662			thread_read_times(thread, &user_time, &system_time);
2663
2664			time_value_add(&times_info->user_time, &user_time);
2665			time_value_add(&times_info->system_time, &system_time);
2666		}
2667
2668		*task_info_count = TASK_THREAD_TIMES_INFO_COUNT;
2669		break;
2670	}
2671
2672	case TASK_ABSOLUTETIME_INFO:
2673	{
2674		task_absolutetime_info_t	info;
2675		register thread_t			thread;
2676
2677		if (*task_info_count < TASK_ABSOLUTETIME_INFO_COUNT) {
2678			error = KERN_INVALID_ARGUMENT;
2679			break;
2680		}
2681
2682		info = (task_absolutetime_info_t)task_info_out;
2683		info->threads_user = info->threads_system = 0;
2684
2685
2686		info->total_user = task->total_user_time;
2687		info->total_system = task->total_system_time;
2688
2689		queue_iterate(&task->threads, thread, thread_t, task_threads) {
2690			uint64_t	tval;
2691			spl_t 		x;
2692
2693			if (thread->options & TH_OPT_IDLE_THREAD)
2694				continue;
2695
2696			x = splsched();
2697			thread_lock(thread);
2698
2699			tval = timer_grab(&thread->user_timer);
2700			info->threads_user += tval;
2701			info->total_user += tval;
2702
2703			tval = timer_grab(&thread->system_timer);
2704			if (thread->precise_user_kernel_time) {
2705				info->threads_system += tval;
2706				info->total_system += tval;
2707			} else {
2708				/* system_timer may represent either sys or user */
2709				info->threads_user += tval;
2710				info->total_user += tval;
2711			}
2712
2713			thread_unlock(thread);
2714			splx(x);
2715		}
2716
2717
2718		*task_info_count = TASK_ABSOLUTETIME_INFO_COUNT;
2719		break;
2720	}
2721
2722	case TASK_DYLD_INFO:
2723	{
2724		task_dyld_info_t info;
2725
2726		/*
2727		 * We added the format field to TASK_DYLD_INFO output.  For
2728		 * temporary backward compatibility, accept the fact that
2729		 * clients may ask for the old version - distinquished by the
2730		 * size of the expected result structure.
2731		 */
2732#define TASK_LEGACY_DYLD_INFO_COUNT \
2733		offsetof(struct task_dyld_info, all_image_info_format)/sizeof(natural_t)
2734
2735		if (*task_info_count < TASK_LEGACY_DYLD_INFO_COUNT) {
2736			error = KERN_INVALID_ARGUMENT;
2737			break;
2738		}
2739
2740		info = (task_dyld_info_t)task_info_out;
2741		info->all_image_info_addr = task->all_image_info_addr;
2742		info->all_image_info_size = task->all_image_info_size;
2743
2744		/* only set format on output for those expecting it */
2745		if (*task_info_count >= TASK_DYLD_INFO_COUNT) {
2746			info->all_image_info_format = task_has_64BitAddr(task) ?
2747				                 TASK_DYLD_ALL_IMAGE_INFO_64 :
2748				                 TASK_DYLD_ALL_IMAGE_INFO_32 ;
2749			*task_info_count = TASK_DYLD_INFO_COUNT;
2750		} else {
2751			*task_info_count = TASK_LEGACY_DYLD_INFO_COUNT;
2752		}
2753		break;
2754	}
2755
2756	case TASK_EXTMOD_INFO:
2757	{
2758		task_extmod_info_t info;
2759		void *p;
2760
2761		if (*task_info_count < TASK_EXTMOD_INFO_COUNT) {
2762			error = KERN_INVALID_ARGUMENT;
2763			break;
2764		}
2765
2766		info = (task_extmod_info_t)task_info_out;
2767
2768		p = get_bsdtask_info(task);
2769		if (p) {
2770			proc_getexecutableuuid(p, info->task_uuid, sizeof(info->task_uuid));
2771		} else {
2772			bzero(info->task_uuid, sizeof(info->task_uuid));
2773		}
2774		info->extmod_statistics = task->extmod_statistics;
2775		*task_info_count = TASK_EXTMOD_INFO_COUNT;
2776
2777		break;
2778	}
2779
2780	case TASK_KERNELMEMORY_INFO:
2781	{
2782		task_kernelmemory_info_t	tkm_info;
2783		ledger_amount_t			credit, debit;
2784
2785		if (*task_info_count < TASK_KERNELMEMORY_INFO_COUNT) {
2786		   error = KERN_INVALID_ARGUMENT;
2787		   break;
2788		}
2789
2790		tkm_info = (task_kernelmemory_info_t) task_info_out;
2791		tkm_info->total_palloc = 0;
2792		tkm_info->total_pfree = 0;
2793		tkm_info->total_salloc = 0;
2794		tkm_info->total_sfree = 0;
2795
2796		if (task == kernel_task) {
2797			/*
2798			 * All shared allocs/frees from other tasks count against
2799			 * the kernel private memory usage.  If we are looking up
2800			 * info for the kernel task, gather from everywhere.
2801			 */
2802			task_unlock(task);
2803
2804			/* start by accounting for all the terminated tasks against the kernel */
2805			tkm_info->total_palloc = tasks_tkm_private.alloc + tasks_tkm_shared.alloc;
2806			tkm_info->total_pfree = tasks_tkm_private.free + tasks_tkm_shared.free;
2807
2808			/* count all other task/thread shared alloc/free against the kernel */
2809			lck_mtx_lock(&tasks_threads_lock);
2810
2811			/* XXX this really shouldn't be using the function parameter 'task' as a local var! */
2812			queue_iterate(&tasks, task, task_t, tasks) {
2813				if (task == kernel_task) {
2814					if (ledger_get_entries(task->ledger,
2815					    task_ledgers.tkm_private, &credit,
2816					    &debit) == KERN_SUCCESS) {
2817						tkm_info->total_palloc += credit;
2818						tkm_info->total_pfree += debit;
2819					}
2820				}
2821				if (!ledger_get_entries(task->ledger,
2822				    task_ledgers.tkm_shared, &credit, &debit)) {
2823					tkm_info->total_palloc += credit;
2824					tkm_info->total_pfree += debit;
2825				}
2826			}
2827			lck_mtx_unlock(&tasks_threads_lock);
2828		} else {
2829			if (!ledger_get_entries(task->ledger,
2830			    task_ledgers.tkm_private, &credit, &debit)) {
2831				tkm_info->total_palloc = credit;
2832				tkm_info->total_pfree = debit;
2833			}
2834			if (!ledger_get_entries(task->ledger,
2835			    task_ledgers.tkm_shared, &credit, &debit)) {
2836				tkm_info->total_salloc = credit;
2837				tkm_info->total_sfree = debit;
2838			}
2839			task_unlock(task);
2840		}
2841
2842		*task_info_count = TASK_KERNELMEMORY_INFO_COUNT;
2843		return KERN_SUCCESS;
2844	}
2845
2846	/* OBSOLETE */
2847	case TASK_SCHED_FIFO_INFO:
2848	{
2849
2850		if (*task_info_count < POLICY_FIFO_BASE_COUNT) {
2851			error = KERN_INVALID_ARGUMENT;
2852			break;
2853		}
2854
2855		error = KERN_INVALID_POLICY;
2856		break;
2857	}
2858
2859	/* OBSOLETE */
2860	case TASK_SCHED_RR_INFO:
2861	{
2862		register policy_rr_base_t	rr_base;
2863		uint32_t quantum_time;
2864		uint64_t quantum_ns;
2865
2866		if (*task_info_count < POLICY_RR_BASE_COUNT) {
2867			error = KERN_INVALID_ARGUMENT;
2868			break;
2869		}
2870
2871		rr_base = (policy_rr_base_t) task_info_out;
2872
2873		if (task != kernel_task) {
2874			error = KERN_INVALID_POLICY;
2875			break;
2876		}
2877
2878		rr_base->base_priority = task->priority;
2879
2880		quantum_time = SCHED(initial_quantum_size)(THREAD_NULL);
2881		absolutetime_to_nanoseconds(quantum_time, &quantum_ns);
2882
2883		rr_base->quantum = (uint32_t)(quantum_ns / 1000 / 1000);
2884
2885		*task_info_count = POLICY_RR_BASE_COUNT;
2886		break;
2887	}
2888
2889	/* OBSOLETE */
2890	case TASK_SCHED_TIMESHARE_INFO:
2891	{
2892		register policy_timeshare_base_t	ts_base;
2893
2894		if (*task_info_count < POLICY_TIMESHARE_BASE_COUNT) {
2895			error = KERN_INVALID_ARGUMENT;
2896			break;
2897		}
2898
2899		ts_base = (policy_timeshare_base_t) task_info_out;
2900
2901		if (task == kernel_task) {
2902			error = KERN_INVALID_POLICY;
2903			break;
2904		}
2905
2906		ts_base->base_priority = task->priority;
2907
2908		*task_info_count = POLICY_TIMESHARE_BASE_COUNT;
2909		break;
2910	}
2911
2912	case TASK_SECURITY_TOKEN:
2913	{
2914		register security_token_t	*sec_token_p;
2915
2916		if (*task_info_count < TASK_SECURITY_TOKEN_COUNT) {
2917		    error = KERN_INVALID_ARGUMENT;
2918		    break;
2919		}
2920
2921		sec_token_p = (security_token_t *) task_info_out;
2922
2923		*sec_token_p = task->sec_token;
2924
2925		*task_info_count = TASK_SECURITY_TOKEN_COUNT;
2926		break;
2927	}
2928
2929	case TASK_AUDIT_TOKEN:
2930	{
2931		register audit_token_t	*audit_token_p;
2932
2933		if (*task_info_count < TASK_AUDIT_TOKEN_COUNT) {
2934		    error = KERN_INVALID_ARGUMENT;
2935		    break;
2936		}
2937
2938		audit_token_p = (audit_token_t *) task_info_out;
2939
2940		*audit_token_p = task->audit_token;
2941
2942		*task_info_count = TASK_AUDIT_TOKEN_COUNT;
2943		break;
2944	}
2945
2946	case TASK_SCHED_INFO:
2947		error = KERN_INVALID_ARGUMENT;
2948		break;
2949
2950	case TASK_EVENTS_INFO:
2951	{
2952		register task_events_info_t	events_info;
2953		register thread_t			thread;
2954
2955		if (*task_info_count < TASK_EVENTS_INFO_COUNT) {
2956		   error = KERN_INVALID_ARGUMENT;
2957		   break;
2958		}
2959
2960		events_info = (task_events_info_t) task_info_out;
2961
2962
2963		events_info->faults = task->faults;
2964		events_info->pageins = task->pageins;
2965		events_info->cow_faults = task->cow_faults;
2966		events_info->messages_sent = task->messages_sent;
2967		events_info->messages_received = task->messages_received;
2968		events_info->syscalls_mach = task->syscalls_mach;
2969		events_info->syscalls_unix = task->syscalls_unix;
2970
2971		events_info->csw = task->c_switch;
2972
2973		queue_iterate(&task->threads, thread, thread_t, task_threads) {
2974			events_info->csw	   += thread->c_switch;
2975			events_info->syscalls_mach += thread->syscalls_mach;
2976			events_info->syscalls_unix += thread->syscalls_unix;
2977		}
2978
2979
2980		*task_info_count = TASK_EVENTS_INFO_COUNT;
2981		break;
2982	}
2983	case TASK_AFFINITY_TAG_INFO:
2984	{
2985		if (*task_info_count < TASK_AFFINITY_TAG_INFO_COUNT) {
2986		    error = KERN_INVALID_ARGUMENT;
2987		    break;
2988		}
2989
2990		error = task_affinity_info(task, task_info_out, task_info_count);
2991		break;
2992	}
2993	case TASK_POWER_INFO:
2994	{
2995		if (*task_info_count < TASK_POWER_INFO_COUNT) {
2996			error = KERN_INVALID_ARGUMENT;
2997			break;
2998		}
2999
3000		task_power_info_locked(task, (task_power_info_t)task_info_out, NULL);
3001		break;
3002	}
3003
3004	case TASK_POWER_INFO_V2:
3005	{
3006		if (*task_info_count < TASK_POWER_INFO_V2_COUNT) {
3007			error = KERN_INVALID_ARGUMENT;
3008			break;
3009		}
3010		task_power_info_v2_t tpiv2 = (task_power_info_v2_t) task_info_out;
3011		task_power_info_locked(task, &tpiv2->cpu_energy, &tpiv2->gpu_energy);
3012		break;
3013	}
3014
3015	case TASK_VM_INFO:
3016	case TASK_VM_INFO_PURGEABLE:
3017	{
3018		task_vm_info_t		vm_info;
3019		vm_map_t		map;
3020
3021		if (*task_info_count < TASK_VM_INFO_COUNT) {
3022		    error = KERN_INVALID_ARGUMENT;
3023		    break;
3024		}
3025
3026		vm_info = (task_vm_info_t)task_info_out;
3027
3028		if (task == kernel_task) {
3029			map = kernel_map;
3030			/* no lock */
3031		} else {
3032			map = task->map;
3033			vm_map_lock_read(map);
3034		}
3035
3036		vm_info->virtual_size = (typeof(vm_info->virtual_size))map->size;
3037		vm_info->region_count = map->hdr.nentries;
3038		vm_info->page_size = vm_map_page_size(map);
3039
3040		vm_info->resident_size = pmap_resident_count(map->pmap);
3041		vm_info->resident_size *= PAGE_SIZE;
3042		vm_info->resident_size_peak = pmap_resident_max(map->pmap);
3043		vm_info->resident_size_peak *= PAGE_SIZE;
3044
3045#define _VM_INFO(_name) \
3046	vm_info->_name = ((mach_vm_size_t) map->pmap->stats._name) * PAGE_SIZE
3047
3048		_VM_INFO(device);
3049		_VM_INFO(device_peak);
3050		_VM_INFO(external);
3051		_VM_INFO(external_peak);
3052		_VM_INFO(internal);
3053		_VM_INFO(internal_peak);
3054		_VM_INFO(reusable);
3055		_VM_INFO(reusable_peak);
3056		_VM_INFO(compressed);
3057		_VM_INFO(compressed_peak);
3058		_VM_INFO(compressed_lifetime);
3059
3060		vm_info->purgeable_volatile_pmap = 0;
3061		vm_info->purgeable_volatile_resident = 0;
3062		vm_info->purgeable_volatile_virtual = 0;
3063		if (task == kernel_task) {
3064			/*
3065			 * We do not maintain the detailed stats for the
3066			 * kernel_pmap, so just count everything as
3067			 * "internal"...
3068			 */
3069			vm_info->internal = vm_info->resident_size;
3070			/*
3071			 * ... but since the memory held by the VM compressor
3072			 * in the kernel address space ought to be attributed
3073			 * to user-space tasks, we subtract it from "internal"
3074			 * to give memory reporting tools a more accurate idea
3075			 * of what the kernel itself is actually using, instead
3076			 * of making it look like the kernel is leaking memory
3077			 * when the system is under memory pressure.
3078			 */
3079			vm_info->internal -= (VM_PAGE_COMPRESSOR_COUNT *
3080					      PAGE_SIZE);
3081		} else {
3082			mach_vm_size_t	volatile_virtual_size;
3083			mach_vm_size_t	volatile_resident_size;
3084			mach_vm_size_t	volatile_pmap_size;
3085			kern_return_t	kr;
3086
3087			if (flavor == TASK_VM_INFO_PURGEABLE) {
3088				kr = vm_map_query_volatile(
3089					map,
3090					&volatile_virtual_size,
3091					&volatile_resident_size,
3092					&volatile_pmap_size);
3093				if (kr == KERN_SUCCESS) {
3094					vm_info->purgeable_volatile_pmap =
3095						volatile_pmap_size;
3096					vm_info->purgeable_volatile_resident =
3097						volatile_resident_size;
3098					vm_info->purgeable_volatile_virtual =
3099						volatile_virtual_size;
3100				}
3101			}
3102			vm_map_unlock_read(map);
3103		}
3104
3105		*task_info_count = TASK_VM_INFO_COUNT;
3106		break;
3107	}
3108
3109	case TASK_WAIT_STATE_INFO:
3110	{
3111		/*
3112		 * Deprecated flavor. Currently allowing some results until all users
3113		 * stop calling it. The results may not be accurate.
3114         */
3115		task_wait_state_info_t	wait_state_info;
3116		uint64_t total_sfi_ledger_val = 0;
3117
3118		if (*task_info_count < TASK_WAIT_STATE_INFO_COUNT) {
3119		   error = KERN_INVALID_ARGUMENT;
3120		   break;
3121		}
3122
3123		wait_state_info = (task_wait_state_info_t) task_info_out;
3124
3125		wait_state_info->total_wait_state_time = 0;
3126		bzero(wait_state_info->_reserved, sizeof(wait_state_info->_reserved));
3127
3128		int i, prev_lentry = -1;
3129		int64_t  val_credit, val_debit;
3130
3131		for (i = 0; i < MAX_SFI_CLASS_ID; i++){
3132			val_credit =0;
3133			/*
3134			 * checking with prev_lentry != entry ensures adjacent classes
3135			 * which share the same ledger do not add wait times twice.
3136			 * Note: Use ledger() call to get data for each individual sfi class.
3137			 */
3138			if (prev_lentry != task_ledgers.sfi_wait_times[i] &&
3139				KERN_SUCCESS == ledger_get_entries(task->ledger,
3140				                task_ledgers.sfi_wait_times[i], &val_credit, &val_debit)) {
3141				total_sfi_ledger_val += val_credit;
3142			}
3143			prev_lentry = task_ledgers.sfi_wait_times[i];
3144		}
3145
3146		wait_state_info->total_wait_sfi_state_time = total_sfi_ledger_val;
3147		*task_info_count = TASK_WAIT_STATE_INFO_COUNT;
3148
3149		break;
3150	}
3151
3152	default:
3153		error = KERN_INVALID_ARGUMENT;
3154	}
3155
3156	task_unlock(task);
3157	return (error);
3158}
3159
3160/*
3161 *	task_power_info
3162 *
3163 *	Returns power stats for the task.
3164 *	Note: Called with task locked.
3165 */
3166void
3167task_power_info_locked(
3168	task_t			task,
3169	task_power_info_t	info,
3170	gpu_energy_data_t	ginfo)
3171{
3172	thread_t		thread;
3173	ledger_amount_t		tmp;
3174
3175	task_lock_assert_owned(task);
3176
3177	ledger_get_entries(task->ledger, task_ledgers.interrupt_wakeups,
3178		(ledger_amount_t *)&info->task_interrupt_wakeups, &tmp);
3179	ledger_get_entries(task->ledger, task_ledgers.platform_idle_wakeups,
3180		(ledger_amount_t *)&info->task_platform_idle_wakeups, &tmp);
3181
3182	info->task_timer_wakeups_bin_1 = task->task_timer_wakeups_bin_1;
3183	info->task_timer_wakeups_bin_2 = task->task_timer_wakeups_bin_2;
3184
3185	info->total_user = task->total_user_time;
3186	info->total_system = task->total_system_time;
3187
3188	if (ginfo) {
3189		ginfo->task_gpu_utilisation = task->task_gpu_ns;
3190	}
3191
3192	queue_iterate(&task->threads, thread, thread_t, task_threads) {
3193		uint64_t	tval;
3194		spl_t 		x;
3195
3196		if (thread->options & TH_OPT_IDLE_THREAD)
3197			continue;
3198
3199		x = splsched();
3200		thread_lock(thread);
3201
3202		info->task_timer_wakeups_bin_1 += thread->thread_timer_wakeups_bin_1;
3203		info->task_timer_wakeups_bin_2 += thread->thread_timer_wakeups_bin_2;
3204
3205		tval = timer_grab(&thread->user_timer);
3206		info->total_user += tval;
3207
3208		tval = timer_grab(&thread->system_timer);
3209		if (thread->precise_user_kernel_time) {
3210			info->total_system += tval;
3211		} else {
3212			/* system_timer may represent either sys or user */
3213			info->total_user += tval;
3214		}
3215
3216		if (ginfo) {
3217			ginfo->task_gpu_utilisation += ml_gpu_stat(thread);
3218		}
3219		thread_unlock(thread);
3220		splx(x);
3221	}
3222}
3223
3224/*
3225 *	task_gpu_utilisation
3226 *
3227 *	Returns the total gpu time used by the all the threads of the task
3228 *  (both dead and alive)
3229 */
3230uint64_t
3231task_gpu_utilisation(
3232	task_t	task)
3233{
3234	uint64_t gpu_time = 0;
3235	thread_t thread;
3236
3237	task_lock(task);
3238	gpu_time += task->task_gpu_ns;
3239
3240	queue_iterate(&task->threads, thread, thread_t, task_threads) {
3241		spl_t x;
3242		x = splsched();
3243		thread_lock(thread);
3244		gpu_time += ml_gpu_stat(thread);
3245		thread_unlock(thread);
3246		splx(x);
3247	}
3248
3249	task_unlock(task);
3250	return gpu_time;
3251}
3252
3253kern_return_t
3254task_purgable_info(
3255	task_t			task,
3256	task_purgable_info_t	*stats)
3257{
3258	if (task == TASK_NULL || stats == NULL)
3259		return KERN_INVALID_ARGUMENT;
3260	/* Take task reference */
3261	task_reference(task);
3262	vm_purgeable_stats((vm_purgeable_info_t)stats, task);
3263	/* Drop task reference */
3264	task_deallocate(task);
3265	return KERN_SUCCESS;
3266}
3267
3268void
3269task_vtimer_set(
3270	task_t		task,
3271	integer_t	which)
3272{
3273	thread_t	thread;
3274	spl_t		x;
3275
3276	/* assert(task == current_task()); */ /* bogus assert 4803227 4807483 */
3277
3278	task_lock(task);
3279
3280	task->vtimers |= which;
3281
3282	switch (which) {
3283
3284	case TASK_VTIMER_USER:
3285		queue_iterate(&task->threads, thread, thread_t, task_threads) {
3286			x = splsched();
3287			thread_lock(thread);
3288			if (thread->precise_user_kernel_time)
3289				thread->vtimer_user_save = timer_grab(&thread->user_timer);
3290			else
3291				thread->vtimer_user_save = timer_grab(&thread->system_timer);
3292			thread_unlock(thread);
3293			splx(x);
3294		}
3295		break;
3296
3297	case TASK_VTIMER_PROF:
3298		queue_iterate(&task->threads, thread, thread_t, task_threads) {
3299			x = splsched();
3300			thread_lock(thread);
3301			thread->vtimer_prof_save = timer_grab(&thread->user_timer);
3302			thread->vtimer_prof_save += timer_grab(&thread->system_timer);
3303			thread_unlock(thread);
3304			splx(x);
3305		}
3306		break;
3307
3308	case TASK_VTIMER_RLIM:
3309		queue_iterate(&task->threads, thread, thread_t, task_threads) {
3310			x = splsched();
3311			thread_lock(thread);
3312			thread->vtimer_rlim_save = timer_grab(&thread->user_timer);
3313			thread->vtimer_rlim_save += timer_grab(&thread->system_timer);
3314			thread_unlock(thread);
3315			splx(x);
3316		}
3317		break;
3318	}
3319
3320	task_unlock(task);
3321}
3322
3323void
3324task_vtimer_clear(
3325	task_t		task,
3326	integer_t	which)
3327{
3328	assert(task == current_task());
3329
3330	task_lock(task);
3331
3332	task->vtimers &= ~which;
3333
3334	task_unlock(task);
3335}
3336
3337void
3338task_vtimer_update(
3339__unused
3340	task_t		task,
3341	integer_t	which,
3342	uint32_t	*microsecs)
3343{
3344	thread_t	thread = current_thread();
3345	uint32_t	tdelt;
3346	clock_sec_t	secs;
3347	uint64_t	tsum;
3348
3349	assert(task == current_task());
3350
3351	assert(task->vtimers & which);
3352
3353	secs = tdelt = 0;
3354
3355	switch (which) {
3356
3357	case TASK_VTIMER_USER:
3358		if (thread->precise_user_kernel_time) {
3359			tdelt = (uint32_t)timer_delta(&thread->user_timer,
3360								&thread->vtimer_user_save);
3361		} else {
3362			tdelt = (uint32_t)timer_delta(&thread->system_timer,
3363								&thread->vtimer_user_save);
3364		}
3365		absolutetime_to_microtime(tdelt, &secs, microsecs);
3366		break;
3367
3368	case TASK_VTIMER_PROF:
3369		tsum = timer_grab(&thread->user_timer);
3370		tsum += timer_grab(&thread->system_timer);
3371		tdelt = (uint32_t)(tsum - thread->vtimer_prof_save);
3372		absolutetime_to_microtime(tdelt, &secs, microsecs);
3373		/* if the time delta is smaller than a usec, ignore */
3374		if (*microsecs != 0)
3375			thread->vtimer_prof_save = tsum;
3376		break;
3377
3378	case TASK_VTIMER_RLIM:
3379		tsum = timer_grab(&thread->user_timer);
3380		tsum += timer_grab(&thread->system_timer);
3381		tdelt = (uint32_t)(tsum - thread->vtimer_rlim_save);
3382		thread->vtimer_rlim_save = tsum;
3383		absolutetime_to_microtime(tdelt, &secs, microsecs);
3384		break;
3385	}
3386
3387}
3388
3389/*
3390 *	task_assign:
3391 *
3392 *	Change the assigned processor set for the task
3393 */
3394kern_return_t
3395task_assign(
3396	__unused task_t		task,
3397	__unused processor_set_t	new_pset,
3398	__unused boolean_t	assign_threads)
3399{
3400	return(KERN_FAILURE);
3401}
3402
3403/*
3404 *	task_assign_default:
3405 *
3406 *	Version of task_assign to assign to default processor set.
3407 */
3408kern_return_t
3409task_assign_default(
3410	task_t		task,
3411	boolean_t	assign_threads)
3412{
3413    return (task_assign(task, &pset0, assign_threads));
3414}
3415
3416/*
3417 *	task_get_assignment
3418 *
3419 *	Return name of processor set that task is assigned to.
3420 */
3421kern_return_t
3422task_get_assignment(
3423	task_t		task,
3424	processor_set_t	*pset)
3425{
3426	if (!task->active)
3427		return(KERN_FAILURE);
3428
3429	*pset = &pset0;
3430
3431	return (KERN_SUCCESS);
3432}
3433
3434
3435/*
3436 * 	task_policy
3437 *
3438 *	Set scheduling policy and parameters, both base and limit, for
3439 *	the given task. Policy must be a policy which is enabled for the
3440 *	processor set. Change contained threads if requested.
3441 */
3442kern_return_t
3443task_policy(
3444	__unused task_t			task,
3445	__unused policy_t			policy_id,
3446	__unused policy_base_t		base,
3447	__unused mach_msg_type_number_t	count,
3448	__unused boolean_t			set_limit,
3449	__unused boolean_t			change)
3450{
3451	return(KERN_FAILURE);
3452}
3453
3454/*
3455 *	task_set_policy
3456 *
3457 *	Set scheduling policy and parameters, both base and limit, for
3458 *	the given task. Policy can be any policy implemented by the
3459 *	processor set, whether enabled or not. Change contained threads
3460 *	if requested.
3461 */
3462kern_return_t
3463task_set_policy(
3464	__unused task_t			task,
3465	__unused processor_set_t		pset,
3466	__unused policy_t			policy_id,
3467	__unused policy_base_t		base,
3468	__unused mach_msg_type_number_t	base_count,
3469	__unused policy_limit_t		limit,
3470	__unused mach_msg_type_number_t	limit_count,
3471	__unused boolean_t			change)
3472{
3473	return(KERN_FAILURE);
3474}
3475
3476kern_return_t
3477task_set_ras_pc(
3478 	__unused task_t	task,
3479 	__unused vm_offset_t	pc,
3480 	__unused vm_offset_t	endpc)
3481{
3482	return KERN_FAILURE;
3483}
3484
3485void
3486task_synchronizer_destroy_all(task_t task)
3487{
3488	semaphore_t	semaphore;
3489
3490	/*
3491	 *  Destroy owned semaphores
3492	 */
3493
3494	while (!queue_empty(&task->semaphore_list)) {
3495		semaphore = (semaphore_t) queue_first(&task->semaphore_list);
3496		(void) semaphore_destroy(task, semaphore);
3497	}
3498}
3499
3500/*
3501 * Install default (machine-dependent) initial thread state
3502 * on the task.  Subsequent thread creation will have this initial
3503 * state set on the thread by machine_thread_inherit_taskwide().
3504 * Flavors and structures are exactly the same as those to thread_set_state()
3505 */
3506kern_return_t
3507task_set_state(
3508	task_t task,
3509	int flavor,
3510	thread_state_t state,
3511	mach_msg_type_number_t state_count)
3512{
3513	kern_return_t ret;
3514
3515	if (task == TASK_NULL) {
3516		return (KERN_INVALID_ARGUMENT);
3517	}
3518
3519	task_lock(task);
3520
3521	if (!task->active) {
3522		task_unlock(task);
3523		return (KERN_FAILURE);
3524	}
3525
3526	ret = machine_task_set_state(task, flavor, state, state_count);
3527
3528	task_unlock(task);
3529	return ret;
3530}
3531
3532/*
3533 * Examine the default (machine-dependent) initial thread state
3534 * on the task, as set by task_set_state().  Flavors and structures
3535 * are exactly the same as those passed to thread_get_state().
3536 */
3537kern_return_t
3538task_get_state(
3539	task_t 	task,
3540	int	flavor,
3541	thread_state_t state,
3542	mach_msg_type_number_t *state_count)
3543{
3544	kern_return_t ret;
3545
3546	if (task == TASK_NULL) {
3547		return (KERN_INVALID_ARGUMENT);
3548	}
3549
3550	task_lock(task);
3551
3552	if (!task->active) {
3553		task_unlock(task);
3554		return (KERN_FAILURE);
3555	}
3556
3557	ret = machine_task_get_state(task, flavor, state, state_count);
3558
3559	task_unlock(task);
3560	return ret;
3561}
3562
3563#if CONFIG_JETSAM
3564#define HWM_USERCORE_MINSPACE 250 // free space (in MB) required *after* core file creation
3565
3566void __attribute__((noinline))
3567THIS_PROCESS_CROSSED_HIGH_WATERMARK__SENDING_EXC_RESOURCE(int max_footprint_mb)
3568{
3569	task_t						task 		= current_task();
3570	int							pid         = 0;
3571	char        				*procname 	= (char *) "unknown";
3572	mach_exception_data_type_t	code[EXCEPTION_CODE_MAX];
3573
3574#ifdef MACH_BSD
3575	pid = proc_selfpid();
3576
3577	if (pid == 1) {
3578		/*
3579		 * Cannot have ReportCrash analyzing
3580		 * a suspended initproc.
3581		 */
3582		return;
3583	}
3584
3585	if (task->bsd_info != NULL)
3586		procname = proc_name_address(current_task()->bsd_info);
3587#endif
3588
3589	if (hwm_user_cores) {
3590		int				error;
3591		uint64_t		starttime, end;
3592		clock_sec_t		secs = 0;
3593		uint32_t		microsecs = 0;
3594
3595		starttime = mach_absolute_time();
3596		/*
3597		 * Trigger a coredump of this process. Don't proceed unless we know we won't
3598		 * be filling up the disk; and ignore the core size resource limit for this
3599		 * core file.
3600		 */
3601		if ((error = coredump(current_task()->bsd_info, HWM_USERCORE_MINSPACE, 1)) != 0) {
3602			printf("couldn't take coredump of %s[%d]: %d\n", procname, pid, error);
3603		}
3604		/*
3605		* coredump() leaves the task suspended.
3606		*/
3607		task_resume_internal(current_task());
3608
3609		end = mach_absolute_time();
3610		absolutetime_to_microtime(end - starttime, &secs, &microsecs);
3611		printf("coredump of %s[%d] taken in %d secs %d microsecs\n",
3612		       proc_name_address(current_task()->bsd_info), pid, (int)secs, microsecs);
3613	}
3614
3615	if (disable_exc_resource) {
3616		printf("process %s[%d] crossed memory high watermark (%d MB); EXC_RESOURCE "
3617			"supressed by a boot-arg.\n", procname, pid, max_footprint_mb);
3618		return;
3619	}
3620
3621	printf("process %s[%d] crossed memory high watermark (%d MB); sending "
3622		"EXC_RESOURCE.\n", procname, pid, max_footprint_mb);
3623
3624	code[0] = code[1] = 0;
3625	EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_MEMORY);
3626	EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_HIGH_WATERMARK);
3627	EXC_RESOURCE_HWM_ENCODE_LIMIT(code[0], max_footprint_mb);
3628
3629	/*
3630	 * Use the _internal_ variant so that no user-space
3631	 * process can resume our task from under us.
3632	 */
3633	task_suspend_internal(task);
3634	exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX);
3635	task_resume_internal(task);
3636}
3637
3638/*
3639 * Callback invoked when a task exceeds its physical footprint limit.
3640 */
3641void
3642task_footprint_exceeded(int warning, __unused const void *param0, __unused const void *param1)
3643{
3644	ledger_amount_t max_footprint, max_footprint_mb;
3645	ledger_amount_t footprint_after_purge;
3646	task_t task;
3647
3648	if (warning == LEDGER_WARNING_DIPPED_BELOW) {
3649		/*
3650		 * Task memory limits only provide a warning on the way up.
3651		 */
3652		return;
3653	}
3654
3655	task = current_task();
3656
3657	ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &max_footprint);
3658	max_footprint_mb = max_footprint >> 20;
3659
3660	/*
3661	 * Try and purge all "volatile" memory in that task first.
3662	 */
3663	(void) task_purge_volatile_memory(task);
3664	/* are we still over the limit ? */
3665	ledger_get_balance(task->ledger,
3666			   task_ledgers.phys_footprint,
3667			   &footprint_after_purge);
3668	if ((!warning &&
3669	     footprint_after_purge <= max_footprint) ||
3670	    (warning &&
3671	     footprint_after_purge <= ((max_footprint *
3672					PHYS_FOOTPRINT_WARNING_LEVEL) / 100))) {
3673		/* all better now */
3674		ledger_reset_callback_state(task->ledger,
3675					    task_ledgers.phys_footprint);
3676		return;
3677	}
3678	/* still over the limit after purging... */
3679
3680	/*
3681	 * If this an actual violation (not a warning),
3682	 * generate a non-fatal high watermark EXC_RESOURCE.
3683	 */
3684	if ((warning == 0) && (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_PHYS_FOOTPRINT_EXCEPTION)) {
3685		THIS_PROCESS_CROSSED_HIGH_WATERMARK__SENDING_EXC_RESOURCE((int)max_footprint_mb);
3686	}
3687
3688	memorystatus_on_ledger_footprint_exceeded((warning == LEDGER_WARNING_ROSE_ABOVE) ? TRUE : FALSE,
3689		(int)max_footprint_mb);
3690}
3691
3692extern int proc_check_footprint_priv(void);
3693
3694kern_return_t
3695task_set_phys_footprint_limit(
3696	task_t task,
3697	int new_limit_mb,
3698	int *old_limit_mb)
3699{
3700	kern_return_t error;
3701
3702	if ((error = proc_check_footprint_priv())) {
3703		return (KERN_NO_ACCESS);
3704	}
3705
3706	return task_set_phys_footprint_limit_internal(task, new_limit_mb, old_limit_mb, FALSE);
3707}
3708
3709kern_return_t
3710task_set_phys_footprint_limit_internal(
3711	task_t task,
3712	int new_limit_mb,
3713	int *old_limit_mb,
3714	boolean_t trigger_exception)
3715{
3716	ledger_amount_t	old;
3717
3718	ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &old);
3719
3720	if (old_limit_mb) {
3721		*old_limit_mb = old >> 20;
3722	}
3723
3724	if (new_limit_mb == -1) {
3725		/*
3726		 * Caller wishes to remove the limit.
3727		 */
3728		ledger_set_limit(task->ledger, task_ledgers.phys_footprint,
3729		                 max_task_footprint ? max_task_footprint : LEDGER_LIMIT_INFINITY,
3730		                 max_task_footprint ? PHYS_FOOTPRINT_WARNING_LEVEL : 0);
3731		return (KERN_SUCCESS);
3732	}
3733
3734#ifdef CONFIG_NOMONITORS
3735	return (KERN_SUCCESS);
3736#endif /* CONFIG_NOMONITORS */
3737
3738	task_lock(task);
3739
3740	if (trigger_exception) {
3741		task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_PHYS_FOOTPRINT_EXCEPTION;
3742	} else {
3743		task->rusage_cpu_flags &= ~TASK_RUSECPU_FLAGS_PHYS_FOOTPRINT_EXCEPTION;
3744	}
3745
3746	ledger_set_limit(task->ledger, task_ledgers.phys_footprint,
3747		(ledger_amount_t)new_limit_mb << 20, PHYS_FOOTPRINT_WARNING_LEVEL);
3748
3749	task_unlock(task);
3750
3751	return (KERN_SUCCESS);
3752}
3753
3754kern_return_t
3755task_get_phys_footprint_limit(
3756	task_t task,
3757	int *limit_mb)
3758{
3759	ledger_amount_t	limit;
3760
3761	ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &limit);
3762	*limit_mb = limit >> 20;
3763
3764	return (KERN_SUCCESS);
3765}
3766#else /* CONFIG_JETSAM */
3767kern_return_t
3768task_set_phys_footprint_limit(
3769	__unused task_t task,
3770	__unused int new_limit_mb,
3771	__unused int *old_limit_mb)
3772{
3773	return (KERN_FAILURE);
3774}
3775
3776kern_return_t
3777task_get_phys_footprint_limit(
3778	__unused task_t task,
3779	__unused int *limit_mb)
3780{
3781	return (KERN_FAILURE);
3782}
3783#endif /* CONFIG_JETSAM */
3784
3785/*
3786 * We need to export some functions to other components that
3787 * are currently implemented in macros within the osfmk
3788 * component.  Just export them as functions of the same name.
3789 */
3790boolean_t is_kerneltask(task_t t)
3791{
3792	if (t == kernel_task)
3793		return (TRUE);
3794
3795	return (FALSE);
3796}
3797
3798int
3799check_for_tasksuspend(task_t task)
3800{
3801
3802	if (task == TASK_NULL)
3803		return (0);
3804
3805	return (task->suspend_count > 0);
3806}
3807
3808#undef current_task
3809task_t current_task(void);
3810task_t current_task(void)
3811{
3812	return (current_task_fast());
3813}
3814
3815#undef task_reference
3816void task_reference(task_t task);
3817void
3818task_reference(
3819	task_t		task)
3820{
3821	if (task != TASK_NULL)
3822		task_reference_internal(task);
3823}
3824
3825/*
3826 * This routine is called always with task lock held.
3827 * And it returns a thread handle without reference as the caller
3828 * operates on it under the task lock held.
3829 */
3830thread_t
3831task_findtid(task_t task, uint64_t tid)
3832{
3833	thread_t thread= THREAD_NULL;
3834
3835	queue_iterate(&task->threads, thread, thread_t, task_threads) {
3836			if (thread->thread_id == tid)
3837				return(thread);
3838	}
3839	return(THREAD_NULL);
3840}
3841
3842/*
3843 * Control the CPU usage monitor for a task.
3844 */
3845kern_return_t
3846task_cpu_usage_monitor_ctl(task_t task, uint32_t *flags)
3847{
3848	int error = KERN_SUCCESS;
3849
3850	if (*flags & CPUMON_MAKE_FATAL) {
3851		task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_FATAL_CPUMON;
3852	} else {
3853		error = KERN_INVALID_ARGUMENT;
3854	}
3855
3856	return error;
3857}
3858
3859/*
3860 * Control the wakeups monitor for a task.
3861 */
3862kern_return_t
3863task_wakeups_monitor_ctl(task_t task, uint32_t *flags, int32_t *rate_hz)
3864{
3865	ledger_t ledger = task->ledger;
3866
3867	task_lock(task);
3868	if (*flags & WAKEMON_GET_PARAMS) {
3869		ledger_amount_t	limit;
3870		uint64_t		period;
3871
3872		ledger_get_limit(ledger, task_ledgers.interrupt_wakeups, &limit);
3873		ledger_get_period(ledger, task_ledgers.interrupt_wakeups, &period);
3874
3875		if (limit != LEDGER_LIMIT_INFINITY) {
3876			/*
3877			 * An active limit means the wakeups monitor is enabled.
3878			 */
3879			*rate_hz = (int32_t)(limit / (int64_t)(period / NSEC_PER_SEC));
3880			*flags = WAKEMON_ENABLE;
3881			if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_WAKEUPSMON) {
3882				*flags |= WAKEMON_MAKE_FATAL;
3883			}
3884		} else {
3885			*flags = WAKEMON_DISABLE;
3886			*rate_hz = -1;
3887		}
3888
3889		/*
3890		 * If WAKEMON_GET_PARAMS is present in flags, all other flags are ignored.
3891		 */
3892 		task_unlock(task);
3893		return KERN_SUCCESS;
3894	}
3895
3896	if (*flags & WAKEMON_ENABLE) {
3897		if (*flags & WAKEMON_SET_DEFAULTS) {
3898			*rate_hz = task_wakeups_monitor_rate;
3899		}
3900
3901#ifndef CONFIG_NOMONITORS
3902		if (*flags & WAKEMON_MAKE_FATAL) {
3903			task->rusage_cpu_flags |= TASK_RUSECPU_FLAGS_FATAL_WAKEUPSMON;
3904		}
3905#endif /* CONFIG_NOMONITORS */
3906
3907		if (*rate_hz < 0) {
3908			task_unlock(task);
3909			return KERN_INVALID_ARGUMENT;
3910		}
3911
3912#ifndef CONFIG_NOMONITORS
3913		ledger_set_limit(ledger, task_ledgers.interrupt_wakeups, *rate_hz * task_wakeups_monitor_interval,
3914			task_wakeups_monitor_ustackshots_trigger_pct);
3915		ledger_set_period(ledger, task_ledgers.interrupt_wakeups, task_wakeups_monitor_interval * NSEC_PER_SEC);
3916		ledger_enable_callback(ledger, task_ledgers.interrupt_wakeups);
3917#endif /* CONFIG_NOMONITORS */
3918	} else if (*flags & WAKEMON_DISABLE) {
3919		/*
3920		 * Caller wishes to disable wakeups monitor on the task.
3921		 *
3922		 * Disable telemetry if it was triggered by the wakeups monitor, and
3923		 * remove the limit & callback on the wakeups ledger entry.
3924		 */
3925#if CONFIG_TELEMETRY
3926		telemetry_task_ctl_locked(current_task(), TF_WAKEMON_WARNING, 0);
3927#endif
3928		ledger_disable_refill(ledger, task_ledgers.interrupt_wakeups);
3929		ledger_disable_callback(ledger, task_ledgers.interrupt_wakeups);
3930	}
3931
3932	task_unlock(task);
3933	return KERN_SUCCESS;
3934}
3935
3936void
3937task_wakeups_rate_exceeded(int warning, __unused const void *param0, __unused const void *param1)
3938{
3939	if (warning == LEDGER_WARNING_ROSE_ABOVE) {
3940#if CONFIG_TELEMETRY
3941		/*
3942		 * This task is in danger of violating the wakeups monitor. Enable telemetry on this task
3943		 * so there are micro-stackshots available if and when EXC_RESOURCE is triggered.
3944		 */
3945		telemetry_task_ctl(current_task(), TF_WAKEMON_WARNING, 1);
3946#endif
3947		return;
3948	}
3949
3950#if CONFIG_TELEMETRY
3951	/*
3952	 * If the balance has dipped below the warning level (LEDGER_WARNING_DIPPED_BELOW) or
3953	 * exceeded the limit, turn telemetry off for the task.
3954	 */
3955	telemetry_task_ctl(current_task(), TF_WAKEMON_WARNING, 0);
3956#endif
3957
3958	if (warning == 0) {
3959		THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE();
3960	}
3961}
3962
3963void __attribute__((noinline))
3964THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS__SENDING_EXC_RESOURCE(void)
3965{
3966	task_t						task 		= current_task();
3967	int							pid         = 0;
3968	char        				*procname 	= (char *) "unknown";
3969	uint64_t					observed_wakeups_rate;
3970	uint64_t					permitted_wakeups_rate;
3971	uint64_t					observation_interval;
3972	mach_exception_data_type_t	code[EXCEPTION_CODE_MAX];
3973	struct ledger_entry_info	lei;
3974
3975#ifdef MACH_BSD
3976	pid = proc_selfpid();
3977	if (task->bsd_info != NULL)
3978		procname = proc_name_address(current_task()->bsd_info);
3979#endif
3980
3981	ledger_get_entry_info(task->ledger, task_ledgers.interrupt_wakeups, &lei);
3982
3983	/*
3984	 * Disable the exception notification so we don't overwhelm
3985	 * the listener with an endless stream of redundant exceptions.
3986	 */
3987	uint32_t flags = WAKEMON_DISABLE;
3988	task_wakeups_monitor_ctl(task, &flags, NULL);
3989
3990	observed_wakeups_rate = (lei.lei_balance * (int64_t)NSEC_PER_SEC) / lei.lei_last_refill;
3991	permitted_wakeups_rate = lei.lei_limit / task_wakeups_monitor_interval;
3992	observation_interval = lei.lei_refill_period / NSEC_PER_SEC;
3993
3994	if (disable_exc_resource) {
3995		printf("process %s[%d] caught causing excessive wakeups. EXC_RESOURCE "
3996			"supressed by a boot-arg\n", procname, pid);
3997		return;
3998	}
3999	if (audio_active) {
4000		printf("process %s[%d] caught causing excessive wakeups. EXC_RESOURCE "
4001		       "supressed due to audio playback\n", procname, pid);
4002		return;
4003	}
4004	printf("process %s[%d] caught causing excessive wakeups. Observed wakeups rate "
4005		"(per sec): %lld; Maximum permitted wakeups rate (per sec): %lld; Observation "
4006		"period: %lld seconds; Task lifetime number of wakeups: %lld\n",
4007		procname, pid, observed_wakeups_rate, permitted_wakeups_rate,
4008		observation_interval, lei.lei_credit);
4009
4010	code[0] = code[1] = 0;
4011	EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_WAKEUPS);
4012	EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_WAKEUPS_MONITOR);
4013	EXC_RESOURCE_CPUMONITOR_ENCODE_WAKEUPS_PERMITTED(code[0], task_wakeups_monitor_rate);
4014	EXC_RESOURCE_CPUMONITOR_ENCODE_OBSERVATION_INTERVAL(code[0], observation_interval);
4015	EXC_RESOURCE_CPUMONITOR_ENCODE_WAKEUPS_OBSERVED(code[1], lei.lei_balance * (int64_t)NSEC_PER_SEC / lei.lei_last_refill);
4016	exception_triage(EXC_RESOURCE, code, EXCEPTION_CODE_MAX);
4017
4018	if (task->rusage_cpu_flags & TASK_RUSECPU_FLAGS_FATAL_WAKEUPSMON) {
4019		task_terminate_internal(task);
4020	}
4021}
4022
4023kern_return_t
4024task_purge_volatile_memory(
4025	task_t	task)
4026{
4027	vm_map_t	map;
4028	int		num_object_purged;
4029
4030	if (task == TASK_NULL)
4031		return KERN_INVALID_TASK;
4032
4033	task_lock(task);
4034
4035	if (!task->active) {
4036		task_unlock(task);
4037		return KERN_INVALID_TASK;
4038	}
4039	map = task->map;
4040	if (map == VM_MAP_NULL) {
4041		task_unlock(task);
4042		return KERN_INVALID_TASK;
4043	}
4044	vm_map_reference(task->map);
4045
4046	task_unlock(task);
4047
4048	num_object_purged = vm_map_purge(map);
4049	vm_map_deallocate(map);
4050
4051	return KERN_SUCCESS;
4052}
4053
4054/* Placeholders for the task set/get voucher interfaces */
4055kern_return_t
4056task_get_mach_voucher(
4057	task_t			task,
4058	mach_voucher_selector_t __unused which,
4059	ipc_voucher_t		*voucher)
4060{
4061	if (TASK_NULL == task)
4062		return KERN_INVALID_TASK;
4063
4064	*voucher = NULL;
4065	return KERN_SUCCESS;
4066}
4067
4068kern_return_t
4069task_set_mach_voucher(
4070	task_t			task,
4071	ipc_voucher_t		__unused voucher)
4072{
4073	if (TASK_NULL == task)
4074		return KERN_INVALID_TASK;
4075
4076	return KERN_SUCCESS;
4077}
4078
4079kern_return_t
4080task_swap_mach_voucher(
4081	task_t			task,
4082	ipc_voucher_t		new_voucher,
4083	ipc_voucher_t		*in_out_old_voucher)
4084{
4085	if (TASK_NULL == task)
4086		return KERN_INVALID_TASK;
4087
4088	*in_out_old_voucher = new_voucher;
4089	return KERN_SUCCESS;
4090}
4091
4092void task_set_gpu_denied(task_t task, boolean_t denied)
4093{
4094	task_lock(task);
4095
4096	if (denied) {
4097		task->t_flags |= TF_GPU_DENIED;
4098	} else {
4099		task->t_flags &= ~TF_GPU_DENIED;
4100	}
4101
4102	task_unlock(task);
4103}
4104
4105boolean_t task_is_gpu_denied(task_t task)
4106{
4107	/* We don't need the lock to read this flag */
4108	return (task->t_flags & TF_GPU_DENIED) ? TRUE : FALSE;
4109}
4110