1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30#include <kern/sched_prim.h>
31#include <kern/kalloc.h>
32#include <kern/assert.h>
33#include <kern/debug.h>
34#include <kern/locks.h>
35#include <kern/task.h>
36#include <kern/thread.h>
37#include <kern/host.h>
38#include <libkern/libkern.h>
39#include <mach/mach_time.h>
40#include <mach/task.h>
41#include <mach/host_priv.h>
42#include <mach/mach_host.h>
43#include <pexpert/pexpert.h>
44#include <sys/kern_event.h>
45#include <sys/proc.h>
46#include <sys/proc_info.h>
47#include <sys/signal.h>
48#include <sys/signalvar.h>
49#include <sys/sysctl.h>
50#include <sys/sysproto.h>
51#include <sys/wait.h>
52#include <sys/tree.h>
53#include <sys/priv.h>
54#include <vm/vm_pageout.h>
55#include <vm/vm_protos.h>
56
57#if CONFIG_FREEZE
58#include <vm/vm_map.h>
59#endif /* CONFIG_FREEZE */
60
61#include <sys/kern_memorystatus.h>
62
63#if CONFIG_JETSAM
64/* For logging clarity */
65static const char *jetsam_kill_cause_name[] = {
66	""                      ,
67	"jettisoned"		,       /* kMemorystatusKilled			*/
68	"highwater"             ,       /* kMemorystatusKilledHiwat		*/
69	"vnode-limit"           ,       /* kMemorystatusKilledVnodes		*/
70	"vm-pageshortage"       ,       /* kMemorystatusKilledVMPageShortage	*/
71	"vm-thrashing"          ,       /* kMemorystatusKilledVMThrashing	*/
72	"fc-thrashing"          ,       /* kMemorystatusKilledFCThrashing	*/
73	"per-process-limit"     ,       /* kMemorystatusKilledPerProcessLimit	*/
74	"diagnostic"            ,       /* kMemorystatusKilledDiagnostic	*/
75	"idle-exit"             ,       /* kMemorystatusKilledIdleExit		*/
76};
77
78/* Does cause indicate vm or fc thrashing? */
79static boolean_t
80is_thrashing(unsigned cause)
81{
82	switch (cause) {
83	case kMemorystatusKilledVMThrashing:
84	case kMemorystatusKilledFCThrashing:
85		return TRUE;
86	default:
87		return FALSE;
88	}
89}
90
91/* Callback into vm_compressor.c to signal that thrashing has been mitigated. */
92extern void vm_thrashing_jetsam_done(void);
93#endif
94
95/* These are very verbose printfs(), enable with
96 * MEMORYSTATUS_DEBUG_LOG
97 */
98#if MEMORYSTATUS_DEBUG_LOG
99#define MEMORYSTATUS_DEBUG(cond, format, ...)      \
100do {                                              \
101	if (cond) { printf(format, ##__VA_ARGS__); } \
102} while(0)
103#else
104#define MEMORYSTATUS_DEBUG(cond, format, ...)
105#endif
106
107/* General tunables */
108
109unsigned long delta_percentage = 5;
110unsigned long critical_threshold_percentage = 5;
111unsigned long idle_offset_percentage = 5;
112unsigned long pressure_threshold_percentage = 15;
113unsigned long freeze_threshold_percentage = 50;
114
115/* General memorystatus stuff */
116
117struct klist memorystatus_klist;
118static lck_mtx_t memorystatus_klist_mutex;
119
120static void memorystatus_klist_lock(void);
121static void memorystatus_klist_unlock(void);
122
123static uint64_t memorystatus_idle_delay_time = 0;
124
125/*
126 * Memorystatus kevents
127 */
128
129static int filt_memorystatusattach(struct knote *kn);
130static void filt_memorystatusdetach(struct knote *kn);
131static int filt_memorystatus(struct knote *kn, long hint);
132
133struct filterops memorystatus_filtops = {
134	.f_attach = filt_memorystatusattach,
135	.f_detach = filt_memorystatusdetach,
136	.f_event = filt_memorystatus,
137};
138
139enum {
140	kMemorystatusNoPressure = 0x1,
141	kMemorystatusPressure = 0x2,
142	kMemorystatusLowSwap = 0x4
143};
144
145/* Idle guard handling */
146
147static int32_t memorystatus_scheduled_idle_demotions = 0;
148
149static thread_call_t memorystatus_idle_demotion_call;
150
151static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
152static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
153static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
154static void memorystatus_reschedule_idle_demotion_locked(void);
155
156static void memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert);
157
158boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
159void memorystatus_send_low_swap_note(void);
160
161int memorystatus_wakeup = 0;
162
163unsigned int memorystatus_level = 0;
164
165static int memorystatus_list_count = 0;
166
167#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
168
169typedef struct memstat_bucket {
170    TAILQ_HEAD(, proc) list;
171    int count;
172} memstat_bucket_t;
173
174memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
175
176uint64_t memstat_idle_demotion_deadline = 0;
177
178static unsigned int memorystatus_dirty_count = 0;
179
180
181int
182memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
183{
184	user_addr_t	level = 0;
185
186	level = args->level;
187
188	if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
189		return EFAULT;
190	}
191
192	return 0;
193}
194
195static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
196static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
197
198static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
199
200/* Jetsam */
201
202#if CONFIG_JETSAM
203
204int proc_get_memstat_priority(proc_t, boolean_t);
205
206/* Kill processes exceeding their limit either under memory pressure (1), or as soon as possible (0) */
207#define LEGACY_HIWATER 1
208
209static boolean_t memorystatus_idle_snapshot = 0;
210
211static int memorystatus_highwater_enabled = 1;
212
213unsigned int memorystatus_delta = 0;
214
215static unsigned int memorystatus_available_pages_critical_base = 0;
216//static unsigned int memorystatus_last_foreground_pressure_pages = (unsigned int)-1;
217static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
218
219#if DEVELOPMENT || DEBUG
220static unsigned int memorystatus_jetsam_panic_debug = 0;
221
222static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
223static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
224#endif
225
226static unsigned int memorystatus_thread_wasted_wakeup = 0;
227
228static uint32_t kill_under_pressure_cause = 0;
229
230static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
231#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
232
233static unsigned int memorystatus_jetsam_snapshot_count = 0;
234static unsigned int memorystatus_jetsam_snapshot_max = 0;
235
236static void memorystatus_clear_errors(void);
237static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
238static uint32_t memorystatus_build_state(proc_t p);
239static void memorystatus_update_levels_locked(boolean_t critical_only);
240//static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
241
242static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause);
243static boolean_t memorystatus_kill_top_process(boolean_t any, uint32_t cause, int32_t *priority, uint32_t *errors);
244#if LEGACY_HIWATER
245static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors);
246#endif
247
248static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
249static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause);
250
251#endif /* CONFIG_JETSAM */
252
253/* VM pressure */
254
255extern unsigned int    vm_page_free_count;
256extern unsigned int    vm_page_active_count;
257extern unsigned int    vm_page_inactive_count;
258extern unsigned int    vm_page_throttled_count;
259extern unsigned int    vm_page_purgeable_count;
260extern unsigned int    vm_page_wire_count;
261
262#if VM_PRESSURE_EVENTS
263
264#include "vm_pressure.h"
265
266extern boolean_t memorystatus_warn_process(pid_t pid, boolean_t critical);
267
268vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
269
270#if CONFIG_MEMORYSTATUS
271unsigned int memorystatus_available_pages = (unsigned int)-1;
272unsigned int memorystatus_available_pages_pressure = 0;
273unsigned int memorystatus_available_pages_critical = 0;
274unsigned int memorystatus_frozen_count = 0;
275unsigned int memorystatus_suspended_count = 0;
276
277/*
278 * We use this flag to signal if we have any HWM offenders
279 * on the system. This way we can reduce the number of wakeups
280 * of the memorystatus_thread when the system is between the
281 * "pressure" and "critical" threshold.
282 *
283 * The (re-)setting of this variable is done without any locks
284 * or synchronization simply because it is not possible (currently)
285 * to keep track of HWM offenders that drop down below their memory
286 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
287 * by allowing the unguarded modification of this variable.
288 */
289boolean_t memorystatus_hwm_candidates = 0;
290
291static int memorystatus_send_note(int event_code, void *data, size_t data_length);
292#endif /* CONFIG_MEMORYSTATUS */
293
294#endif /* VM_PRESSURE_EVENTS */
295
296/* Freeze */
297
298#if CONFIG_FREEZE
299
300boolean_t memorystatus_freeze_enabled = FALSE;
301int memorystatus_freeze_wakeup = 0;
302
303static inline boolean_t memorystatus_can_freeze_processes(void);
304static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
305
306static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
307
308/* Thresholds */
309static unsigned int memorystatus_freeze_threshold = 0;
310
311static unsigned int memorystatus_freeze_pages_min = 0;
312static unsigned int memorystatus_freeze_pages_max = 0;
313
314static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
315
316/* Stats */
317static uint64_t memorystatus_freeze_count = 0;
318static uint64_t memorystatus_freeze_pageouts = 0;
319
320/* Throttling */
321static throttle_interval_t throttle_intervals[] = {
322	{      60,  8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
323	{ 24 * 60,  1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
324};
325
326static uint64_t memorystatus_freeze_throttle_count = 0;
327
328static unsigned int memorystatus_suspended_footprint_total = 0;
329
330#endif /* CONFIG_FREEZE */
331
332/* Debug */
333
334extern struct knote *vm_find_knote_from_pid(pid_t, struct klist *);
335
336#if DEVELOPMENT || DEBUG
337
338#if CONFIG_JETSAM
339
340/* Debug aid to aid determination of limit */
341
342static int
343sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
344{
345#pragma unused(oidp, arg2)
346	proc_t p;
347	unsigned int b = 0;
348	int error, enable = 0;
349	int32_t memlimit;
350
351	error = SYSCTL_OUT(req, arg1, sizeof(int));
352	if (error || !req->newptr) {
353		return (error);
354	}
355
356	error = SYSCTL_IN(req, &enable, sizeof(int));
357	if (error || !req->newptr) {
358		return (error);
359	}
360
361	if (!(enable == 0 || enable == 1)) {
362		return EINVAL;
363	}
364
365	proc_list_lock();
366
367	p = memorystatus_get_first_proc_locked(&b, TRUE);
368	while (p) {
369		if (enable) {
370			if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
371				memlimit = -1;
372			} else {
373				memlimit = p->p_memstat_memlimit;
374			}
375		} else {
376			memlimit = -1;
377		}
378		task_set_phys_footprint_limit_internal(p->task, (memlimit  > 0) ? memlimit : -1, NULL, TRUE);
379
380		if (memlimit == -1) {
381        		p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
382		} else {
383        		if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) {
384				p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
385			}
386		}
387
388		p = memorystatus_get_next_proc_locked(&b, p, TRUE);
389	}
390
391	memorystatus_highwater_enabled = enable;
392
393	proc_list_unlock();
394
395	return 0;
396}
397
398SYSCTL_INT(_kern, OID_AUTO, memorystatus_idle_snapshot, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_idle_snapshot, 0, "");
399
400SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
401
402SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
403SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
404SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
405SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
406
407/* Diagnostic code */
408
409enum {
410	kJetsamDiagnosticModeNone =              0,
411	kJetsamDiagnosticModeAll  =              1,
412	kJetsamDiagnosticModeStopAtFirstActive = 2,
413	kJetsamDiagnosticModeCount
414} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
415
416static int jetsam_diagnostic_suspended_one_active_proc = 0;
417
418static int
419sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
420{
421#pragma unused(arg1, arg2)
422
423	const char *diagnosticStrings[] = {
424		"jetsam: diagnostic mode: resetting critical level.",
425		"jetsam: diagnostic mode: will examine all processes",
426		"jetsam: diagnostic mode: will stop at first active process"
427	};
428
429	int error, val = jetsam_diagnostic_mode;
430	boolean_t changed = FALSE;
431
432	error = sysctl_handle_int(oidp, &val, 0, req);
433	if (error || !req->newptr)
434 		return (error);
435	if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
436		printf("jetsam: diagnostic mode: invalid value - %d\n", val);
437		return EINVAL;
438	}
439
440	proc_list_lock();
441
442	if ((unsigned int) val != jetsam_diagnostic_mode) {
443		jetsam_diagnostic_mode = val;
444
445		memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
446
447		switch (jetsam_diagnostic_mode) {
448		case kJetsamDiagnosticModeNone:
449			/* Already cleared */
450			break;
451		case kJetsamDiagnosticModeAll:
452			memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
453			break;
454		case kJetsamDiagnosticModeStopAtFirstActive:
455			memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
456			break;
457		default:
458			/* Already validated */
459			break;
460		}
461
462		memorystatus_update_levels_locked(FALSE);
463		changed = TRUE;
464	}
465
466	proc_list_unlock();
467
468	if (changed) {
469		printf("%s\n", diagnosticStrings[val]);
470	}
471
472	return (0);
473}
474
475SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY,
476  		&jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
477
478SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
479
480#if VM_PRESSURE_EVENTS
481
482SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
483
484
485/*
486 * This routine is used for targeted notifications
487 * regardless of system memory pressure.
488 * "memnote" is the current user.
489 */
490
491static int
492sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
493{
494#pragma unused(arg1, arg2)
495
496	int error = 0, pid = 0;
497	int ret = 0;
498	struct knote *kn = NULL;
499
500	error = sysctl_handle_int(oidp, &pid, 0, req);
501	if (error || !req->newptr)
502		return (error);
503
504	/*
505	 * We inspect 3 lists here for targeted notifications:
506	 * - memorystatus_klist
507	 * - vm_pressure_klist
508	 * - vm_pressure_dormant_klist
509	 *
510	 * The vm_pressure_* lists are tied to the old VM_PRESSURE
511	 * notification mechanism. We intend to stop using that
512	 * mechanism and, in turn, get rid of the 2 lists and
513	 * vm_dispatch_pressure_note_to_pid() too.
514	 */
515
516	memorystatus_klist_lock();
517	kn = vm_find_knote_from_pid(pid, &memorystatus_klist);
518	if (kn) {
519		/*
520		 * Forcibly send this pid a "warning" memory pressure notification.
521		 */
522		kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_WARN;
523    		KNOTE(&memorystatus_klist, kMemorystatusPressure);
524    		ret = 0;
525	} else {
526		ret = vm_dispatch_pressure_note_to_pid(pid, FALSE);
527	}
528	memorystatus_klist_unlock();
529
530	return ret;
531}
532
533SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
534    0, 0, &sysctl_memorystatus_vm_pressure_send, "I", "");
535
536#endif /* VM_PRESSURE_EVENTS */
537
538#endif /* CONFIG_JETSAM */
539
540#if CONFIG_FREEZE
541
542SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
543
544SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
545SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
546
547SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
548SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
549SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
550SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
551
552boolean_t memorystatus_freeze_throttle_enabled = TRUE;
553SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
554
555/*
556 * Manual trigger of freeze and thaw for dev / debug kernels only.
557 */
558static int
559sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
560{
561#pragma unused(arg1, arg2)
562
563	int error, pid = 0;
564	proc_t p;
565
566	if (memorystatus_freeze_enabled == FALSE) {
567		return ENOTSUP;
568	}
569
570	error = sysctl_handle_int(oidp, &pid, 0, req);
571	if (error || !req->newptr)
572		return (error);
573
574	p = proc_find(pid);
575	if (p != NULL) {
576		uint32_t purgeable, wired, clean, dirty;
577		boolean_t shared;
578		uint32_t max_pages = 0;
579
580		if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
581			max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max);
582		} else {
583			max_pages = UINT32_MAX - 1;
584		}
585		error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
586		proc_rele(p);
587
588		if (error)
589			error = EIO;
590		return error;
591	}
592	return EINVAL;
593}
594
595SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
596    0, 0, &sysctl_memorystatus_freeze, "I", "");
597
598static int
599sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
600{
601#pragma unused(arg1, arg2)
602
603	int error, pid = 0;
604	proc_t p;
605
606	if (memorystatus_freeze_enabled == FALSE) {
607		return ENOTSUP;
608	}
609
610	error = sysctl_handle_int(oidp, &pid, 0, req);
611	if (error || !req->newptr)
612		return (error);
613
614	p = proc_find(pid);
615	if (p != NULL) {
616		error = task_thaw(p->task);
617		proc_rele(p);
618
619		if (error)
620			error = EIO;
621		return error;
622	}
623
624	return EINVAL;
625}
626
627SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
628    0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
629
630#endif /* CONFIG_FREEZE */
631
632#endif /* DEVELOPMENT || DEBUG */
633
634extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
635                                                  void *parameter,
636                                                  integer_t priority,
637                                                  thread_t *new_thread);
638
639#if CONFIG_JETSAM
640/*
641 * Sort processes by size for a single jetsam bucket.
642 */
643
644static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index)
645{
646	proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL;
647	uint32_t pages = 0, max_pages = 0;
648	memstat_bucket_t *current_bucket;
649
650	if (bucket_index >= MEMSTAT_BUCKET_COUNT) {
651		return;
652	}
653
654	current_bucket = &memstat_bucket[bucket_index];
655
656	p = TAILQ_FIRST(&current_bucket->list);
657
658	if (p) {
659		memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
660		max_pages = pages;
661		insert_after_proc = NULL;
662
663		p = TAILQ_NEXT(p, p_memstat_list);
664
665restart:
666		while (p) {
667
668			memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
669
670			if (pages > max_pages) {
671				max_pages = pages;
672				max_proc = p;
673			}
674
675			p = TAILQ_NEXT(p, p_memstat_list);
676		}
677
678		if (max_proc) {
679
680			TAILQ_REMOVE(&current_bucket->list, max_proc, p_memstat_list);
681
682			if (insert_after_proc == NULL) {
683				TAILQ_INSERT_HEAD(&current_bucket->list, max_proc, p_memstat_list);
684			} else {
685				TAILQ_INSERT_AFTER(&current_bucket->list, insert_after_proc, max_proc, p_memstat_list);
686			}
687
688			insert_after_proc = max_proc;
689
690			/* Reset parameters for the new search. */
691			p = TAILQ_NEXT(max_proc, p_memstat_list);
692			if (p) {
693				memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
694				max_pages = pages;
695			}
696			max_proc = NULL;
697
698			goto restart;
699		}
700	}
701}
702
703#endif /* CONFIG_JETSAM */
704
705static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) {
706	memstat_bucket_t *current_bucket;
707	proc_t next_p;
708
709	if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
710		return NULL;
711	}
712
713	current_bucket = &memstat_bucket[*bucket_index];
714	next_p = TAILQ_FIRST(&current_bucket->list);
715	if (!next_p && search) {
716		while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
717			current_bucket = &memstat_bucket[*bucket_index];
718			next_p = TAILQ_FIRST(&current_bucket->list);
719		}
720	}
721
722	return next_p;
723}
724
725static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) {
726	memstat_bucket_t *current_bucket;
727	proc_t next_p;
728
729	if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
730		return NULL;
731	}
732
733	next_p = TAILQ_NEXT(p, p_memstat_list);
734	while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
735		current_bucket = &memstat_bucket[*bucket_index];
736		next_p = TAILQ_FIRST(&current_bucket->list);
737	}
738
739	return next_p;
740}
741
742__private_extern__ void
743memorystatus_init(void)
744{
745	thread_t thread = THREAD_NULL;
746	kern_return_t result;
747	int i;
748
749#if CONFIG_FREEZE
750	memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
751	memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
752#endif
753
754	nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_idle_delay_time);
755
756	/* Init buckets */
757	for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
758		TAILQ_INIT(&memstat_bucket[i].list);
759		memstat_bucket[i].count = 0;
760	}
761
762	memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
763
764	/* Apply overrides */
765	PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
766	assert(delta_percentage < 100);
767	PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
768	assert(critical_threshold_percentage < 100);
769	PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
770	assert(idle_offset_percentage < 100);
771	PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
772	assert(pressure_threshold_percentage < 100);
773	PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
774	assert(freeze_threshold_percentage < 100);
775
776#if CONFIG_JETSAM
777	memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
778	memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
779	memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
780
781	memorystatus_jetsam_snapshot_max = maxproc;
782	memorystatus_jetsam_snapshot =
783		(memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
784		sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
785	if (!memorystatus_jetsam_snapshot) {
786		panic("Could not allocate memorystatus_jetsam_snapshot");
787	}
788
789	/* No contention at this point */
790	memorystatus_update_levels_locked(FALSE);
791#endif
792
793#if CONFIG_FREEZE
794	memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
795#endif
796
797	result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
798	if (result == KERN_SUCCESS) {
799		thread_deallocate(thread);
800	} else {
801		panic("Could not create memorystatus_thread");
802	}
803}
804
805/* Centralised for the purposes of allowing panic-on-jetsam */
806extern void
807vm_wake_compactor_swapper(void);
808
809/*
810 * The jetsam no frills kill call
811 * 	Return: 0 on success
812 *		error code on failure (EINVAL...)
813 */
814static int
815jetsam_do_kill(proc_t p, int jetsam_flags) {
816	int error = 0;
817	error = exit1_internal(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags);
818	return(error);
819}
820
821/*
822 * Wrapper for processes exiting with memorystatus details
823 */
824static boolean_t
825memorystatus_do_kill(proc_t p, uint32_t cause) {
826
827	int error = 0;
828	__unused pid_t victim_pid = p->p_pid;
829
830	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_START,
831			       victim_pid, cause, vm_page_free_count, 0, 0);
832
833#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
834	if (memorystatus_jetsam_panic_debug & (1 << cause)) {
835		panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
836	}
837#else
838#pragma unused(cause)
839#endif
840	int jetsam_flags = P_LTERM_JETSAM;
841	switch (cause) {
842		case kMemorystatusKilledHiwat:			jetsam_flags |= P_JETSAM_HIWAT; break;
843		case kMemorystatusKilledVnodes:			jetsam_flags |= P_JETSAM_VNODE; break;
844		case kMemorystatusKilledVMPageShortage:		jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
845		case kMemorystatusKilledVMThrashing:		jetsam_flags |= P_JETSAM_VMTHRASHING; break;
846		case kMemorystatusKilledFCThrashing:		jetsam_flags |= P_JETSAM_FCTHRASHING; break;
847		case kMemorystatusKilledPerProcessLimit:	jetsam_flags |= P_JETSAM_PID; break;
848		case kMemorystatusKilledIdleExit:		jetsam_flags |= P_JETSAM_IDLEEXIT; break;
849	}
850	error = jetsam_do_kill(p, jetsam_flags);
851
852	KERNEL_DEBUG_CONSTANT( (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END,
853			       victim_pid, cause, vm_page_free_count, error, 0);
854
855	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
856 		vm_wake_compactor_swapper();
857 	}
858
859	return (error == 0);
860}
861
862/*
863 * Node manipulation
864 */
865
866static void
867memorystatus_check_levels_locked(void) {
868#if CONFIG_JETSAM
869	/* Update levels */
870	memorystatus_update_levels_locked(TRUE);
871#endif
872}
873
874static void
875memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
876{
877	proc_t p;
878	uint64_t current_time;
879	memstat_bucket_t *demotion_bucket;
880
881	MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
882
883	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
884
885 	current_time = mach_absolute_time();
886
887	proc_list_lock();
888
889	demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
890	p = TAILQ_FIRST(&demotion_bucket->list);
891
892	while (p) {
893		MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
894
895		assert(p->p_memstat_idledeadline);
896		assert(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS);
897		assert((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED);
898
899		if (current_time >= p->p_memstat_idledeadline) {
900#if DEBUG || DEVELOPMENT
901			if (!(p->p_memstat_dirty & P_DIRTY_MARKED)) {
902				printf("memorystatus_perform_idle_demotion: moving process %d [%s] to idle band, but never dirtied (0x%x)!\n",
903					p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"), p->p_memstat_dirty);
904			}
905#endif
906			memorystatus_invalidate_idle_demotion_locked(p, TRUE);
907			memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, false);
908
909			// The prior process has moved out of the demotion bucket, so grab the new head and continue
910			p = TAILQ_FIRST(&demotion_bucket->list);
911			continue;
912		}
913
914		// No further candidates
915		break;
916	}
917
918	memorystatus_reschedule_idle_demotion_locked();
919
920	proc_list_unlock();
921
922	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
923}
924
925static void
926memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
927{
928	boolean_t present_in_deferred_bucket = FALSE;
929
930	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
931		present_in_deferred_bucket = TRUE;
932	}
933
934	MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for process %d (dirty:0x%x, set_state %d, demotions %d).\n",
935	    p->p_pid, p->p_memstat_dirty, set_state, memorystatus_scheduled_idle_demotions);
936
937	assert((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED);
938
939	if (set_state) {
940		assert(p->p_memstat_idledeadline == 0);
941		p->p_memstat_dirty |= P_DIRTY_DEFER_IN_PROGRESS;
942		p->p_memstat_idledeadline = mach_absolute_time() + memorystatus_idle_delay_time;
943	}
944
945	assert(p->p_memstat_idledeadline);
946
947 	if (present_in_deferred_bucket == FALSE) {
948		memorystatus_scheduled_idle_demotions++;
949	}
950}
951
952static void
953memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
954{
955	boolean_t present_in_deferred_bucket = FALSE;
956
957	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
958		present_in_deferred_bucket = TRUE;
959		assert(p->p_memstat_idledeadline);
960	}
961
962	MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for process %d (clear_state %d, demotions %d).\n",
963	    p->p_pid, clear_state, memorystatus_scheduled_idle_demotions);
964
965
966	if (clear_state) {
967 		p->p_memstat_idledeadline = 0;
968 		p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS;
969	}
970
971 	if (present_in_deferred_bucket == TRUE) {
972		memorystatus_scheduled_idle_demotions--;
973	}
974
975 	assert(memorystatus_scheduled_idle_demotions >= 0);
976}
977
978static void
979memorystatus_reschedule_idle_demotion_locked(void) {
980 	if (0 == memorystatus_scheduled_idle_demotions) {
981 	 	if (memstat_idle_demotion_deadline) {
982 	 	 	/* Transitioned 1->0, so cancel next call */
983 	 	 	thread_call_cancel(memorystatus_idle_demotion_call);
984 	 	 	memstat_idle_demotion_deadline = 0;
985 		}
986 	} else {
987 		memstat_bucket_t *demotion_bucket;
988 		proc_t p;
989 		demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
990 		p = TAILQ_FIRST(&demotion_bucket->list);
991
992		assert(p && p->p_memstat_idledeadline);
993
994		if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){
995			thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
996			memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
997		}
998 	}
999}
1000
1001/*
1002 * List manipulation
1003 */
1004
1005int
1006memorystatus_add(proc_t p, boolean_t locked)
1007{
1008	memstat_bucket_t *bucket;
1009
1010	MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding process %d with priority %d.\n", p->p_pid, p->p_memstat_effectivepriority);
1011
1012	if (!locked) {
1013   	   	proc_list_lock();
1014   	}
1015
1016	/* Processes marked internal do not have priority tracked */
1017	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1018                goto exit;
1019	}
1020
1021	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
1022
1023	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1024		assert(bucket->count == memorystatus_scheduled_idle_demotions);
1025	}
1026
1027	TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
1028	bucket->count++;
1029
1030	memorystatus_list_count++;
1031
1032	memorystatus_check_levels_locked();
1033
1034exit:
1035   	if (!locked) {
1036   	   	proc_list_unlock();
1037   	}
1038
1039	return 0;
1040}
1041
1042static void
1043memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_insert)
1044{
1045	memstat_bucket_t *old_bucket, *new_bucket;
1046
1047	assert(priority < MEMSTAT_BUCKET_COUNT);
1048
1049	/* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
1050	if ((p->p_listflag & P_LIST_EXITED) != 0) {
1051		return;
1052	}
1053
1054	MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting process %d to priority %d, inserting at %s\n",
1055	                   p->p_pid, priority, head_insert ? "head" : "tail");
1056
1057	old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
1058	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1059		assert(old_bucket->count == (memorystatus_scheduled_idle_demotions + 1));
1060	}
1061
1062	TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
1063	old_bucket->count--;
1064
1065	new_bucket = &memstat_bucket[priority];
1066	if (head_insert)
1067		TAILQ_INSERT_HEAD(&new_bucket->list, p, p_memstat_list);
1068	else
1069		TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
1070	new_bucket->count++;
1071
1072#if CONFIG_JETSAM
1073	if (memorystatus_highwater_enabled && (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND)) {
1074
1075		/*
1076		 * Adjust memory limit based on if the task is going to/from foreground and background.
1077		 */
1078
1079		if (((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) ||
1080			((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND))) {
1081			int32_t memlimit = (priority >= JETSAM_PRIORITY_FOREGROUND) ? -1 : p->p_memstat_memlimit;
1082			task_set_phys_footprint_limit_internal(p->task, (memlimit  > 0) ? memlimit : -1, NULL, TRUE);
1083
1084			if (memlimit <= 0) {
1085		        	p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1086			} else {
1087	        		p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
1088			}
1089		}
1090	}
1091#endif
1092
1093	p->p_memstat_effectivepriority = priority;
1094
1095	memorystatus_check_levels_locked();
1096}
1097
1098int
1099memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit, int32_t memlimit, boolean_t memlimit_background, boolean_t is_fatal_limit)
1100{
1101	int ret;
1102	boolean_t head_insert = false;
1103
1104#if !CONFIG_JETSAM
1105#pragma unused(update_memlimit, memlimit, memlimit_background, is_fatal_limit)
1106#endif
1107
1108	MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing process %d: priority %d, user_data 0x%llx\n", p->p_pid, priority, user_data);
1109
1110	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
1111
1112	if (priority == -1) {
1113		/* Use as shorthand for default priority */
1114		priority = JETSAM_PRIORITY_DEFAULT;
1115	} else if (priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1116		/* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
1117		priority = JETSAM_PRIORITY_IDLE;
1118	} else if (priority == JETSAM_PRIORITY_IDLE_HEAD) {
1119		/* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */
1120		priority = JETSAM_PRIORITY_IDLE;
1121		head_insert = true;
1122	} else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
1123		/* Sanity check */
1124		ret = EINVAL;
1125		goto out;
1126	}
1127
1128	proc_list_lock();
1129
1130	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
1131
1132	if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
1133		ret = EALREADY;
1134		proc_list_unlock();
1135		MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", p->p_pid);
1136		goto out;
1137	}
1138
1139	if ((p->p_memstat_state & P_MEMSTAT_TERMINATED) || ((p->p_listflag & P_LIST_EXITED) != 0)) {
1140		/*
1141		 * This could happen when a process calling posix_spawn() is exiting on the jetsam thread.
1142		 */
1143		ret = EBUSY;
1144		proc_list_unlock();
1145		goto out;
1146	}
1147
1148	p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
1149	p->p_memstat_userdata = user_data;
1150	p->p_memstat_requestedpriority = priority;
1151
1152#if CONFIG_JETSAM
1153	if (update_memlimit) {
1154		p->p_memstat_memlimit = memlimit;
1155		if (memlimit_background) {
1156			/* Will be set as priority is updated */
1157			p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_BACKGROUND;
1158
1159			/* Cannot have a background memory limit and be fatal. */
1160			is_fatal_limit = FALSE;
1161
1162		} else {
1163			/* Otherwise, apply now */
1164			if (memorystatus_highwater_enabled) {
1165				task_set_phys_footprint_limit_internal(p->task, (memlimit  > 0) ? memlimit : -1, NULL, TRUE);
1166			}
1167		}
1168
1169		if (is_fatal_limit || memlimit <= 0) {
1170	        	p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
1171		} else {
1172	        	p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
1173		}
1174	}
1175#endif
1176
1177	/*
1178	 * We can't add to the JETSAM_PRIORITY_IDLE_DEFERRED bucket here.
1179	 * But, we could be removing it from the bucket.
1180	 * Check and take appropriate steps if so.
1181	 */
1182
1183	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1184
1185		memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1186	}
1187
1188	memorystatus_update_priority_locked(p, priority, head_insert);
1189
1190	proc_list_unlock();
1191	ret = 0;
1192
1193out:
1194	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
1195
1196	return ret;
1197}
1198
1199int
1200memorystatus_remove(proc_t p, boolean_t locked)
1201{
1202	int ret;
1203	memstat_bucket_t *bucket;
1204
1205	MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing process %d\n", p->p_pid);
1206
1207   	if (!locked) {
1208   	   	proc_list_lock();
1209   	}
1210
1211	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
1212
1213	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
1214	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1215		assert(bucket->count == memorystatus_scheduled_idle_demotions);
1216	}
1217
1218	TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
1219	bucket->count--;
1220
1221	memorystatus_list_count--;
1222
1223	/* If awaiting demotion to the idle band, clean up */
1224	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
1225		memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1226 		memorystatus_reschedule_idle_demotion_locked();
1227	}
1228
1229	memorystatus_check_levels_locked();
1230
1231#if CONFIG_FREEZE
1232	if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
1233		memorystatus_frozen_count--;
1234	}
1235
1236	if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
1237		memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
1238		memorystatus_suspended_count--;
1239	}
1240#endif
1241
1242   	if (!locked) {
1243   	   	proc_list_unlock();
1244   	}
1245
1246	if (p) {
1247		ret = 0;
1248	} else {
1249		ret = ESRCH;
1250	}
1251
1252	return ret;
1253}
1254
1255static boolean_t
1256memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
1257	/* See that the process isn't marked for termination */
1258	if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
1259		return FALSE;
1260	}
1261
1262	/* Idle exit requires that process be tracked */
1263	if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
1264	   !(pcontrol & PROC_DIRTY_TRACK)) {
1265		return FALSE;
1266	}
1267
1268	/* 'Launch in progress' tracking requires that process have enabled dirty tracking too. */
1269	if ((pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) &&
1270	   !(pcontrol & PROC_DIRTY_TRACK)) {
1271		return FALSE;
1272	}
1273
1274	/* Deferral is only relevant if idle exit is specified */
1275	if ((pcontrol & PROC_DIRTY_DEFER) &&
1276	   !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
1277		return FALSE;
1278	}
1279
1280	return TRUE;
1281}
1282
1283static void
1284memorystatus_update_idle_priority_locked(proc_t p) {
1285	int32_t priority;
1286
1287	MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
1288
1289	if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
1290		priority = (p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) ? JETSAM_PRIORITY_IDLE_DEFERRED : JETSAM_PRIORITY_IDLE;
1291	} else {
1292		priority = p->p_memstat_requestedpriority;
1293	}
1294
1295	if (priority != p->p_memstat_effectivepriority) {
1296		memorystatus_update_priority_locked(p, priority, false);
1297	}
1298}
1299
1300/*
1301 * Processes can opt to have their state tracked by the kernel, indicating  when they are busy (dirty) or idle
1302 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
1303 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
1304 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
1305 *
1306 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
1307 * memorystatus_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
1308 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
1309 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
1310 * band. The deferral can be cleared early by clearing the appropriate flag.
1311 *
1312 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
1313 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
1314 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
1315 */
1316
1317int
1318memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
1319	unsigned int old_dirty;
1320	boolean_t reschedule = FALSE;
1321	boolean_t already_deferred = FALSE;
1322	boolean_t defer_now = FALSE;
1323	int ret;
1324
1325	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_TRACK),
1326		p->p_pid, p->p_memstat_dirty, pcontrol, 0, 0);
1327
1328	proc_list_lock();
1329
1330	if ((p->p_listflag & P_LIST_EXITED) != 0) {
1331		/*
1332		 * Process is on its way out.
1333		 */
1334		ret = EBUSY;
1335		goto exit;
1336	}
1337
1338	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1339		ret = EPERM;
1340		goto exit;
1341	}
1342
1343	if (!memorystatus_validate_track_flags(p, pcontrol)) {
1344		ret = EINVAL;
1345		goto exit;
1346        }
1347
1348        old_dirty = p->p_memstat_dirty;
1349
1350	/* These bits are cumulative, as per <rdar://problem/11159924> */
1351	if (pcontrol & PROC_DIRTY_TRACK) {
1352		p->p_memstat_dirty |= P_DIRTY_TRACK;
1353	}
1354
1355	if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
1356		p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
1357	}
1358
1359	if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
1360		p->p_memstat_dirty |= P_DIRTY_LAUNCH_IN_PROGRESS;
1361	}
1362
1363	if (old_dirty & P_DIRTY_DEFER_IN_PROGRESS) {
1364		already_deferred = TRUE;
1365	}
1366
1367	/* This can be set and cleared exactly once. */
1368	if (pcontrol & PROC_DIRTY_DEFER) {
1369
1370	       	if ( !(old_dirty & P_DIRTY_DEFER)) {
1371			p->p_memstat_dirty |= P_DIRTY_DEFER;
1372		}
1373
1374		defer_now = TRUE;
1375	}
1376
1377	MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / defer %s / dirty %s for process %d\n",
1378		((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
1379		defer_now ? "Y" : "N",
1380		p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
1381		p->p_pid);
1382
1383	/* Kick off or invalidate the idle exit deferment if there's a state transition. */
1384	if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
1385		if (((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) &&
1386			defer_now && !already_deferred) {
1387
1388			/*
1389			 * Request to defer a clean process that's idle-exit enabled
1390			 * and not already in the jetsam deferred band.
1391			 */
1392			memorystatus_schedule_idle_demotion_locked(p, TRUE);
1393			reschedule = TRUE;
1394
1395		} else if (!defer_now && already_deferred) {
1396
1397			/*
1398			 * Either the process is no longer idle-exit enabled OR
1399			 * there's a request to cancel a currently active deferral.
1400			 */
1401			memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1402			reschedule = TRUE;
1403		}
1404	} else {
1405
1406		/*
1407		 * We are trying to operate on a dirty process. Dirty processes have to
1408		 * be removed from the deferred band. The question is do we reset the
1409		 * deferred state or not?
1410		 *
1411		 * This could be a legal request like:
1412		 * - this process had opted into the JETSAM_DEFERRED band
1413		 * - but it's now dirty and requests to opt out.
1414		 * In this case, we remove the process from the band and reset its
1415		 * state too. It'll opt back in properly when needed.
1416		 *
1417		 * OR, this request could be a user-space bug. E.g.:
1418		 * - this process had opted into the JETSAM_DEFERRED band when clean
1419		 * - and, then issues another request to again put it into the band except
1420		 *   this time the process is dirty.
1421		 * The process going dirty, as a transition in memorystatus_dirty_set(), will pull the process out of
1422		 * the deferred band with its state intact. So our request below is no-op.
1423		 * But we do it here anyways for coverage.
1424		 *
1425		 * memorystatus_update_idle_priority_locked()
1426		 * single-mindedly treats a dirty process as "cannot be in the deferred band".
1427		 */
1428
1429		if (!defer_now && already_deferred) {
1430			memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1431			reschedule = TRUE;
1432		} else {
1433			memorystatus_invalidate_idle_demotion_locked(p, FALSE);
1434			reschedule = TRUE;
1435		}
1436	}
1437
1438	memorystatus_update_idle_priority_locked(p);
1439
1440	if (reschedule) {
1441		memorystatus_reschedule_idle_demotion_locked();
1442	}
1443
1444	ret = 0;
1445
1446exit:
1447	proc_list_unlock();
1448
1449	return ret;
1450}
1451
1452int
1453memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
1454	int ret;
1455	boolean_t kill = false;
1456	boolean_t reschedule = FALSE;
1457	boolean_t was_dirty = FALSE;
1458	boolean_t now_dirty = FALSE;
1459
1460	MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
1461
1462	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_SET), p->p_pid, self, pcontrol, 0, 0);
1463
1464	proc_list_lock();
1465
1466	if ((p->p_listflag & P_LIST_EXITED) != 0) {
1467		/*
1468		 * Process is on its way out.
1469		 */
1470		ret = EBUSY;
1471		goto exit;
1472	}
1473
1474	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1475		ret = EPERM;
1476		goto exit;
1477	}
1478
1479	if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1480		was_dirty = TRUE;
1481
1482	if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
1483		/* Dirty tracking not enabled */
1484		ret = EINVAL;
1485	} else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1486		/*
1487		 * Process is set to be terminated and we're attempting to mark it dirty.
1488		 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
1489		 */
1490		ret = EBUSY;
1491	} else {
1492		int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
1493		if (pcontrol && !(p->p_memstat_dirty & flag)) {
1494			/* Mark the process as having been dirtied at some point */
1495			p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
1496			memorystatus_dirty_count++;
1497			ret = 0;
1498		} else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
1499			if ((flag == P_DIRTY_SHUTDOWN) && (!p->p_memstat_dirty & P_DIRTY)) {
1500				/* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
1501				p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1502				kill = true;
1503			} else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1504				/* Kill previously terminated processes if set clean */
1505				kill = true;
1506			}
1507			p->p_memstat_dirty &= ~flag;
1508			memorystatus_dirty_count--;
1509			ret = 0;
1510		} else {
1511			/* Already set */
1512			ret = EALREADY;
1513		}
1514	}
1515
1516	if (ret != 0) {
1517		goto exit;
1518	}
1519
1520	if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1521		now_dirty = TRUE;
1522
1523	if ((was_dirty == TRUE && now_dirty == FALSE) ||
1524	    (was_dirty == FALSE && now_dirty == TRUE)) {
1525
1526		/* Manage idle exit deferral, if applied */
1527		if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) ==
1528		    (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) {
1529
1530			/*
1531			 * P_DIRTY_DEFER_IN_PROGRESS means the process is in the deferred band OR it might be heading back
1532			 * there once it's clean again and has some protection window left.
1533			 */
1534
1535			if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
1536				/*
1537				 * New dirty process i.e. "was_dirty == FALSE && now_dirty == TRUE"
1538				 *
1539				 * The process will move from the deferred band to its higher requested
1540				 * jetsam band. But we don't clear its state i.e. we want to remember that
1541				 * this process was part of the "deferred" band and will return to it.
1542				 *
1543				 * This way, we don't let it age beyond the protection
1544				 * window when it returns to "clean". All the while giving
1545				 * it a chance to perform its work while "dirty".
1546				 *
1547				 */
1548				memorystatus_invalidate_idle_demotion_locked(p, FALSE);
1549				reschedule = TRUE;
1550			} else {
1551
1552				/*
1553				 * Process is back from "dirty" to "clean".
1554				 *
1555				 * Is its timer up OR does it still have some protection
1556				 * window left?
1557				 */
1558
1559				if (mach_absolute_time() >= p->p_memstat_idledeadline) {
1560					/*
1561				 	 * The process' deadline has expired. It currently
1562					 * does not reside in the DEFERRED bucket.
1563					 *
1564					 * It's on its way to the JETSAM_PRIORITY_IDLE
1565					 * bucket via memorystatus_update_idle_priority_locked()
1566					 * below.
1567
1568					 * So all we need to do is reset all the state on the
1569					 * process that's related to the DEFERRED bucket i.e.
1570					 * the DIRTY_DEFER_IN_PROGRESS flag and the timer deadline.
1571					 *
1572				 	 */
1573
1574					memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1575					reschedule = TRUE;
1576				} else {
1577					/*
1578					 * It still has some protection window left and so
1579					 * we just re-arm the timer without modifying any
1580					 * state on the process.
1581					 */
1582					memorystatus_schedule_idle_demotion_locked(p, FALSE);
1583					reschedule = TRUE;
1584				}
1585			}
1586		}
1587
1588		memorystatus_update_idle_priority_locked(p);
1589
1590		/* If the deferral state changed, reschedule the demotion timer */
1591		if (reschedule) {
1592			memorystatus_reschedule_idle_demotion_locked();
1593		}
1594	}
1595
1596	if (kill) {
1597		psignal(p, SIGKILL);
1598	}
1599
1600exit:
1601	proc_list_unlock();
1602
1603	return ret;
1604}
1605
1606int
1607memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
1608
1609	int ret = 0;
1610
1611	MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_clear(): %d 0x%x 0x%x\n", p->p_pid, pcontrol, p->p_memstat_dirty);
1612
1613	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DIRTY_CLEAR), p->p_pid, pcontrol, 0, 0, 0);
1614
1615	proc_list_lock();
1616
1617	if ((p->p_listflag & P_LIST_EXITED) != 0) {
1618		/*
1619		 * Process is on its way out.
1620		 */
1621		ret = EBUSY;
1622		goto exit;
1623	}
1624
1625	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1626		ret = EPERM;
1627		goto exit;
1628	}
1629
1630	if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
1631		/* Dirty tracking not enabled */
1632		ret = EINVAL;
1633		goto exit;
1634	}
1635
1636	if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) {
1637		ret = EINVAL;
1638		goto exit;
1639	}
1640
1641	if (pcontrol & PROC_DIRTY_LAUNCH_IN_PROGRESS) {
1642		p->p_memstat_dirty &= ~P_DIRTY_LAUNCH_IN_PROGRESS;
1643	}
1644
1645	/* This can be set and cleared exactly once. */
1646	if (pcontrol & PROC_DIRTY_DEFER) {
1647
1648	       	if (p->p_memstat_dirty & P_DIRTY_DEFER) {
1649
1650			p->p_memstat_dirty &= ~P_DIRTY_DEFER;
1651
1652			memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1653			memorystatus_update_idle_priority_locked(p);
1654			memorystatus_reschedule_idle_demotion_locked();
1655		}
1656	}
1657
1658	ret = 0;
1659exit:
1660	proc_list_unlock();
1661
1662	return ret;
1663}
1664
1665int
1666memorystatus_dirty_get(proc_t p) {
1667	int ret = 0;
1668
1669	proc_list_lock();
1670
1671	if (p->p_memstat_dirty & P_DIRTY_TRACK) {
1672		ret |= PROC_DIRTY_TRACKED;
1673		if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
1674			ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
1675		}
1676		if (p->p_memstat_dirty & P_DIRTY) {
1677			ret |= PROC_DIRTY_IS_DIRTY;
1678		}
1679		if (p->p_memstat_dirty & P_DIRTY_LAUNCH_IN_PROGRESS) {
1680			ret |= PROC_DIRTY_LAUNCH_IS_IN_PROGRESS;
1681		}
1682	}
1683
1684	proc_list_unlock();
1685
1686	return ret;
1687}
1688
1689int
1690memorystatus_on_terminate(proc_t p) {
1691	int sig;
1692
1693	proc_list_lock();
1694
1695	p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1696
1697	if ((p->p_memstat_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
1698		/* Clean; mark as terminated and issue SIGKILL */
1699		sig = SIGKILL;
1700	} else {
1701		/* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
1702		sig = SIGTERM;
1703	}
1704
1705	proc_list_unlock();
1706
1707	return sig;
1708}
1709
1710void
1711memorystatus_on_suspend(proc_t p)
1712{
1713#if CONFIG_FREEZE
1714	uint32_t pages;
1715	memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
1716#endif
1717	proc_list_lock();
1718#if CONFIG_FREEZE
1719	p->p_memstat_suspendedfootprint = pages;
1720	memorystatus_suspended_footprint_total += pages;
1721	memorystatus_suspended_count++;
1722#endif
1723	p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
1724	proc_list_unlock();
1725}
1726
1727void
1728memorystatus_on_resume(proc_t p)
1729{
1730#if CONFIG_FREEZE
1731	boolean_t frozen;
1732	pid_t pid;
1733#endif
1734
1735	proc_list_lock();
1736
1737#if CONFIG_FREEZE
1738	frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
1739	if (frozen) {
1740		memorystatus_frozen_count--;
1741		p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
1742	}
1743
1744	memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
1745	memorystatus_suspended_count--;
1746
1747	pid = p->p_pid;
1748#endif
1749
1750	p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
1751
1752	proc_list_unlock();
1753
1754#if CONFIG_FREEZE
1755	if (frozen) {
1756		memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
1757		memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
1758	}
1759#endif
1760}
1761
1762void
1763memorystatus_on_inactivity(proc_t p)
1764{
1765#pragma unused(p)
1766#if CONFIG_FREEZE
1767	/* Wake the freeze thread */
1768	thread_wakeup((event_t)&memorystatus_freeze_wakeup);
1769#endif
1770}
1771
1772static uint32_t
1773memorystatus_build_state(proc_t p) {
1774	uint32_t snapshot_state = 0;
1775
1776	/* General */
1777	if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
1778		snapshot_state |= kMemorystatusSuspended;
1779	}
1780	if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
1781		snapshot_state |= kMemorystatusFrozen;
1782	}
1783	if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
1784 		snapshot_state |= kMemorystatusWasThawed;
1785	}
1786
1787	/* Tracking */
1788	if (p->p_memstat_dirty & P_DIRTY_TRACK) {
1789		snapshot_state |= kMemorystatusTracked;
1790	}
1791	if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
1792		snapshot_state |= kMemorystatusSupportsIdleExit;
1793	}
1794	if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
1795		snapshot_state |= kMemorystatusDirty;
1796	}
1797
1798	return snapshot_state;
1799}
1800
1801#if !CONFIG_JETSAM
1802
1803static boolean_t
1804kill_idle_exit_proc(void)
1805{
1806	proc_t p, victim_p = PROC_NULL;
1807	uint64_t current_time;
1808	boolean_t killed = FALSE;
1809	unsigned int i = 0;
1810
1811	/* Pick next idle exit victim. */
1812	current_time = mach_absolute_time();
1813
1814	proc_list_lock();
1815
1816	p = memorystatus_get_first_proc_locked(&i, FALSE);
1817	while (p) {
1818		/* No need to look beyond the idle band */
1819		if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
1820			break;
1821		}
1822
1823		if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
1824			if (current_time >= p->p_memstat_idledeadline) {
1825				p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1826				victim_p = proc_ref_locked(p);
1827				break;
1828			}
1829		}
1830
1831		p = memorystatus_get_next_proc_locked(&i, p, FALSE);
1832	}
1833
1834	proc_list_unlock();
1835
1836	if (victim_p) {
1837		printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_p->p_pid, (victim_p->p_comm ? victim_p->p_comm : "(unknown)"));
1838		killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit);
1839		proc_rele(victim_p);
1840	}
1841
1842	return killed;
1843}
1844#endif
1845
1846#if CONFIG_JETSAM
1847static void
1848memorystatus_thread_wake(void) {
1849	thread_wakeup((event_t)&memorystatus_wakeup);
1850}
1851#endif /* CONFIG_JETSAM */
1852
1853extern void vm_pressure_response(void);
1854
1855static int
1856memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
1857{
1858	if (interval_ms) {
1859		assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
1860	} else {
1861		assert_wait(&memorystatus_wakeup, THREAD_UNINT);
1862	}
1863
1864	return thread_block(continuation);
1865}
1866
1867static void
1868memorystatus_thread(void *param __unused, wait_result_t wr __unused)
1869{
1870	static boolean_t is_vm_privileged = FALSE;
1871#if CONFIG_JETSAM
1872	boolean_t post_snapshot = FALSE;
1873	uint32_t errors = 0;
1874	uint32_t hwm_kill = 0;
1875#endif
1876
1877	if (is_vm_privileged == FALSE) {
1878		/*
1879		 * It's the first time the thread has run, so just mark the thread as privileged and block.
1880		 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
1881		 */
1882		thread_wire(host_priv_self(), current_thread(), TRUE);
1883		is_vm_privileged = TRUE;
1884
1885		memorystatus_thread_block(0, memorystatus_thread);
1886	}
1887
1888#if CONFIG_JETSAM
1889
1890	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
1891		memorystatus_available_pages, 0, 0, 0, 0);
1892
1893	/*
1894	 * Jetsam aware version.
1895	 *
1896	 * The VM pressure notification thread is working it's way through clients in parallel.
1897	 *
1898	 * So, while the pressure notification thread is targeting processes in order of
1899	 * increasing jetsam priority, we can hopefully reduce / stop it's work by killing
1900	 * any processes that have exceeded their highwater mark.
1901	 *
1902	 * If we run out of HWM processes and our available pages drops below the critical threshold, then,
1903	 * we target the least recently used process in order of increasing jetsam priority (exception: the FG band).
1904	 */
1905	while (is_thrashing(kill_under_pressure_cause) ||
1906	       memorystatus_available_pages <= memorystatus_available_pages_pressure) {
1907		boolean_t killed;
1908		int32_t priority;
1909		uint32_t cause;
1910
1911		if (kill_under_pressure_cause) {
1912			cause = kill_under_pressure_cause;
1913		} else {
1914			cause = kMemorystatusKilledVMPageShortage;
1915		}
1916
1917#if LEGACY_HIWATER
1918		/* Highwater */
1919		killed = memorystatus_kill_hiwat_proc(&errors);
1920		if (killed) {
1921			hwm_kill++;
1922			post_snapshot = TRUE;
1923			goto done;
1924		} else {
1925			memorystatus_hwm_candidates = FALSE;
1926		}
1927
1928		/* No highwater processes to kill. Continue or stop for now? */
1929		if (!is_thrashing(kill_under_pressure_cause) &&
1930		    (memorystatus_available_pages > memorystatus_available_pages_critical)) {
1931			/*
1932			 * We are _not_ out of pressure but we are above the critical threshold and there's:
1933			 * - no compressor thrashing
1934			 * - no more HWM processes left.
1935			 * For now, don't kill any other processes.
1936			 */
1937
1938			if (hwm_kill == 0) {
1939 				memorystatus_thread_wasted_wakeup++;
1940			}
1941
1942			break;
1943		}
1944#endif
1945
1946		/* LRU */
1947		killed = memorystatus_kill_top_process(TRUE, cause, &priority, &errors);
1948		if (killed) {
1949			/* Don't generate logs for steady-state idle-exit kills (unless overridden for debug) */
1950			if ((priority != JETSAM_PRIORITY_IDLE) || memorystatus_idle_snapshot) {
1951        			post_snapshot = TRUE;
1952			}
1953			goto done;
1954		}
1955
1956		if (memorystatus_available_pages <= memorystatus_available_pages_critical) {
1957			/* Under pressure and unable to kill a process - panic */
1958			panic("memorystatus_jetsam_thread: no victim! available pages:%d\n", memorystatus_available_pages);
1959		}
1960
1961done:
1962
1963		/*
1964		 * We do not want to over-kill when thrashing has been detected.
1965		 * To avoid that, we reset the flag here and notify the
1966		 * compressor.
1967		 */
1968		if (is_thrashing(kill_under_pressure_cause)) {
1969			kill_under_pressure_cause = 0;
1970			vm_thrashing_jetsam_done();
1971		}
1972	}
1973
1974	kill_under_pressure_cause = 0;
1975
1976	if (errors) {
1977		memorystatus_clear_errors();
1978	}
1979
1980#if VM_PRESSURE_EVENTS
1981	/*
1982	 * LD: We used to target the foreground process first and foremost here.
1983	 * Now, we target all processes, starting from the non-suspended, background
1984	 * processes first. We will target foreground too.
1985	 *
1986	 * memorystatus_update_vm_pressure(TRUE);
1987	 */
1988	//vm_pressure_response();
1989#endif
1990
1991	if (post_snapshot) {
1992		size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
1993			sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
1994		memorystatus_jetsam_snapshot->notification_time = mach_absolute_time();
1995		memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
1996	}
1997
1998	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
1999		memorystatus_available_pages, 0, 0, 0, 0);
2000
2001#else /* CONFIG_JETSAM */
2002
2003	/*
2004	 * Jetsam not enabled
2005	 */
2006
2007#endif /* CONFIG_JETSAM */
2008
2009	memorystatus_thread_block(0, memorystatus_thread);
2010}
2011
2012#if !CONFIG_JETSAM
2013/*
2014 * Returns TRUE:
2015 * 	when an idle-exitable proc was killed
2016 * Returns FALSE:
2017 *	when there are no more idle-exitable procs found
2018 * 	when the attempt to kill an idle-exitable proc failed
2019 */
2020boolean_t memorystatus_idle_exit_from_VM(void) {
2021	return(kill_idle_exit_proc());
2022}
2023#endif /* !CONFIG_JETSAM */
2024
2025#if CONFIG_JETSAM
2026
2027/*
2028 * Callback invoked when allowable physical memory footprint exceeded
2029 * (dirty pages + IOKit mappings)
2030 *
2031 * This is invoked for both advisory, non-fatal per-task high watermarks,
2032 * as well as the fatal task memory limits.
2033 */
2034void
2035memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footprint_mb)
2036{
2037	proc_t p = current_proc();
2038
2039    if (warning == FALSE) {
2040		printf("process %d (%s) exceeded physical memory footprint limit of %d MB\n",
2041		       p->p_pid, p->p_comm, max_footprint_mb);
2042	}
2043
2044#if VM_PRESSURE_EVENTS
2045	if (warning == TRUE) {
2046		if (memorystatus_warn_process(p->p_pid, TRUE /* critical? */) != TRUE) {
2047			/* Print warning, since it's possible that task has not registered for pressure notifications */
2048			printf("task_exceeded_footprint: failed to warn the current task (exiting, or no handler registered?).\n");
2049		}
2050		return;
2051	}
2052#endif /* VM_PRESSURE_EVENTS */
2053
2054	if ((p->p_memstat_state & P_MEMSTAT_FATAL_MEMLIMIT) == P_MEMSTAT_FATAL_MEMLIMIT) {
2055		/*
2056		 * If this process has no high watermark or has a fatal task limit, then we have been invoked because the task
2057		 * has violated either the system-wide per-task memory limit OR its own task limit.
2058		 */
2059		if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit) != TRUE) {
2060			printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
2061		}
2062	} else {
2063		/*
2064		 * HWM offender exists. Done without locks or synchronization.
2065		 * See comment near its declaration for more details.
2066		 */
2067		memorystatus_hwm_candidates = TRUE;
2068	}
2069}
2070
2071/*
2072 * This is invoked when cpulimits have been exceeded while in fatal mode.
2073 * The jetsam_flags do not apply as those are for memory related kills.
2074 * We call this routine so that the offending process is killed with
2075 * a non-zero exit status.
2076 */
2077void
2078jetsam_on_ledger_cpulimit_exceeded(void)
2079{
2080	int retval = 0;
2081	int jetsam_flags = 0;  /* make it obvious */
2082	proc_t p = current_proc();
2083
2084	printf("task_exceeded_cpulimit: killing pid %d [%s]\n",
2085	       p->p_pid, (p->p_comm ? p->p_comm : "(unknown)"));
2086
2087	retval = jetsam_do_kill(p, jetsam_flags);
2088
2089	if (retval) {
2090		printf("task_exceeded_cpulimit: failed to kill current task (exiting?).\n");
2091	}
2092}
2093
2094static void
2095memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
2096{
2097	assert(task);
2098	assert(footprint);
2099
2100	*footprint = (uint32_t)(get_task_phys_footprint(task) / PAGE_SIZE_64);
2101	if (max_footprint) {
2102		*max_footprint = (uint32_t)(get_task_phys_footprint_max(task) / PAGE_SIZE_64);
2103	}
2104	if (max_footprint_lifetime) {
2105		*max_footprint_lifetime = (uint32_t)(get_task_resident_max(task) / PAGE_SIZE_64);
2106	}
2107	if (purgeable_pages) {
2108		*purgeable_pages = (uint32_t)(get_task_purgeable_size(task) / PAGE_SIZE_64);
2109	}
2110}
2111
2112
2113static void
2114memorystatus_update_snapshot_locked(proc_t p, uint32_t kill_cause)
2115{
2116	unsigned int i;
2117
2118	for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
2119		if (memorystatus_jetsam_snapshot_list[i].pid == p->p_pid) {
2120			/* Update if the priority has changed since the snapshot was taken */
2121			if (memorystatus_jetsam_snapshot_list[i].priority != p->p_memstat_effectivepriority) {
2122				memorystatus_jetsam_snapshot_list[i].priority = p->p_memstat_effectivepriority;
2123				strlcpy(memorystatus_jetsam_snapshot_list[i].name, p->p_comm, MAXCOMLEN+1);
2124				memorystatus_jetsam_snapshot_list[i].state = memorystatus_build_state(p);
2125				memorystatus_jetsam_snapshot_list[i].user_data = p->p_memstat_userdata;
2126				memorystatus_jetsam_snapshot_list[i].fds = p->p_fd->fd_nfiles;
2127			}
2128			memorystatus_jetsam_snapshot_list[i].killed = kill_cause;
2129			return;
2130		}
2131	}
2132}
2133
2134void memorystatus_pages_update(unsigned int pages_avail)
2135{
2136	memorystatus_available_pages = pages_avail;
2137
2138#if VM_PRESSURE_EVENTS
2139	/*
2140	 * Since memorystatus_available_pages changes, we should
2141	 * re-evaluate the pressure levels on the system and
2142	 * check if we need to wake the pressure thread.
2143	 * We also update memorystatus_level in that routine.
2144	 */
2145	vm_pressure_response();
2146
2147	if (memorystatus_available_pages <= memorystatus_available_pages_pressure) {
2148
2149		if (memorystatus_hwm_candidates || (memorystatus_available_pages <= memorystatus_available_pages_critical)) {
2150			memorystatus_thread_wake();
2151		}
2152	}
2153#else /* VM_PRESSURE_EVENTS */
2154
2155	boolean_t critical, delta;
2156
2157	if (!memorystatus_delta) {
2158	    return;
2159	}
2160
2161	critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
2162	delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
2163                || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
2164
2165	if (critical || delta) {
2166  		memorystatus_level = memorystatus_available_pages * 100 / atop_64(max_mem);
2167		memorystatus_thread_wake();
2168	}
2169#endif /* VM_PRESSURE_EVENTS */
2170}
2171
2172static boolean_t
2173memorystatus_get_snapshot_properties_for_proc_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry)
2174{
2175	clock_sec_t                     tv_sec;
2176	clock_usec_t                    tv_usec;
2177
2178	memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
2179
2180	entry->pid = p->p_pid;
2181	strlcpy(&entry->name[0], p->p_comm, MAXCOMLEN+1);
2182	entry->priority = p->p_memstat_effectivepriority;
2183	memorystatus_get_task_page_counts(p->task, &entry->pages, &entry->max_pages, &entry->max_pages_lifetime, &entry->purgeable_pages);
2184	entry->state = memorystatus_build_state(p);
2185	entry->user_data = p->p_memstat_userdata;
2186	memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
2187	entry->fds = p->p_fd->fd_nfiles;
2188
2189	absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
2190	entry->cpu_time.tv_sec = tv_sec;
2191	entry->cpu_time.tv_usec = tv_usec;
2192
2193	return TRUE;
2194}
2195
2196static void
2197memorystatus_jetsam_snapshot_procs_locked(void)
2198{
2199	proc_t p, next_p;
2200	unsigned int b = 0, i = 0;
2201	kern_return_t kr = KERN_SUCCESS;
2202
2203	mach_msg_type_number_t	count = HOST_VM_INFO64_COUNT;
2204	vm_statistics64_data_t	vm_stat;
2205
2206	if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count) != KERN_SUCCESS)) {
2207		printf("memorystatus_jetsam_snapshot_procs_locked: host_statistics64 failed with %d\n", kr);
2208		memset(&memorystatus_jetsam_snapshot->stats, 0, sizeof(memorystatus_jetsam_snapshot->stats));
2209	} else {
2210		memorystatus_jetsam_snapshot->stats.free_pages = vm_stat.free_count;
2211		memorystatus_jetsam_snapshot->stats.active_pages = vm_stat.active_count;
2212		memorystatus_jetsam_snapshot->stats.inactive_pages = vm_stat.inactive_count;
2213		memorystatus_jetsam_snapshot->stats.throttled_pages = vm_stat.throttled_count;
2214		memorystatus_jetsam_snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
2215		memorystatus_jetsam_snapshot->stats.wired_pages = vm_stat.wire_count;
2216
2217		memorystatus_jetsam_snapshot->stats.speculative_pages = vm_stat.speculative_count;
2218		memorystatus_jetsam_snapshot->stats.filebacked_pages = vm_stat.external_page_count;
2219		memorystatus_jetsam_snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
2220		memorystatus_jetsam_snapshot->stats.compressions = vm_stat.compressions;
2221		memorystatus_jetsam_snapshot->stats.decompressions = vm_stat.decompressions;
2222		memorystatus_jetsam_snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
2223		memorystatus_jetsam_snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
2224	}
2225
2226	next_p = memorystatus_get_first_proc_locked(&b, TRUE);
2227	while (next_p) {
2228		p = next_p;
2229		next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
2230
2231		if (FALSE == memorystatus_get_snapshot_properties_for_proc_locked(p, &memorystatus_jetsam_snapshot_list[i])) {
2232			continue;
2233		}
2234
2235		MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid = %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
2236			p->p_pid,
2237			p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
2238			p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
2239
2240		if (++i == memorystatus_jetsam_snapshot_max) {
2241			break;
2242		}
2243	}
2244
2245	memorystatus_jetsam_snapshot->snapshot_time = mach_absolute_time();
2246	memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = i;
2247}
2248
2249#if DEVELOPMENT || DEBUG
2250
2251static int
2252memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) {
2253	int ret;
2254	memorystatus_jetsam_panic_options_t debug;
2255
2256	if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
2257		return EINVAL;
2258	}
2259
2260	ret = copyin(buffer, &debug, buffer_size);
2261	if (ret) {
2262		return ret;
2263	}
2264
2265	/* Panic bits match kMemorystatusKilled* enum */
2266	memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
2267
2268	/* Copyout new value */
2269	debug.data = memorystatus_jetsam_panic_debug;
2270	ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
2271
2272	return ret;
2273}
2274
2275#endif
2276
2277/*
2278 * Jetsam a specific process.
2279 */
2280static boolean_t
2281memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause) {
2282	boolean_t killed;
2283	proc_t p;
2284
2285	/* TODO - add a victim queue and push this into the main jetsam thread */
2286
2287	p = proc_find(victim_pid);
2288	if (!p) {
2289		return FALSE;
2290	}
2291
2292	printf("memorystatus: specifically killing pid %d [%s] (%s) - memorystatus_available_pages: %d\n",
2293		victim_pid, (p->p_comm ? p->p_comm : "(unknown)"),
2294	        jetsam_kill_cause_name[cause], memorystatus_available_pages);
2295
2296	proc_list_lock();
2297
2298	if (memorystatus_jetsam_snapshot_count == 0) {
2299		memorystatus_jetsam_snapshot_procs_locked();
2300	}
2301
2302	memorystatus_update_snapshot_locked(p, cause);
2303	proc_list_unlock();
2304
2305	killed = memorystatus_do_kill(p, cause);
2306	proc_rele(p);
2307
2308	return killed;
2309}
2310
2311/*
2312 * Jetsam the first process in the queue.
2313 */
2314static boolean_t
2315memorystatus_kill_top_process(boolean_t any, uint32_t cause, int32_t *priority, uint32_t *errors)
2316{
2317	pid_t aPid;
2318	proc_t p = PROC_NULL, next_p = PROC_NULL;
2319	boolean_t new_snapshot = FALSE, killed = FALSE;
2320	unsigned int i = 0;
2321
2322#ifndef CONFIG_FREEZE
2323#pragma unused(any)
2324#endif
2325
2326	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
2327		memorystatus_available_pages, 0, 0, 0, 0);
2328
2329	proc_list_lock();
2330
2331	memorystatus_sort_by_largest_process_locked(JETSAM_PRIORITY_FOREGROUND);
2332
2333	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2334	while (next_p) {
2335#if DEVELOPMENT || DEBUG
2336		int activeProcess;
2337		int procSuspendedForDiagnosis;
2338#endif /* DEVELOPMENT || DEBUG */
2339
2340		p = next_p;
2341		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
2342
2343#if DEVELOPMENT || DEBUG
2344		activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
2345		procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
2346#endif /* DEVELOPMENT || DEBUG */
2347
2348		aPid = p->p_pid;
2349
2350		if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
2351			continue;
2352		}
2353
2354#if DEVELOPMENT || DEBUG
2355		if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
2356			printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
2357			continue;
2358		}
2359#endif /* DEVELOPMENT || DEBUG */
2360
2361		if (cause == kMemorystatusKilledVnodes)
2362		{
2363			/*
2364			 * If the system runs out of vnodes, we systematically jetsam
2365			 * processes in hopes of stumbling onto a vnode gain that helps
2366			 * the system recover.  The process that happens to trigger
2367			 * this path has no known relationship to the vnode consumption.
2368			 * We attempt to safeguard that process e.g: do not jetsam it.
2369			 */
2370
2371			if (p == current_proc()) {
2372				/* do not jetsam the current process */
2373				continue;
2374			}
2375		}
2376
2377#if CONFIG_FREEZE
2378		boolean_t skip;
2379		boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
2380		if (any || reclaim_proc) {
2381			skip = FALSE;
2382		} else {
2383			skip = TRUE;
2384		}
2385
2386		if (skip) {
2387			continue;
2388		} else
2389#endif
2390		{
2391			if (priority) {
2392				*priority = p->p_memstat_effectivepriority;
2393			}
2394
2395		        /*
2396		         * Capture a snapshot if none exists and:
2397		         * - priority was not requested (this is something other than an ambient kill)
2398		         * - the priority was requested *and* the targeted process is not at idle priority
2399		         */
2400                	if ((memorystatus_jetsam_snapshot_count == 0) &&
2401                		(memorystatus_idle_snapshot || ((!priority) || (priority && (*priority != JETSAM_PRIORITY_IDLE))))) {
2402                		memorystatus_jetsam_snapshot_procs_locked();
2403                		new_snapshot = TRUE;
2404                	}
2405
2406			/*
2407			 * Mark as terminated so that if exit1() indicates success, but the process (for example)
2408			 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
2409			 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
2410			 * acquisition of the proc lock.
2411			 */
2412			p->p_memstat_state |= P_MEMSTAT_TERMINATED;
2413
2414#if DEVELOPMENT || DEBUG
2415			if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) {
2416				MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n",
2417					aPid, (p->p_comm ? p->p_comm: "(unknown)"), memorystatus_level);
2418				memorystatus_update_snapshot_locked(p, kMemorystatusKilledDiagnostic);
2419				p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
2420				if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
2421					jetsam_diagnostic_suspended_one_active_proc = 1;
2422					printf("jetsam: returning after suspending first active proc - %d\n", aPid);
2423				}
2424
2425				p = proc_ref_locked(p);
2426				proc_list_unlock();
2427				if (p) {
2428					task_suspend(p->task);
2429					proc_rele(p);
2430					killed = TRUE;
2431				}
2432
2433				goto exit;
2434			} else
2435#endif /* DEVELOPMENT || DEBUG */
2436			{
2437				/* Shift queue, update stats */
2438				memorystatus_update_snapshot_locked(p, cause);
2439
2440				p = proc_ref_locked(p);
2441				proc_list_unlock();
2442				if (p) {
2443					printf("memorystatus: %s %d [%s] (%s) - memorystatus_available_pages: %d\n",
2444					    ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) ?
2445					    "idle exiting pid" : "jetsam killing pid"),
2446					    aPid, (p->p_comm ? p->p_comm : "(unknown)"),
2447					    jetsam_kill_cause_name[cause], memorystatus_available_pages);
2448					killed = memorystatus_do_kill(p, cause);
2449				}
2450
2451				/* Success? */
2452				if (killed) {
2453					proc_rele(p);
2454					goto exit;
2455				}
2456
2457				/* Failure - unwind and restart. */
2458				proc_list_lock();
2459				proc_rele_locked(p);
2460				p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
2461				p->p_memstat_state |= P_MEMSTAT_ERROR;
2462				*errors += 1;
2463				i = 0;
2464				next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2465			}
2466		}
2467	}
2468
2469	proc_list_unlock();
2470
2471exit:
2472	/* Clear snapshot if freshly captured and no target was found */
2473	if (new_snapshot && !killed) {
2474	    memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
2475	}
2476
2477	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
2478	    memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
2479
2480	return killed;
2481}
2482
2483#if LEGACY_HIWATER
2484
2485static boolean_t
2486memorystatus_kill_hiwat_proc(uint32_t *errors)
2487{
2488	pid_t aPid = 0;
2489	proc_t p = PROC_NULL, next_p = PROC_NULL;
2490	boolean_t new_snapshot = FALSE, killed = FALSE;
2491	unsigned int i = 0;
2492
2493	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
2494		memorystatus_available_pages, 0, 0, 0, 0);
2495
2496	proc_list_lock();
2497	memorystatus_sort_by_largest_process_locked(JETSAM_PRIORITY_FOREGROUND);
2498
2499	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2500	while (next_p) {
2501		uint32_t footprint;
2502		boolean_t skip;
2503
2504		p = next_p;
2505		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
2506
2507		aPid = p->p_pid;
2508
2509		if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
2510			continue;
2511		}
2512
2513		/* skip if no limit set */
2514		if (p->p_memstat_memlimit <= 0) {
2515			continue;
2516		}
2517
2518		/* skip if a currently inapplicable limit is encountered */
2519		if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
2520			continue;
2521		}
2522
2523		footprint = (uint32_t)(get_task_phys_footprint(p->task) / (1024 * 1024));
2524		skip = (((int32_t)footprint) <= p->p_memstat_memlimit);
2525#if DEVELOPMENT || DEBUG
2526		if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
2527			if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
2528				continue;
2529			}
2530		}
2531#endif /* DEVELOPMENT || DEBUG */
2532
2533#if CONFIG_FREEZE
2534		if (!skip) {
2535			if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
2536				skip = TRUE;
2537			} else {
2538				skip = FALSE;
2539			}
2540		}
2541#endif
2542
2543		if (skip) {
2544			continue;
2545		} else {
2546			MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %d Mb > 1 (%d Mb)\n",
2547				(memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", aPid, p->p_comm, footprint, p->p_memstat_memlimit);
2548
2549			if (memorystatus_jetsam_snapshot_count == 0) {
2550                		memorystatus_jetsam_snapshot_procs_locked();
2551                		new_snapshot = TRUE;
2552                	}
2553
2554			p->p_memstat_state |= P_MEMSTAT_TERMINATED;
2555
2556#if DEVELOPMENT || DEBUG
2557			if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
2558			        MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages);
2559				memorystatus_update_snapshot_locked(p, kMemorystatusKilledDiagnostic);
2560				p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
2561
2562				p = proc_ref_locked(p);
2563				proc_list_unlock();
2564				if (p) {
2565					task_suspend(p->task);
2566					proc_rele(p);
2567					killed = TRUE;
2568				}
2569
2570				goto exit;
2571			} else
2572#endif /* DEVELOPMENT || DEBUG */
2573			{
2574				memorystatus_update_snapshot_locked(p, kMemorystatusKilledHiwat);
2575
2576				p = proc_ref_locked(p);
2577				proc_list_unlock();
2578				if (p) {
2579				    printf("memorystatus: jetsam killing pid %d [%s] (highwater) - memorystatus_available_pages: %d\n",
2580        					aPid, (p->p_comm ? p->p_comm : "(unknown)"), memorystatus_available_pages);
2581				    killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat);
2582				}
2583
2584				/* Success? */
2585				if (killed) {
2586					proc_rele(p);
2587					goto exit;
2588				}
2589
2590				/* Failure - unwind and restart. */
2591				proc_list_lock();
2592				proc_rele_locked(p);
2593				p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
2594				p->p_memstat_state |= P_MEMSTAT_ERROR;
2595				*errors += 1;
2596				i = 0;
2597				next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2598			}
2599		}
2600	}
2601
2602	proc_list_unlock();
2603
2604exit:
2605	/* Clear snapshot if freshly captured and no target was found */
2606	if (new_snapshot && !killed) {
2607		memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
2608	}
2609
2610	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
2611	    memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
2612
2613	return killed;
2614}
2615
2616#endif /* LEGACY_HIWATER */
2617
2618static boolean_t
2619memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
2620	/* TODO: allow a general async path */
2621	if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing &&
2622				   cause != kMemorystatusKilledFCThrashing)) {
2623		return FALSE;
2624	}
2625
2626	kill_under_pressure_cause = cause;
2627	memorystatus_thread_wake();
2628	return TRUE;
2629}
2630
2631static boolean_t
2632memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause) {
2633	boolean_t res;
2634	uint32_t errors = 0;
2635
2636	if (victim_pid == -1) {
2637		/* No pid, so kill first process */
2638		res = memorystatus_kill_top_process(TRUE, cause, NULL, &errors);
2639	} else {
2640		res = memorystatus_kill_specific_process(victim_pid, cause);
2641	}
2642
2643	if (errors) {
2644		memorystatus_clear_errors();
2645	}
2646
2647	if (res == TRUE) {
2648		/* Fire off snapshot notification */
2649		size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
2650			sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
2651		memorystatus_jetsam_snapshot->notification_time = mach_absolute_time();
2652		memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
2653	}
2654
2655	return res;
2656}
2657
2658boolean_t
2659memorystatus_kill_on_VM_page_shortage(boolean_t async) {
2660	if (async) {
2661		return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
2662	} else {
2663		return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage);
2664	}
2665}
2666
2667boolean_t
2668memorystatus_kill_on_VM_thrashing(boolean_t async) {
2669	if (async) {
2670		return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
2671	} else {
2672		return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing);
2673	}
2674}
2675
2676boolean_t
2677memorystatus_kill_on_FC_thrashing(boolean_t async) {
2678	if (async) {
2679		return memorystatus_kill_process_async(-1, kMemorystatusKilledFCThrashing);
2680	} else {
2681		return memorystatus_kill_process_sync(-1, kMemorystatusKilledFCThrashing);
2682	}
2683}
2684
2685boolean_t
2686memorystatus_kill_on_vnode_limit(void) {
2687	return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes);
2688}
2689
2690#endif /* CONFIG_JETSAM */
2691
2692#if CONFIG_FREEZE
2693
2694__private_extern__ void
2695memorystatus_freeze_init(void)
2696{
2697	kern_return_t result;
2698	thread_t thread;
2699
2700	result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
2701	if (result == KERN_SUCCESS) {
2702		thread_deallocate(thread);
2703	} else {
2704		panic("Could not create memorystatus_freeze_thread");
2705	}
2706}
2707
2708static int
2709memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
2710{
2711	pid_t aPid = 0;
2712	int ret = -1;
2713	proc_t p = PROC_NULL, next_p = PROC_NULL;
2714	unsigned int i = 0;
2715
2716	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
2717		memorystatus_available_pages, 0, 0, 0, 0);
2718
2719	proc_list_lock();
2720
2721	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2722	while (next_p) {
2723		kern_return_t kr;
2724		uint32_t purgeable, wired, clean, dirty;
2725		boolean_t shared;
2726		uint32_t pages;
2727		uint32_t max_pages = 0;
2728		uint32_t state;
2729
2730		p = next_p;
2731		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
2732
2733		aPid = p->p_pid;
2734		state = p->p_memstat_state;
2735
2736		/* Ensure the process is eligible for freezing */
2737		if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
2738			continue; // with lock held
2739		}
2740
2741		/* Only freeze processes meeting our minimum resident page criteria */
2742		memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
2743		if (pages < memorystatus_freeze_pages_min) {
2744			continue; // with lock held
2745		}
2746
2747		if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPBACKED) {
2748			/* Ensure there's enough free space to freeze this process. */
2749			max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max);
2750			if (max_pages < memorystatus_freeze_pages_min) {
2751				*memorystatus_freeze_swap_low = TRUE;
2752				proc_list_unlock();
2753				goto exit;
2754			}
2755		} else {
2756			max_pages = UINT32_MAX - 1;
2757		}
2758
2759		/* Mark as locked temporarily to avoid kill */
2760		p->p_memstat_state |= P_MEMSTAT_LOCKED;
2761
2762		p = proc_ref_locked(p);
2763		proc_list_unlock();
2764		if (!p) {
2765			goto exit;
2766		}
2767
2768		kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
2769
2770		MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
2771    			"memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n",
2772       		(kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"),
2773       		memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free());
2774
2775		proc_list_lock();
2776		p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
2777
2778		/* Success? */
2779		if (KERN_SUCCESS == kr) {
2780			memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
2781
2782			memorystatus_frozen_count++;
2783
2784			p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
2785
2786			/* Update stats */
2787			for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
2788       				throttle_intervals[i].pageouts += dirty;
2789			}
2790
2791			memorystatus_freeze_pageouts += dirty;
2792			memorystatus_freeze_count++;
2793
2794			proc_list_unlock();
2795
2796			memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
2797
2798			/* Return the number of reclaimed pages */
2799			ret = dirty;
2800
2801		} else {
2802			proc_list_unlock();
2803		}
2804
2805		proc_rele(p);
2806		goto exit;
2807	}
2808
2809	proc_list_unlock();
2810
2811exit:
2812	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
2813		memorystatus_available_pages, aPid, 0, 0, 0);
2814
2815	return ret;
2816}
2817
2818static inline boolean_t
2819memorystatus_can_freeze_processes(void)
2820{
2821	boolean_t ret;
2822
2823	proc_list_lock();
2824
2825	if (memorystatus_suspended_count) {
2826		uint32_t average_resident_pages, estimated_processes;
2827
2828		/* Estimate the number of suspended processes we can fit */
2829		average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
2830		estimated_processes = memorystatus_suspended_count +
2831			((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
2832
2833		/* If it's predicted that no freeze will occur, lower the threshold temporarily */
2834		if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
2835			memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
2836		} else {
2837			memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
2838		}
2839
2840		MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n",
2841			memorystatus_suspended_count, average_resident_pages, estimated_processes);
2842
2843		if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
2844			ret = TRUE;
2845		} else {
2846			ret = FALSE;
2847		}
2848	} else {
2849		ret = FALSE;
2850	}
2851
2852	proc_list_unlock();
2853
2854	return ret;
2855}
2856
2857static boolean_t
2858memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
2859{
2860	/* Only freeze if we're sufficiently low on memory; this holds off freeze right
2861	   after boot,  and is generally is a no-op once we've reached steady state. */
2862	if (memorystatus_available_pages > memorystatus_freeze_threshold) {
2863		return FALSE;
2864	}
2865
2866	/* Check minimum suspended process threshold. */
2867	if (!memorystatus_can_freeze_processes()) {
2868		return FALSE;
2869	}
2870
2871	/* Is swap running low? */
2872	if (*memorystatus_freeze_swap_low) {
2873		/* If there's been no movement in free swap pages since we last attempted freeze, return. */
2874		if (default_pager_swap_pages_free() < memorystatus_freeze_pages_min) {
2875			return FALSE;
2876		}
2877
2878		/* Pages have been freed - we can retry. */
2879		*memorystatus_freeze_swap_low = FALSE;
2880	}
2881
2882	/* OK */
2883	return TRUE;
2884}
2885
2886static void
2887memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
2888{
2889	if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
2890		if (!interval->max_pageouts) {
2891			interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * FREEZE_DAILY_PAGEOUTS_MAX) / (24 * 60)));
2892		} else {
2893			printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
2894		}
2895		interval->ts.tv_sec = interval->mins * 60;
2896		interval->ts.tv_nsec = 0;
2897		ADD_MACH_TIMESPEC(&interval->ts, ts);
2898		/* Since we update the throttle stats pre-freeze, adjust for overshoot here */
2899		if (interval->pageouts > interval->max_pageouts) {
2900			interval->pageouts -= interval->max_pageouts;
2901		} else {
2902			interval->pageouts = 0;
2903		}
2904		interval->throttle = FALSE;
2905	} else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
2906		printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
2907		interval->throttle = TRUE;
2908	}
2909
2910	MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
2911		interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
2912		interval->throttle ? "on" : "off");
2913}
2914
2915static boolean_t
2916memorystatus_freeze_update_throttle(void)
2917{
2918	clock_sec_t sec;
2919	clock_nsec_t nsec;
2920	mach_timespec_t ts;
2921	uint32_t i;
2922	boolean_t throttled = FALSE;
2923
2924#if DEVELOPMENT || DEBUG
2925	if (!memorystatus_freeze_throttle_enabled)
2926		return FALSE;
2927#endif
2928
2929	clock_get_system_nanotime(&sec, &nsec);
2930	ts.tv_sec = sec;
2931	ts.tv_nsec = nsec;
2932
2933	/* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
2934	 *
2935	 * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
2936	 * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
2937	 * order to allow for bursts of activity.
2938	 */
2939	for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
2940		memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
2941		if (throttle_intervals[i].throttle == TRUE)
2942			throttled = TRUE;
2943	}
2944
2945	return throttled;
2946}
2947
2948static void
2949memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
2950{
2951	static boolean_t memorystatus_freeze_swap_low = FALSE;
2952
2953	if (memorystatus_freeze_enabled) {
2954		if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
2955			/* Only freeze if we've not exceeded our pageout budgets or we're not backed by swap. */
2956			if (DEFAULT_FREEZER_COMPRESSED_PAGER_IS_SWAPLESS ||
2957				!memorystatus_freeze_update_throttle()) {
2958				memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
2959			} else {
2960				printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
2961				memorystatus_freeze_throttle_count++; /* Throttled, update stats */
2962			}
2963		}
2964	}
2965
2966	assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
2967	thread_block((thread_continue_t) memorystatus_freeze_thread);
2968}
2969
2970#endif /* CONFIG_FREEZE */
2971
2972#if VM_PRESSURE_EVENTS
2973
2974#if CONFIG_MEMORYSTATUS
2975
2976static int
2977memorystatus_send_note(int event_code, void *data, size_t data_length) {
2978	int ret;
2979	struct kev_msg ev_msg;
2980
2981	ev_msg.vendor_code    = KEV_VENDOR_APPLE;
2982	ev_msg.kev_class      = KEV_SYSTEM_CLASS;
2983	ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
2984
2985	ev_msg.event_code     = event_code;
2986
2987	ev_msg.dv[0].data_length = data_length;
2988	ev_msg.dv[0].data_ptr = data;
2989	ev_msg.dv[1].data_length = 0;
2990
2991	ret = kev_post_msg(&ev_msg);
2992	if (ret) {
2993		printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
2994	}
2995
2996	return ret;
2997}
2998
2999boolean_t
3000memorystatus_warn_process(pid_t pid, boolean_t critical) {
3001
3002	boolean_t ret = FALSE;
3003	struct knote *kn = NULL;
3004
3005	/*
3006	 * See comment in sysctl_memorystatus_vm_pressure_send.
3007	 */
3008
3009	memorystatus_klist_lock();
3010	kn = vm_find_knote_from_pid(pid, &memorystatus_klist);
3011	if (kn) {
3012		/*
3013		 * By setting the "fflags" here, we are forcing
3014		 * a process to deal with the case where it's
3015		 * bumping up into its memory limits. If we don't
3016		 * do this here, we will end up depending on the
3017		 * system pressure snapshot evaluation in
3018		 * filt_memorystatus().
3019		 */
3020
3021		if (critical) {
3022			kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
3023		} else {
3024			kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_WARN;
3025		}
3026    		KNOTE(&memorystatus_klist, kMemorystatusPressure);
3027    		ret = TRUE;
3028	} else {
3029		if (vm_dispatch_pressure_note_to_pid(pid, FALSE) == 0) {
3030			ret = TRUE;
3031		}
3032	}
3033	memorystatus_klist_unlock();
3034
3035	return ret;
3036}
3037
3038int
3039memorystatus_send_pressure_note(pid_t pid) {
3040 	MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
3041 	return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
3042}
3043
3044void
3045memorystatus_send_low_swap_note(void) {
3046
3047	struct knote *kn = NULL;
3048
3049	memorystatus_klist_lock();
3050	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
3051		if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
3052    			KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
3053		}
3054	}
3055	memorystatus_klist_unlock();
3056}
3057
3058boolean_t
3059memorystatus_bg_pressure_eligible(proc_t p) {
3060 	boolean_t eligible = FALSE;
3061
3062	proc_list_lock();
3063
3064	MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
3065
3066 	/* Foreground processes have already been dealt with at this point, so just test for eligibility */
3067 	if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
3068                eligible = TRUE;
3069	}
3070
3071	proc_list_unlock();
3072
3073 	return eligible;
3074}
3075
3076boolean_t
3077memorystatus_is_foreground_locked(proc_t p) {
3078        return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
3079                (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT));
3080}
3081#endif /* CONFIG_MEMORYSTATUS */
3082
3083/*
3084 * Trigger levels to test the mechanism.
3085 * Can be used via a sysctl.
3086 */
3087#define TEST_LOW_MEMORY_TRIGGER_ONE		1
3088#define TEST_LOW_MEMORY_TRIGGER_ALL		2
3089#define TEST_PURGEABLE_TRIGGER_ONE		3
3090#define TEST_PURGEABLE_TRIGGER_ALL		4
3091#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE	5
3092#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL	6
3093
3094boolean_t		memorystatus_manual_testing_on = FALSE;
3095vm_pressure_level_t	memorystatus_manual_testing_level = kVMPressureNormal;
3096
3097extern struct knote *
3098vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
3099
3100extern
3101kern_return_t vm_pressure_notification_without_levels(boolean_t);
3102
3103extern void vm_pressure_klist_lock(void);
3104extern void vm_pressure_klist_unlock(void);
3105
3106extern void vm_reset_active_list(void);
3107
3108extern void delay(int);
3109
3110#define INTER_NOTIFICATION_DELAY	(250000)	/* .25 second */
3111
3112void memorystatus_on_pageout_scan_end(void) {
3113	/* No-op */
3114}
3115
3116/*
3117 * kn_max - knote
3118 *
3119 * knote_pressure_level - to check if the knote is registered for this notification level.
3120 *
3121 * task	- task whose bits we'll be modifying
3122 *
3123 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
3124 *
3125 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
3126 *
3127 */
3128
3129boolean_t
3130is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
3131{
3132	if (kn_max->kn_sfflags & knote_pressure_level) {
3133
3134		if (task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
3135
3136			task_clear_has_been_notified(task, pressure_level_to_clear);
3137		}
3138
3139		task_mark_has_been_notified(task, pressure_level_to_set);
3140		return TRUE;
3141	}
3142
3143	return FALSE;
3144}
3145
3146extern kern_return_t vm_pressure_notify_dispatch_vm_clients(boolean_t target_foreground_process);
3147
3148#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD		5000	/* milliseconds */
3149
3150kern_return_t
3151memorystatus_update_vm_pressure(boolean_t target_foreground_process)
3152{
3153	struct knote			*kn_max = NULL;
3154        pid_t				target_pid = -1;
3155        struct klist			dispatch_klist = { NULL };
3156	proc_t				target_proc = PROC_NULL;
3157	struct task			*task = NULL;
3158	boolean_t			found_candidate = FALSE;
3159
3160	static vm_pressure_level_t 	level_snapshot = kVMPressureNormal;
3161	static vm_pressure_level_t	prev_level_snapshot = kVMPressureNormal;
3162	boolean_t			smoothing_window_started = FALSE;
3163	struct timeval			smoothing_window_start_tstamp = {0, 0};
3164	struct timeval			curr_tstamp = {0, 0};
3165	int				elapsed_msecs = 0;
3166
3167#if !CONFIG_JETSAM
3168#define MAX_IDLE_KILLS 100	/* limit the number of idle kills allowed */
3169
3170	int	idle_kill_counter = 0;
3171
3172	/*
3173	 * On desktop we take this opportunity to free up memory pressure
3174	 * by immediately killing idle exitable processes. We use a delay
3175	 * to avoid overkill.  And we impose a max counter as a fail safe
3176	 * in case daemons re-launch too fast.
3177	 */
3178	while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
3179		if (memorystatus_idle_exit_from_VM() == FALSE) {
3180			/* No idle exitable processes left to kill */
3181			break;
3182		}
3183		idle_kill_counter++;
3184		delay(1000000);    /* 1 second */
3185	}
3186#endif /* !CONFIG_JETSAM */
3187
3188	while (1) {
3189
3190		/*
3191		 * There is a race window here. But it's not clear
3192		 * how much we benefit from having extra synchronization.
3193		 */
3194		level_snapshot = memorystatus_vm_pressure_level;
3195
3196		if (prev_level_snapshot > level_snapshot) {
3197			/*
3198			 * Pressure decreased? Let's take a little breather
3199			 * and see if this condition stays.
3200			 */
3201			if (smoothing_window_started == FALSE) {
3202
3203				smoothing_window_started = TRUE;
3204				microuptime(&smoothing_window_start_tstamp);
3205			}
3206
3207			microuptime(&curr_tstamp);
3208			timevalsub(&curr_tstamp, &smoothing_window_start_tstamp);
3209			elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
3210
3211			if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
3212
3213				delay(INTER_NOTIFICATION_DELAY);
3214				continue;
3215			}
3216		}
3217
3218		prev_level_snapshot = level_snapshot;
3219		smoothing_window_started = FALSE;
3220
3221		memorystatus_klist_lock();
3222		kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process);
3223
3224        	if (kn_max == NULL) {
3225			memorystatus_klist_unlock();
3226
3227			/*
3228			 * No more level-based clients to notify.
3229			 * Try the non-level based notification clients.
3230			 *
3231			 * However, these non-level clients don't understand
3232			 * the "return-to-normal" notification.
3233			 *
3234			 * So don't consider them for those notifications. Just
3235			 * return instead.
3236			 *
3237			 */
3238
3239			if (level_snapshot != kVMPressureNormal) {
3240				goto try_dispatch_vm_clients;
3241			} else {
3242				return KERN_FAILURE;
3243			}
3244		}
3245
3246		target_proc = kn_max->kn_kq->kq_p;
3247
3248		proc_list_lock();
3249		if (target_proc != proc_ref_locked(target_proc)) {
3250			target_proc = PROC_NULL;
3251			proc_list_unlock();
3252			memorystatus_klist_unlock();
3253			continue;
3254		}
3255		proc_list_unlock();
3256		memorystatus_klist_unlock();
3257
3258		target_pid = target_proc->p_pid;
3259
3260		task = (struct task *)(target_proc->task);
3261
3262		if (level_snapshot != kVMPressureNormal) {
3263
3264			if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
3265
3266				if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, kVMPressureCritical, kVMPressureWarning) == TRUE) {
3267					found_candidate = TRUE;
3268				}
3269			} else {
3270				if (level_snapshot == kVMPressureCritical) {
3271
3272					if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, kVMPressureWarning, kVMPressureCritical) == TRUE) {
3273						found_candidate = TRUE;
3274					}
3275				}
3276			}
3277		} else {
3278			if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3279
3280				task_clear_has_been_notified(task, kVMPressureWarning);
3281				task_clear_has_been_notified(task, kVMPressureCritical);
3282
3283				found_candidate = TRUE;
3284			}
3285		}
3286
3287		if (found_candidate == FALSE) {
3288			continue;
3289		}
3290
3291		memorystatus_klist_lock();
3292		KNOTE_DETACH(&memorystatus_klist, kn_max);
3293		KNOTE_ATTACH(&dispatch_klist, kn_max);
3294		memorystatus_klist_unlock();
3295
3296		KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
3297
3298		memorystatus_klist_lock();
3299		KNOTE_DETACH(&dispatch_klist, kn_max);
3300		KNOTE_ATTACH(&memorystatus_klist, kn_max);
3301		memorystatus_klist_unlock();
3302
3303		microuptime(&target_proc->vm_pressure_last_notify_tstamp);
3304		proc_rele(target_proc);
3305
3306		if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
3307			break;
3308		}
3309
3310try_dispatch_vm_clients:
3311		if (kn_max == NULL && level_snapshot != kVMPressureNormal) {
3312			/*
3313			 * We will exit this loop when we are done with
3314			 * notification clients (level and non-level based).
3315			 */
3316			if ((vm_pressure_notify_dispatch_vm_clients(target_foreground_process) == KERN_FAILURE) && (kn_max == NULL)) {
3317				/*
3318				 * kn_max == NULL i.e. we didn't find any eligible clients for the level-based notifications
3319				 * AND
3320				 * we have failed to find any eligible clients for the non-level based notifications too.
3321				 * So, we are done.
3322				 */
3323
3324				return KERN_FAILURE;
3325			}
3326		}
3327
3328		/*
3329		 * LD: This block of code below used to be invoked in the older memory notification scheme on embedded everytime
3330		 * a process was sent a memory pressure notification. The "memorystatus_klist" list was used to hold these
3331		 * privileged listeners. But now we have moved to the newer scheme and are trying to move away from the extra
3332		 * notifications. So the code is here in case we break compat. and need to send out notifications to the privileged
3333		 * apps.
3334		 */
3335#if 0
3336#endif /* 0 */
3337
3338		if (memorystatus_manual_testing_on == TRUE) {
3339			/*
3340			 * Testing out the pressure notification scheme.
3341			 * No need for delays etc.
3342			 */
3343		} else {
3344
3345			uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
3346#if CONFIG_JETSAM
3347			unsigned int page_delta = 0;
3348			unsigned int skip_delay_page_threshold = 0;
3349
3350			assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
3351
3352			page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
3353			skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
3354
3355			if (memorystatus_available_pages <= skip_delay_page_threshold) {
3356				/*
3357				 * We are nearing the critcal mark fast and can't afford to wait between
3358				 * notifications.
3359				 */
3360				sleep_interval = 0;
3361			}
3362#endif /* CONFIG_JETSAM */
3363
3364			if (sleep_interval) {
3365				delay(sleep_interval);
3366			}
3367		}
3368	}
3369
3370	return KERN_SUCCESS;
3371}
3372
3373vm_pressure_level_t
3374convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
3375
3376vm_pressure_level_t
3377convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
3378{
3379	vm_pressure_level_t	dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
3380
3381	switch (internal_pressure_level) {
3382
3383		case kVMPressureNormal:
3384		{
3385			dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
3386			break;
3387		}
3388
3389		case kVMPressureWarning:
3390		case kVMPressureUrgent:
3391		{
3392			dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
3393			break;
3394		}
3395
3396		case kVMPressureCritical:
3397		{
3398			dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
3399			break;
3400		}
3401
3402		default:
3403			break;
3404	}
3405
3406	return dispatch_level;
3407}
3408
3409static int
3410sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
3411{
3412#pragma unused(arg1, arg2, oidp)
3413	vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
3414
3415	return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
3416}
3417
3418#if DEBUG || DEVELOPMENT
3419
3420SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED,
3421    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
3422
3423#else /* DEBUG || DEVELOPMENT */
3424
3425SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED,
3426    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
3427
3428#endif /* DEBUG || DEVELOPMENT */
3429
3430extern int memorystatus_purge_on_warning;
3431extern int memorystatus_purge_on_critical;
3432
3433static int
3434sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
3435{
3436#pragma unused(arg1, arg2)
3437
3438	int level = 0;
3439	int error = 0;
3440	int pressure_level = 0;
3441	int trigger_request = 0;
3442	int force_purge;
3443
3444	error = sysctl_handle_int(oidp, &level, 0, req);
3445	if (error || !req->newptr) {
3446		return (error);
3447	}
3448
3449	memorystatus_manual_testing_on = TRUE;
3450
3451	trigger_request = (level >> 16) & 0xFFFF;
3452	pressure_level = (level & 0xFFFF);
3453
3454	if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
3455	    trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
3456		return EINVAL;
3457	}
3458	switch (pressure_level) {
3459	case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
3460	case NOTE_MEMORYSTATUS_PRESSURE_WARN:
3461	case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
3462		break;
3463	default:
3464		return EINVAL;
3465	}
3466
3467	/*
3468	 * The pressure level is being set from user-space.
3469	 * And user-space uses the constants in sys/event.h
3470	 * So we translate those events to our internal levels here.
3471	 */
3472	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3473
3474		memorystatus_manual_testing_level = kVMPressureNormal;
3475		force_purge = 0;
3476
3477	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
3478
3479		memorystatus_manual_testing_level = kVMPressureWarning;
3480		force_purge = memorystatus_purge_on_warning;
3481
3482	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
3483
3484		memorystatus_manual_testing_level = kVMPressureCritical;
3485		force_purge = memorystatus_purge_on_critical;
3486	}
3487
3488	memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
3489
3490	/* purge according to the new pressure level */
3491	switch (trigger_request) {
3492	case TEST_PURGEABLE_TRIGGER_ONE:
3493	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
3494		if (force_purge == 0) {
3495			/* no purging requested */
3496			break;
3497		}
3498		vm_purgeable_object_purge_one_unlocked(force_purge);
3499		break;
3500	case TEST_PURGEABLE_TRIGGER_ALL:
3501	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
3502		if (force_purge == 0) {
3503			/* no purging requested */
3504			break;
3505		}
3506		while (vm_purgeable_object_purge_one_unlocked(force_purge));
3507		break;
3508	}
3509
3510	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
3511	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
3512
3513		memorystatus_update_vm_pressure(TRUE);
3514	}
3515
3516	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
3517	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
3518
3519		while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
3520			continue;
3521		}
3522	}
3523
3524	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3525		memorystatus_manual_testing_on = FALSE;
3526
3527		vm_pressure_klist_lock();
3528		vm_reset_active_list();
3529		vm_pressure_klist_unlock();
3530	} else {
3531
3532		vm_pressure_klist_lock();
3533		vm_pressure_notification_without_levels(FALSE);
3534		vm_pressure_klist_unlock();
3535	}
3536
3537	return 0;
3538}
3539
3540SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
3541    0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
3542
3543
3544extern int memorystatus_purge_on_warning;
3545extern int memorystatus_purge_on_urgent;
3546extern int memorystatus_purge_on_critical;
3547
3548SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
3549SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
3550SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
3551
3552
3553#endif /* VM_PRESSURE_EVENTS */
3554
3555/* Return both allocated and actual size, since there's a race between allocation and list compilation */
3556static int
3557memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
3558{
3559 	uint32_t list_count, i = 0;
3560	memorystatus_priority_entry_t *list_entry;
3561	proc_t p;
3562
3563 	list_count = memorystatus_list_count;
3564	*list_size = sizeof(memorystatus_priority_entry_t) * list_count;
3565
3566	/* Just a size check? */
3567	if (size_only) {
3568		return 0;
3569	}
3570
3571	/* Otherwise, validate the size of the buffer */
3572	if (*buffer_size < *list_size) {
3573		return EINVAL;
3574	}
3575
3576 	*list_ptr = (memorystatus_priority_entry_t*)kalloc(*list_size);
3577	if (!list_ptr) {
3578		return ENOMEM;
3579	}
3580
3581	memset(*list_ptr, 0, *list_size);
3582
3583	*buffer_size = *list_size;
3584	*list_size = 0;
3585
3586	list_entry = *list_ptr;
3587
3588	proc_list_lock();
3589
3590	p = memorystatus_get_first_proc_locked(&i, TRUE);
3591	while (p && (*list_size < *buffer_size)) {
3592		list_entry->pid = p->p_pid;
3593		list_entry->priority = p->p_memstat_effectivepriority;
3594		list_entry->user_data = p->p_memstat_userdata;
3595#if LEGACY_HIWATER
3596		if (((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) ||
3597		     (p->p_memstat_memlimit <= 0)) {
3598			task_get_phys_footprint_limit(p->task, &list_entry->limit);
3599		} else {
3600			list_entry->limit = p->p_memstat_memlimit;
3601		}
3602#else
3603		task_get_phys_footprint_limit(p->task, &list_entry->limit);
3604#endif
3605		list_entry->state = memorystatus_build_state(p);
3606		list_entry++;
3607
3608		*list_size += sizeof(memorystatus_priority_entry_t);
3609
3610		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3611	}
3612
3613	proc_list_unlock();
3614
3615	MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
3616
3617	return 0;
3618}
3619
3620static int
3621memorystatus_cmd_get_priority_list(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
3622	int error = EINVAL;
3623	boolean_t size_only;
3624	memorystatus_priority_entry_t *list = NULL;
3625	size_t list_size;
3626
3627	size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
3628
3629	error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
3630	if (error) {
3631		goto out;
3632	}
3633
3634	if (!size_only) {
3635		error = copyout(list, buffer, list_size);
3636	}
3637
3638	if (error == 0) {
3639		*retval = list_size;
3640	}
3641out:
3642
3643	if (list) {
3644		kfree(list, buffer_size);
3645	}
3646
3647	return error;
3648}
3649
3650#if CONFIG_JETSAM
3651
3652static void
3653memorystatus_clear_errors(void)
3654{
3655	proc_t p;
3656	unsigned int i = 0;
3657
3658	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
3659
3660	proc_list_lock();
3661
3662	p = memorystatus_get_first_proc_locked(&i, TRUE);
3663	while (p) {
3664		if (p->p_memstat_state & P_MEMSTAT_ERROR) {
3665			p->p_memstat_state &= ~P_MEMSTAT_ERROR;
3666		}
3667		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3668	}
3669
3670	proc_list_unlock();
3671
3672	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
3673}
3674
3675static void
3676memorystatus_update_levels_locked(boolean_t critical_only) {
3677
3678	memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
3679
3680	/*
3681	 * If there's an entry in the first bucket, we have idle processes.
3682	 */
3683	memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3684	if (first_bucket->count) {
3685		memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
3686
3687		if (memorystatus_available_pages_critical  > memorystatus_available_pages_pressure ) {
3688			/*
3689			 * The critical threshold must never exceed the pressure threshold
3690			 */
3691			memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
3692		}
3693	}
3694
3695#if DEBUG || DEVELOPMENT
3696	if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
3697		memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
3698
3699		if (memorystatus_available_pages_critical > memorystatus_available_pages_pressure ) {
3700			/*
3701			 * The critical threshold must never exceed the pressure threshold
3702			 */
3703			memorystatus_available_pages_critical = memorystatus_available_pages_pressure;
3704		}
3705	}
3706#endif
3707
3708	if (critical_only) {
3709		return;
3710	}
3711
3712#if VM_PRESSURE_EVENTS
3713	memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
3714#if DEBUG || DEVELOPMENT
3715	if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
3716		memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
3717	}
3718#endif
3719#endif
3720}
3721
3722static int
3723memorystatus_get_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
3724	size_t input_size = *snapshot_size;
3725
3726	if (memorystatus_jetsam_snapshot_count > 0) {
3727		*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
3728	} else {
3729		*snapshot_size = 0;
3730	}
3731
3732	if (size_only) {
3733		return 0;
3734	}
3735
3736	if (input_size < *snapshot_size) {
3737		return EINVAL;
3738	}
3739
3740	*snapshot = memorystatus_jetsam_snapshot;
3741
3742	MEMORYSTATUS_DEBUG(1, "memorystatus_snapshot: returning %ld for size\n", (long)*snapshot_size);
3743
3744	return 0;
3745}
3746
3747
3748static int
3749memorystatus_cmd_get_jetsam_snapshot(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
3750	int error = EINVAL;
3751	boolean_t size_only;
3752	memorystatus_jetsam_snapshot_t *snapshot;
3753
3754	size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
3755
3756	error = memorystatus_get_snapshot(&snapshot, &buffer_size, size_only);
3757	if (error) {
3758		goto out;
3759	}
3760
3761	/* Copy out and reset */
3762	if (!size_only) {
3763		if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
3764			snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
3765		}
3766	}
3767
3768	if (error == 0) {
3769		*retval = buffer_size;
3770	}
3771out:
3772	return error;
3773}
3774
3775/*
3776 * 	Routine:	memorystatus_cmd_grp_set_properties
3777 *	Purpose:	Update properties for a group of processes.
3778 *
3779 *	Supported Properties:
3780 *	[priority]
3781 *		Move each process out of its effective priority
3782 *		band and into a new priority band.
3783 *		Maintains relative order from lowest to highest priority.
3784 *		In single band, maintains relative order from head to tail.
3785 *
3786 *		eg: before	[effectivepriority | pid]
3787 *				[18 | p101              ]
3788 *				[17 | p55, p67, p19     ]
3789 *				[12 | p103 p10          ]
3790 *				[ 7 | p25               ]
3791 *			 	[ 0 | p71, p82,         ]
3792 *
3793 *		after	[ new band | pid]
3794 *			[ xxx | p71, p82, p25, p103, p10, p55, p67, p19, p101]
3795 *
3796 *	Returns:  0 on success, else non-zero.
3797 *
3798 *	Caveat:   We know there is a race window regarding recycled pids.
3799 *		  A process could be killed before the kernel can act on it here.
3800 *		  If a pid cannot be found in any of the jetsam priority bands,
3801 *		  then we simply ignore it.  No harm.
3802 *		  But, if the pid has been recycled then it could be an issue.
3803 *		  In that scenario, we might move an unsuspecting process to the new
3804 *		  priority band. It's not clear how the kernel can safeguard
3805 *		  against this, but it would be an extremely rare case anyway.
3806 *		  The caller of this api might avoid such race conditions by
3807 *		  ensuring that the processes passed in the pid list are suspended.
3808 */
3809
3810
3811/* This internal structure can expand when we add support for more properties */
3812typedef	struct memorystatus_internal_properties
3813{
3814	proc_t proc;
3815	int32_t priority;  /* see memorytstatus_priority_entry_t : priority */
3816} memorystatus_internal_properties_t;
3817
3818
3819static int
3820memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3821
3822#pragma unused (flags)
3823
3824	/*
3825	 * We only handle setting priority
3826	 * per process
3827	 */
3828
3829	int error = 0;
3830	memorystatus_priority_entry_t *entries = NULL;
3831	uint32_t entry_count = 0;
3832
3833	/* This will be the ordered proc list */
3834	memorystatus_internal_properties_t *table = NULL;
3835	size_t table_size = 0;
3836	uint32_t table_count = 0;
3837
3838	uint32_t i = 0;
3839	uint32_t bucket_index = 0;
3840	boolean_t head_insert;
3841	int32_t new_priority;
3842
3843	proc_t p;
3844
3845	/* Verify inputs */
3846	if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) {
3847		error = EINVAL;
3848		goto out;
3849	}
3850
3851	entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t));
3852	if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) {
3853		error = ENOMEM;
3854		goto out;
3855	}
3856
3857	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0);
3858
3859	if ((error = copyin(buffer, entries, buffer_size)) != 0) {
3860		goto out;
3861	}
3862
3863	/* Verify sanity of input priorities */
3864	for (i=0; i < entry_count; i++) {
3865		if (entries[i].priority == -1) {
3866			/* Use as shorthand for default priority */
3867			entries[i].priority = JETSAM_PRIORITY_DEFAULT;
3868		} else if (entries[i].priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
3869			/* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use;
3870			 * if requested, adjust to JETSAM_PRIORITY_IDLE. */
3871			entries[i].priority = JETSAM_PRIORITY_IDLE;
3872	        } else if (entries[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
3873			/* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle
3874			 * queue */
3875			/* Deal with this later */
3876		} else if ((entries[i].priority < 0) || (entries[i].priority >= MEMSTAT_BUCKET_COUNT)) {
3877			/* Sanity check */
3878			error = EINVAL;
3879			goto out;
3880		}
3881	}
3882
3883	table_size = sizeof(memorystatus_internal_properties_t) * entry_count;
3884	if ( (table = (memorystatus_internal_properties_t *)kalloc(table_size)) == NULL) {
3885		error = ENOMEM;
3886		goto out;
3887	}
3888	memset(table, 0, table_size);
3889
3890
3891	/*
3892	 * For each jetsam bucket entry, spin through the input property list.
3893	 * When a matching pid is found, populate an adjacent table with the
3894	 * appropriate proc pointer and new property values.
3895	 * This traversal automatically preserves order from lowest
3896	 * to highest priority.
3897	 */
3898
3899	bucket_index=0;
3900
3901	proc_list_lock();
3902
3903	/* Create the ordered table */
3904	p = memorystatus_get_first_proc_locked(&bucket_index, TRUE);
3905	while (p && (table_count < entry_count)) {
3906		for (i=0; i < entry_count; i++ ) {
3907			if (p->p_pid == entries[i].pid) {
3908				/* Build the table data  */
3909				table[table_count].proc = p;
3910				table[table_count].priority = entries[i].priority;
3911				table_count++;
3912				break;
3913			}
3914		}
3915		p = memorystatus_get_next_proc_locked(&bucket_index, p, TRUE);
3916	}
3917
3918	/* We now have ordered list of procs ready to move */
3919	for (i=0; i < table_count; i++) {
3920		p = table[i].proc;
3921		assert(p != NULL);
3922
3923		/* Allow head inserts -- but relative order is now  */
3924		if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) {
3925			new_priority = JETSAM_PRIORITY_IDLE;
3926			head_insert = true;
3927		} else {
3928			new_priority = table[i].priority;
3929			head_insert = false;
3930		}
3931
3932		/* Not allowed */
3933		if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3934			continue;
3935		}
3936
3937		/*
3938		 * Take appropriate steps if moving proc out of the
3939		 * JETSAM_PRIORITY_IDLE_DEFERRED band.
3940		 */
3941		if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
3942			memorystatus_invalidate_idle_demotion_locked(p, TRUE);
3943		}
3944
3945		memorystatus_update_priority_locked(p, new_priority, head_insert);
3946	}
3947
3948	proc_list_unlock();
3949
3950	/*
3951	 * if (table_count != entry_count)
3952	 * then some pids were not found in a jetsam band.
3953	 * harmless but interesting...
3954	 */
3955	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0);
3956
3957out:
3958	if (entries)
3959		kfree(entries, buffer_size);
3960	if (table)
3961		kfree(table, table_size);
3962
3963	return (error);
3964}
3965
3966
3967/*
3968 * This routine is meant solely for the purpose of adjusting jetsam priorities and bands.
3969 * It is _not_ meant to be used for the setting of memory limits, especially, since we can't
3970 * tell if the memory limit being set is fatal or not.
3971 *
3972 * So the the last 5 args to the memorystatus_update() call below, related to memory limits,  are all 0 or FALSE.
3973 */
3974
3975static int
3976memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3977	const uint32_t MAX_ENTRY_COUNT = 2; /* Cap the entry count */
3978
3979	int error;
3980	uint32_t i;
3981	uint32_t entry_count;
3982	memorystatus_priority_properties_t *entries;
3983
3984	/* Validate inputs */
3985	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
3986		return EINVAL;
3987	}
3988
3989	/* Make sure the buffer is a multiple of the entry size, and that an excessive size isn't specified */
3990	entry_count = (buffer_size / sizeof(memorystatus_priority_properties_t));
3991	if (((buffer_size % sizeof(memorystatus_priority_properties_t)) != 0) || (entry_count > MAX_ENTRY_COUNT)) {
3992		return EINVAL;
3993	}
3994
3995	entries = (memorystatus_priority_properties_t *)kalloc(buffer_size);
3996
3997	error = copyin(buffer, entries, buffer_size);
3998
3999	for (i = 0; i < entry_count; i++) {
4000		proc_t p;
4001
4002		if (error) {
4003			break;
4004		}
4005
4006		p = proc_find(pid);
4007		if (!p) {
4008			error = ESRCH;
4009			break;
4010		}
4011
4012		if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
4013			error = EPERM;
4014			proc_rele(p);
4015			break;
4016		}
4017
4018		error = memorystatus_update(p, entries[i].priority, entries[i].user_data, FALSE, FALSE, 0, 0, FALSE);
4019		proc_rele(p);
4020	}
4021
4022	kfree(entries, buffer_size);
4023
4024	return error;
4025}
4026
4027static int
4028memorystatus_cmd_get_pressure_status(int32_t *retval) {
4029	int error;
4030
4031	/* Need privilege for check */
4032	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
4033	if (error) {
4034		return (error);
4035	}
4036
4037	/* Inherently racy, so it's not worth taking a lock here */
4038	*retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
4039
4040	return error;
4041}
4042
4043/*
4044 * Every process, including a P_MEMSTAT_INTERNAL process (currently only pid 1), is allowed to set a HWM.
4045 */
4046
4047static int
4048memorystatus_cmd_set_jetsam_memory_limit(pid_t pid, int32_t high_water_mark, __unused int32_t *retval, boolean_t is_fatal_limit) {
4049	int error = 0;
4050
4051	proc_t p = proc_find(pid);
4052	if (!p) {
4053		return ESRCH;
4054	}
4055
4056	if (high_water_mark <= 0) {
4057		high_water_mark = -1; /* Disable */
4058	}
4059
4060	proc_list_lock();
4061
4062	p->p_memstat_memlimit = high_water_mark;
4063	if (memorystatus_highwater_enabled) {
4064        	if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) {
4065
4066			memorystatus_update_priority_locked(p, p->p_memstat_effectivepriority, false);
4067
4068        		/*
4069			 * The update priority call above takes care to set/reset the fatal memory limit state
4070			 * IF the process is transitioning between foreground <-> background and has a background
4071			 * memory limit.
4072			 * Here, however, the process won't be doing any such transitions and so we explicitly tackle
4073			 * the fatal limit state.
4074			 */
4075			is_fatal_limit = FALSE;
4076
4077        	} else {
4078        		error = (task_set_phys_footprint_limit_internal(p->task, high_water_mark, NULL, TRUE) == 0) ? 0 : EINVAL;
4079        	}
4080	}
4081
4082	if (error == 0) {
4083		if (is_fatal_limit == TRUE) {
4084        		p->p_memstat_state |= P_MEMSTAT_FATAL_MEMLIMIT;
4085		} else {
4086        		p->p_memstat_state &= ~P_MEMSTAT_FATAL_MEMLIMIT;
4087		}
4088	}
4089
4090	proc_list_unlock();
4091	proc_rele(p);
4092
4093	return error;
4094}
4095
4096/*
4097 * Returns the jetsam priority (effective or requested) of the process
4098 * associated with this task.
4099 */
4100int
4101proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
4102{
4103	if (p) {
4104		if (effective_priority) {
4105			return p->p_memstat_effectivepriority;
4106		} else {
4107			return p->p_memstat_requestedpriority;
4108		}
4109	}
4110	return 0;
4111}
4112#endif /* CONFIG_JETSAM */
4113
4114int
4115memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
4116	int error = EINVAL;
4117
4118#if !CONFIG_JETSAM
4119	#pragma unused(ret)
4120#endif
4121
4122	/* Root only for now */
4123	if (!kauth_cred_issuser(kauth_cred_get())) {
4124		error = EPERM;
4125		goto out;
4126	}
4127
4128	/* Sanity check */
4129	if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
4130		error = EINVAL;
4131		goto out;
4132	}
4133
4134	switch (args->command) {
4135	case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
4136		error = memorystatus_cmd_get_priority_list(args->buffer, args->buffersize, ret);
4137		break;
4138#if CONFIG_JETSAM
4139	case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
4140		error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
4141		break;
4142	case MEMORYSTATUS_CMD_GRP_SET_PROPERTIES:
4143		error = memorystatus_cmd_grp_set_properties((int32_t)args->flags, args->buffer, args->buffersize, ret);
4144		break;
4145	case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
4146		error = memorystatus_cmd_get_jetsam_snapshot(args->buffer, args->buffersize, ret);
4147		break;
4148	case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
4149		error = memorystatus_cmd_get_pressure_status(ret);
4150		break;
4151	case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
4152		error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, FALSE);
4153		break;
4154	case MEMORYSTATUS_CMD_SET_JETSAM_TASK_LIMIT:
4155		error = memorystatus_cmd_set_jetsam_memory_limit(args->pid, (int32_t)args->flags, ret, TRUE);
4156		break;
4157	/* Test commands */
4158#if DEVELOPMENT || DEBUG
4159	case MEMORYSTATUS_CMD_TEST_JETSAM:
4160		error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled) ? 0 : EINVAL;
4161		break;
4162	case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
4163		error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
4164		break;
4165#endif /* DEVELOPMENT || DEBUG */
4166#endif /* CONFIG_JETSAM */
4167	default:
4168		break;
4169	}
4170
4171out:
4172	return error;
4173}
4174
4175
4176static int
4177filt_memorystatusattach(struct knote *kn)
4178{
4179	kn->kn_flags |= EV_CLEAR;
4180	return memorystatus_knote_register(kn);
4181}
4182
4183static void
4184filt_memorystatusdetach(struct knote *kn)
4185{
4186	memorystatus_knote_unregister(kn);
4187}
4188
4189static int
4190filt_memorystatus(struct knote *kn __unused, long hint)
4191{
4192	if (hint) {
4193		switch (hint) {
4194		case kMemorystatusNoPressure:
4195			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
4196				kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
4197			}
4198			break;
4199		case kMemorystatusPressure:
4200			if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
4201				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
4202					kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_WARN;
4203				}
4204			} else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
4205
4206				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
4207					kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
4208				}
4209			}
4210			break;
4211		case kMemorystatusLowSwap:
4212			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
4213				kn->kn_fflags |= NOTE_MEMORYSTATUS_LOW_SWAP;
4214			}
4215			break;
4216		default:
4217			break;
4218		}
4219	}
4220
4221	return (kn->kn_fflags != 0);
4222}
4223
4224static void
4225memorystatus_klist_lock(void) {
4226	lck_mtx_lock(&memorystatus_klist_mutex);
4227}
4228
4229static void
4230memorystatus_klist_unlock(void) {
4231	lck_mtx_unlock(&memorystatus_klist_mutex);
4232}
4233
4234void
4235memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) {
4236	lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
4237	klist_init(&memorystatus_klist);
4238}
4239
4240int
4241memorystatus_knote_register(struct knote *kn) {
4242	int error = 0;
4243
4244	memorystatus_klist_lock();
4245
4246	if (kn->kn_sfflags & (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP)) {
4247
4248		if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
4249			error = suser(kauth_cred_get(), 0);
4250		}
4251
4252		if (error == 0) {
4253			KNOTE_ATTACH(&memorystatus_klist, kn);
4254		}
4255	} else {
4256		error = ENOTSUP;
4257	}
4258
4259	memorystatus_klist_unlock();
4260
4261	return error;
4262}
4263
4264void
4265memorystatus_knote_unregister(struct knote *kn __unused) {
4266	memorystatus_klist_lock();
4267	KNOTE_DETACH(&memorystatus_klist, kn);
4268	memorystatus_klist_unlock();
4269}
4270
4271
4272#if 0
4273#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
4274static boolean_t
4275memorystatus_issue_pressure_kevent(boolean_t pressured) {
4276	memorystatus_klist_lock();
4277	KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
4278	memorystatus_klist_unlock();
4279	return TRUE;
4280}
4281#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
4282#endif /* 0 */
4283