1/*
2 * Copyright (c) 2009-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <libkern/libkern.h>
30#include <mach/mach_types.h>
31#include <mach/task.h>
32#include <sys/proc_internal.h>
33#include <sys/event.h>
34#include <sys/eventvar.h>
35#include <kern/locks.h>
36#include <sys/queue.h>
37#include <kern/vm_pressure.h>
38#include <sys/malloc.h>
39#include <sys/errno.h>
40#include <sys/systm.h>
41#include <sys/types.h>
42#include <sys/sysctl.h>
43#include <kern/assert.h>
44#include <kern/task.h>
45#include <vm/vm_pageout.h>
46
47#include <kern/task.h>
48
49#if CONFIG_MEMORYSTATUS
50#include <sys/kern_memorystatus.h>
51#endif
52
53/*
54 * This value is the threshold that a process must meet to be considered for scavenging.
55 */
56#define VM_PRESSURE_MINIMUM_RSIZE		10	/* MB */
57#define VM_PRESSURE_NOTIFY_WAIT_PERIOD		10000	/* milliseconds */
58
59void vm_pressure_klist_lock(void);
60void vm_pressure_klist_unlock(void);
61
62static void vm_dispatch_memory_pressure(void);
63void vm_reset_active_list(void);
64
65#if !(CONFIG_MEMORYSTATUS && CONFIG_JETSAM)
66static kern_return_t vm_try_pressure_candidates(void);
67#endif
68
69static lck_mtx_t vm_pressure_klist_mutex;
70
71struct klist vm_pressure_klist;
72struct klist vm_pressure_klist_dormant;
73
74#if DEBUG
75#define VM_PRESSURE_DEBUG(cond, format, ...)      \
76do {                                              \
77	if (cond) { printf(format, ##__VA_ARGS__); } \
78} while(0)
79#else
80#define VM_PRESSURE_DEBUG(cond, format, ...)
81#endif
82
83void vm_pressure_init(lck_grp_t *grp, lck_attr_t *attr) {
84	lck_mtx_init(&vm_pressure_klist_mutex, grp, attr);
85}
86
87void vm_pressure_klist_lock(void) {
88	lck_mtx_lock(&vm_pressure_klist_mutex);
89}
90
91void vm_pressure_klist_unlock(void) {
92	lck_mtx_unlock(&vm_pressure_klist_mutex);
93}
94
95int vm_knote_register(struct knote *kn) {
96	int rv = 0;
97
98	vm_pressure_klist_lock();
99
100	if ((kn->kn_sfflags) & (NOTE_VM_PRESSURE)) {
101		KNOTE_ATTACH(&vm_pressure_klist, kn);
102	} else {
103		rv = ENOTSUP;
104	}
105
106	vm_pressure_klist_unlock();
107
108	return rv;
109}
110
111void vm_knote_unregister(struct knote *kn) {
112	struct knote *kn_temp;
113
114	vm_pressure_klist_lock();
115
116	VM_PRESSURE_DEBUG(0, "[vm_pressure] process %d cancelling pressure notification\n", kn->kn_kq->kq_p->p_pid);
117
118	SLIST_FOREACH(kn_temp, &vm_pressure_klist, kn_selnext) {
119		if (kn_temp == kn) {
120			KNOTE_DETACH(&vm_pressure_klist, kn);
121			vm_pressure_klist_unlock();
122			return;
123		}
124	}
125
126	SLIST_FOREACH(kn_temp, &vm_pressure_klist_dormant, kn_selnext) {
127		if (kn_temp == kn) {
128			KNOTE_DETACH(&vm_pressure_klist_dormant, kn);
129			vm_pressure_klist_unlock();
130			return;
131		}
132	}
133
134	vm_pressure_klist_unlock();
135}
136
137void vm_pressure_proc_cleanup(proc_t p)
138{
139	struct knote *kn = NULL;
140
141	vm_pressure_klist_lock();
142
143	VM_PRESSURE_DEBUG(0, "[vm_pressure] process %d exiting pressure notification\n", p->p_pid);
144
145	SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) {
146		if (kn->kn_kq->kq_p == p) {
147			KNOTE_DETACH(&vm_pressure_klist, kn);
148			vm_pressure_klist_unlock();
149			return;
150		}
151	}
152
153	SLIST_FOREACH(kn, &vm_pressure_klist_dormant, kn_selnext) {
154		if (kn->kn_kq->kq_p == p) {
155			KNOTE_DETACH(&vm_pressure_klist_dormant, kn);
156			vm_pressure_klist_unlock();
157			return;
158		}
159	}
160
161	vm_pressure_klist_unlock();
162}
163
164/*
165 * Used by the vm_pressure_thread which is
166 * signalled from within vm_pageout_scan().
167 */
168void consider_vm_pressure_events(void)
169{
170	vm_dispatch_memory_pressure();
171}
172
173#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
174
175static void vm_dispatch_memory_pressure(void)
176{
177	/* Update the pressure level and target the foreground or next-largest process as appropriate */
178	memorystatus_update_vm_pressure(FALSE);
179}
180
181/* Jetsam aware version. Called with lock held */
182
183static struct knote *vm_find_knote_from_pid(pid_t pid, struct klist *list) {
184	struct knote *kn = NULL;
185
186	SLIST_FOREACH(kn, list, kn_selnext) {
187		struct proc *p;
188		pid_t current_pid;
189
190		p = kn->kn_kq->kq_p;
191		current_pid = p->p_pid;
192
193		if (current_pid == pid) {
194			break;
195		}
196	}
197
198	return kn;
199}
200
201int vm_dispatch_pressure_note_to_pid(pid_t pid, boolean_t locked) {
202	int ret = EINVAL;
203	struct knote *kn;
204
205	VM_PRESSURE_DEBUG(1, "vm_dispatch_pressure_note_to_pid(): pid %d\n", pid);
206
207	if (!locked) {
208		vm_pressure_klist_lock();
209	}
210
211	/*
212	 * Because we're specifically targeting a process here, we don't care
213	 * if a warning has already been sent and it's moved to the dormant
214	 * list; check that too.
215	 */
216	kn = vm_find_knote_from_pid(pid, &vm_pressure_klist);
217	if (kn) {
218    		KNOTE(&vm_pressure_klist, pid);
219    		ret = 0;
220	} else {
221	        kn = vm_find_knote_from_pid(pid, &vm_pressure_klist_dormant);
222	        if (!kn) {
223        		KNOTE(&vm_pressure_klist_dormant, pid);
224	        }
225	}
226
227	if (!locked) {
228		vm_pressure_klist_unlock();
229	}
230
231	return ret;
232}
233
234void vm_find_pressure_foreground_candidates(void)
235{
236	struct knote *kn, *kn_tmp;
237	struct klist dispatch_klist = { NULL };
238
239	vm_pressure_klist_lock();
240	proc_list_lock();
241
242	/* Find the foreground processes. */
243	SLIST_FOREACH_SAFE(kn, &vm_pressure_klist, kn_selnext, kn_tmp) {
244		proc_t p = kn->kn_kq->kq_p;
245
246		if (memorystatus_is_foreground_locked(p)) {
247			KNOTE_DETACH(&vm_pressure_klist, kn);
248			KNOTE_ATTACH(&dispatch_klist, kn);
249		}
250	}
251
252	SLIST_FOREACH_SAFE(kn, &vm_pressure_klist_dormant, kn_selnext, kn_tmp) {
253		proc_t p = kn->kn_kq->kq_p;
254
255		if (memorystatus_is_foreground_locked(p)) {
256			KNOTE_DETACH(&vm_pressure_klist_dormant, kn);
257			KNOTE_ATTACH(&dispatch_klist, kn);
258		}
259	}
260
261	proc_list_unlock();
262
263	/* Dispatch pressure notifications accordingly */
264	SLIST_FOREACH_SAFE(kn, &dispatch_klist, kn_selnext, kn_tmp) {
265		proc_t p = kn->kn_kq->kq_p;
266
267		proc_list_lock();
268		if (p != proc_ref_locked(p)) {
269			proc_list_unlock();
270			KNOTE_DETACH(&dispatch_klist, kn);
271			KNOTE_ATTACH(&vm_pressure_klist_dormant, kn);
272			continue;
273		}
274		proc_list_unlock();
275
276		VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d\n", kn->kn_kq->kq_p->p_pid);
277		KNOTE(&dispatch_klist, p->p_pid);
278		KNOTE_DETACH(&dispatch_klist, kn);
279		KNOTE_ATTACH(&vm_pressure_klist_dormant, kn);
280		microuptime(&p->vm_pressure_last_notify_tstamp);
281		memorystatus_send_pressure_note(p->p_pid);
282		proc_rele(p);
283	}
284
285	vm_pressure_klist_unlock();
286}
287
288void vm_find_pressure_candidate(void)
289{
290	struct knote *kn = NULL, *kn_max = NULL;
291	unsigned int resident_max = 0;
292	pid_t target_pid = -1;
293	struct klist dispatch_klist = { NULL };
294	struct timeval curr_tstamp = {0, 0};
295	int elapsed_msecs = 0;
296	proc_t target_proc = PROC_NULL;
297	kern_return_t kr = KERN_SUCCESS;
298
299	microuptime(&curr_tstamp);
300
301	vm_pressure_klist_lock();
302
303	SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) {\
304	    struct mach_task_basic_info basic_info;
305	    mach_msg_type_number_t  size = MACH_TASK_BASIC_INFO_COUNT;
306		unsigned int		resident_size = 0;
307		proc_t			p = PROC_NULL;
308		struct task*		t = TASK_NULL;
309
310		p = kn->kn_kq->kq_p;
311		proc_list_lock();
312		if (p != proc_ref_locked(p)) {
313			p = PROC_NULL;
314			proc_list_unlock();
315			continue;
316		}
317		proc_list_unlock();
318
319		t = (struct task *)(p->task);
320
321		timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp);
322		elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
323
324		if (elapsed_msecs < VM_PRESSURE_NOTIFY_WAIT_PERIOD) {
325			proc_rele(p);
326			continue;
327		}
328
329		if (!memorystatus_bg_pressure_eligible(p)) {
330			VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", p->p_pid);
331			proc_rele(p);
332			continue;
333		}
334
335		if( ( kr = task_info(t, MACH_TASK_BASIC_INFO, (task_info_t)(&basic_info), &size)) != KERN_SUCCESS ) {
336			VM_PRESSURE_DEBUG(1, "[vm_pressure] task_info for pid %d failed\n", p->p_pid);
337			proc_rele(p);
338			continue;
339		}
340
341		/*
342		 * We don't want a small process to block large processes from
343		 * being notified again. <rdar://problem/7955532>
344		 */
345		resident_size = (basic_info.resident_size)/(1024 * 1024);
346		if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) {
347			if (resident_size > resident_max) {
348				resident_max = resident_size;
349				kn_max = kn;
350				target_pid = p->p_pid;
351				target_proc = p;
352			}
353		} else {
354			/* There was no candidate with enough resident memory to scavenge */
355			VM_PRESSURE_DEBUG(1, "[vm_pressure] threshold failed for pid %d with %u resident...\n", p->p_pid, resident_size);
356		}
357		proc_rele(p);
358	}
359
360	if (kn_max == NULL || target_pid == -1) {
361		VM_PRESSURE_DEBUG(1, "[vm_pressure] - no target found!\n");
362		goto exit;
363	}
364
365	VM_DEBUG_EVENT(vm_pageout_scan, VM_PRESSURE_EVENT, DBG_FUNC_NONE, target_pid, resident_max, 0, 0);
366	VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max);
367
368	KNOTE_DETACH(&vm_pressure_klist, kn_max);
369
370	target_proc = proc_find(target_pid);
371	if (target_proc != PROC_NULL) {
372		KNOTE_ATTACH(&dispatch_klist, kn_max);
373		KNOTE(&dispatch_klist, target_pid);
374		KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max);
375		memorystatus_send_pressure_note(target_pid);
376		microuptime(&target_proc->vm_pressure_last_notify_tstamp);
377		proc_rele(target_proc);
378	}
379
380exit:
381	vm_pressure_klist_unlock();
382}
383
384#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
385
386struct knote *
387vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level);
388
389kern_return_t vm_pressure_notification_without_levels(void);
390kern_return_t vm_pressure_notify_dispatch_vm_clients(void);
391
392kern_return_t
393vm_pressure_notify_dispatch_vm_clients(void)
394{
395	vm_pressure_klist_lock();
396
397	if (SLIST_EMPTY(&vm_pressure_klist)) {
398		vm_reset_active_list();
399	}
400
401	if (!SLIST_EMPTY(&vm_pressure_klist)) {
402
403		VM_PRESSURE_DEBUG(1, "[vm_pressure] vm_dispatch_memory_pressure\n");
404
405		if (KERN_SUCCESS == vm_try_pressure_candidates()) {
406			vm_pressure_klist_unlock();
407			return KERN_SUCCESS;
408		}
409	}
410
411	VM_PRESSURE_DEBUG(1, "[vm_pressure] could not find suitable event candidate\n");
412
413	vm_pressure_klist_unlock();
414
415	return KERN_FAILURE;
416}
417
418static void vm_dispatch_memory_pressure(void)
419{
420	memorystatus_update_vm_pressure(FALSE);
421}
422
423extern vm_pressure_level_t
424convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
425
426struct knote *
427vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level)
428{
429	struct knote	*kn = NULL, *kn_max = NULL;
430        unsigned int	resident_max = 0;
431	kern_return_t	kr = KERN_SUCCESS;
432	struct timeval	curr_tstamp = {0, 0};
433	int		elapsed_msecs = 0;
434	int		selected_task_importance = 0;
435	static int	pressure_snapshot = -1;
436	boolean_t	pressure_increase = FALSE;
437
438	if (level != -1) {
439
440		if (pressure_snapshot == -1) {
441			/*
442			 * Initial snapshot.
443		 	*/
444			pressure_snapshot = level;
445			pressure_increase = TRUE;
446		} else {
447
448			if (level >= pressure_snapshot) {
449				pressure_increase = TRUE;
450			} else {
451				pressure_increase = FALSE;
452			}
453
454			pressure_snapshot = level;
455		}
456	}
457
458	if ((level > 0) && (pressure_increase) == TRUE) {
459		/*
460		 * We'll start by considering the largest
461		 * unimportant task in our list.
462		 */
463		selected_task_importance = INT_MAX;
464	} else {
465		/*
466		 * We'll start by considering the largest
467		 * important task in our list.
468		 */
469		selected_task_importance = 0;
470	}
471
472	microuptime(&curr_tstamp);
473
474        SLIST_FOREACH(kn, candidate_list, kn_selnext) {
475
476                struct mach_task_basic_info basic_info;
477                mach_msg_type_number_t  size = MACH_TASK_BASIC_INFO_COUNT;
478                unsigned int		resident_size = 0;
479		proc_t			p = PROC_NULL;
480		struct task*		t = TASK_NULL;
481		int			curr_task_importance = 0;
482		boolean_t		consider_knote = FALSE;
483
484		p = kn->kn_kq->kq_p;
485		proc_list_lock();
486		if (p != proc_ref_locked(p)) {
487			p = PROC_NULL;
488			proc_list_unlock();
489			continue;
490		}
491		proc_list_unlock();
492
493		t = (struct task *)(p->task);
494
495		timevalsub(&curr_tstamp, &p->vm_pressure_last_notify_tstamp);
496		elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
497
498		if ((level == -1) && (elapsed_msecs < VM_PRESSURE_NOTIFY_WAIT_PERIOD)) {
499			proc_rele(p);
500			continue;
501		}
502
503		if (level != -1) {
504			/*
505			 * For the level based notifications, check and see if this knote is
506			 * registered for the current level.
507			 */
508			vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
509
510			if ((kn->kn_sfflags & dispatch_level) == 0) {
511				proc_rele(p);
512				continue;
513			}
514		}
515
516                if( ( kr = task_info(t, MACH_TASK_BASIC_INFO, (task_info_t)(&basic_info), &size)) != KERN_SUCCESS ) {
517                        VM_PRESSURE_DEBUG(1, "[vm_pressure] task_info for pid %d failed with %d\n", p->p_pid, kr);
518			proc_rele(p);
519                        continue;
520                }
521
522		curr_task_importance = task_importance_estimate(t);
523
524                /*
525                * We don't want a small process to block large processes from
526                * being notified again. <rdar://problem/7955532>
527                */
528                resident_size = (basic_info.resident_size)/(MB);
529
530                if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) {
531
532			if (level > 0) {
533				/*
534				 * Warning or Critical Pressure.
535				 */
536                        	if (pressure_increase) {
537					if ((curr_task_importance <= selected_task_importance) && (resident_size > resident_max)) {
538						if (task_has_been_notified(t, level) == FALSE) {
539							consider_knote = TRUE;
540						}
541					}
542				} else {
543					if ((curr_task_importance >= selected_task_importance) && (resident_size > resident_max)) {
544						if (task_has_been_notified(t, level) == FALSE) {
545							consider_knote = TRUE;
546						}
547					}
548				}
549			} else if (level == 0) {
550                        	/*
551				 * Pressure back to normal.
552				 */
553				if ((curr_task_importance >= selected_task_importance) && (resident_size > resident_max)) {
554
555					if ((task_has_been_notified(t, kVMPressureWarning) == TRUE) || (task_has_been_notified(t, kVMPressureCritical) == TRUE)) {
556						consider_knote = TRUE;
557					}
558				}
559			} else if (level == -1) {
560
561				/*
562				 * Simple (importance and level)-free behavior based solely on RSIZE.
563				 */
564				if (resident_size > resident_max) {
565					consider_knote = TRUE;
566				}
567			}
568
569
570			if (consider_knote) {
571				resident_max = resident_size;
572				kn_max = kn;
573				selected_task_importance = curr_task_importance;
574				consider_knote = FALSE; /* reset for the next candidate */
575			}
576                } else {
577                        /* There was no candidate with enough resident memory to scavenge */
578                        VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %u resident...\n", p->p_pid, resident_size);
579                }
580		proc_rele(p);
581        }
582
583	if (kn_max) {
584        	VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max);
585	}
586
587	return kn_max;
588}
589
590/*
591 * vm_pressure_klist_lock is held for this routine.
592 */
593kern_return_t vm_pressure_notification_without_levels(void)
594{
595	struct knote *kn_max = NULL;
596        pid_t target_pid = -1;
597        struct klist dispatch_klist = { NULL };
598	proc_t	target_proc = PROC_NULL;
599
600	kn_max = vm_pressure_select_optimal_candidate_to_notify(&vm_pressure_klist, -1);
601
602        if (kn_max == NULL) {
603		return KERN_FAILURE;
604	}
605
606	target_proc = kn_max->kn_kq->kq_p;
607
608        KNOTE_DETACH(&vm_pressure_klist, kn_max);
609
610	if (target_proc != PROC_NULL) {
611
612		target_pid = target_proc->p_pid;
613
614		memoryshot(VM_PRESSURE_EVENT, DBG_FUNC_NONE);
615
616        	KNOTE_ATTACH(&dispatch_klist, kn_max);
617        	KNOTE(&dispatch_klist, target_pid);
618        	KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max);
619
620		microuptime(&target_proc->vm_pressure_last_notify_tstamp);
621	}
622
623        return KERN_SUCCESS;
624}
625
626static kern_return_t vm_try_pressure_candidates(void)
627{
628	/*
629	 * This takes care of candidates that use NOTE_VM_PRESSURE.
630	 * It's a notification without indication of the level
631	 * of memory pressure.
632	 */
633	return (vm_pressure_notification_without_levels());
634}
635
636#endif /* !(CONFIG_MEMORYSTATUS && CONFIG_JETSAM) */
637
638/*
639 * Remove all elements from the dormant list and place them on the active list.
640 * Called with klist lock held.
641 */
642void vm_reset_active_list(void) {
643	/* Re-charge the main list from the dormant list if possible */
644	if (!SLIST_EMPTY(&vm_pressure_klist_dormant)) {
645		struct knote *kn;
646
647		VM_PRESSURE_DEBUG(1, "[vm_pressure] recharging main list from dormant list\n");
648
649		while (!SLIST_EMPTY(&vm_pressure_klist_dormant)) {
650			kn = SLIST_FIRST(&vm_pressure_klist_dormant);
651			SLIST_REMOVE_HEAD(&vm_pressure_klist_dormant, kn_selnext);
652			SLIST_INSERT_HEAD(&vm_pressure_klist, kn, kn_selnext);
653		}
654	}
655}
656