1#define JEMALLOC_PAGES_C_
2#include "jemalloc/internal/jemalloc_preamble.h"
3
4#include "jemalloc/internal/pages.h"
5
6#include "jemalloc/internal/jemalloc_internal_includes.h"
7
8#include "jemalloc/internal/assert.h"
9#include "jemalloc/internal/malloc_io.h"
10
11#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
12#include <sys/sysctl.h>
13#ifdef __FreeBSD__
14#include <sys/auxv.h>
15#include <vm/vm_param.h>
16#include <vm/vm.h>
17#endif
18#endif
19
20/******************************************************************************/
21/* Data. */
22
23/* Actual operating system page size, detected during bootstrap, <= PAGE. */
24static size_t	os_page;
25
26#ifndef _WIN32
27#  define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
28#  define PAGES_PROT_DECOMMIT (PROT_NONE)
29static int	mmap_flags;
30#endif
31static bool	os_overcommits;
32
33const char *thp_mode_names[] = {
34	"default",
35	"always",
36	"never",
37	"not supported"
38};
39thp_mode_t opt_thp = THP_MODE_DEFAULT;
40thp_mode_t init_system_thp_mode;
41
42/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
43static bool pages_can_purge_lazy_runtime = true;
44
45/******************************************************************************/
46/*
47 * Function prototypes for static functions that are referenced prior to
48 * definition.
49 */
50
51static void os_pages_unmap(void *addr, size_t size);
52
53/******************************************************************************/
54
55static void *
56os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
57	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
58	assert(ALIGNMENT_CEILING(size, os_page) == size);
59	assert(size != 0);
60
61	if (os_overcommits) {
62		*commit = true;
63	}
64
65	void *ret;
66#ifdef _WIN32
67	/*
68	 * If VirtualAlloc can't allocate at the given address when one is
69	 * given, it fails and returns NULL.
70	 */
71	ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
72	    PAGE_READWRITE);
73#else
74	/*
75	 * We don't use MAP_FIXED here, because it can cause the *replacement*
76	 * of existing mappings, and we only want to create new mappings.
77	 */
78	{
79		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
80
81		ret = mmap(addr, size, prot, mmap_flags, -1, 0);
82	}
83	assert(ret != NULL);
84
85	if (ret == MAP_FAILED) {
86		ret = NULL;
87	} else if (addr != NULL && ret != addr) {
88		/*
89		 * We succeeded in mapping memory, but not in the right place.
90		 */
91		os_pages_unmap(ret, size);
92		ret = NULL;
93	}
94#endif
95	assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
96	    ret == addr));
97	return ret;
98}
99
100static void *
101os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
102    bool *commit) {
103	void *ret = (void *)((uintptr_t)addr + leadsize);
104
105	assert(alloc_size >= leadsize + size);
106#ifdef _WIN32
107	os_pages_unmap(addr, alloc_size);
108	void *new_addr = os_pages_map(ret, size, PAGE, commit);
109	if (new_addr == ret) {
110		return ret;
111	}
112	if (new_addr != NULL) {
113		os_pages_unmap(new_addr, size);
114	}
115	return NULL;
116#else
117	size_t trailsize = alloc_size - leadsize - size;
118
119	if (leadsize != 0) {
120		os_pages_unmap(addr, leadsize);
121	}
122	if (trailsize != 0) {
123		os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
124	}
125	return ret;
126#endif
127}
128
129static void
130os_pages_unmap(void *addr, size_t size) {
131	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
132	assert(ALIGNMENT_CEILING(size, os_page) == size);
133
134#ifdef _WIN32
135	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
136#else
137	if (munmap(addr, size) == -1)
138#endif
139	{
140		char buf[BUFERROR_BUF];
141
142		buferror(get_errno(), buf, sizeof(buf));
143		malloc_printf("<jemalloc>: Error in "
144#ifdef _WIN32
145		    "VirtualFree"
146#else
147		    "munmap"
148#endif
149		    "(): %s\n", buf);
150		if (opt_abort) {
151			abort();
152		}
153	}
154}
155
156static void *
157pages_map_slow(size_t size, size_t alignment, bool *commit) {
158	size_t alloc_size = size + alignment - os_page;
159	/* Beware size_t wrap-around. */
160	if (alloc_size < size) {
161		return NULL;
162	}
163
164	void *ret;
165	do {
166		void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
167		if (pages == NULL) {
168			return NULL;
169		}
170		size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
171		    - (uintptr_t)pages;
172		ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
173	} while (ret == NULL);
174
175	assert(ret != NULL);
176	assert(PAGE_ADDR2BASE(ret) == ret);
177	return ret;
178}
179
180void *
181pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
182	assert(alignment >= PAGE);
183	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
184
185#if defined(__FreeBSD__) && defined(MAP_EXCL)
186	/*
187	 * FreeBSD has mechanisms both to mmap at specific address without
188	 * touching existing mappings, and to mmap with specific alignment.
189	 */
190	{
191		if (os_overcommits) {
192			*commit = true;
193		}
194
195		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
196		int flags = mmap_flags;
197
198		if (addr != NULL) {
199			flags |= MAP_FIXED | MAP_EXCL;
200		} else {
201			unsigned alignment_bits = ffs_zu(alignment);
202			assert(alignment_bits > 1);
203			flags |= MAP_ALIGNED(alignment_bits - 1);
204		}
205
206		void *ret = mmap(addr, size, prot, flags, -1, 0);
207		if (ret == MAP_FAILED) {
208			ret = NULL;
209		}
210
211		return ret;
212	}
213#endif
214	/*
215	 * Ideally, there would be a way to specify alignment to mmap() (like
216	 * NetBSD has), but in the absence of such a feature, we have to work
217	 * hard to efficiently create aligned mappings.  The reliable, but
218	 * slow method is to create a mapping that is over-sized, then trim the
219	 * excess.  However, that always results in one or two calls to
220	 * os_pages_unmap(), and it can leave holes in the process's virtual
221	 * memory map if memory grows downward.
222	 *
223	 * Optimistically try mapping precisely the right amount before falling
224	 * back to the slow method, with the expectation that the optimistic
225	 * approach works most of the time.
226	 */
227
228	void *ret = os_pages_map(addr, size, os_page, commit);
229	if (ret == NULL || ret == addr) {
230		return ret;
231	}
232	assert(addr == NULL);
233	if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
234		os_pages_unmap(ret, size);
235		return pages_map_slow(size, alignment, commit);
236	}
237
238	assert(PAGE_ADDR2BASE(ret) == ret);
239	return ret;
240}
241
242void
243pages_unmap(void *addr, size_t size) {
244	assert(PAGE_ADDR2BASE(addr) == addr);
245	assert(PAGE_CEILING(size) == size);
246
247	os_pages_unmap(addr, size);
248}
249
250static bool
251pages_commit_impl(void *addr, size_t size, bool commit) {
252	assert(PAGE_ADDR2BASE(addr) == addr);
253	assert(PAGE_CEILING(size) == size);
254
255	if (os_overcommits) {
256		return true;
257	}
258
259#ifdef _WIN32
260	return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
261	    PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
262#else
263	{
264		int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
265		void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
266		    -1, 0);
267		if (result == MAP_FAILED) {
268			return true;
269		}
270		if (result != addr) {
271			/*
272			 * We succeeded in mapping memory, but not in the right
273			 * place.
274			 */
275			os_pages_unmap(result, size);
276			return true;
277		}
278		return false;
279	}
280#endif
281}
282
283bool
284pages_commit(void *addr, size_t size) {
285	return pages_commit_impl(addr, size, true);
286}
287
288bool
289pages_decommit(void *addr, size_t size) {
290	return pages_commit_impl(addr, size, false);
291}
292
293bool
294pages_purge_lazy(void *addr, size_t size) {
295	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
296	assert(PAGE_CEILING(size) == size);
297
298	if (!pages_can_purge_lazy) {
299		return true;
300	}
301	if (!pages_can_purge_lazy_runtime) {
302		/*
303		 * Built with lazy purge enabled, but detected it was not
304		 * supported on the current system.
305		 */
306		return true;
307	}
308
309#ifdef _WIN32
310	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
311	return false;
312#elif defined(JEMALLOC_PURGE_MADVISE_FREE)
313	return (madvise(addr, size,
314#  ifdef MADV_FREE
315	    MADV_FREE
316#  else
317	    JEMALLOC_MADV_FREE
318#  endif
319	    ) != 0);
320#elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
321    !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
322	return (madvise(addr, size, MADV_DONTNEED) != 0);
323#else
324	not_reached();
325#endif
326}
327
328bool
329pages_purge_forced(void *addr, size_t size) {
330	assert(PAGE_ADDR2BASE(addr) == addr);
331	assert(PAGE_CEILING(size) == size);
332
333	if (!pages_can_purge_forced) {
334		return true;
335	}
336
337#if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
338    defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
339	return (madvise(addr, size, MADV_DONTNEED) != 0);
340#elif defined(JEMALLOC_MAPS_COALESCE)
341	/* Try to overlay a new demand-zeroed mapping. */
342	return pages_commit(addr, size);
343#else
344	not_reached();
345#endif
346}
347
348static bool
349pages_huge_impl(void *addr, size_t size, bool aligned) {
350	if (aligned) {
351		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
352		assert(HUGEPAGE_CEILING(size) == size);
353	}
354#ifdef JEMALLOC_HAVE_MADVISE_HUGE
355	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
356#else
357	return true;
358#endif
359}
360
361bool
362pages_huge(void *addr, size_t size) {
363	return pages_huge_impl(addr, size, true);
364}
365
366static bool
367pages_huge_unaligned(void *addr, size_t size) {
368	return pages_huge_impl(addr, size, false);
369}
370
371static bool
372pages_nohuge_impl(void *addr, size_t size, bool aligned) {
373	if (aligned) {
374		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
375		assert(HUGEPAGE_CEILING(size) == size);
376	}
377
378#ifdef JEMALLOC_HAVE_MADVISE_HUGE
379	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
380#else
381	return false;
382#endif
383}
384
385bool
386pages_nohuge(void *addr, size_t size) {
387	return pages_nohuge_impl(addr, size, true);
388}
389
390static bool
391pages_nohuge_unaligned(void *addr, size_t size) {
392	return pages_nohuge_impl(addr, size, false);
393}
394
395bool
396pages_dontdump(void *addr, size_t size) {
397	assert(PAGE_ADDR2BASE(addr) == addr);
398	assert(PAGE_CEILING(size) == size);
399#ifdef JEMALLOC_MADVISE_DONTDUMP
400	return madvise(addr, size, MADV_DONTDUMP) != 0;
401#else
402	return false;
403#endif
404}
405
406bool
407pages_dodump(void *addr, size_t size) {
408	assert(PAGE_ADDR2BASE(addr) == addr);
409	assert(PAGE_CEILING(size) == size);
410#ifdef JEMALLOC_MADVISE_DONTDUMP
411	return madvise(addr, size, MADV_DODUMP) != 0;
412#else
413	return false;
414#endif
415}
416
417
418static size_t
419os_page_detect(void) {
420#ifdef _WIN32
421	SYSTEM_INFO si;
422	GetSystemInfo(&si);
423	return si.dwPageSize;
424#elif defined(__FreeBSD__)
425	/*
426	 * This returns the value obtained from
427	 * the auxv vector, avoiding a syscall.
428	 */
429	return getpagesize();
430#else
431	long result = sysconf(_SC_PAGESIZE);
432	if (result == -1) {
433		return LG_PAGE;
434	}
435	return (size_t)result;
436#endif
437}
438
439#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
440static bool
441os_overcommits_sysctl(void) {
442	int vm_overcommit;
443	size_t sz;
444
445#ifdef ELF_BSDF_VMNOOVERCOMMIT
446	int bsdflags;
447
448	if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) == 0)
449		return ((bsdflags & ELF_BSDF_VMNOOVERCOMMIT) == 0);
450#endif
451
452	sz = sizeof(vm_overcommit);
453#if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
454	int mib[2];
455
456	mib[0] = CTL_VM;
457	mib[1] = VM_OVERCOMMIT;
458	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
459		return false; /* Error. */
460	}
461#else
462	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
463		return false; /* Error. */
464	}
465#endif
466
467#ifndef SWAP_RESERVE_FORCE_ON
468#define	SWAP_RESERVE_FORCE_ON		(1 << 0)
469#define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
470#endif
471	return ((vm_overcommit & (SWAP_RESERVE_FORCE_ON |
472	    SWAP_RESERVE_RLIMIT_ON)) == 0);
473}
474#endif
475
476#ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
477/*
478 * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
479 * reentry during bootstrapping if another library has interposed system call
480 * wrappers.
481 */
482static bool
483os_overcommits_proc(void) {
484	int fd;
485	char buf[1];
486
487#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
488	#if defined(O_CLOEXEC)
489		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
490			O_CLOEXEC);
491	#else
492		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
493		if (fd != -1) {
494			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
495		}
496	#endif
497#elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
498	#if defined(O_CLOEXEC)
499		fd = (int)syscall(SYS_openat,
500			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
501	#else
502		fd = (int)syscall(SYS_openat,
503			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
504		if (fd != -1) {
505			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
506		}
507	#endif
508#else
509	#if defined(O_CLOEXEC)
510		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
511	#else
512		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
513		if (fd != -1) {
514			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
515		}
516	#endif
517#endif
518
519	if (fd == -1) {
520		return false; /* Error. */
521	}
522
523	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
524#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
525	syscall(SYS_close, fd);
526#else
527	close(fd);
528#endif
529
530	if (nread < 1) {
531		return false; /* Error. */
532	}
533	/*
534	 * /proc/sys/vm/overcommit_memory meanings:
535	 * 0: Heuristic overcommit.
536	 * 1: Always overcommit.
537	 * 2: Never overcommit.
538	 */
539	return (buf[0] == '0' || buf[0] == '1');
540}
541#endif
542
543void
544pages_set_thp_state (void *ptr, size_t size) {
545	if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
546		return;
547	}
548	assert(opt_thp != thp_mode_not_supported &&
549	    init_system_thp_mode != thp_mode_not_supported);
550
551	if (opt_thp == thp_mode_always
552	    && init_system_thp_mode != thp_mode_never) {
553		assert(init_system_thp_mode == thp_mode_default);
554		pages_huge_unaligned(ptr, size);
555	} else if (opt_thp == thp_mode_never) {
556		assert(init_system_thp_mode == thp_mode_default ||
557		    init_system_thp_mode == thp_mode_always);
558		pages_nohuge_unaligned(ptr, size);
559	}
560}
561
562static void
563init_thp_state(void) {
564	if (!have_madvise_huge) {
565		if (metadata_thp_enabled() && opt_abort) {
566			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
567			abort();
568		}
569		goto label_error;
570	}
571
572	static const char sys_state_madvise[] = "always [madvise] never\n";
573	static const char sys_state_always[] = "[always] madvise never\n";
574	static const char sys_state_never[] = "always madvise [never]\n";
575	char buf[sizeof(sys_state_madvise)];
576
577#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
578	int fd = (int)syscall(SYS_open,
579	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
580#else
581	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
582#endif
583	if (fd == -1) {
584		goto label_error;
585	}
586
587	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
588#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
589	syscall(SYS_close, fd);
590#else
591	close(fd);
592#endif
593
594        if (nread < 0) {
595		goto label_error;
596        }
597
598	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
599		init_system_thp_mode = thp_mode_default;
600	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
601		init_system_thp_mode = thp_mode_always;
602	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
603		init_system_thp_mode = thp_mode_never;
604	} else {
605		goto label_error;
606	}
607	return;
608label_error:
609	opt_thp = init_system_thp_mode = thp_mode_not_supported;
610}
611
612bool
613pages_boot(void) {
614	os_page = os_page_detect();
615	if (os_page > PAGE) {
616		malloc_write("<jemalloc>: Unsupported system page size\n");
617		if (opt_abort) {
618			abort();
619		}
620		return true;
621	}
622
623#ifndef _WIN32
624	mmap_flags = MAP_PRIVATE | MAP_ANON;
625#endif
626
627#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
628	os_overcommits = os_overcommits_sysctl();
629#elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
630	os_overcommits = os_overcommits_proc();
631#  ifdef MAP_NORESERVE
632	if (os_overcommits) {
633		mmap_flags |= MAP_NORESERVE;
634	}
635#  endif
636#else
637	os_overcommits = false;
638#endif
639
640	init_thp_state();
641
642#ifdef __FreeBSD__
643	/*
644	 * FreeBSD doesn't need the check; madvise(2) is known to work.
645	 */
646#else
647	/* Detect lazy purge runtime support. */
648	if (pages_can_purge_lazy) {
649		bool committed = false;
650		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
651		if (madv_free_page == NULL) {
652			return true;
653		}
654		assert(pages_can_purge_lazy_runtime);
655		if (pages_purge_lazy(madv_free_page, PAGE)) {
656			pages_can_purge_lazy_runtime = false;
657		}
658		os_pages_unmap(madv_free_page, PAGE);
659	}
660#endif
661
662	return false;
663}
664