xref: /netbsd-src/external/bsd/jemalloc/dist/src/pages.c (revision e60e3cf4f0b11e369ed5d70524e9ba533036e51e)
1 #include "jemalloc/internal/jemalloc_preamble.h"
2 
3 #include "jemalloc/internal/pages.h"
4 
5 #include "jemalloc/internal/jemalloc_internal_includes.h"
6 
7 #include "jemalloc/internal/assert.h"
8 #include "jemalloc/internal/malloc_io.h"
9 
10 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
11 #include <sys/sysctl.h>
12 #ifdef __FreeBSD__
13 #include <vm/vm_param.h>
14 #endif
15 #endif
16 #ifdef __NetBSD__
17 #include <sys/bitops.h>	/* ilog2 */
18 #endif
19 #ifdef JEMALLOC_HAVE_VM_MAKE_TAG
20 #define PAGES_FD_TAG VM_MAKE_TAG(101U)
21 #else
22 #define PAGES_FD_TAG -1
23 #endif
24 
25 /******************************************************************************/
26 /* Data. */
27 
28 /* Actual operating system page size, detected during bootstrap, <= PAGE. */
29 static size_t	os_page;
30 
31 #ifndef _WIN32
32 #  define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
33 #  define PAGES_PROT_DECOMMIT (PROT_NONE)
34 static int	mmap_flags;
35 #endif
36 static bool	os_overcommits;
37 
38 const char *thp_mode_names[] = {
39 	"default",
40 	"always",
41 	"never",
42 	"not supported"
43 };
44 thp_mode_t opt_thp = THP_MODE_DEFAULT;
45 thp_mode_t init_system_thp_mode;
46 
47 /* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
48 static bool pages_can_purge_lazy_runtime = true;
49 
50 #ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
51 static int madvise_dont_need_zeros_is_faulty = -1;
52 /**
53  * Check that MADV_DONTNEED will actually zero pages on subsequent access.
54  *
55  * Since qemu does not support this, yet [1], and you can get very tricky
56  * assert if you will run program with jemalloc in use under qemu:
57  *
58  *     <jemalloc>: ../contrib/jemalloc/src/extent.c:1195: Failed assertion: "p[i] == 0"
59  *
60  *   [1]: https://patchwork.kernel.org/patch/10576637/
61  */
62 static int madvise_MADV_DONTNEED_zeroes_pages()
63 {
64 	int works = -1;
65 	size_t size = PAGE;
66 
67 	void * addr = mmap(NULL, size, PROT_READ|PROT_WRITE,
68 	    MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
69 
70 	if (addr == MAP_FAILED) {
71 		malloc_write("<jemalloc>: Cannot allocate memory for "
72 		    "MADV_DONTNEED check\n");
73 		if (opt_abort) {
74 			abort();
75 		}
76 	}
77 
78 	memset(addr, 'A', size);
79 	if (madvise(addr, size, MADV_DONTNEED) == 0) {
80 		works = memchr(addr, 'A', size) == NULL;
81 	} else {
82 		/*
83 		 * If madvise() does not support MADV_DONTNEED, then we can
84 		 * call it anyway, and use it's return code.
85 		 */
86 		works = 1;
87 	}
88 
89 	if (munmap(addr, size) != 0) {
90 		malloc_write("<jemalloc>: Cannot deallocate memory for "
91 		    "MADV_DONTNEED check\n");
92 		if (opt_abort) {
93 			abort();
94 		}
95 	}
96 
97 	return works;
98 }
99 #endif
100 
101 /******************************************************************************/
102 /*
103  * Function prototypes for static functions that are referenced prior to
104  * definition.
105  */
106 
107 static void os_pages_unmap(void *addr, size_t size);
108 
109 /******************************************************************************/
110 
111 static void *
112 os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
113 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
114 	assert(ALIGNMENT_CEILING(size, os_page) == size);
115 	assert(size != 0);
116 
117 	if (os_overcommits) {
118 		*commit = true;
119 	}
120 
121 	void *ret;
122 #ifdef _WIN32
123 	/*
124 	 * If VirtualAlloc can't allocate at the given address when one is
125 	 * given, it fails and returns NULL.
126 	 */
127 	ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
128 	    PAGE_READWRITE);
129 #else
130 	/*
131 	 * We don't use MAP_FIXED here, because it can cause the *replacement*
132 	 * of existing mappings, and we only want to create new mappings.
133 	 */
134 	{
135 		int flags = mmap_flags;
136 #ifdef __NetBSD__
137 		/*
138 		 * On NetBSD PAGE for a platform is defined to the
139 		 * maximum page size of all machine architectures
140 		 * for that platform, so that we can use the same
141 		 * binaries across all machine architectures.
142 		 */
143 		if (alignment > os_page || PAGE > os_page) {
144 			unsigned int a = ilog2(MAX(alignment, PAGE));
145 			flags |= MAP_ALIGNED(a);
146 		}
147 #endif
148 		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
149 
150 		ret = mmap(addr, size, prot, flags, PAGES_FD_TAG, 0);
151 	}
152 	assert(ret != NULL);
153 
154 	if (ret == MAP_FAILED) {
155 		ret = NULL;
156 	} else if (addr != NULL && ret != addr) {
157 		/*
158 		 * We succeeded in mapping memory, but not in the right place.
159 		 */
160 		os_pages_unmap(ret, size);
161 		ret = NULL;
162 	}
163 #endif
164 	assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL &&
165 	    ret == addr));
166 	return ret;
167 }
168 
169 static void *
170 os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size,
171     bool *commit) {
172 	void *ret = (void *)((uintptr_t)addr + leadsize);
173 
174 	assert(alloc_size >= leadsize + size);
175 #ifdef _WIN32
176 	os_pages_unmap(addr, alloc_size);
177 	void *new_addr = os_pages_map(ret, size, PAGE, commit);
178 	if (new_addr == ret) {
179 		return ret;
180 	}
181 	if (new_addr != NULL) {
182 		os_pages_unmap(new_addr, size);
183 	}
184 	return NULL;
185 #else
186 	size_t trailsize = alloc_size - leadsize - size;
187 
188 	if (leadsize != 0) {
189 		os_pages_unmap(addr, leadsize);
190 	}
191 	if (trailsize != 0) {
192 		os_pages_unmap((void *)((uintptr_t)ret + size), trailsize);
193 	}
194 	return ret;
195 #endif
196 }
197 
198 static void
199 os_pages_unmap(void *addr, size_t size) {
200 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
201 	assert(ALIGNMENT_CEILING(size, os_page) == size);
202 
203 #ifdef _WIN32
204 	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
205 #else
206 	if (munmap(addr, size) == -1)
207 #endif
208 	{
209 		char buf[BUFERROR_BUF];
210 
211 		buferror(get_errno(), buf, sizeof(buf));
212 		malloc_printf("<jemalloc>: Error in "
213 #ifdef _WIN32
214 		    "VirtualFree"
215 #else
216 		    "munmap"
217 #endif
218 		    "(): %s\n", buf);
219 		if (opt_abort) {
220 			abort();
221 		}
222 	}
223 }
224 
225 static void *
226 pages_map_slow(size_t size, size_t alignment, bool *commit) {
227 	size_t alloc_size = size + alignment - os_page;
228 	/* Beware size_t wrap-around. */
229 	if (alloc_size < size) {
230 		return NULL;
231 	}
232 
233 	void *ret;
234 	do {
235 		void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
236 		if (pages == NULL) {
237 			return NULL;
238 		}
239 		size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
240 		    - (uintptr_t)pages;
241 		ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
242 	} while (ret == NULL);
243 
244 	assert(ret != NULL);
245 	assert(PAGE_ADDR2BASE(ret) == ret);
246 	return ret;
247 }
248 
249 void *
250 pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
251 	assert(alignment >= PAGE);
252 	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);
253 
254 #if defined(__FreeBSD__) && defined(MAP_EXCL)
255 	/*
256 	 * FreeBSD has mechanisms both to mmap at specific address without
257 	 * touching existing mappings, and to mmap with specific alignment.
258 	 */
259 	{
260 		if (os_overcommits) {
261 			*commit = true;
262 		}
263 
264 		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
265 		int flags = mmap_flags;
266 
267 		if (addr != NULL) {
268 			flags |= MAP_FIXED | MAP_EXCL;
269 		} else {
270 			unsigned alignment_bits = ffs_zu(alignment);
271 			assert(alignment_bits > 0);
272 			flags |= MAP_ALIGNED(alignment_bits);
273 		}
274 
275 		void *ret = mmap(addr, size, prot, flags, -1, 0);
276 		if (ret == MAP_FAILED) {
277 			ret = NULL;
278 		}
279 
280 		return ret;
281 	}
282 #endif
283 	/*
284 	 * Ideally, there would be a way to specify alignment to mmap() (like
285 	 * NetBSD has), but in the absence of such a feature, we have to work
286 	 * hard to efficiently create aligned mappings.  The reliable, but
287 	 * slow method is to create a mapping that is over-sized, then trim the
288 	 * excess.  However, that always results in one or two calls to
289 	 * os_pages_unmap(), and it can leave holes in the process's virtual
290 	 * memory map if memory grows downward.
291 	 *
292 	 * Optimistically try mapping precisely the right amount before falling
293 	 * back to the slow method, with the expectation that the optimistic
294 	 * approach works most of the time.
295 	 */
296 
297 	void *ret = os_pages_map(addr, size, os_page, commit);
298 	if (ret == NULL || ret == addr) {
299 		return ret;
300 	}
301 	assert(addr == NULL);
302 	if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
303 		os_pages_unmap(ret, size);
304 		return pages_map_slow(size, alignment, commit);
305 	}
306 
307 	assert(PAGE_ADDR2BASE(ret) == ret);
308 	return ret;
309 }
310 
311 void
312 pages_unmap(void *addr, size_t size) {
313 	assert(PAGE_ADDR2BASE(addr) == addr);
314 	assert(PAGE_CEILING(size) == size);
315 
316 	os_pages_unmap(addr, size);
317 }
318 
319 static bool
320 os_pages_commit(void *addr, size_t size, bool commit) {
321 	assert(PAGE_ADDR2BASE(addr) == addr);
322 	assert(PAGE_CEILING(size) == size);
323 
324 #ifdef _WIN32
325 	return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT,
326 	    PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT)));
327 #else
328 	{
329 		int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
330 		void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED,
331 		    PAGES_FD_TAG, 0);
332 		if (result == MAP_FAILED) {
333 			return true;
334 		}
335 		if (result != addr) {
336 			/*
337 			 * We succeeded in mapping memory, but not in the right
338 			 * place.
339 			 */
340 			os_pages_unmap(result, size);
341 			return true;
342 		}
343 		return false;
344 	}
345 #endif
346 }
347 
348 static bool
349 pages_commit_impl(void *addr, size_t size, bool commit) {
350 	if (os_overcommits) {
351 		return true;
352 	}
353 
354 	return os_pages_commit(addr, size, commit);
355 }
356 
357 bool
358 pages_commit(void *addr, size_t size) {
359 	return pages_commit_impl(addr, size, true);
360 }
361 
362 bool
363 pages_decommit(void *addr, size_t size) {
364 	return pages_commit_impl(addr, size, false);
365 }
366 
367 void
368 pages_mark_guards(void *head, void *tail) {
369 	assert(head != NULL || tail != NULL);
370 	assert(head == NULL || tail == NULL ||
371 	    (uintptr_t)head < (uintptr_t)tail);
372 #ifdef JEMALLOC_HAVE_MPROTECT
373 	if (head != NULL) {
374 		mprotect(head, PAGE, PROT_NONE);
375 	}
376 	if (tail != NULL) {
377 		mprotect(tail, PAGE, PROT_NONE);
378 	}
379 #else
380 	/* Decommit sets to PROT_NONE / MEM_DECOMMIT. */
381 	if (head != NULL) {
382 		os_pages_commit(head, PAGE, false);
383 	}
384 	if (tail != NULL) {
385 		os_pages_commit(tail, PAGE, false);
386 	}
387 #endif
388 }
389 
390 void
391 pages_unmark_guards(void *head, void *tail) {
392 	assert(head != NULL || tail != NULL);
393 	assert(head == NULL || tail == NULL ||
394 	    (uintptr_t)head < (uintptr_t)tail);
395 #ifdef JEMALLOC_HAVE_MPROTECT
396 	bool head_and_tail = (head != NULL) && (tail != NULL);
397 	size_t range = head_and_tail ?
398 	    (uintptr_t)tail - (uintptr_t)head + PAGE :
399 	    SIZE_T_MAX;
400 	/*
401 	 * The amount of work that the kernel does in mprotect depends on the
402 	 * range argument.  SC_LARGE_MINCLASS is an arbitrary threshold chosen
403 	 * to prevent kernel from doing too much work that would outweigh the
404 	 * savings of performing one less system call.
405 	 */
406 	bool ranged_mprotect = head_and_tail && range <= SC_LARGE_MINCLASS;
407 	if (ranged_mprotect) {
408 		mprotect(head, range, PROT_READ | PROT_WRITE);
409 	} else {
410 		if (head != NULL) {
411 			mprotect(head, PAGE, PROT_READ | PROT_WRITE);
412 		}
413 		if (tail != NULL) {
414 			mprotect(tail, PAGE, PROT_READ | PROT_WRITE);
415 		}
416 	}
417 #else
418 	if (head != NULL) {
419 		os_pages_commit(head, PAGE, true);
420 	}
421 	if (tail != NULL) {
422 		os_pages_commit(tail, PAGE, true);
423 	}
424 #endif
425 }
426 
427 bool
428 pages_purge_lazy(void *addr, size_t size) {
429 	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
430 	assert(PAGE_CEILING(size) == size);
431 
432 	if (!pages_can_purge_lazy) {
433 		return true;
434 	}
435 	if (!pages_can_purge_lazy_runtime) {
436 		/*
437 		 * Built with lazy purge enabled, but detected it was not
438 		 * supported on the current system.
439 		 */
440 		return true;
441 	}
442 
443 #ifdef _WIN32
444 	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
445 	return false;
446 #elif defined(JEMALLOC_PURGE_MADVISE_FREE)
447 	return (madvise(addr, size,
448 #  ifdef MADV_FREE
449 	    MADV_FREE
450 #  else
451 	    JEMALLOC_MADV_FREE
452 #  endif
453 	    ) != 0);
454 #elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
455     !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
456 	return (madvise(addr, size, MADV_DONTNEED) != 0);
457 #elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \
458     !defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS)
459 	return (posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0);
460 #else
461 	not_reached();
462 #endif
463 }
464 
465 bool
466 pages_purge_forced(void *addr, size_t size) {
467 	assert(PAGE_ADDR2BASE(addr) == addr);
468 	assert(PAGE_CEILING(size) == size);
469 
470 	if (!pages_can_purge_forced) {
471 		return true;
472 	}
473 
474 #if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
475     defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
476 	return (unlikely(madvise_dont_need_zeros_is_faulty) ||
477 	    madvise(addr, size, MADV_DONTNEED) != 0);
478 #elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \
479     defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS)
480 	return (unlikely(madvise_dont_need_zeros_is_faulty) ||
481 	    posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0);
482 #elif defined(JEMALLOC_MAPS_COALESCE)
483 	/* Try to overlay a new demand-zeroed mapping. */
484 	return pages_commit(addr, size);
485 #else
486 	not_reached();
487 #endif
488 }
489 
490 static bool
491 pages_huge_impl(void *addr, size_t size, bool aligned) {
492 	if (aligned) {
493 		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
494 		assert(HUGEPAGE_CEILING(size) == size);
495 	}
496 #if defined(JEMALLOC_HAVE_MADVISE_HUGE)
497 	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
498 #elif defined(JEMALLOC_HAVE_MEMCNTL)
499 	struct memcntl_mha m = {0};
500 	m.mha_cmd = MHA_MAPSIZE_VA;
501 	m.mha_pagesize = HUGEPAGE;
502 	return (memcntl(addr, size, MC_HAT_ADVISE, (caddr_t)&m, 0, 0) == 0);
503 #else
504 	return true;
505 #endif
506 }
507 
508 bool
509 pages_huge(void *addr, size_t size) {
510 	return pages_huge_impl(addr, size, true);
511 }
512 
513 static bool
514 pages_huge_unaligned(void *addr, size_t size) {
515 	return pages_huge_impl(addr, size, false);
516 }
517 
518 static bool
519 pages_nohuge_impl(void *addr, size_t size, bool aligned) {
520 	if (aligned) {
521 		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
522 		assert(HUGEPAGE_CEILING(size) == size);
523 	}
524 
525 #ifdef JEMALLOC_HAVE_MADVISE_HUGE
526 	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
527 #else
528 	return false;
529 #endif
530 }
531 
532 bool
533 pages_nohuge(void *addr, size_t size) {
534 	return pages_nohuge_impl(addr, size, true);
535 }
536 
537 static bool
538 pages_nohuge_unaligned(void *addr, size_t size) {
539 	return pages_nohuge_impl(addr, size, false);
540 }
541 
542 bool
543 pages_dontdump(void *addr, size_t size) {
544 	assert(PAGE_ADDR2BASE(addr) == addr);
545 	assert(PAGE_CEILING(size) == size);
546 #if defined(JEMALLOC_MADVISE_DONTDUMP)
547 	return madvise(addr, size, MADV_DONTDUMP) != 0;
548 #elif defined(JEMALLOC_MADVISE_NOCORE)
549 	return madvise(addr, size, MADV_NOCORE) != 0;
550 #else
551 	return false;
552 #endif
553 }
554 
555 bool
556 pages_dodump(void *addr, size_t size) {
557 	assert(PAGE_ADDR2BASE(addr) == addr);
558 	assert(PAGE_CEILING(size) == size);
559 #if defined(JEMALLOC_MADVISE_DONTDUMP)
560 	return madvise(addr, size, MADV_DODUMP) != 0;
561 #elif defined(JEMALLOC_MADVISE_NOCORE)
562 	return madvise(addr, size, MADV_CORE) != 0;
563 #else
564 	return false;
565 #endif
566 }
567 
568 
569 static size_t
570 os_page_detect(void) {
571 #ifdef _WIN32
572 	SYSTEM_INFO si;
573 	GetSystemInfo(&si);
574 	return si.dwPageSize;
575 #elif defined(__FreeBSD__)
576 	/*
577 	 * This returns the value obtained from
578 	 * the auxv vector, avoiding a syscall.
579 	 */
580 	return getpagesize();
581 #else
582 	long result = sysconf(_SC_PAGESIZE);
583 	if (result == -1) {
584 		return LG_PAGE;
585 	}
586 	return (size_t)result;
587 #endif
588 }
589 
590 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
591 static bool
592 os_overcommits_sysctl(void) {
593 	int vm_overcommit;
594 	size_t sz;
595 
596 	sz = sizeof(vm_overcommit);
597 #if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
598 	int mib[2];
599 
600 	mib[0] = CTL_VM;
601 	mib[1] = VM_OVERCOMMIT;
602 	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
603 		return false; /* Error. */
604 	}
605 #else
606 	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
607 		return false; /* Error. */
608 	}
609 #endif
610 
611 	return ((vm_overcommit & 0x3) == 0);
612 }
613 #endif
614 
615 #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
616 /*
617  * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
618  * reentry during bootstrapping if another library has interposed system call
619  * wrappers.
620  */
621 static bool
622 os_overcommits_proc(void) {
623 	int fd;
624 	char buf[1];
625 
626 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
627 	#if defined(O_CLOEXEC)
628 		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY |
629 			O_CLOEXEC);
630 	#else
631 		fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
632 		if (fd != -1) {
633 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
634 		}
635 	#endif
636 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
637 	#if defined(O_CLOEXEC)
638 		fd = (int)syscall(SYS_openat,
639 			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
640 	#else
641 		fd = (int)syscall(SYS_openat,
642 			AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY);
643 		if (fd != -1) {
644 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
645 		}
646 	#endif
647 #else
648 	#if defined(O_CLOEXEC)
649 		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
650 	#else
651 		fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
652 		if (fd != -1) {
653 			fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
654 		}
655 	#endif
656 #endif
657 
658 	if (fd == -1) {
659 		return false; /* Error. */
660 	}
661 
662 	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
663 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
664 	syscall(SYS_close, fd);
665 #else
666 	close(fd);
667 #endif
668 
669 	if (nread < 1) {
670 		return false; /* Error. */
671 	}
672 	/*
673 	 * /proc/sys/vm/overcommit_memory meanings:
674 	 * 0: Heuristic overcommit.
675 	 * 1: Always overcommit.
676 	 * 2: Never overcommit.
677 	 */
678 	return (buf[0] == '0' || buf[0] == '1');
679 }
680 #endif
681 
682 void
683 pages_set_thp_state (void *ptr, size_t size) {
684 	if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) {
685 		return;
686 	}
687 	assert(opt_thp != thp_mode_not_supported &&
688 	    init_system_thp_mode != thp_mode_not_supported);
689 
690 	if (opt_thp == thp_mode_always
691 	    && init_system_thp_mode != thp_mode_never) {
692 		assert(init_system_thp_mode == thp_mode_default);
693 		pages_huge_unaligned(ptr, size);
694 	} else if (opt_thp == thp_mode_never) {
695 		assert(init_system_thp_mode == thp_mode_default ||
696 		    init_system_thp_mode == thp_mode_always);
697 		pages_nohuge_unaligned(ptr, size);
698 	}
699 }
700 
701 static void
702 init_thp_state(void) {
703 	if (!have_madvise_huge && !have_memcntl) {
704 		if (metadata_thp_enabled() && opt_abort) {
705 			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
706 			abort();
707 		}
708 		goto label_error;
709 	}
710 #if defined(JEMALLOC_HAVE_MADVISE_HUGE)
711 	static const char sys_state_madvise[] = "always [madvise] never\n";
712 	static const char sys_state_always[] = "[always] madvise never\n";
713 	static const char sys_state_never[] = "always madvise [never]\n";
714 	char buf[sizeof(sys_state_madvise)];
715 
716 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
717 	int fd = (int)syscall(SYS_open,
718 	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
719 #elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
720 	int fd = (int)syscall(SYS_openat,
721 		    AT_FDCWD, "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
722 #else
723 	int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
724 #endif
725 	if (fd == -1) {
726 		goto label_error;
727 	}
728 
729 	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
730 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
731 	syscall(SYS_close, fd);
732 #else
733 	close(fd);
734 #endif
735 
736         if (nread < 0) {
737 		goto label_error;
738         }
739 
740 	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
741 		init_system_thp_mode = thp_mode_default;
742 	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
743 		init_system_thp_mode = thp_mode_always;
744 	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
745 		init_system_thp_mode = thp_mode_never;
746 	} else {
747 		goto label_error;
748 	}
749 	return;
750 #elif defined(JEMALLOC_HAVE_MEMCNTL)
751 	init_system_thp_mode = thp_mode_default;
752 	return;
753 #endif
754 label_error:
755 	opt_thp = init_system_thp_mode = thp_mode_not_supported;
756 }
757 
758 bool
759 pages_boot(void) {
760 	os_page = os_page_detect();
761 	if (os_page > PAGE) {
762 		malloc_write("<jemalloc>: Unsupported system page size\n");
763 		if (opt_abort) {
764 			abort();
765 		}
766 		return true;
767 	}
768 
769 #ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
770 	if (!opt_trust_madvise) {
771 		madvise_dont_need_zeros_is_faulty = !madvise_MADV_DONTNEED_zeroes_pages();
772 		if (madvise_dont_need_zeros_is_faulty) {
773 			malloc_write("<jemalloc>: MADV_DONTNEED does not work (memset will be used instead)\n");
774 			malloc_write("<jemalloc>: (This is the expected behaviour if you are running under QEMU)\n");
775 		}
776 	} else {
777 		/* In case opt_trust_madvise is disable,
778 		 * do not do runtime check */
779 		madvise_dont_need_zeros_is_faulty = 0;
780 	}
781 #endif
782 
783 #ifndef _WIN32
784 	mmap_flags = MAP_PRIVATE | MAP_ANON;
785 #endif
786 
787 #ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
788 	os_overcommits = os_overcommits_sysctl();
789 #elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
790 	os_overcommits = os_overcommits_proc();
791 #  ifdef MAP_NORESERVE
792 	if (os_overcommits) {
793 		mmap_flags |= MAP_NORESERVE;
794 	}
795 #  endif
796 #elif defined(__NetBSD__)
797 	os_overcommits = true;
798 #else
799 	os_overcommits = false;
800 #endif
801 
802 	init_thp_state();
803 
804 #ifdef __FreeBSD__
805 	/*
806 	 * FreeBSD doesn't need the check; madvise(2) is known to work.
807 	 */
808 #else
809 	/* Detect lazy purge runtime support. */
810 	if (pages_can_purge_lazy) {
811 		bool committed = false;
812 		void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed);
813 		if (madv_free_page == NULL) {
814 			return true;
815 		}
816 		assert(pages_can_purge_lazy_runtime);
817 		if (pages_purge_lazy(madv_free_page, PAGE)) {
818 			pages_can_purge_lazy_runtime = false;
819 		}
820 		os_pages_unmap(madv_free_page, PAGE);
821 	}
822 #endif
823 
824 	return false;
825 }
826