1// SPDX-License-Identifier: GPL-2.0
2/*
3 * HugeTLB Vmemmap Optimization (HVO)
4 *
5 * Copyright (c) 2020, ByteDance. All rights reserved.
6 *
7 *     Author: Muchun Song <songmuchun@bytedance.com>
8 *
9 * See Documentation/mm/vmemmap_dedup.rst
10 */
11#define pr_fmt(fmt)	"HugeTLB: " fmt
12
13#include <linux/pgtable.h>
14#include <linux/moduleparam.h>
15#include <linux/bootmem_info.h>
16#include <linux/mmdebug.h>
17#include <linux/pagewalk.h>
18#include <asm/pgalloc.h>
19#include <asm/tlbflush.h>
20#include "hugetlb_vmemmap.h"
21
22/**
23 * struct vmemmap_remap_walk - walk vmemmap page table
24 *
25 * @remap_pte:		called for each lowest-level entry (PTE).
26 * @nr_walked:		the number of walked pte.
27 * @reuse_page:		the page which is reused for the tail vmemmap pages.
28 * @reuse_addr:		the virtual address of the @reuse_page page.
29 * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
30 *			or is mapped from.
31 * @flags:		used to modify behavior in vmemmap page table walking
32 *			operations.
33 */
34struct vmemmap_remap_walk {
35	void			(*remap_pte)(pte_t *pte, unsigned long addr,
36					     struct vmemmap_remap_walk *walk);
37	unsigned long		nr_walked;
38	struct page		*reuse_page;
39	unsigned long		reuse_addr;
40	struct list_head	*vmemmap_pages;
41
42/* Skip the TLB flush when we split the PMD */
43#define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)
44/* Skip the TLB flush when we remap the PTE */
45#define VMEMMAP_REMAP_NO_TLB_FLUSH	BIT(1)
46	unsigned long		flags;
47};
48
49static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
50			     struct vmemmap_remap_walk *walk)
51{
52	pmd_t __pmd;
53	int i;
54	unsigned long addr = start;
55	pte_t *pgtable;
56
57	pgtable = pte_alloc_one_kernel(&init_mm);
58	if (!pgtable)
59		return -ENOMEM;
60
61	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
62
63	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
64		pte_t entry, *pte;
65		pgprot_t pgprot = PAGE_KERNEL;
66
67		entry = mk_pte(head + i, pgprot);
68		pte = pte_offset_kernel(&__pmd, addr);
69		set_pte_at(&init_mm, addr, pte, entry);
70	}
71
72	spin_lock(&init_mm.page_table_lock);
73	if (likely(pmd_leaf(*pmd))) {
74		/*
75		 * Higher order allocations from buddy allocator must be able to
76		 * be treated as indepdenent small pages (as they can be freed
77		 * individually).
78		 */
79		if (!PageReserved(head))
80			split_page(head, get_order(PMD_SIZE));
81
82		/* Make pte visible before pmd. See comment in pmd_install(). */
83		smp_wmb();
84		pmd_populate_kernel(&init_mm, pmd, pgtable);
85		if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
86			flush_tlb_kernel_range(start, start + PMD_SIZE);
87	} else {
88		pte_free_kernel(&init_mm, pgtable);
89	}
90	spin_unlock(&init_mm.page_table_lock);
91
92	return 0;
93}
94
95static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
96			     unsigned long next, struct mm_walk *walk)
97{
98	int ret = 0;
99	struct page *head;
100	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
101
102	/* Only splitting, not remapping the vmemmap pages. */
103	if (!vmemmap_walk->remap_pte)
104		walk->action = ACTION_CONTINUE;
105
106	spin_lock(&init_mm.page_table_lock);
107	head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
108	/*
109	 * Due to HugeTLB alignment requirements and the vmemmap
110	 * pages being at the start of the hotplugged memory
111	 * region in memory_hotplug.memmap_on_memory case. Checking
112	 * the vmemmap page associated with the first vmemmap page
113	 * if it is self-hosted is sufficient.
114	 *
115	 * [                  hotplugged memory                  ]
116	 * [        section        ][...][        section        ]
117	 * [ vmemmap ][              usable memory               ]
118	 *   ^  | ^                        |
119	 *   +--+ |                        |
120	 *        +------------------------+
121	 */
122	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
123		struct page *page = head ? head + pte_index(addr) :
124				    pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
125
126		if (PageVmemmapSelfHosted(page))
127			ret = -ENOTSUPP;
128	}
129	spin_unlock(&init_mm.page_table_lock);
130	if (!head || ret)
131		return ret;
132
133	return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
134}
135
136static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
137			     unsigned long next, struct mm_walk *walk)
138{
139	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
140
141	/*
142	 * The reuse_page is found 'first' in page table walking before
143	 * starting remapping.
144	 */
145	if (!vmemmap_walk->reuse_page)
146		vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
147	else
148		vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
149	vmemmap_walk->nr_walked++;
150
151	return 0;
152}
153
154static const struct mm_walk_ops vmemmap_remap_ops = {
155	.pmd_entry	= vmemmap_pmd_entry,
156	.pte_entry	= vmemmap_pte_entry,
157};
158
159static int vmemmap_remap_range(unsigned long start, unsigned long end,
160			       struct vmemmap_remap_walk *walk)
161{
162	int ret;
163
164	VM_BUG_ON(!PAGE_ALIGNED(start | end));
165
166	mmap_read_lock(&init_mm);
167	ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
168				    NULL, walk);
169	mmap_read_unlock(&init_mm);
170	if (ret)
171		return ret;
172
173	if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
174		flush_tlb_kernel_range(start, end);
175
176	return 0;
177}
178
179/*
180 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
181 * allocator or buddy allocator. If the PG_reserved flag is set, it means
182 * that it allocated from the memblock allocator, just free it via the
183 * free_bootmem_page(). Otherwise, use __free_page().
184 */
185static inline void free_vmemmap_page(struct page *page)
186{
187	if (PageReserved(page))
188		free_bootmem_page(page);
189	else
190		__free_page(page);
191}
192
193/* Free a list of the vmemmap pages */
194static void free_vmemmap_page_list(struct list_head *list)
195{
196	struct page *page, *next;
197
198	list_for_each_entry_safe(page, next, list, lru)
199		free_vmemmap_page(page);
200}
201
202static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
203			      struct vmemmap_remap_walk *walk)
204{
205	/*
206	 * Remap the tail pages as read-only to catch illegal write operation
207	 * to the tail pages.
208	 */
209	pgprot_t pgprot = PAGE_KERNEL_RO;
210	struct page *page = pte_page(ptep_get(pte));
211	pte_t entry;
212
213	/* Remapping the head page requires r/w */
214	if (unlikely(addr == walk->reuse_addr)) {
215		pgprot = PAGE_KERNEL;
216		list_del(&walk->reuse_page->lru);
217
218		/*
219		 * Makes sure that preceding stores to the page contents from
220		 * vmemmap_remap_free() become visible before the set_pte_at()
221		 * write.
222		 */
223		smp_wmb();
224	}
225
226	entry = mk_pte(walk->reuse_page, pgprot);
227	list_add(&page->lru, walk->vmemmap_pages);
228	set_pte_at(&init_mm, addr, pte, entry);
229}
230
231/*
232 * How many struct page structs need to be reset. When we reuse the head
233 * struct page, the special metadata (e.g. page->flags or page->mapping)
234 * cannot copy to the tail struct page structs. The invalid value will be
235 * checked in the free_tail_page_prepare(). In order to avoid the message
236 * of "corrupted mapping in tail page". We need to reset at least 3 (one
237 * head struct page struct and two tail struct page structs) struct page
238 * structs.
239 */
240#define NR_RESET_STRUCT_PAGE		3
241
242static inline void reset_struct_pages(struct page *start)
243{
244	struct page *from = start + NR_RESET_STRUCT_PAGE;
245
246	BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
247	memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
248}
249
250static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
251				struct vmemmap_remap_walk *walk)
252{
253	pgprot_t pgprot = PAGE_KERNEL;
254	struct page *page;
255	void *to;
256
257	BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
258
259	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
260	list_del(&page->lru);
261	to = page_to_virt(page);
262	copy_page(to, (void *)walk->reuse_addr);
263	reset_struct_pages(to);
264
265	/*
266	 * Makes sure that preceding stores to the page contents become visible
267	 * before the set_pte_at() write.
268	 */
269	smp_wmb();
270	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
271}
272
273/**
274 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
275 *                      backing PMDs of the directmap into PTEs
276 * @start:     start address of the vmemmap virtual address range that we want
277 *             to remap.
278 * @end:       end address of the vmemmap virtual address range that we want to
279 *             remap.
280 * @reuse:     reuse address.
281 *
282 * Return: %0 on success, negative error code otherwise.
283 */
284static int vmemmap_remap_split(unsigned long start, unsigned long end,
285			       unsigned long reuse)
286{
287	struct vmemmap_remap_walk walk = {
288		.remap_pte	= NULL,
289		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
290	};
291
292	/* See the comment in the vmemmap_remap_free(). */
293	BUG_ON(start - reuse != PAGE_SIZE);
294
295	return vmemmap_remap_range(reuse, end, &walk);
296}
297
298/**
299 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
300 *			to the page which @reuse is mapped to, then free vmemmap
301 *			which the range are mapped to.
302 * @start:	start address of the vmemmap virtual address range that we want
303 *		to remap.
304 * @end:	end address of the vmemmap virtual address range that we want to
305 *		remap.
306 * @reuse:	reuse address.
307 * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
308 *		responsibility to free pages.
309 * @flags:	modifications to vmemmap_remap_walk flags
310 *
311 * Return: %0 on success, negative error code otherwise.
312 */
313static int vmemmap_remap_free(unsigned long start, unsigned long end,
314			      unsigned long reuse,
315			      struct list_head *vmemmap_pages,
316			      unsigned long flags)
317{
318	int ret;
319	struct vmemmap_remap_walk walk = {
320		.remap_pte	= vmemmap_remap_pte,
321		.reuse_addr	= reuse,
322		.vmemmap_pages	= vmemmap_pages,
323		.flags		= flags,
324	};
325	int nid = page_to_nid((struct page *)reuse);
326	gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
327
328	/*
329	 * Allocate a new head vmemmap page to avoid breaking a contiguous
330	 * block of struct page memory when freeing it back to page allocator
331	 * in free_vmemmap_page_list(). This will allow the likely contiguous
332	 * struct page backing memory to be kept contiguous and allowing for
333	 * more allocations of hugepages. Fallback to the currently
334	 * mapped head page in case should it fail to allocate.
335	 */
336	walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
337	if (walk.reuse_page) {
338		copy_page(page_to_virt(walk.reuse_page),
339			  (void *)walk.reuse_addr);
340		list_add(&walk.reuse_page->lru, vmemmap_pages);
341	}
342
343	/*
344	 * In order to make remapping routine most efficient for the huge pages,
345	 * the routine of vmemmap page table walking has the following rules
346	 * (see more details from the vmemmap_pte_range()):
347	 *
348	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
349	 *   should be continuous.
350	 * - The @reuse address is part of the range [@reuse, @end) that we are
351	 *   walking which is passed to vmemmap_remap_range().
352	 * - The @reuse address is the first in the complete range.
353	 *
354	 * So we need to make sure that @start and @reuse meet the above rules.
355	 */
356	BUG_ON(start - reuse != PAGE_SIZE);
357
358	ret = vmemmap_remap_range(reuse, end, &walk);
359	if (ret && walk.nr_walked) {
360		end = reuse + walk.nr_walked * PAGE_SIZE;
361		/*
362		 * vmemmap_pages contains pages from the previous
363		 * vmemmap_remap_range call which failed.  These
364		 * are pages which were removed from the vmemmap.
365		 * They will be restored in the following call.
366		 */
367		walk = (struct vmemmap_remap_walk) {
368			.remap_pte	= vmemmap_restore_pte,
369			.reuse_addr	= reuse,
370			.vmemmap_pages	= vmemmap_pages,
371			.flags		= 0,
372		};
373
374		vmemmap_remap_range(reuse, end, &walk);
375	}
376
377	return ret;
378}
379
380static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
381				   struct list_head *list)
382{
383	gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
384	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
385	int nid = page_to_nid((struct page *)start);
386	struct page *page, *next;
387
388	while (nr_pages--) {
389		page = alloc_pages_node(nid, gfp_mask, 0);
390		if (!page)
391			goto out;
392		list_add(&page->lru, list);
393	}
394
395	return 0;
396out:
397	list_for_each_entry_safe(page, next, list, lru)
398		__free_page(page);
399	return -ENOMEM;
400}
401
402/**
403 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
404 *			 to the page which is from the @vmemmap_pages
405 *			 respectively.
406 * @start:	start address of the vmemmap virtual address range that we want
407 *		to remap.
408 * @end:	end address of the vmemmap virtual address range that we want to
409 *		remap.
410 * @reuse:	reuse address.
411 * @flags:	modifications to vmemmap_remap_walk flags
412 *
413 * Return: %0 on success, negative error code otherwise.
414 */
415static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
416			       unsigned long reuse, unsigned long flags)
417{
418	LIST_HEAD(vmemmap_pages);
419	struct vmemmap_remap_walk walk = {
420		.remap_pte	= vmemmap_restore_pte,
421		.reuse_addr	= reuse,
422		.vmemmap_pages	= &vmemmap_pages,
423		.flags		= flags,
424	};
425
426	/* See the comment in the vmemmap_remap_free(). */
427	BUG_ON(start - reuse != PAGE_SIZE);
428
429	if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
430		return -ENOMEM;
431
432	return vmemmap_remap_range(reuse, end, &walk);
433}
434
435DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
436EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
437
438static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
439core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
440
441static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
442					   struct folio *folio, unsigned long flags)
443{
444	int ret;
445	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
446	unsigned long vmemmap_reuse;
447
448	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
449	if (!folio_test_hugetlb_vmemmap_optimized(folio))
450		return 0;
451
452	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
453	vmemmap_reuse	= vmemmap_start;
454	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
455
456	/*
457	 * The pages which the vmemmap virtual address range [@vmemmap_start,
458	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
459	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
460	 * When a HugeTLB page is freed to the buddy allocator, previously
461	 * discarded vmemmap pages must be allocated and remapping.
462	 */
463	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
464	if (!ret) {
465		folio_clear_hugetlb_vmemmap_optimized(folio);
466		static_branch_dec(&hugetlb_optimize_vmemmap_key);
467	}
468
469	return ret;
470}
471
472/**
473 * hugetlb_vmemmap_restore_folio - restore previously optimized (by
474 *				hugetlb_vmemmap_optimize_folio()) vmemmap pages which
475 *				will be reallocated and remapped.
476 * @h:		struct hstate.
477 * @folio:     the folio whose vmemmap pages will be restored.
478 *
479 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
480 * negative error code otherwise.
481 */
482int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
483{
484	return __hugetlb_vmemmap_restore_folio(h, folio, 0);
485}
486
487/**
488 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
489 * @h:			hstate.
490 * @folio_list:		list of folios.
491 * @non_hvo_folios:	Output list of folios for which vmemmap exists.
492 *
493 * Return: number of folios for which vmemmap was restored, or an error code
494 *		if an error was encountered restoring vmemmap for a folio.
495 *		Folios that have vmemmap are moved to the non_hvo_folios
496 *		list.  Processing of entries stops when the first error is
497 *		encountered. The folio that experienced the error and all
498 *		non-processed folios will remain on folio_list.
499 */
500long hugetlb_vmemmap_restore_folios(const struct hstate *h,
501					struct list_head *folio_list,
502					struct list_head *non_hvo_folios)
503{
504	struct folio *folio, *t_folio;
505	long restored = 0;
506	long ret = 0;
507
508	list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
509		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
510			ret = __hugetlb_vmemmap_restore_folio(h, folio,
511							      VMEMMAP_REMAP_NO_TLB_FLUSH);
512			if (ret)
513				break;
514			restored++;
515		}
516
517		/* Add non-optimized folios to output list */
518		list_move(&folio->lru, non_hvo_folios);
519	}
520
521	if (restored)
522		flush_tlb_all();
523	if (!ret)
524		ret = restored;
525	return ret;
526}
527
528/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
529static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
530{
531	if (folio_test_hugetlb_vmemmap_optimized(folio))
532		return false;
533
534	if (!READ_ONCE(vmemmap_optimize_enabled))
535		return false;
536
537	if (!hugetlb_vmemmap_optimizable(h))
538		return false;
539
540	return true;
541}
542
543static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
544					    struct folio *folio,
545					    struct list_head *vmemmap_pages,
546					    unsigned long flags)
547{
548	int ret = 0;
549	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
550	unsigned long vmemmap_reuse;
551
552	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
553	if (!vmemmap_should_optimize_folio(h, folio))
554		return ret;
555
556	static_branch_inc(&hugetlb_optimize_vmemmap_key);
557	/*
558	 * Very Subtle
559	 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
560	 * immediately after remapping.  As a result, subsequent accesses
561	 * and modifications to struct pages associated with the hugetlb
562	 * page could be to the OLD struct pages.  Set the vmemmap optimized
563	 * flag here so that it is copied to the new head page.  This keeps
564	 * the old and new struct pages in sync.
565	 * If there is an error during optimization, we will immediately FLUSH
566	 * the TLB and clear the flag below.
567	 */
568	folio_set_hugetlb_vmemmap_optimized(folio);
569
570	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
571	vmemmap_reuse	= vmemmap_start;
572	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
573
574	/*
575	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
576	 * to the page which @vmemmap_reuse is mapped to.  Add pages previously
577	 * mapping the range to vmemmap_pages list so that they can be freed by
578	 * the caller.
579	 */
580	ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
581				 vmemmap_pages, flags);
582	if (ret) {
583		static_branch_dec(&hugetlb_optimize_vmemmap_key);
584		folio_clear_hugetlb_vmemmap_optimized(folio);
585	}
586
587	return ret;
588}
589
590/**
591 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
592 * @h:		struct hstate.
593 * @folio:     the folio whose vmemmap pages will be optimized.
594 *
595 * This function only tries to optimize @folio's vmemmap pages and does not
596 * guarantee that the optimization will succeed after it returns. The caller
597 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
598 * vmemmap pages have been optimized.
599 */
600void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
601{
602	LIST_HEAD(vmemmap_pages);
603
604	__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
605	free_vmemmap_page_list(&vmemmap_pages);
606}
607
608static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
609{
610	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
611	unsigned long vmemmap_reuse;
612
613	if (!vmemmap_should_optimize_folio(h, folio))
614		return 0;
615
616	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
617	vmemmap_reuse	= vmemmap_start;
618	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
619
620	/*
621	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
622	 * @vmemmap_end]
623	 */
624	return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
625}
626
627void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
628{
629	struct folio *folio;
630	LIST_HEAD(vmemmap_pages);
631
632	list_for_each_entry(folio, folio_list, lru) {
633		int ret = hugetlb_vmemmap_split_folio(h, folio);
634
635		/*
636		 * Spliting the PMD requires allocating a page, thus lets fail
637		 * early once we encounter the first OOM. No point in retrying
638		 * as it can be dynamically done on remap with the memory
639		 * we get back from the vmemmap deduplication.
640		 */
641		if (ret == -ENOMEM)
642			break;
643	}
644
645	flush_tlb_all();
646
647	list_for_each_entry(folio, folio_list, lru) {
648		int ret;
649
650		ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
651						       VMEMMAP_REMAP_NO_TLB_FLUSH);
652
653		/*
654		 * Pages to be freed may have been accumulated.  If we
655		 * encounter an ENOMEM,  free what we have and try again.
656		 * This can occur in the case that both spliting fails
657		 * halfway and head page allocation also failed. In this
658		 * case __hugetlb_vmemmap_optimize_folio() would free memory
659		 * allowing more vmemmap remaps to occur.
660		 */
661		if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
662			flush_tlb_all();
663			free_vmemmap_page_list(&vmemmap_pages);
664			INIT_LIST_HEAD(&vmemmap_pages);
665			__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
666							 VMEMMAP_REMAP_NO_TLB_FLUSH);
667		}
668	}
669
670	flush_tlb_all();
671	free_vmemmap_page_list(&vmemmap_pages);
672}
673
674static struct ctl_table hugetlb_vmemmap_sysctls[] = {
675	{
676		.procname	= "hugetlb_optimize_vmemmap",
677		.data		= &vmemmap_optimize_enabled,
678		.maxlen		= sizeof(vmemmap_optimize_enabled),
679		.mode		= 0644,
680		.proc_handler	= proc_dobool,
681	},
682	{ }
683};
684
685static int __init hugetlb_vmemmap_init(void)
686{
687	const struct hstate *h;
688
689	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
690	BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
691
692	for_each_hstate(h) {
693		if (hugetlb_vmemmap_optimizable(h)) {
694			register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
695			break;
696		}
697	}
698	return 0;
699}
700late_initcall(hugetlb_vmemmap_init);
701