1/*
2 * hugetlbpage-backed filesystem.  Based on ramfs.
3 *
4 * Nadia Yvette Chambers, 2002
5 *
6 * Copyright (C) 2002 Linus Torvalds.
7 * License: GPL
8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12#include <linux/thread_info.h>
13#include <asm/current.h>
14#include <linux/falloc.h>
15#include <linux/fs.h>
16#include <linux/mount.h>
17#include <linux/file.h>
18#include <linux/kernel.h>
19#include <linux/writeback.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/init.h>
23#include <linux/string.h>
24#include <linux/capability.h>
25#include <linux/ctype.h>
26#include <linux/backing-dev.h>
27#include <linux/hugetlb.h>
28#include <linux/pagevec.h>
29#include <linux/fs_parser.h>
30#include <linux/mman.h>
31#include <linux/slab.h>
32#include <linux/dnotify.h>
33#include <linux/statfs.h>
34#include <linux/security.h>
35#include <linux/magic.h>
36#include <linux/migrate.h>
37#include <linux/uio.h>
38
39#include <linux/uaccess.h>
40#include <linux/sched/mm.h>
41
42static const struct address_space_operations hugetlbfs_aops;
43const struct file_operations hugetlbfs_file_operations;
44static const struct inode_operations hugetlbfs_dir_inode_operations;
45static const struct inode_operations hugetlbfs_inode_operations;
46
47enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
48
49struct hugetlbfs_fs_context {
50	struct hstate		*hstate;
51	unsigned long long	max_size_opt;
52	unsigned long long	min_size_opt;
53	long			max_hpages;
54	long			nr_inodes;
55	long			min_hpages;
56	enum hugetlbfs_size_type max_val_type;
57	enum hugetlbfs_size_type min_val_type;
58	kuid_t			uid;
59	kgid_t			gid;
60	umode_t			mode;
61};
62
63int sysctl_hugetlb_shm_group;
64
65enum hugetlb_param {
66	Opt_gid,
67	Opt_min_size,
68	Opt_mode,
69	Opt_nr_inodes,
70	Opt_pagesize,
71	Opt_size,
72	Opt_uid,
73};
74
75static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
76	fsparam_u32   ("gid",		Opt_gid),
77	fsparam_string("min_size",	Opt_min_size),
78	fsparam_u32oct("mode",		Opt_mode),
79	fsparam_string("nr_inodes",	Opt_nr_inodes),
80	fsparam_string("pagesize",	Opt_pagesize),
81	fsparam_string("size",		Opt_size),
82	fsparam_u32   ("uid",		Opt_uid),
83	{}
84};
85
86/*
87 * Mask used when checking the page offset value passed in via system
88 * calls.  This value will be converted to a loff_t which is signed.
89 * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
90 * value.  The extra bit (- 1 in the shift value) is to take the sign
91 * bit into account.
92 */
93#define PGOFF_LOFFT_MAX \
94	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
95
96static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
97{
98	struct inode *inode = file_inode(file);
99	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
100	loff_t len, vma_len;
101	int ret;
102	struct hstate *h = hstate_file(file);
103	vm_flags_t vm_flags;
104
105	/*
106	 * vma address alignment (but not the pgoff alignment) has
107	 * already been checked by prepare_hugepage_range.  If you add
108	 * any error returns here, do so after setting VM_HUGETLB, so
109	 * is_vm_hugetlb_page tests below unmap_region go the right
110	 * way when do_mmap unwinds (may be important on powerpc
111	 * and ia64).
112	 */
113	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
114	vma->vm_ops = &hugetlb_vm_ops;
115
116	ret = seal_check_write(info->seals, vma);
117	if (ret)
118		return ret;
119
120	/*
121	 * page based offset in vm_pgoff could be sufficiently large to
122	 * overflow a loff_t when converted to byte offset.  This can
123	 * only happen on architectures where sizeof(loff_t) ==
124	 * sizeof(unsigned long).  So, only check in those instances.
125	 */
126	if (sizeof(unsigned long) == sizeof(loff_t)) {
127		if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
128			return -EINVAL;
129	}
130
131	/* must be huge page aligned */
132	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
133		return -EINVAL;
134
135	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
136	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
137	/* check for overflow */
138	if (len < vma_len)
139		return -EINVAL;
140
141	inode_lock(inode);
142	file_accessed(file);
143
144	ret = -ENOMEM;
145
146	vm_flags = vma->vm_flags;
147	/*
148	 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
149	 * reserving here. Note: only for SHM hugetlbfs file, the inode
150	 * flag S_PRIVATE is set.
151	 */
152	if (inode->i_flags & S_PRIVATE)
153		vm_flags |= VM_NORESERVE;
154
155	if (!hugetlb_reserve_pages(inode,
156				vma->vm_pgoff >> huge_page_order(h),
157				len >> huge_page_shift(h), vma,
158				vm_flags))
159		goto out;
160
161	ret = 0;
162	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
163		i_size_write(inode, len);
164out:
165	inode_unlock(inode);
166
167	return ret;
168}
169
170/*
171 * Called under mmap_write_lock(mm).
172 */
173
174static unsigned long
175hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
176		unsigned long len, unsigned long pgoff, unsigned long flags)
177{
178	struct hstate *h = hstate_file(file);
179	struct vm_unmapped_area_info info;
180
181	info.flags = 0;
182	info.length = len;
183	info.low_limit = current->mm->mmap_base;
184	info.high_limit = arch_get_mmap_end(addr, len, flags);
185	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
186	info.align_offset = 0;
187	return vm_unmapped_area(&info);
188}
189
190static unsigned long
191hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
192		unsigned long len, unsigned long pgoff, unsigned long flags)
193{
194	struct hstate *h = hstate_file(file);
195	struct vm_unmapped_area_info info;
196
197	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
198	info.length = len;
199	info.low_limit = PAGE_SIZE;
200	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
201	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
202	info.align_offset = 0;
203	addr = vm_unmapped_area(&info);
204
205	/*
206	 * A failed mmap() very likely causes application failure,
207	 * so fall back to the bottom-up function here. This scenario
208	 * can happen with large stack limits and large mmap()
209	 * allocations.
210	 */
211	if (unlikely(offset_in_page(addr))) {
212		VM_BUG_ON(addr != -ENOMEM);
213		info.flags = 0;
214		info.low_limit = current->mm->mmap_base;
215		info.high_limit = arch_get_mmap_end(addr, len, flags);
216		addr = vm_unmapped_area(&info);
217	}
218
219	return addr;
220}
221
222unsigned long
223generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
224				  unsigned long len, unsigned long pgoff,
225				  unsigned long flags)
226{
227	struct mm_struct *mm = current->mm;
228	struct vm_area_struct *vma;
229	struct hstate *h = hstate_file(file);
230	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
231
232	if (len & ~huge_page_mask(h))
233		return -EINVAL;
234	if (len > TASK_SIZE)
235		return -ENOMEM;
236
237	if (flags & MAP_FIXED) {
238		if (prepare_hugepage_range(file, addr, len))
239			return -EINVAL;
240		return addr;
241	}
242
243	if (addr) {
244		addr = ALIGN(addr, huge_page_size(h));
245		vma = find_vma(mm, addr);
246		if (mmap_end - len >= addr &&
247		    (!vma || addr + len <= vm_start_gap(vma)))
248			return addr;
249	}
250
251	/*
252	 * Use mm->get_unmapped_area value as a hint to use topdown routine.
253	 * If architectures have special needs, they should define their own
254	 * version of hugetlb_get_unmapped_area.
255	 */
256	if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
257		return hugetlb_get_unmapped_area_topdown(file, addr, len,
258				pgoff, flags);
259	return hugetlb_get_unmapped_area_bottomup(file, addr, len,
260			pgoff, flags);
261}
262
263#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
264static unsigned long
265hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
266			  unsigned long len, unsigned long pgoff,
267			  unsigned long flags)
268{
269	return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
270}
271#endif
272
273/*
274 * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
275 * Returns the maximum number of bytes one can read without touching the 1st raw
276 * HWPOISON subpage.
277 *
278 * The implementation borrows the iteration logic from copy_page_to_iter*.
279 */
280static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
281{
282	size_t n = 0;
283	size_t res = 0;
284
285	/* First subpage to start the loop. */
286	page = nth_page(page, offset / PAGE_SIZE);
287	offset %= PAGE_SIZE;
288	while (1) {
289		if (is_raw_hwpoison_page_in_hugepage(page))
290			break;
291
292		/* Safe to read n bytes without touching HWPOISON subpage. */
293		n = min(bytes, (size_t)PAGE_SIZE - offset);
294		res += n;
295		bytes -= n;
296		if (!bytes || !n)
297			break;
298		offset += n;
299		if (offset == PAGE_SIZE) {
300			page = nth_page(page, 1);
301			offset = 0;
302		}
303	}
304
305	return res;
306}
307
308/*
309 * Support for read() - Find the page attached to f_mapping and copy out the
310 * data. This provides functionality similar to filemap_read().
311 */
312static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
313{
314	struct file *file = iocb->ki_filp;
315	struct hstate *h = hstate_file(file);
316	struct address_space *mapping = file->f_mapping;
317	struct inode *inode = mapping->host;
318	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
319	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
320	unsigned long end_index;
321	loff_t isize;
322	ssize_t retval = 0;
323
324	while (iov_iter_count(to)) {
325		struct folio *folio;
326		size_t nr, copied, want;
327
328		/* nr is the maximum number of bytes to copy from this page */
329		nr = huge_page_size(h);
330		isize = i_size_read(inode);
331		if (!isize)
332			break;
333		end_index = (isize - 1) >> huge_page_shift(h);
334		if (index > end_index)
335			break;
336		if (index == end_index) {
337			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
338			if (nr <= offset)
339				break;
340		}
341		nr = nr - offset;
342
343		/* Find the folio */
344		folio = filemap_lock_hugetlb_folio(h, mapping, index);
345		if (IS_ERR(folio)) {
346			/*
347			 * We have a HOLE, zero out the user-buffer for the
348			 * length of the hole or request.
349			 */
350			copied = iov_iter_zero(nr, to);
351		} else {
352			folio_unlock(folio);
353
354			if (!folio_test_hwpoison(folio))
355				want = nr;
356			else {
357				/*
358				 * Adjust how many bytes safe to read without
359				 * touching the 1st raw HWPOISON subpage after
360				 * offset.
361				 */
362				want = adjust_range_hwpoison(&folio->page, offset, nr);
363				if (want == 0) {
364					folio_put(folio);
365					retval = -EIO;
366					break;
367				}
368			}
369
370			/*
371			 * We have the folio, copy it to user space buffer.
372			 */
373			copied = copy_folio_to_iter(folio, offset, want, to);
374			folio_put(folio);
375		}
376		offset += copied;
377		retval += copied;
378		if (copied != nr && iov_iter_count(to)) {
379			if (!retval)
380				retval = -EFAULT;
381			break;
382		}
383		index += offset >> huge_page_shift(h);
384		offset &= ~huge_page_mask(h);
385	}
386	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
387	return retval;
388}
389
390static int hugetlbfs_write_begin(struct file *file,
391			struct address_space *mapping,
392			loff_t pos, unsigned len,
393			struct page **pagep, void **fsdata)
394{
395	return -EINVAL;
396}
397
398static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
399			loff_t pos, unsigned len, unsigned copied,
400			struct page *page, void *fsdata)
401{
402	BUG();
403	return -EINVAL;
404}
405
406static void hugetlb_delete_from_page_cache(struct folio *folio)
407{
408	folio_clear_dirty(folio);
409	folio_clear_uptodate(folio);
410	filemap_remove_folio(folio);
411}
412
413/*
414 * Called with i_mmap_rwsem held for inode based vma maps.  This makes
415 * sure vma (and vm_mm) will not go away.  We also hold the hugetlb fault
416 * mutex for the page in the mapping.  So, we can not race with page being
417 * faulted into the vma.
418 */
419static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
420				unsigned long addr, struct page *page)
421{
422	pte_t *ptep, pte;
423
424	ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
425	if (!ptep)
426		return false;
427
428	pte = huge_ptep_get(ptep);
429	if (huge_pte_none(pte) || !pte_present(pte))
430		return false;
431
432	if (pte_page(pte) == page)
433		return true;
434
435	return false;
436}
437
438/*
439 * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
440 * No, because the interval tree returns us only those vmas
441 * which overlap the truncated area starting at pgoff,
442 * and no vma on a 32-bit arch can span beyond the 4GB.
443 */
444static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
445{
446	unsigned long offset = 0;
447
448	if (vma->vm_pgoff < start)
449		offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
450
451	return vma->vm_start + offset;
452}
453
454static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
455{
456	unsigned long t_end;
457
458	if (!end)
459		return vma->vm_end;
460
461	t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
462	if (t_end > vma->vm_end)
463		t_end = vma->vm_end;
464	return t_end;
465}
466
467/*
468 * Called with hugetlb fault mutex held.  Therefore, no more mappings to
469 * this folio can be created while executing the routine.
470 */
471static void hugetlb_unmap_file_folio(struct hstate *h,
472					struct address_space *mapping,
473					struct folio *folio, pgoff_t index)
474{
475	struct rb_root_cached *root = &mapping->i_mmap;
476	struct hugetlb_vma_lock *vma_lock;
477	struct page *page = &folio->page;
478	struct vm_area_struct *vma;
479	unsigned long v_start;
480	unsigned long v_end;
481	pgoff_t start, end;
482
483	start = index * pages_per_huge_page(h);
484	end = (index + 1) * pages_per_huge_page(h);
485
486	i_mmap_lock_write(mapping);
487retry:
488	vma_lock = NULL;
489	vma_interval_tree_foreach(vma, root, start, end - 1) {
490		v_start = vma_offset_start(vma, start);
491		v_end = vma_offset_end(vma, end);
492
493		if (!hugetlb_vma_maps_page(vma, v_start, page))
494			continue;
495
496		if (!hugetlb_vma_trylock_write(vma)) {
497			vma_lock = vma->vm_private_data;
498			/*
499			 * If we can not get vma lock, we need to drop
500			 * immap_sema and take locks in order.  First,
501			 * take a ref on the vma_lock structure so that
502			 * we can be guaranteed it will not go away when
503			 * dropping immap_sema.
504			 */
505			kref_get(&vma_lock->refs);
506			break;
507		}
508
509		unmap_hugepage_range(vma, v_start, v_end, NULL,
510				     ZAP_FLAG_DROP_MARKER);
511		hugetlb_vma_unlock_write(vma);
512	}
513
514	i_mmap_unlock_write(mapping);
515
516	if (vma_lock) {
517		/*
518		 * Wait on vma_lock.  We know it is still valid as we have
519		 * a reference.  We must 'open code' vma locking as we do
520		 * not know if vma_lock is still attached to vma.
521		 */
522		down_write(&vma_lock->rw_sema);
523		i_mmap_lock_write(mapping);
524
525		vma = vma_lock->vma;
526		if (!vma) {
527			/*
528			 * If lock is no longer attached to vma, then just
529			 * unlock, drop our reference and retry looking for
530			 * other vmas.
531			 */
532			up_write(&vma_lock->rw_sema);
533			kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
534			goto retry;
535		}
536
537		/*
538		 * vma_lock is still attached to vma.  Check to see if vma
539		 * still maps page and if so, unmap.
540		 */
541		v_start = vma_offset_start(vma, start);
542		v_end = vma_offset_end(vma, end);
543		if (hugetlb_vma_maps_page(vma, v_start, page))
544			unmap_hugepage_range(vma, v_start, v_end, NULL,
545					     ZAP_FLAG_DROP_MARKER);
546
547		kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
548		hugetlb_vma_unlock_write(vma);
549
550		goto retry;
551	}
552}
553
554static void
555hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
556		      zap_flags_t zap_flags)
557{
558	struct vm_area_struct *vma;
559
560	/*
561	 * end == 0 indicates that the entire range after start should be
562	 * unmapped.  Note, end is exclusive, whereas the interval tree takes
563	 * an inclusive "last".
564	 */
565	vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
566		unsigned long v_start;
567		unsigned long v_end;
568
569		if (!hugetlb_vma_trylock_write(vma))
570			continue;
571
572		v_start = vma_offset_start(vma, start);
573		v_end = vma_offset_end(vma, end);
574
575		unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);
576
577		/*
578		 * Note that vma lock only exists for shared/non-private
579		 * vmas.  Therefore, lock is not held when calling
580		 * unmap_hugepage_range for private vmas.
581		 */
582		hugetlb_vma_unlock_write(vma);
583	}
584}
585
586/*
587 * Called with hugetlb fault mutex held.
588 * Returns true if page was actually removed, false otherwise.
589 */
590static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
591					struct address_space *mapping,
592					struct folio *folio, pgoff_t index,
593					bool truncate_op)
594{
595	bool ret = false;
596
597	/*
598	 * If folio is mapped, it was faulted in after being
599	 * unmapped in caller.  Unmap (again) while holding
600	 * the fault mutex.  The mutex will prevent faults
601	 * until we finish removing the folio.
602	 */
603	if (unlikely(folio_mapped(folio)))
604		hugetlb_unmap_file_folio(h, mapping, folio, index);
605
606	folio_lock(folio);
607	/*
608	 * We must remove the folio from page cache before removing
609	 * the region/ reserve map (hugetlb_unreserve_pages).  In
610	 * rare out of memory conditions, removal of the region/reserve
611	 * map could fail.  Correspondingly, the subpool and global
612	 * reserve usage count can need to be adjusted.
613	 */
614	VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
615	hugetlb_delete_from_page_cache(folio);
616	ret = true;
617	if (!truncate_op) {
618		if (unlikely(hugetlb_unreserve_pages(inode, index,
619							index + 1, 1)))
620			hugetlb_fix_reserve_counts(inode);
621	}
622
623	folio_unlock(folio);
624	return ret;
625}
626
627/*
628 * remove_inode_hugepages handles two distinct cases: truncation and hole
629 * punch.  There are subtle differences in operation for each case.
630 *
631 * truncation is indicated by end of range being LLONG_MAX
632 *	In this case, we first scan the range and release found pages.
633 *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
634 *	maps and global counts.  Page faults can race with truncation.
635 *	During faults, hugetlb_no_page() checks i_size before page allocation,
636 *	and again after obtaining page table lock.  It will 'back out'
637 *	allocations in the truncated range.
638 * hole punch is indicated if end is not LLONG_MAX
639 *	In the hole punch case we scan the range and release found pages.
640 *	Only when releasing a page is the associated region/reserve map
641 *	deleted.  The region/reserve map for ranges without associated
642 *	pages are not modified.  Page faults can race with hole punch.
643 *	This is indicated if we find a mapped page.
644 * Note: If the passed end of range value is beyond the end of file, but
645 * not LLONG_MAX this routine still performs a hole punch operation.
646 */
647static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
648				   loff_t lend)
649{
650	struct hstate *h = hstate_inode(inode);
651	struct address_space *mapping = &inode->i_data;
652	const pgoff_t end = lend >> PAGE_SHIFT;
653	struct folio_batch fbatch;
654	pgoff_t next, index;
655	int i, freed = 0;
656	bool truncate_op = (lend == LLONG_MAX);
657
658	folio_batch_init(&fbatch);
659	next = lstart >> PAGE_SHIFT;
660	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
661		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
662			struct folio *folio = fbatch.folios[i];
663			u32 hash = 0;
664
665			index = folio->index >> huge_page_order(h);
666			hash = hugetlb_fault_mutex_hash(mapping, index);
667			mutex_lock(&hugetlb_fault_mutex_table[hash]);
668
669			/*
670			 * Remove folio that was part of folio_batch.
671			 */
672			if (remove_inode_single_folio(h, inode, mapping, folio,
673							index, truncate_op))
674				freed++;
675
676			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
677		}
678		folio_batch_release(&fbatch);
679		cond_resched();
680	}
681
682	if (truncate_op)
683		(void)hugetlb_unreserve_pages(inode,
684				lstart >> huge_page_shift(h),
685				LONG_MAX, freed);
686}
687
688static void hugetlbfs_evict_inode(struct inode *inode)
689{
690	struct resv_map *resv_map;
691
692	remove_inode_hugepages(inode, 0, LLONG_MAX);
693
694	/*
695	 * Get the resv_map from the address space embedded in the inode.
696	 * This is the address space which points to any resv_map allocated
697	 * at inode creation time.  If this is a device special inode,
698	 * i_mapping may not point to the original address space.
699	 */
700	resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
701	/* Only regular and link inodes have associated reserve maps */
702	if (resv_map)
703		resv_map_release(&resv_map->refs);
704	clear_inode(inode);
705}
706
707static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
708{
709	pgoff_t pgoff;
710	struct address_space *mapping = inode->i_mapping;
711	struct hstate *h = hstate_inode(inode);
712
713	BUG_ON(offset & ~huge_page_mask(h));
714	pgoff = offset >> PAGE_SHIFT;
715
716	i_size_write(inode, offset);
717	i_mmap_lock_write(mapping);
718	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
719		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
720				      ZAP_FLAG_DROP_MARKER);
721	i_mmap_unlock_write(mapping);
722	remove_inode_hugepages(inode, offset, LLONG_MAX);
723}
724
725static void hugetlbfs_zero_partial_page(struct hstate *h,
726					struct address_space *mapping,
727					loff_t start,
728					loff_t end)
729{
730	pgoff_t idx = start >> huge_page_shift(h);
731	struct folio *folio;
732
733	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
734	if (IS_ERR(folio))
735		return;
736
737	start = start & ~huge_page_mask(h);
738	end = end & ~huge_page_mask(h);
739	if (!end)
740		end = huge_page_size(h);
741
742	folio_zero_segment(folio, (size_t)start, (size_t)end);
743
744	folio_unlock(folio);
745	folio_put(folio);
746}
747
748static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
749{
750	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
751	struct address_space *mapping = inode->i_mapping;
752	struct hstate *h = hstate_inode(inode);
753	loff_t hpage_size = huge_page_size(h);
754	loff_t hole_start, hole_end;
755
756	/*
757	 * hole_start and hole_end indicate the full pages within the hole.
758	 */
759	hole_start = round_up(offset, hpage_size);
760	hole_end = round_down(offset + len, hpage_size);
761
762	inode_lock(inode);
763
764	/* protected by i_rwsem */
765	if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
766		inode_unlock(inode);
767		return -EPERM;
768	}
769
770	i_mmap_lock_write(mapping);
771
772	/* If range starts before first full page, zero partial page. */
773	if (offset < hole_start)
774		hugetlbfs_zero_partial_page(h, mapping,
775				offset, min(offset + len, hole_start));
776
777	/* Unmap users of full pages in the hole. */
778	if (hole_end > hole_start) {
779		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
780			hugetlb_vmdelete_list(&mapping->i_mmap,
781					      hole_start >> PAGE_SHIFT,
782					      hole_end >> PAGE_SHIFT, 0);
783	}
784
785	/* If range extends beyond last full page, zero partial page. */
786	if ((offset + len) > hole_end && (offset + len) > hole_start)
787		hugetlbfs_zero_partial_page(h, mapping,
788				hole_end, offset + len);
789
790	i_mmap_unlock_write(mapping);
791
792	/* Remove full pages from the file. */
793	if (hole_end > hole_start)
794		remove_inode_hugepages(inode, hole_start, hole_end);
795
796	inode_unlock(inode);
797
798	return 0;
799}
800
801static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
802				loff_t len)
803{
804	struct inode *inode = file_inode(file);
805	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
806	struct address_space *mapping = inode->i_mapping;
807	struct hstate *h = hstate_inode(inode);
808	struct vm_area_struct pseudo_vma;
809	struct mm_struct *mm = current->mm;
810	loff_t hpage_size = huge_page_size(h);
811	unsigned long hpage_shift = huge_page_shift(h);
812	pgoff_t start, index, end;
813	int error;
814	u32 hash;
815
816	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
817		return -EOPNOTSUPP;
818
819	if (mode & FALLOC_FL_PUNCH_HOLE)
820		return hugetlbfs_punch_hole(inode, offset, len);
821
822	/*
823	 * Default preallocate case.
824	 * For this range, start is rounded down and end is rounded up
825	 * as well as being converted to page offsets.
826	 */
827	start = offset >> hpage_shift;
828	end = (offset + len + hpage_size - 1) >> hpage_shift;
829
830	inode_lock(inode);
831
832	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
833	error = inode_newsize_ok(inode, offset + len);
834	if (error)
835		goto out;
836
837	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
838		error = -EPERM;
839		goto out;
840	}
841
842	/*
843	 * Initialize a pseudo vma as this is required by the huge page
844	 * allocation routines.
845	 */
846	vma_init(&pseudo_vma, mm);
847	vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
848	pseudo_vma.vm_file = file;
849
850	for (index = start; index < end; index++) {
851		/*
852		 * This is supposed to be the vaddr where the page is being
853		 * faulted in, but we have no vaddr here.
854		 */
855		struct folio *folio;
856		unsigned long addr;
857
858		cond_resched();
859
860		/*
861		 * fallocate(2) manpage permits EINTR; we may have been
862		 * interrupted because we are using up too much memory.
863		 */
864		if (signal_pending(current)) {
865			error = -EINTR;
866			break;
867		}
868
869		/* addr is the offset within the file (zero based) */
870		addr = index * hpage_size;
871
872		/* mutex taken here, fault path and hole punch */
873		hash = hugetlb_fault_mutex_hash(mapping, index);
874		mutex_lock(&hugetlb_fault_mutex_table[hash]);
875
876		/* See if already present in mapping to avoid alloc/free */
877		folio = filemap_get_folio(mapping, index << huge_page_order(h));
878		if (!IS_ERR(folio)) {
879			folio_put(folio);
880			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
881			continue;
882		}
883
884		/*
885		 * Allocate folio without setting the avoid_reserve argument.
886		 * There certainly are no reserves associated with the
887		 * pseudo_vma.  However, there could be shared mappings with
888		 * reserves for the file at the inode level.  If we fallocate
889		 * folios in these areas, we need to consume the reserves
890		 * to keep reservation accounting consistent.
891		 */
892		folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
893		if (IS_ERR(folio)) {
894			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
895			error = PTR_ERR(folio);
896			goto out;
897		}
898		clear_huge_page(&folio->page, addr, pages_per_huge_page(h));
899		__folio_mark_uptodate(folio);
900		error = hugetlb_add_to_page_cache(folio, mapping, index);
901		if (unlikely(error)) {
902			restore_reserve_on_error(h, &pseudo_vma, addr, folio);
903			folio_put(folio);
904			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
905			goto out;
906		}
907
908		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
909
910		folio_set_hugetlb_migratable(folio);
911		/*
912		 * folio_unlock because locked by hugetlb_add_to_page_cache()
913		 * folio_put() due to reference from alloc_hugetlb_folio()
914		 */
915		folio_unlock(folio);
916		folio_put(folio);
917	}
918
919	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
920		i_size_write(inode, offset + len);
921	inode_set_ctime_current(inode);
922out:
923	inode_unlock(inode);
924	return error;
925}
926
927static int hugetlbfs_setattr(struct mnt_idmap *idmap,
928			     struct dentry *dentry, struct iattr *attr)
929{
930	struct inode *inode = d_inode(dentry);
931	struct hstate *h = hstate_inode(inode);
932	int error;
933	unsigned int ia_valid = attr->ia_valid;
934	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
935
936	error = setattr_prepare(idmap, dentry, attr);
937	if (error)
938		return error;
939
940	if (ia_valid & ATTR_SIZE) {
941		loff_t oldsize = inode->i_size;
942		loff_t newsize = attr->ia_size;
943
944		if (newsize & ~huge_page_mask(h))
945			return -EINVAL;
946		/* protected by i_rwsem */
947		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
948		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
949			return -EPERM;
950		hugetlb_vmtruncate(inode, newsize);
951	}
952
953	setattr_copy(idmap, inode, attr);
954	mark_inode_dirty(inode);
955	return 0;
956}
957
958static struct inode *hugetlbfs_get_root(struct super_block *sb,
959					struct hugetlbfs_fs_context *ctx)
960{
961	struct inode *inode;
962
963	inode = new_inode(sb);
964	if (inode) {
965		inode->i_ino = get_next_ino();
966		inode->i_mode = S_IFDIR | ctx->mode;
967		inode->i_uid = ctx->uid;
968		inode->i_gid = ctx->gid;
969		simple_inode_init_ts(inode);
970		inode->i_op = &hugetlbfs_dir_inode_operations;
971		inode->i_fop = &simple_dir_operations;
972		/* directory inodes start off with i_nlink == 2 (for "." entry) */
973		inc_nlink(inode);
974		lockdep_annotate_inode_mutex_key(inode);
975	}
976	return inode;
977}
978
979/*
980 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
981 * be taken from reclaim -- unlike regular filesystems. This needs an
982 * annotation because huge_pmd_share() does an allocation under hugetlb's
983 * i_mmap_rwsem.
984 */
985static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
986
987static struct inode *hugetlbfs_get_inode(struct super_block *sb,
988					struct mnt_idmap *idmap,
989					struct inode *dir,
990					umode_t mode, dev_t dev)
991{
992	struct inode *inode;
993	struct resv_map *resv_map = NULL;
994
995	/*
996	 * Reserve maps are only needed for inodes that can have associated
997	 * page allocations.
998	 */
999	if (S_ISREG(mode) || S_ISLNK(mode)) {
1000		resv_map = resv_map_alloc();
1001		if (!resv_map)
1002			return NULL;
1003	}
1004
1005	inode = new_inode(sb);
1006	if (inode) {
1007		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
1008
1009		inode->i_ino = get_next_ino();
1010		inode_init_owner(idmap, inode, dir, mode);
1011		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
1012				&hugetlbfs_i_mmap_rwsem_key);
1013		inode->i_mapping->a_ops = &hugetlbfs_aops;
1014		simple_inode_init_ts(inode);
1015		inode->i_mapping->i_private_data = resv_map;
1016		info->seals = F_SEAL_SEAL;
1017		switch (mode & S_IFMT) {
1018		default:
1019			init_special_inode(inode, mode, dev);
1020			break;
1021		case S_IFREG:
1022			inode->i_op = &hugetlbfs_inode_operations;
1023			inode->i_fop = &hugetlbfs_file_operations;
1024			break;
1025		case S_IFDIR:
1026			inode->i_op = &hugetlbfs_dir_inode_operations;
1027			inode->i_fop = &simple_dir_operations;
1028
1029			/* directory inodes start off with i_nlink == 2 (for "." entry) */
1030			inc_nlink(inode);
1031			break;
1032		case S_IFLNK:
1033			inode->i_op = &page_symlink_inode_operations;
1034			inode_nohighmem(inode);
1035			break;
1036		}
1037		lockdep_annotate_inode_mutex_key(inode);
1038	} else {
1039		if (resv_map)
1040			kref_put(&resv_map->refs, resv_map_release);
1041	}
1042
1043	return inode;
1044}
1045
1046/*
1047 * File creation. Allocate an inode, and we're done..
1048 */
1049static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
1050			   struct dentry *dentry, umode_t mode, dev_t dev)
1051{
1052	struct inode *inode;
1053
1054	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
1055	if (!inode)
1056		return -ENOSPC;
1057	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
1058	d_instantiate(dentry, inode);
1059	dget(dentry);/* Extra count - pin the dentry in core */
1060	return 0;
1061}
1062
1063static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
1064			   struct dentry *dentry, umode_t mode)
1065{
1066	int retval = hugetlbfs_mknod(idmap, dir, dentry,
1067				     mode | S_IFDIR, 0);
1068	if (!retval)
1069		inc_nlink(dir);
1070	return retval;
1071}
1072
1073static int hugetlbfs_create(struct mnt_idmap *idmap,
1074			    struct inode *dir, struct dentry *dentry,
1075			    umode_t mode, bool excl)
1076{
1077	return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
1078}
1079
1080static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
1081			     struct inode *dir, struct file *file,
1082			     umode_t mode)
1083{
1084	struct inode *inode;
1085
1086	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
1087	if (!inode)
1088		return -ENOSPC;
1089	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
1090	d_tmpfile(file, inode);
1091	return finish_open_simple(file, 0);
1092}
1093
1094static int hugetlbfs_symlink(struct mnt_idmap *idmap,
1095			     struct inode *dir, struct dentry *dentry,
1096			     const char *symname)
1097{
1098	const umode_t mode = S_IFLNK|S_IRWXUGO;
1099	struct inode *inode;
1100	int error = -ENOSPC;
1101
1102	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
1103	if (inode) {
1104		int l = strlen(symname)+1;
1105		error = page_symlink(inode, symname, l);
1106		if (!error) {
1107			d_instantiate(dentry, inode);
1108			dget(dentry);
1109		} else
1110			iput(inode);
1111	}
1112	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
1113
1114	return error;
1115}
1116
1117#ifdef CONFIG_MIGRATION
1118static int hugetlbfs_migrate_folio(struct address_space *mapping,
1119				struct folio *dst, struct folio *src,
1120				enum migrate_mode mode)
1121{
1122	int rc;
1123
1124	rc = migrate_huge_page_move_mapping(mapping, dst, src);
1125	if (rc != MIGRATEPAGE_SUCCESS)
1126		return rc;
1127
1128	if (hugetlb_folio_subpool(src)) {
1129		hugetlb_set_folio_subpool(dst,
1130					hugetlb_folio_subpool(src));
1131		hugetlb_set_folio_subpool(src, NULL);
1132	}
1133
1134	if (mode != MIGRATE_SYNC_NO_COPY)
1135		folio_migrate_copy(dst, src);
1136	else
1137		folio_migrate_flags(dst, src);
1138
1139	return MIGRATEPAGE_SUCCESS;
1140}
1141#else
1142#define hugetlbfs_migrate_folio NULL
1143#endif
1144
1145static int hugetlbfs_error_remove_folio(struct address_space *mapping,
1146				struct folio *folio)
1147{
1148	return 0;
1149}
1150
1151/*
1152 * Display the mount options in /proc/mounts.
1153 */
1154static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root)
1155{
1156	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
1157	struct hugepage_subpool *spool = sbinfo->spool;
1158	unsigned long hpage_size = huge_page_size(sbinfo->hstate);
1159	unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
1160	char mod;
1161
1162	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
1163		seq_printf(m, ",uid=%u",
1164			   from_kuid_munged(&init_user_ns, sbinfo->uid));
1165	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
1166		seq_printf(m, ",gid=%u",
1167			   from_kgid_munged(&init_user_ns, sbinfo->gid));
1168	if (sbinfo->mode != 0755)
1169		seq_printf(m, ",mode=%o", sbinfo->mode);
1170	if (sbinfo->max_inodes != -1)
1171		seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
1172
1173	hpage_size /= 1024;
1174	mod = 'K';
1175	if (hpage_size >= 1024) {
1176		hpage_size /= 1024;
1177		mod = 'M';
1178	}
1179	seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
1180	if (spool) {
1181		if (spool->max_hpages != -1)
1182			seq_printf(m, ",size=%llu",
1183				   (unsigned long long)spool->max_hpages << hpage_shift);
1184		if (spool->min_hpages != -1)
1185			seq_printf(m, ",min_size=%llu",
1186				   (unsigned long long)spool->min_hpages << hpage_shift);
1187	}
1188	return 0;
1189}
1190
1191static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1192{
1193	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
1194	struct hstate *h = hstate_inode(d_inode(dentry));
1195	u64 id = huge_encode_dev(dentry->d_sb->s_dev);
1196
1197	buf->f_fsid = u64_to_fsid(id);
1198	buf->f_type = HUGETLBFS_MAGIC;
1199	buf->f_bsize = huge_page_size(h);
1200	if (sbinfo) {
1201		spin_lock(&sbinfo->stat_lock);
1202		/* If no limits set, just report 0 or -1 for max/free/used
1203		 * blocks, like simple_statfs() */
1204		if (sbinfo->spool) {
1205			long free_pages;
1206
1207			spin_lock_irq(&sbinfo->spool->lock);
1208			buf->f_blocks = sbinfo->spool->max_hpages;
1209			free_pages = sbinfo->spool->max_hpages
1210				- sbinfo->spool->used_hpages;
1211			buf->f_bavail = buf->f_bfree = free_pages;
1212			spin_unlock_irq(&sbinfo->spool->lock);
1213			buf->f_files = sbinfo->max_inodes;
1214			buf->f_ffree = sbinfo->free_inodes;
1215		}
1216		spin_unlock(&sbinfo->stat_lock);
1217	}
1218	buf->f_namelen = NAME_MAX;
1219	return 0;
1220}
1221
1222static void hugetlbfs_put_super(struct super_block *sb)
1223{
1224	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
1225
1226	if (sbi) {
1227		sb->s_fs_info = NULL;
1228
1229		if (sbi->spool)
1230			hugepage_put_subpool(sbi->spool);
1231
1232		kfree(sbi);
1233	}
1234}
1235
1236static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1237{
1238	if (sbinfo->free_inodes >= 0) {
1239		spin_lock(&sbinfo->stat_lock);
1240		if (unlikely(!sbinfo->free_inodes)) {
1241			spin_unlock(&sbinfo->stat_lock);
1242			return 0;
1243		}
1244		sbinfo->free_inodes--;
1245		spin_unlock(&sbinfo->stat_lock);
1246	}
1247
1248	return 1;
1249}
1250
1251static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1252{
1253	if (sbinfo->free_inodes >= 0) {
1254		spin_lock(&sbinfo->stat_lock);
1255		sbinfo->free_inodes++;
1256		spin_unlock(&sbinfo->stat_lock);
1257	}
1258}
1259
1260
1261static struct kmem_cache *hugetlbfs_inode_cachep;
1262
1263static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
1264{
1265	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
1266	struct hugetlbfs_inode_info *p;
1267
1268	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
1269		return NULL;
1270	p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
1271	if (unlikely(!p)) {
1272		hugetlbfs_inc_free_inodes(sbinfo);
1273		return NULL;
1274	}
1275	return &p->vfs_inode;
1276}
1277
1278static void hugetlbfs_free_inode(struct inode *inode)
1279{
1280	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
1281}
1282
1283static void hugetlbfs_destroy_inode(struct inode *inode)
1284{
1285	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
1286}
1287
1288static const struct address_space_operations hugetlbfs_aops = {
1289	.write_begin	= hugetlbfs_write_begin,
1290	.write_end	= hugetlbfs_write_end,
1291	.dirty_folio	= noop_dirty_folio,
1292	.migrate_folio  = hugetlbfs_migrate_folio,
1293	.error_remove_folio	= hugetlbfs_error_remove_folio,
1294};
1295
1296
1297static void init_once(void *foo)
1298{
1299	struct hugetlbfs_inode_info *ei = foo;
1300
1301	inode_init_once(&ei->vfs_inode);
1302}
1303
1304const struct file_operations hugetlbfs_file_operations = {
1305	.read_iter		= hugetlbfs_read_iter,
1306	.mmap			= hugetlbfs_file_mmap,
1307	.fsync			= noop_fsync,
1308	.get_unmapped_area	= hugetlb_get_unmapped_area,
1309	.llseek			= default_llseek,
1310	.fallocate		= hugetlbfs_fallocate,
1311};
1312
1313static const struct inode_operations hugetlbfs_dir_inode_operations = {
1314	.create		= hugetlbfs_create,
1315	.lookup		= simple_lookup,
1316	.link		= simple_link,
1317	.unlink		= simple_unlink,
1318	.symlink	= hugetlbfs_symlink,
1319	.mkdir		= hugetlbfs_mkdir,
1320	.rmdir		= simple_rmdir,
1321	.mknod		= hugetlbfs_mknod,
1322	.rename		= simple_rename,
1323	.setattr	= hugetlbfs_setattr,
1324	.tmpfile	= hugetlbfs_tmpfile,
1325};
1326
1327static const struct inode_operations hugetlbfs_inode_operations = {
1328	.setattr	= hugetlbfs_setattr,
1329};
1330
1331static const struct super_operations hugetlbfs_ops = {
1332	.alloc_inode    = hugetlbfs_alloc_inode,
1333	.free_inode     = hugetlbfs_free_inode,
1334	.destroy_inode  = hugetlbfs_destroy_inode,
1335	.evict_inode	= hugetlbfs_evict_inode,
1336	.statfs		= hugetlbfs_statfs,
1337	.put_super	= hugetlbfs_put_super,
1338	.show_options	= hugetlbfs_show_options,
1339};
1340
1341/*
1342 * Convert size option passed from command line to number of huge pages
1343 * in the pool specified by hstate.  Size option could be in bytes
1344 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1345 */
1346static long
1347hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
1348			 enum hugetlbfs_size_type val_type)
1349{
1350	if (val_type == NO_SIZE)
1351		return -1;
1352
1353	if (val_type == SIZE_PERCENT) {
1354		size_opt <<= huge_page_shift(h);
1355		size_opt *= h->max_huge_pages;
1356		do_div(size_opt, 100);
1357	}
1358
1359	size_opt >>= huge_page_shift(h);
1360	return size_opt;
1361}
1362
1363/*
1364 * Parse one mount parameter.
1365 */
1366static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
1367{
1368	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1369	struct fs_parse_result result;
1370	struct hstate *h;
1371	char *rest;
1372	unsigned long ps;
1373	int opt;
1374
1375	opt = fs_parse(fc, hugetlb_fs_parameters, param, &result);
1376	if (opt < 0)
1377		return opt;
1378
1379	switch (opt) {
1380	case Opt_uid:
1381		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
1382		if (!uid_valid(ctx->uid))
1383			goto bad_val;
1384		return 0;
1385
1386	case Opt_gid:
1387		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
1388		if (!gid_valid(ctx->gid))
1389			goto bad_val;
1390		return 0;
1391
1392	case Opt_mode:
1393		ctx->mode = result.uint_32 & 01777U;
1394		return 0;
1395
1396	case Opt_size:
1397		/* memparse() will accept a K/M/G without a digit */
1398		if (!param->string || !isdigit(param->string[0]))
1399			goto bad_val;
1400		ctx->max_size_opt = memparse(param->string, &rest);
1401		ctx->max_val_type = SIZE_STD;
1402		if (*rest == '%')
1403			ctx->max_val_type = SIZE_PERCENT;
1404		return 0;
1405
1406	case Opt_nr_inodes:
1407		/* memparse() will accept a K/M/G without a digit */
1408		if (!param->string || !isdigit(param->string[0]))
1409			goto bad_val;
1410		ctx->nr_inodes = memparse(param->string, &rest);
1411		return 0;
1412
1413	case Opt_pagesize:
1414		ps = memparse(param->string, &rest);
1415		h = size_to_hstate(ps);
1416		if (!h) {
1417			pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
1418			return -EINVAL;
1419		}
1420		ctx->hstate = h;
1421		return 0;
1422
1423	case Opt_min_size:
1424		/* memparse() will accept a K/M/G without a digit */
1425		if (!param->string || !isdigit(param->string[0]))
1426			goto bad_val;
1427		ctx->min_size_opt = memparse(param->string, &rest);
1428		ctx->min_val_type = SIZE_STD;
1429		if (*rest == '%')
1430			ctx->min_val_type = SIZE_PERCENT;
1431		return 0;
1432
1433	default:
1434		return -EINVAL;
1435	}
1436
1437bad_val:
1438	return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
1439		      param->string, param->key);
1440}
1441
1442/*
1443 * Validate the parsed options.
1444 */
1445static int hugetlbfs_validate(struct fs_context *fc)
1446{
1447	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1448
1449	/*
1450	 * Use huge page pool size (in hstate) to convert the size
1451	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
1452	 */
1453	ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
1454						   ctx->max_size_opt,
1455						   ctx->max_val_type);
1456	ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
1457						   ctx->min_size_opt,
1458						   ctx->min_val_type);
1459
1460	/*
1461	 * If max_size was specified, then min_size must be smaller
1462	 */
1463	if (ctx->max_val_type > NO_SIZE &&
1464	    ctx->min_hpages > ctx->max_hpages) {
1465		pr_err("Minimum size can not be greater than maximum size\n");
1466		return -EINVAL;
1467	}
1468
1469	return 0;
1470}
1471
1472static int
1473hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
1474{
1475	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1476	struct hugetlbfs_sb_info *sbinfo;
1477
1478	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1479	if (!sbinfo)
1480		return -ENOMEM;
1481	sb->s_fs_info = sbinfo;
1482	spin_lock_init(&sbinfo->stat_lock);
1483	sbinfo->hstate		= ctx->hstate;
1484	sbinfo->max_inodes	= ctx->nr_inodes;
1485	sbinfo->free_inodes	= ctx->nr_inodes;
1486	sbinfo->spool		= NULL;
1487	sbinfo->uid		= ctx->uid;
1488	sbinfo->gid		= ctx->gid;
1489	sbinfo->mode		= ctx->mode;
1490
1491	/*
1492	 * Allocate and initialize subpool if maximum or minimum size is
1493	 * specified.  Any needed reservations (for minimum size) are taken
1494	 * when the subpool is created.
1495	 */
1496	if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
1497		sbinfo->spool = hugepage_new_subpool(ctx->hstate,
1498						     ctx->max_hpages,
1499						     ctx->min_hpages);
1500		if (!sbinfo->spool)
1501			goto out_free;
1502	}
1503	sb->s_maxbytes = MAX_LFS_FILESIZE;
1504	sb->s_blocksize = huge_page_size(ctx->hstate);
1505	sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
1506	sb->s_magic = HUGETLBFS_MAGIC;
1507	sb->s_op = &hugetlbfs_ops;
1508	sb->s_time_gran = 1;
1509
1510	/*
1511	 * Due to the special and limited functionality of hugetlbfs, it does
1512	 * not work well as a stacking filesystem.
1513	 */
1514	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
1515	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
1516	if (!sb->s_root)
1517		goto out_free;
1518	return 0;
1519out_free:
1520	kfree(sbinfo->spool);
1521	kfree(sbinfo);
1522	return -ENOMEM;
1523}
1524
1525static int hugetlbfs_get_tree(struct fs_context *fc)
1526{
1527	int err = hugetlbfs_validate(fc);
1528	if (err)
1529		return err;
1530	return get_tree_nodev(fc, hugetlbfs_fill_super);
1531}
1532
1533static void hugetlbfs_fs_context_free(struct fs_context *fc)
1534{
1535	kfree(fc->fs_private);
1536}
1537
1538static const struct fs_context_operations hugetlbfs_fs_context_ops = {
1539	.free		= hugetlbfs_fs_context_free,
1540	.parse_param	= hugetlbfs_parse_param,
1541	.get_tree	= hugetlbfs_get_tree,
1542};
1543
1544static int hugetlbfs_init_fs_context(struct fs_context *fc)
1545{
1546	struct hugetlbfs_fs_context *ctx;
1547
1548	ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
1549	if (!ctx)
1550		return -ENOMEM;
1551
1552	ctx->max_hpages	= -1; /* No limit on size by default */
1553	ctx->nr_inodes	= -1; /* No limit on number of inodes by default */
1554	ctx->uid	= current_fsuid();
1555	ctx->gid	= current_fsgid();
1556	ctx->mode	= 0755;
1557	ctx->hstate	= &default_hstate;
1558	ctx->min_hpages	= -1; /* No default minimum size */
1559	ctx->max_val_type = NO_SIZE;
1560	ctx->min_val_type = NO_SIZE;
1561	fc->fs_private = ctx;
1562	fc->ops	= &hugetlbfs_fs_context_ops;
1563	return 0;
1564}
1565
1566static struct file_system_type hugetlbfs_fs_type = {
1567	.name			= "hugetlbfs",
1568	.init_fs_context	= hugetlbfs_init_fs_context,
1569	.parameters		= hugetlb_fs_parameters,
1570	.kill_sb		= kill_litter_super,
1571	.fs_flags               = FS_ALLOW_IDMAP,
1572};
1573
1574static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1575
1576static int can_do_hugetlb_shm(void)
1577{
1578	kgid_t shm_group;
1579	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
1580	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
1581}
1582
1583static int get_hstate_idx(int page_size_log)
1584{
1585	struct hstate *h = hstate_sizelog(page_size_log);
1586
1587	if (!h)
1588		return -1;
1589	return hstate_index(h);
1590}
1591
1592/*
1593 * Note that size should be aligned to proper hugepage size in caller side,
1594 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1595 */
1596struct file *hugetlb_file_setup(const char *name, size_t size,
1597				vm_flags_t acctflag, int creat_flags,
1598				int page_size_log)
1599{
1600	struct inode *inode;
1601	struct vfsmount *mnt;
1602	int hstate_idx;
1603	struct file *file;
1604
1605	hstate_idx = get_hstate_idx(page_size_log);
1606	if (hstate_idx < 0)
1607		return ERR_PTR(-ENODEV);
1608
1609	mnt = hugetlbfs_vfsmount[hstate_idx];
1610	if (!mnt)
1611		return ERR_PTR(-ENOENT);
1612
1613	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1614		struct ucounts *ucounts = current_ucounts();
1615
1616		if (user_shm_lock(size, ucounts)) {
1617			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
1618				current->comm, current->pid);
1619			user_shm_unlock(size, ucounts);
1620		}
1621		return ERR_PTR(-EPERM);
1622	}
1623
1624	file = ERR_PTR(-ENOSPC);
1625	/* hugetlbfs_vfsmount[] mounts do not use idmapped mounts.  */
1626	inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
1627				    S_IFREG | S_IRWXUGO, 0);
1628	if (!inode)
1629		goto out;
1630	if (creat_flags == HUGETLB_SHMFS_INODE)
1631		inode->i_flags |= S_PRIVATE;
1632
1633	inode->i_size = size;
1634	clear_nlink(inode);
1635
1636	if (!hugetlb_reserve_pages(inode, 0,
1637			size >> huge_page_shift(hstate_inode(inode)), NULL,
1638			acctflag))
1639		file = ERR_PTR(-ENOMEM);
1640	else
1641		file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
1642					&hugetlbfs_file_operations);
1643	if (!IS_ERR(file))
1644		return file;
1645
1646	iput(inode);
1647out:
1648	return file;
1649}
1650
1651static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
1652{
1653	struct fs_context *fc;
1654	struct vfsmount *mnt;
1655
1656	fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
1657	if (IS_ERR(fc)) {
1658		mnt = ERR_CAST(fc);
1659	} else {
1660		struct hugetlbfs_fs_context *ctx = fc->fs_private;
1661		ctx->hstate = h;
1662		mnt = fc_mount(fc);
1663		put_fs_context(fc);
1664	}
1665	if (IS_ERR(mnt))
1666		pr_err("Cannot mount internal hugetlbfs for page size %luK",
1667		       huge_page_size(h) / SZ_1K);
1668	return mnt;
1669}
1670
1671static int __init init_hugetlbfs_fs(void)
1672{
1673	struct vfsmount *mnt;
1674	struct hstate *h;
1675	int error;
1676	int i;
1677
1678	if (!hugepages_supported()) {
1679		pr_info("disabling because there are no supported hugepage sizes\n");
1680		return -ENOTSUPP;
1681	}
1682
1683	error = -ENOMEM;
1684	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1685					sizeof(struct hugetlbfs_inode_info),
1686					0, SLAB_ACCOUNT, init_once);
1687	if (hugetlbfs_inode_cachep == NULL)
1688		goto out;
1689
1690	error = register_filesystem(&hugetlbfs_fs_type);
1691	if (error)
1692		goto out_free;
1693
1694	/* default hstate mount is required */
1695	mnt = mount_one_hugetlbfs(&default_hstate);
1696	if (IS_ERR(mnt)) {
1697		error = PTR_ERR(mnt);
1698		goto out_unreg;
1699	}
1700	hugetlbfs_vfsmount[default_hstate_idx] = mnt;
1701
1702	/* other hstates are optional */
1703	i = 0;
1704	for_each_hstate(h) {
1705		if (i == default_hstate_idx) {
1706			i++;
1707			continue;
1708		}
1709
1710		mnt = mount_one_hugetlbfs(h);
1711		if (IS_ERR(mnt))
1712			hugetlbfs_vfsmount[i] = NULL;
1713		else
1714			hugetlbfs_vfsmount[i] = mnt;
1715		i++;
1716	}
1717
1718	return 0;
1719
1720 out_unreg:
1721	(void)unregister_filesystem(&hugetlbfs_fs_type);
1722 out_free:
1723	kmem_cache_destroy(hugetlbfs_inode_cachep);
1724 out:
1725	return error;
1726}
1727fs_initcall(init_hugetlbfs_fs)
1728