1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Ram backed block device driver.
4 *
5 * Copyright (C) 2007 Nick Piggin
6 * Copyright (C) 2007 Novell Inc.
7 *
8 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
9 * of their respective owners.
10 */
11
12#include <linux/init.h>
13#include <linux/initrd.h>
14#include <linux/module.h>
15#include <linux/moduleparam.h>
16#include <linux/major.h>
17#include <linux/blkdev.h>
18#include <linux/bio.h>
19#include <linux/highmem.h>
20#include <linux/mutex.h>
21#include <linux/pagemap.h>
22#include <linux/xarray.h>
23#include <linux/fs.h>
24#include <linux/slab.h>
25#include <linux/backing-dev.h>
26#include <linux/debugfs.h>
27
28#include <linux/uaccess.h>
29
30/*
31 * Each block ramdisk device has a xarray brd_pages of pages that stores
32 * the pages containing the block device's contents. A brd page's ->index is
33 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
34 * with, the kernel's pagecache or buffer cache (which sit above our block
35 * device).
36 */
37struct brd_device {
38	int			brd_number;
39	struct gendisk		*brd_disk;
40	struct list_head	brd_list;
41
42	/*
43	 * Backing store of pages. This is the contents of the block device.
44	 */
45	struct xarray	        brd_pages;
46	u64			brd_nr_pages;
47};
48
49/*
50 * Look up and return a brd's page for a given sector.
51 */
52static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
53{
54	pgoff_t idx;
55	struct page *page;
56
57	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
58	page = xa_load(&brd->brd_pages, idx);
59
60	BUG_ON(page && page->index != idx);
61
62	return page;
63}
64
65/*
66 * Insert a new page for a given sector, if one does not already exist.
67 */
68static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
69{
70	pgoff_t idx;
71	struct page *page, *cur;
72	int ret = 0;
73
74	page = brd_lookup_page(brd, sector);
75	if (page)
76		return 0;
77
78	page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
79	if (!page)
80		return -ENOMEM;
81
82	xa_lock(&brd->brd_pages);
83
84	idx = sector >> PAGE_SECTORS_SHIFT;
85	page->index = idx;
86
87	cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
88
89	if (unlikely(cur)) {
90		__free_page(page);
91		ret = xa_err(cur);
92		if (!ret && (cur->index != idx))
93			ret = -EIO;
94	} else {
95		brd->brd_nr_pages++;
96	}
97
98	xa_unlock(&brd->brd_pages);
99
100	return ret;
101}
102
103/*
104 * Free all backing store pages and xarray. This must only be called when
105 * there are no other users of the device.
106 */
107static void brd_free_pages(struct brd_device *brd)
108{
109	struct page *page;
110	pgoff_t idx;
111
112	xa_for_each(&brd->brd_pages, idx, page) {
113		__free_page(page);
114		cond_resched();
115	}
116
117	xa_destroy(&brd->brd_pages);
118}
119
120/*
121 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
122 */
123static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
124			     gfp_t gfp)
125{
126	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
127	size_t copy;
128	int ret;
129
130	copy = min_t(size_t, n, PAGE_SIZE - offset);
131	ret = brd_insert_page(brd, sector, gfp);
132	if (ret)
133		return ret;
134	if (copy < n) {
135		sector += copy >> SECTOR_SHIFT;
136		ret = brd_insert_page(brd, sector, gfp);
137	}
138	return ret;
139}
140
141/*
142 * Copy n bytes from src to the brd starting at sector. Does not sleep.
143 */
144static void copy_to_brd(struct brd_device *brd, const void *src,
145			sector_t sector, size_t n)
146{
147	struct page *page;
148	void *dst;
149	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
150	size_t copy;
151
152	copy = min_t(size_t, n, PAGE_SIZE - offset);
153	page = brd_lookup_page(brd, sector);
154	BUG_ON(!page);
155
156	dst = kmap_atomic(page);
157	memcpy(dst + offset, src, copy);
158	kunmap_atomic(dst);
159
160	if (copy < n) {
161		src += copy;
162		sector += copy >> SECTOR_SHIFT;
163		copy = n - copy;
164		page = brd_lookup_page(brd, sector);
165		BUG_ON(!page);
166
167		dst = kmap_atomic(page);
168		memcpy(dst, src, copy);
169		kunmap_atomic(dst);
170	}
171}
172
173/*
174 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
175 */
176static void copy_from_brd(void *dst, struct brd_device *brd,
177			sector_t sector, size_t n)
178{
179	struct page *page;
180	void *src;
181	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
182	size_t copy;
183
184	copy = min_t(size_t, n, PAGE_SIZE - offset);
185	page = brd_lookup_page(brd, sector);
186	if (page) {
187		src = kmap_atomic(page);
188		memcpy(dst, src + offset, copy);
189		kunmap_atomic(src);
190	} else
191		memset(dst, 0, copy);
192
193	if (copy < n) {
194		dst += copy;
195		sector += copy >> SECTOR_SHIFT;
196		copy = n - copy;
197		page = brd_lookup_page(brd, sector);
198		if (page) {
199			src = kmap_atomic(page);
200			memcpy(dst, src, copy);
201			kunmap_atomic(src);
202		} else
203			memset(dst, 0, copy);
204	}
205}
206
207/*
208 * Process a single bvec of a bio.
209 */
210static int brd_do_bvec(struct brd_device *brd, struct page *page,
211			unsigned int len, unsigned int off, blk_opf_t opf,
212			sector_t sector)
213{
214	void *mem;
215	int err = 0;
216
217	if (op_is_write(opf)) {
218		/*
219		 * Must use NOIO because we don't want to recurse back into the
220		 * block or filesystem layers from page reclaim.
221		 */
222		gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
223
224		err = copy_to_brd_setup(brd, sector, len, gfp);
225		if (err)
226			goto out;
227	}
228
229	mem = kmap_atomic(page);
230	if (!op_is_write(opf)) {
231		copy_from_brd(mem + off, brd, sector, len);
232		flush_dcache_page(page);
233	} else {
234		flush_dcache_page(page);
235		copy_to_brd(brd, mem + off, sector, len);
236	}
237	kunmap_atomic(mem);
238
239out:
240	return err;
241}
242
243static void brd_submit_bio(struct bio *bio)
244{
245	struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
246	sector_t sector = bio->bi_iter.bi_sector;
247	struct bio_vec bvec;
248	struct bvec_iter iter;
249
250	bio_for_each_segment(bvec, bio, iter) {
251		unsigned int len = bvec.bv_len;
252		int err;
253
254		/* Don't support un-aligned buffer */
255		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
256				(len & (SECTOR_SIZE - 1)));
257
258		err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
259				  bio->bi_opf, sector);
260		if (err) {
261			if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
262				bio_wouldblock_error(bio);
263				return;
264			}
265			bio_io_error(bio);
266			return;
267		}
268		sector += len >> SECTOR_SHIFT;
269	}
270
271	bio_endio(bio);
272}
273
274static const struct block_device_operations brd_fops = {
275	.owner =		THIS_MODULE,
276	.submit_bio =		brd_submit_bio,
277};
278
279/*
280 * And now the modules code and kernel interface.
281 */
282static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
283module_param(rd_nr, int, 0444);
284MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
285
286unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
287module_param(rd_size, ulong, 0444);
288MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
289
290static int max_part = 1;
291module_param(max_part, int, 0444);
292MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
293
294MODULE_LICENSE("GPL");
295MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
296MODULE_ALIAS("rd");
297
298#ifndef MODULE
299/* Legacy boot options - nonmodular */
300static int __init ramdisk_size(char *str)
301{
302	rd_size = simple_strtol(str, NULL, 0);
303	return 1;
304}
305__setup("ramdisk_size=", ramdisk_size);
306#endif
307
308/*
309 * The device scheme is derived from loop.c. Keep them in synch where possible
310 * (should share code eventually).
311 */
312static LIST_HEAD(brd_devices);
313static struct dentry *brd_debugfs_dir;
314
315static int brd_alloc(int i)
316{
317	struct brd_device *brd;
318	struct gendisk *disk;
319	char buf[DISK_NAME_LEN];
320	int err = -ENOMEM;
321	struct queue_limits lim = {
322		/*
323		 * This is so fdisk will align partitions on 4k, because of
324		 * direct_access API needing 4k alignment, returning a PFN
325		 * (This is only a problem on very small devices <= 4M,
326		 *  otherwise fdisk will align on 1M. Regardless this call
327		 *  is harmless)
328		 */
329		.physical_block_size	= PAGE_SIZE,
330	};
331
332	list_for_each_entry(brd, &brd_devices, brd_list)
333		if (brd->brd_number == i)
334			return -EEXIST;
335	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
336	if (!brd)
337		return -ENOMEM;
338	brd->brd_number		= i;
339	list_add_tail(&brd->brd_list, &brd_devices);
340
341	xa_init(&brd->brd_pages);
342
343	snprintf(buf, DISK_NAME_LEN, "ram%d", i);
344	if (!IS_ERR_OR_NULL(brd_debugfs_dir))
345		debugfs_create_u64(buf, 0444, brd_debugfs_dir,
346				&brd->brd_nr_pages);
347
348	disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
349	if (IS_ERR(disk)) {
350		err = PTR_ERR(disk);
351		goto out_free_dev;
352	}
353	disk->major		= RAMDISK_MAJOR;
354	disk->first_minor	= i * max_part;
355	disk->minors		= max_part;
356	disk->fops		= &brd_fops;
357	disk->private_data	= brd;
358	strscpy(disk->disk_name, buf, DISK_NAME_LEN);
359	set_capacity(disk, rd_size * 2);
360
361	/* Tell the block layer that this is not a rotational device */
362	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
363	blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);
364	blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
365	err = add_disk(disk);
366	if (err)
367		goto out_cleanup_disk;
368
369	return 0;
370
371out_cleanup_disk:
372	put_disk(disk);
373out_free_dev:
374	list_del(&brd->brd_list);
375	kfree(brd);
376	return err;
377}
378
379static void brd_probe(dev_t dev)
380{
381	brd_alloc(MINOR(dev) / max_part);
382}
383
384static void brd_cleanup(void)
385{
386	struct brd_device *brd, *next;
387
388	debugfs_remove_recursive(brd_debugfs_dir);
389
390	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
391		del_gendisk(brd->brd_disk);
392		put_disk(brd->brd_disk);
393		brd_free_pages(brd);
394		list_del(&brd->brd_list);
395		kfree(brd);
396	}
397}
398
399static inline void brd_check_and_reset_par(void)
400{
401	if (unlikely(!max_part))
402		max_part = 1;
403
404	/*
405	 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
406	 * otherwise, it is possiable to get same dev_t when adding partitions.
407	 */
408	if ((1U << MINORBITS) % max_part != 0)
409		max_part = 1UL << fls(max_part);
410
411	if (max_part > DISK_MAX_PARTS) {
412		pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
413			DISK_MAX_PARTS, DISK_MAX_PARTS);
414		max_part = DISK_MAX_PARTS;
415	}
416}
417
418static int __init brd_init(void)
419{
420	int err, i;
421
422	brd_check_and_reset_par();
423
424	brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
425
426	for (i = 0; i < rd_nr; i++) {
427		err = brd_alloc(i);
428		if (err)
429			goto out_free;
430	}
431
432	/*
433	 * brd module now has a feature to instantiate underlying device
434	 * structure on-demand, provided that there is an access dev node.
435	 *
436	 * (1) if rd_nr is specified, create that many upfront. else
437	 *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
438	 * (2) User can further extend brd devices by create dev node themselves
439	 *     and have kernel automatically instantiate actual device
440	 *     on-demand. Example:
441	 *		mknod /path/devnod_name b 1 X	# 1 is the rd major
442	 *		fdisk -l /path/devnod_name
443	 *	If (X / max_part) was not already created it will be created
444	 *	dynamically.
445	 */
446
447	if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
448		err = -EIO;
449		goto out_free;
450	}
451
452	pr_info("brd: module loaded\n");
453	return 0;
454
455out_free:
456	brd_cleanup();
457
458	pr_info("brd: module NOT loaded !!!\n");
459	return err;
460}
461
462static void __exit brd_exit(void)
463{
464
465	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
466	brd_cleanup();
467
468	pr_info("brd: module unloaded\n");
469}
470
471module_init(brd_init);
472module_exit(brd_exit);
473
474