ramdisk.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27/*
28 * Ramdisk device driver.
29 *
30 * There are two types of ramdisk: 'real' OBP-created ramdisks, and 'pseudo'
31 * ramdisks created at runtime with no corresponding OBP device node.  The
32 * ramdisk(7D) driver is capable of dealing with both, and with the creation
33 * and deletion of 'pseudo' ramdisks.
34 *
35 * Every ramdisk has a single 'state' structure which maintains data for
36 * that ramdisk, and is assigned a single minor number.  The bottom 10-bits
37 * of the minor number index the state structures; the top 8-bits give a
38 * 'real OBP disk' number, i.e. they are zero for 'pseudo' ramdisks.  Thus
39 * it is possible to distinguish 'real' from 'pseudo' ramdisks using the
40 * top 8-bits of the minor number.
41 *
42 * Each OBP-created ramdisk has its own node in the device tree with an
43 * "existing" property which describes the one-or-more physical address ranges
44 * assigned to the ramdisk.  All 'pseudo' ramdisks share a common devinfo
45 * structure.
46 *
47 * A single character device node is used by ramdiskadm(1M) to communicate
48 * with the ramdisk driver, with minor number 0:
49 *
50 *	/dev/ramdiskctl -> /devices/pseudo/ramdisk@0:ctl
51 *
52 * For consistent access, block and raw device nodes are created for *every*
53 * ramdisk.  For 'pseudo' ramdisks:
54 *
55 *	/dev/ramdisk/<diskname>  -> /devices/pseudo/ramdisk@0:<diskname>
56 *	/dev/rramdisk/<diskname> -> /devices/pseudo/ramdisk@0:<diskname>,raw
57 *
58 * For OBP-created ramdisks:
59 *
60 *	/dev/ramdisk/<diskname>  -> /devices/ramdisk-<diskname>:a
61 *	/dev/ramdisk/<diskname>  -> /devices/ramdisk-<diskname>:a,raw
62 *
63 * This allows the transition from the standalone to the kernel to proceed
64 * when booting from a ramdisk, and for the installation to correctly identify
65 * the root device.
66 */
67
68#include <sys/types.h>
69#include <sys/param.h>
70#include <sys/sysmacros.h>
71#include <sys/errno.h>
72#include <sys/uio.h>
73#include <sys/buf.h>
74#include <sys/modctl.h>
75#include <sys/open.h>
76#include <sys/kmem.h>
77#include <sys/poll.h>
78#include <sys/conf.h>
79#include <sys/cmn_err.h>
80#include <sys/stat.h>
81#include <sys/file.h>
82#include <sys/ddi.h>
83#include <sys/sunddi.h>
84#include <sys/ramdisk.h>
85#include <vm/seg_kmem.h>
86
87/*
88 * An opaque handle where information about our set of ramdisk devices lives.
89 */
90static void	*rd_statep;
91
92/*
93 * Pointer to devinfo for the 'pseudo' ramdisks.  Real OBP-created ramdisks
94 * get their own individual devinfo.
95 */
96static dev_info_t *rd_dip = NULL;
97
98/*
99 * Global state lock.
100 */
101static kmutex_t	rd_lock;
102
103/*
104 * Maximum number of ramdisks supported by this driver.
105 */
106static uint32_t	rd_max_disks = RD_DFLT_DISKS;
107
108/*
109 * Percentage of physical memory which can be assigned to pseudo ramdisks,
110 * what that equates to in pages, and how many pages are currently assigned.
111 */
112static uint_t	rd_percent_physmem = RD_DEFAULT_PERCENT_PHYSMEM;
113static pgcnt_t	rd_max_physmem;
114static pgcnt_t	rd_tot_physmem;
115
116static uint_t	rd_maxphys = RD_DEFAULT_MAXPHYS;
117
118/*
119 * Is the driver busy, i.e. are there any pseudo ramdisk devices in existence?
120 */
121static int
122rd_is_busy(void)
123{
124	minor_t	minor;
125	rd_devstate_t	*rsp;
126
127	ASSERT(mutex_owned(&rd_lock));
128	for (minor = 1; minor <= rd_max_disks; ++minor) {
129		if ((rsp = ddi_get_soft_state(rd_statep, minor)) != NULL &&
130		    rsp->rd_dip == rd_dip) {
131			return (EBUSY);
132		}
133	}
134	return (0);
135}
136
137/*
138 * Find the first free minor number; returns zero if there isn't one.
139 */
140static minor_t
141rd_find_free_minor(void)
142{
143	minor_t	minor;
144
145	ASSERT(mutex_owned(&rd_lock));
146	for (minor = 1; minor <= rd_max_disks; ++minor) {
147		if (ddi_get_soft_state(rd_statep, minor) == NULL) {
148			return (minor);
149		}
150	}
151	return (0);
152}
153
154/*
155 * Locate the rd_devstate for the named ramdisk; returns NULL if not found.
156 * Each ramdisk is identified uniquely by name, i.e. an OBP-created ramdisk
157 * cannot have the same name as a pseudo ramdisk.
158 */
159static rd_devstate_t *
160rd_find_named_disk(char *name)
161{
162	minor_t		minor;
163	rd_devstate_t	*rsp;
164
165	ASSERT(mutex_owned(&rd_lock));
166	for (minor = 1; minor <= rd_max_disks; ++minor) {
167		if ((rsp = ddi_get_soft_state(rd_statep, minor)) != NULL &&
168		    strcmp(rsp->rd_name, name) == 0) {
169			return (rsp);
170		}
171	}
172	return (NULL);
173}
174
175/*
176 * Locate the rd_devstate for the real OBP-created ramdisk whose devinfo
177 * is referenced by 'dip'; returns NULL if not found (shouldn't happen).
178 */
179static rd_devstate_t *
180rd_find_dip_state(dev_info_t *dip)
181{
182	minor_t		minor;
183	rd_devstate_t	*rsp;
184
185	ASSERT(mutex_owned(&rd_lock));
186	for (minor = 1; minor <= rd_max_disks; ++minor) {
187		if ((rsp = ddi_get_soft_state(rd_statep, minor)) != NULL &&
188		    rsp->rd_dip == dip) {
189			return (rsp);
190		}
191	}
192	return (NULL);
193}
194
195/*
196 * Is the ramdisk open?
197 */
198static int
199rd_is_open(rd_devstate_t *rsp)
200{
201	ASSERT(mutex_owned(&rd_lock));
202	return (rsp->rd_chr_open || rsp->rd_blk_open || rsp->rd_lyr_open_cnt);
203}
204
205/*
206 * Mark the ramdisk open.
207 */
208static int
209rd_opened(rd_devstate_t *rsp, int otyp)
210{
211	ASSERT(mutex_owned(&rd_lock));
212	switch (otyp) {
213	case OTYP_CHR:
214		rsp->rd_chr_open = 1;
215		break;
216	case OTYP_BLK:
217		rsp->rd_blk_open = 1;
218		break;
219	case OTYP_LYR:
220		rsp->rd_lyr_open_cnt++;
221		break;
222	default:
223		return (-1);
224	}
225	return (0);
226}
227
228/*
229 * Mark the ramdisk closed.
230 */
231static void
232rd_closed(rd_devstate_t *rsp, int otyp)
233{
234	ASSERT(mutex_owned(&rd_lock));
235	switch (otyp) {
236	case OTYP_CHR:
237		rsp->rd_chr_open = 0;
238		break;
239	case OTYP_BLK:
240		rsp->rd_blk_open = 0;
241		break;
242	case OTYP_LYR:
243		rsp->rd_lyr_open_cnt--;
244		break;
245	default:
246		break;
247	}
248}
249
250static void
251rd_init_tuneables(void)
252{
253	char	*prop, *p;
254
255	/*
256	 * Ensure sanity of 'rd_max_disks', which may be tuned in ramdisk.conf.
257	 */
258	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, rd_dip, 0,
259	    "max_disks", &prop) == DDI_PROP_SUCCESS) {
260		p = prop;
261		rd_max_disks = (uint32_t)stoi(&p);
262		ddi_prop_free(prop);
263	}
264	if (rd_max_disks >= RD_MAX_DISKS) {
265		cmn_err(CE_WARN, "ramdisk: rd_max_disks (%u) too big;"
266		    " using default (%u).", rd_max_disks, RD_MAX_DISKS - 1);
267
268		rd_max_disks = RD_MAX_DISKS - 1;
269	}
270
271	/*
272	 * Ensure sanity of 'rd_percent_physmem', which may be tuned
273	 * in ramdisk.conf.
274	 */
275	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, rd_dip, 0,
276	    "percent_physmem", &prop) == DDI_PROP_SUCCESS) {
277		p = prop;
278		rd_percent_physmem = (uint_t)stoi(&p);
279		ddi_prop_free(prop);
280	}
281	if (rd_percent_physmem >= 100) {
282		cmn_err(CE_WARN, "ramdisk: rd_percent_physmem (%u) >= 100;"
283		    " using default (%u%%).", rd_percent_physmem,
284		    RD_DEFAULT_PERCENT_PHYSMEM);
285
286		rd_percent_physmem = RD_DEFAULT_PERCENT_PHYSMEM;
287	}
288
289	/*
290	 * Since availrmem_initial is a long, this won't overflow.
291	 */
292	rd_max_physmem = (availrmem_initial * rd_percent_physmem) / 100;
293}
294
295/*
296 * Allocate enough physical pages to hold "npages" pages.  Returns an
297 * array of page_t * pointers that can later be mapped in or out via
298 * rd_{un}map_window() but is otherwise opaque, or NULL on failure.
299 */
300page_t **
301rd_phys_alloc(pgcnt_t npages)
302{
303	page_t		*pp, **ppa;
304	spgcnt_t	i;
305	size_t		ppalen;
306	struct seg	kseg;
307	caddr_t		addr;		/* For coloring */
308
309	if (rd_tot_physmem + npages > rd_max_physmem)
310		return (NULL);
311
312	if (!page_resv(npages, KM_NOSLEEP))
313		return (NULL);
314
315	if (!page_create_wait(npages, 0)) {
316		page_unresv(npages);
317		return (NULL);
318	}
319
320	ppalen = npages * sizeof (struct page_t *);
321	ppa = kmem_zalloc(ppalen, KM_NOSLEEP);
322	if (ppa == NULL) {
323		page_create_putback(npages);
324		page_unresv(npages);
325		return (NULL);
326	}
327
328	kseg.s_as = &kas;
329	for (i = 0, addr = NULL; i < npages; ++i, addr += PAGESIZE) {
330		pp = page_get_freelist(&kvp, 0, &kseg, addr, PAGESIZE, 0, NULL);
331		if (pp == NULL) {
332			pp = page_get_cachelist(&kvp, 0, &kseg, addr, 0, NULL);
333			if (pp == NULL)
334				goto out;
335			if (!PP_ISAGED(pp))
336				page_hashout(pp, NULL);
337		}
338
339		PP_CLRFREE(pp);
340		PP_CLRAGED(pp);
341		ppa[i] = pp;
342	}
343
344	for (i = 0; i < npages; i++)
345		page_downgrade(ppa[i]);
346	rd_tot_physmem += npages;
347
348	return (ppa);
349
350out:
351	ASSERT(i < npages);
352	page_create_putback(npages - i);
353	while (--i >= 0)
354		page_free(ppa[i], 0);
355	kmem_free(ppa, ppalen);
356	page_unresv(npages);
357
358	return (NULL);
359}
360
361/*
362 * Free physical pages previously allocated via rd_phys_alloc(); note that
363 * this function may block as it has to wait until it can exclusively lock
364 * all the pages first.
365 */
366static void
367rd_phys_free(page_t **ppa, pgcnt_t npages)
368{
369	pgcnt_t	i;
370	size_t	ppalen = npages * sizeof (struct page_t *);
371
372	for (i = 0; i < npages; ++i) {
373		if (! page_tryupgrade(ppa[i])) {
374			page_unlock(ppa[i]);
375			while (! page_lock(ppa[i], SE_EXCL, NULL, P_RECLAIM))
376				;
377		}
378		page_free(ppa[i], 0);
379	}
380
381	kmem_free(ppa, ppalen);
382
383	page_unresv(npages);
384	rd_tot_physmem -= npages;
385}
386
387/*
388 * Remove a window mapping (if present).
389 */
390static void
391rd_unmap_window(rd_devstate_t *rsp)
392{
393	ASSERT(rsp->rd_window_obp == 0);
394	if (rsp->rd_window_base != RD_WINDOW_NOT_MAPPED) {
395		hat_unload(kas.a_hat, rsp->rd_window_virt, rsp->rd_window_size,
396		    HAT_UNLOAD_UNLOCK);
397	}
398}
399
400/*
401 * Map a portion of the ramdisk into the virtual window.
402 */
403static void
404rd_map_window(rd_devstate_t *rsp, off_t offset)
405{
406	pgcnt_t	offpgs = btop(offset);
407
408	if (rsp->rd_window_base != RD_WINDOW_NOT_MAPPED) {
409		/*
410		 * Already mapped; is offset within our window?
411		 */
412		if (offset >= rsp->rd_window_base &&
413		    offset < rsp->rd_window_base + rsp->rd_window_size) {
414			return;
415		}
416
417		/*
418		 * No, we need to re-map; toss the old mapping.
419		 */
420		rd_unmap_window(rsp);
421	}
422	rsp->rd_window_base = ptob(offpgs);
423
424	/*
425	 * Different algorithms depending on whether this is a real
426	 * OBP-created ramdisk, or a pseudo ramdisk.
427	 */
428	if (rsp->rd_dip == rd_dip) {
429		pgcnt_t	pi, lastpi;
430		caddr_t	vaddr;
431
432		/*
433		 * Find the range of pages which should be mapped.
434		 */
435		pi = offpgs;
436		lastpi = pi + btopr(rsp->rd_window_size);
437		if (lastpi > rsp->rd_npages) {
438			lastpi = rsp->rd_npages;
439		}
440
441		/*
442		 * Load the mapping.
443		 */
444		vaddr = rsp->rd_window_virt;
445		for (; pi < lastpi; ++pi) {
446			hat_memload(kas.a_hat, vaddr, rsp->rd_ppa[pi],
447			    (PROT_READ | PROT_WRITE) | HAT_NOSYNC,
448			    HAT_LOAD_LOCK);
449			vaddr += ptob(1);
450		}
451	} else {
452		uint_t	i;
453		pfn_t	pfn;
454
455		/*
456		 * Real OBP-created ramdisk: locate the physical range which
457		 * contains this offset.
458		 */
459		for (i = 0; i < rsp->rd_nexisting; ++i) {
460			if (offset < rsp->rd_existing[i].size) {
461				break;
462			}
463			offset -= rsp->rd_existing[i].size;
464		}
465		ASSERT(i < rsp->rd_nexisting);
466
467		/*
468		 * Load the mapping.
469		 */
470		pfn = btop(rsp->rd_existing[i].phys + offset);
471		hat_devload(kas.a_hat, rsp->rd_window_virt, rsp->rd_window_size,
472		    pfn, (PROT_READ | PROT_WRITE),
473		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
474	}
475}
476
477/*
478 * Fakes up a disk geometry, and one big partition, based on the size
479 * of the file. This is needed because we allow newfs'ing the device,
480 * and newfs will do several disk ioctls to figure out the geometry and
481 * partition information. It uses that information to determine the parameters
482 * to pass to mkfs. Geometry is pretty much irrelevant these days, but we
483 * have to support it.
484 *
485 * Stolen from lofi.c - should maybe split out common code sometime.
486 */
487static void
488rd_fake_disk_geometry(rd_devstate_t *rsp)
489{
490	/* dk_geom - see dkio(7I) */
491	/*
492	 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
493	 * of sectors), but that breaks programs like fdisk which want to
494	 * partition a disk by cylinder. With one cylinder, you can't create
495	 * an fdisk partition and put pcfs on it for testing (hard to pick
496	 * a number between one and one).
497	 *
498	 * The cheezy floppy test is an attempt to not have too few cylinders
499	 * for a small file, or so many on a big file that you waste space
500	 * for backup superblocks or cylinder group structures.
501	 */
502	if (rsp->rd_size < (2 * 1024 * 1024)) /* floppy? */
503		rsp->rd_dkg.dkg_ncyl = rsp->rd_size / (100 * 1024);
504	else
505		rsp->rd_dkg.dkg_ncyl = rsp->rd_size / (300 * 1024);
506	/* in case file file is < 100k */
507	if (rsp->rd_dkg.dkg_ncyl == 0)
508		rsp->rd_dkg.dkg_ncyl = 1;
509	rsp->rd_dkg.dkg_acyl = 0;
510	rsp->rd_dkg.dkg_bcyl = 0;
511	rsp->rd_dkg.dkg_nhead = 1;
512	rsp->rd_dkg.dkg_obs1 = 0;
513	rsp->rd_dkg.dkg_intrlv = 0;
514	rsp->rd_dkg.dkg_obs2 = 0;
515	rsp->rd_dkg.dkg_obs3 = 0;
516	rsp->rd_dkg.dkg_apc = 0;
517	rsp->rd_dkg.dkg_rpm = 7200;
518	rsp->rd_dkg.dkg_pcyl = rsp->rd_dkg.dkg_ncyl + rsp->rd_dkg.dkg_acyl;
519	rsp->rd_dkg.dkg_nsect = rsp->rd_size /
520	    (DEV_BSIZE * rsp->rd_dkg.dkg_ncyl);
521	rsp->rd_dkg.dkg_write_reinstruct = 0;
522	rsp->rd_dkg.dkg_read_reinstruct = 0;
523
524	/* vtoc - see dkio(7I) */
525	bzero(&rsp->rd_vtoc, sizeof (struct vtoc));
526	rsp->rd_vtoc.v_sanity = VTOC_SANE;
527	rsp->rd_vtoc.v_version = V_VERSION;
528	bcopy(RD_DRIVER_NAME, rsp->rd_vtoc.v_volume, 7);
529	rsp->rd_vtoc.v_sectorsz = DEV_BSIZE;
530	rsp->rd_vtoc.v_nparts = 1;
531	rsp->rd_vtoc.v_part[0].p_tag = V_UNASSIGNED;
532	rsp->rd_vtoc.v_part[0].p_flag = V_UNMNT;
533	rsp->rd_vtoc.v_part[0].p_start = (daddr_t)0;
534	/*
535	 * The partition size cannot just be the number of sectors, because
536	 * that might not end on a cylinder boundary. And if that's the case,
537	 * newfs/mkfs will print a scary warning. So just figure the size
538	 * based on the number of cylinders and sectors/cylinder.
539	 */
540	rsp->rd_vtoc.v_part[0].p_size = rsp->rd_dkg.dkg_pcyl *
541	    rsp->rd_dkg.dkg_nsect * rsp->rd_dkg.dkg_nhead;
542
543	/* dk_cinfo - see dkio(7I) */
544	bzero(&rsp->rd_ci, sizeof (struct dk_cinfo));
545	(void) strcpy(rsp->rd_ci.dki_cname, RD_DRIVER_NAME);
546	rsp->rd_ci.dki_ctype = DKC_MD;
547	rsp->rd_ci.dki_flags = 0;
548	rsp->rd_ci.dki_cnum = 0;
549	rsp->rd_ci.dki_addr = 0;
550	rsp->rd_ci.dki_space = 0;
551	rsp->rd_ci.dki_prio = 0;
552	rsp->rd_ci.dki_vec = 0;
553	(void) strcpy(rsp->rd_ci.dki_dname, RD_DRIVER_NAME);
554	rsp->rd_ci.dki_unit = 0;
555	rsp->rd_ci.dki_slave = 0;
556	rsp->rd_ci.dki_partition = 0;
557	/*
558	 * newfs uses this to set maxcontig. Must not be < 16, or it
559	 * will be 0 when newfs multiplies it by DEV_BSIZE and divides
560	 * it by the block size. Then tunefs doesn't work because
561	 * maxcontig is 0.
562	 */
563	rsp->rd_ci.dki_maxtransfer = 16;
564}
565
566/*
567 * Deallocate resources (virtual and physical, device nodes, structures)
568 * from a ramdisk.
569 */
570static void
571rd_dealloc_resources(rd_devstate_t *rsp)
572{
573	dev_info_t	*dip = rsp->rd_dip;
574	char		namebuf[RD_NAME_LEN + 5];
575	dev_t		fulldev;
576
577	if (rsp->rd_window_obp == 0 && rsp->rd_window_virt != NULL) {
578		if (rsp->rd_window_base != RD_WINDOW_NOT_MAPPED) {
579			rd_unmap_window(rsp);
580		}
581		vmem_free(heap_arena, rsp->rd_window_virt, rsp->rd_window_size);
582	}
583	mutex_destroy(&rsp->rd_device_lock);
584
585	if (rsp->rd_existing) {
586		ddi_prop_free(rsp->rd_existing);
587	}
588	if (rsp->rd_ppa != NULL) {
589		rd_phys_free(rsp->rd_ppa, rsp->rd_npages);
590	}
591
592	/*
593	 * Remove the block and raw device nodes.
594	 */
595	if (dip == rd_dip) {
596		(void) snprintf(namebuf, sizeof (namebuf), "%s",
597		    rsp->rd_name);
598		ddi_remove_minor_node(dip, namebuf);
599		(void) snprintf(namebuf, sizeof (namebuf), "%s,raw",
600		    rsp->rd_name);
601		ddi_remove_minor_node(dip, namebuf);
602	} else {
603		ddi_remove_minor_node(dip, "a");
604		ddi_remove_minor_node(dip, "a,raw");
605	}
606
607	/*
608	 * Remove the "Size" and "Nblocks" properties.
609	 */
610	fulldev = makedevice(ddi_driver_major(dip), rsp->rd_minor);
611	(void) ddi_prop_remove(fulldev, dip, SIZE_PROP_NAME);
612	(void) ddi_prop_remove(fulldev, dip, NBLOCKS_PROP_NAME);
613
614	if (rsp->rd_kstat) {
615		kstat_delete(rsp->rd_kstat);
616		mutex_destroy(&rsp->rd_kstat_lock);
617	}
618
619	ddi_soft_state_free(rd_statep, rsp->rd_minor);
620}
621
622/*
623 * Allocate resources (virtual and physical, device nodes, structures)
624 * to a ramdisk.
625 */
626static rd_devstate_t *
627rd_alloc_resources(char *name, uint_t addr, size_t size, dev_info_t *dip)
628{
629	minor_t		minor;
630	rd_devstate_t	*rsp;
631	char		namebuf[RD_NAME_LEN + 5];
632	dev_t		fulldev;
633	int64_t		Nblocks_prop_val;
634	int64_t		Size_prop_val;
635
636	minor = rd_find_free_minor();
637	if (ddi_soft_state_zalloc(rd_statep, minor) == DDI_FAILURE) {
638		return (NULL);
639	}
640	rsp = ddi_get_soft_state(rd_statep, minor);
641
642	(void) strcpy(rsp->rd_name, name);
643	rsp->rd_dip = dip;
644	rsp->rd_minor = minor;
645	rsp->rd_size = size;
646
647	/*
648	 * Allocate virtual window onto ramdisk.
649	 */
650	mutex_init(&rsp->rd_device_lock, NULL, MUTEX_DRIVER, NULL);
651	if (addr == 0) {
652		rsp->rd_window_obp = 0;
653		rsp->rd_window_base = RD_WINDOW_NOT_MAPPED;
654		rsp->rd_window_size = PAGESIZE;
655		rsp->rd_window_virt = vmem_alloc(heap_arena,
656		    rsp->rd_window_size, VM_SLEEP);
657		if (rsp->rd_window_virt == NULL) {
658			goto create_failed;
659		}
660	} else {
661		rsp->rd_window_obp = 1;
662		rsp->rd_window_base = 0;
663		rsp->rd_window_size = size;
664		rsp->rd_window_virt = (caddr_t)((ulong_t)addr);
665	}
666
667	/*
668	 * Allocate physical memory for non-OBP ramdisks.
669	 * Create pseudo block and raw device nodes.
670	 */
671	if (dip == rd_dip) {
672		rsp->rd_npages = btopr(size);
673		rsp->rd_ppa = rd_phys_alloc(rsp->rd_npages);
674		if (rsp->rd_ppa == NULL) {
675			goto create_failed;
676		}
677
678		/*
679		 * For non-OBP ramdisks the device nodes are:
680		 *
681		 *	/devices/pseudo/ramdisk@0:<diskname>
682		 *	/devices/pseudo/ramdisk@0:<diskname>,raw
683		 */
684		(void) snprintf(namebuf, sizeof (namebuf), "%s",
685		    rsp->rd_name);
686		if (ddi_create_minor_node(dip, namebuf, S_IFBLK, minor,
687		    DDI_PSEUDO, 0) == DDI_FAILURE) {
688			goto create_failed;
689		}
690		(void) snprintf(namebuf, sizeof (namebuf), "%s,raw",
691		    rsp->rd_name);
692		if (ddi_create_minor_node(dip, namebuf, S_IFCHR, minor,
693		    DDI_PSEUDO, 0) == DDI_FAILURE) {
694			goto create_failed;
695		}
696	} else {
697		/*
698		 * For OBP-created ramdisks the device nodes are:
699		 *
700		 *	/devices/ramdisk-<diskname>:a
701		 *	/devices/ramdisk-<diskname>:a,raw
702		 */
703		if (ddi_create_minor_node(dip, "a", S_IFBLK, minor,
704		    DDI_PSEUDO, 0) == DDI_FAILURE) {
705			goto create_failed;
706		}
707		if (ddi_create_minor_node(dip, "a,raw", S_IFCHR, minor,
708		    DDI_PSEUDO, 0) == DDI_FAILURE) {
709			goto create_failed;
710		}
711	}
712
713	/*
714	 * Create the "Size" and "Nblocks" properties.
715	 */
716	fulldev = makedevice(ddi_driver_major(dip), minor);
717	Size_prop_val = size;
718	if ((ddi_prop_update_int64(fulldev, dip,
719	    SIZE_PROP_NAME, Size_prop_val)) != DDI_PROP_SUCCESS) {
720		goto create_failed;
721	}
722	Nblocks_prop_val = size / DEV_BSIZE;
723	if ((ddi_prop_update_int64(fulldev, dip,
724	    NBLOCKS_PROP_NAME, Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
725		goto create_failed;
726	}
727
728	/*
729	 * Allocate kstat stuff.
730	 */
731	rsp->rd_kstat = kstat_create(RD_DRIVER_NAME, minor, NULL,
732	    "disk", KSTAT_TYPE_IO, 1, 0);
733	if (rsp->rd_kstat) {
734		mutex_init(&rsp->rd_kstat_lock, NULL,
735		    MUTEX_DRIVER, NULL);
736		rsp->rd_kstat->ks_lock = &rsp->rd_kstat_lock;
737		kstat_install(rsp->rd_kstat);
738	}
739
740	rd_fake_disk_geometry(rsp);
741
742	return (rsp);
743
744create_failed:
745	/*
746	 * Cleanup.
747	 */
748	rd_dealloc_resources(rsp);
749
750	return (NULL);
751}
752
753/*
754 * Undo what we did in rd_attach, freeing resources and removing things which
755 * we installed.  The system framework guarantees we are not active with this
756 * devinfo node in any other entry points at this time.
757 */
758static int
759rd_common_detach(dev_info_t *dip)
760{
761	if (dip == rd_dip) {
762		/*
763		 * Pseudo node: can't detach if any pseudo ramdisks exist.
764		 */
765		if (rd_is_busy()) {
766			return (DDI_FAILURE);
767		}
768		ddi_soft_state_free(rd_statep, RD_CTL_MINOR);
769		rd_dip = NULL;
770	} else {
771		/*
772		 * A 'real' ramdisk; find the state and free resources.
773		 */
774		rd_devstate_t	*rsp;
775
776		if ((rsp = rd_find_dip_state(dip)) != NULL) {
777			rd_dealloc_resources(rsp);
778		}
779	}
780	ddi_remove_minor_node(dip, NULL);
781
782	return (DDI_SUCCESS);
783}
784
785static int
786rd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
787{
788	char		*name;
789	rd_existing_t	*ep = NULL;
790	uint_t		obpaddr = 0, nep, i;
791	size_t		size = 0;
792	rd_devstate_t	*rsp;
793
794	switch (cmd) {
795
796	case DDI_ATTACH:
797		mutex_enter(&rd_lock);
798
799		/*
800		 * For pseudo ramdisk devinfo set up state 0 and :ctl device;
801		 * else it's an OBP-created ramdisk.
802		 */
803		if (is_pseudo_device(dip)) {
804			rd_dip = dip;
805			rd_init_tuneables();
806
807			/*
808			 * The zeroth minor is reserved for the ramdisk
809			 * 'control' device.
810			 */
811			if (ddi_soft_state_zalloc(rd_statep, RD_CTL_MINOR) ==
812			    DDI_FAILURE) {
813				goto attach_failed;
814			}
815			rsp = ddi_get_soft_state(rd_statep, RD_CTL_MINOR);
816			rsp->rd_dip = dip;
817
818			if (ddi_create_minor_node(dip, RD_CTL_NODE,
819			    S_IFCHR, 0, DDI_PSEUDO, NULL) == DDI_FAILURE) {
820				goto attach_failed;
821			}
822		} else {
823			RD_STRIP_PREFIX(name, ddi_node_name(dip));
824
825			if (strlen(name) > RD_NAME_LEN) {
826				cmn_err(CE_CONT,
827				    "%s: name too long - ignoring\n", name);
828				goto attach_failed;
829			}
830
831			/*
832			 * An OBP-created ramdisk must have an 'existing'
833			 * property; get and check it.
834			 */
835			if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, dip,
836			    DDI_PROP_DONTPASS, OBP_EXISTING_PROP_NAME,
837			    (uchar_t **)&ep, &nep) == DDI_SUCCESS) {
838
839				if (nep == 0 || (nep % sizeof (*ep)) != 0) {
840					cmn_err(CE_CONT,
841					    "%s: " OBP_EXISTING_PROP_NAME
842					    " illegal size\n", name);
843					goto attach_failed;
844				}
845				nep /= sizeof (*ep);
846
847				/*
848				 * Calculate the size of the ramdisk.
849				 */
850				for (i = 0; i < nep; ++i) {
851					size += ep[i].size;
852				}
853			} else if ((obpaddr = ddi_prop_get_int(DDI_DEV_T_ANY,
854			    dip, DDI_PROP_DONTPASS, OBP_ADDRESS_PROP_NAME,
855			    0)) != 0)  {
856
857				size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
858				    DDI_PROP_DONTPASS, OBP_SIZE_PROP_NAME, 0);
859			} else {
860				cmn_err(CE_CONT, "%s: missing OBP properties\n",
861				    name);
862				goto attach_failed;
863			}
864
865			/*
866			 * Allocate driver resources for the ramdisk.
867			 */
868			if ((rsp = rd_alloc_resources(name, obpaddr, size,
869			    dip)) == NULL) {
870				goto attach_failed;
871			}
872
873			rsp->rd_existing = ep;
874			rsp->rd_nexisting = nep;
875		}
876
877		mutex_exit(&rd_lock);
878
879		ddi_report_dev(dip);
880
881		return (DDI_SUCCESS);
882
883	case DDI_RESUME:
884		return (DDI_SUCCESS);
885
886	default:
887		return (DDI_FAILURE);
888	}
889
890attach_failed:
891	/*
892	 * Use our common detach routine to unallocate any stuff which
893	 * was allocated above.
894	 */
895	(void) rd_common_detach(dip);
896	mutex_exit(&rd_lock);
897
898	if (ep != NULL) {
899		ddi_prop_free(ep);
900	}
901	return (DDI_FAILURE);
902}
903
904static int
905rd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
906{
907	int	e;
908
909	switch (cmd) {
910
911	case DDI_DETACH:
912		mutex_enter(&rd_lock);
913		e = rd_common_detach(dip);
914		mutex_exit(&rd_lock);
915
916		return (e);
917
918	case DDI_SUSPEND:
919		return (DDI_SUCCESS);
920
921	default:
922		return (DDI_FAILURE);
923	}
924}
925
926/*ARGSUSED*/
927static int
928rd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
929{
930	rd_devstate_t	*rsp;
931
932	switch (infocmd) {
933	case DDI_INFO_DEVT2DEVINFO:
934		if ((rsp = ddi_get_soft_state(rd_statep,
935		    getminor((dev_t)arg))) != NULL) {
936			*result = rsp->rd_dip;
937			return (DDI_SUCCESS);
938		}
939		*result = NULL;
940		return (DDI_FAILURE);
941
942	case DDI_INFO_DEVT2INSTANCE:
943		if ((rsp = ddi_get_soft_state(rd_statep,
944		    getminor((dev_t)arg))) != NULL) {
945			*result = (void *)(uintptr_t)
946			    ddi_get_instance(rsp->rd_dip);
947			return (DDI_SUCCESS);
948		}
949		*result = NULL;
950		return (DDI_FAILURE);
951
952	default:
953		return (DDI_FAILURE);
954	}
955}
956
957/*ARGSUSED3*/
958static int
959rd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
960{
961	minor_t		minor;
962	rd_devstate_t	*rsp;
963
964	mutex_enter(&rd_lock);
965
966	minor = getminor(*devp);
967	if (minor == RD_CTL_MINOR) {
968		/*
969		 * Master control device; must be opened exclusively.
970		 */
971		if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
972			mutex_exit(&rd_lock);
973			return (EINVAL);
974		}
975
976		rsp = ddi_get_soft_state(rd_statep, RD_CTL_MINOR);
977		if (rsp == NULL) {
978			mutex_exit(&rd_lock);
979			return (ENXIO);
980		}
981
982		if (rd_is_open(rsp)) {
983			mutex_exit(&rd_lock);
984			return (EBUSY);
985		}
986		(void) rd_opened(rsp, OTYP_CHR);
987
988		mutex_exit(&rd_lock);
989
990		return (0);
991	}
992
993	rsp = ddi_get_soft_state(rd_statep, minor);
994	if (rsp == NULL) {
995		mutex_exit(&rd_lock);
996		return (ENXIO);
997	}
998
999	if (rd_opened(rsp, otyp) == -1) {
1000		mutex_exit(&rd_lock);
1001		return (EINVAL);
1002	}
1003
1004	mutex_exit(&rd_lock);
1005	return (0);
1006}
1007
1008/*ARGSUSED*/
1009static int
1010rd_close(dev_t dev, int flag, int otyp, struct cred *credp)
1011{
1012	minor_t		minor;
1013	rd_devstate_t	*rsp;
1014
1015	mutex_enter(&rd_lock);
1016
1017	minor = getminor(dev);
1018
1019	rsp = ddi_get_soft_state(rd_statep, minor);
1020	if (rsp == NULL) {
1021		mutex_exit(&rd_lock);
1022		return (EINVAL);
1023	}
1024
1025	rd_closed(rsp, otyp);
1026
1027	mutex_exit(&rd_lock);
1028
1029	return (0);
1030}
1031
1032static void
1033rd_minphys(struct buf *bp)
1034{
1035	if (bp->b_bcount > rd_maxphys) {
1036		bp->b_bcount = rd_maxphys;
1037	}
1038}
1039
1040static void
1041rd_rw(rd_devstate_t *rsp, struct buf *bp, offset_t offset, size_t nbytes)
1042{
1043	int	reading = bp->b_flags & B_READ;
1044	caddr_t	buf_addr;
1045
1046	bp_mapin(bp);
1047	buf_addr = bp->b_un.b_addr;
1048
1049	while (nbytes > 0) {
1050		offset_t	off_in_window;
1051		size_t		rem_in_window, copy_bytes;
1052		caddr_t		raddr;
1053
1054		mutex_enter(&rsp->rd_device_lock);
1055		rd_map_window(rsp, offset);
1056
1057		off_in_window = offset - rsp->rd_window_base;
1058		rem_in_window = rsp->rd_window_size - off_in_window;
1059
1060		raddr = rsp->rd_window_virt + off_in_window;
1061		copy_bytes = MIN(nbytes, rem_in_window);
1062
1063		if (reading) {
1064			(void) bcopy(raddr, buf_addr, copy_bytes);
1065		} else {
1066			(void) bcopy(buf_addr, raddr, copy_bytes);
1067		}
1068		mutex_exit(&rsp->rd_device_lock);
1069
1070		offset   += copy_bytes;
1071		buf_addr += copy_bytes;
1072		nbytes   -= copy_bytes;
1073	}
1074}
1075
1076static int
1077rd_strategy(struct buf *bp)
1078{
1079	rd_devstate_t	*rsp;
1080	offset_t	offset;
1081
1082	rsp = ddi_get_soft_state(rd_statep, getminor(bp->b_edev));
1083	offset = bp->b_blkno * DEV_BSIZE;
1084
1085	if (rsp == NULL) {
1086		bp->b_error = ENXIO;
1087		bp->b_flags |= B_ERROR;
1088	} else if (offset >= rsp->rd_size) {
1089		bp->b_error = EINVAL;
1090		bp->b_flags |= B_ERROR;
1091	} else {
1092		size_t	nbytes;
1093
1094		if (rsp->rd_kstat) {
1095			mutex_enter(rsp->rd_kstat->ks_lock);
1096			kstat_runq_enter(KSTAT_IO_PTR(rsp->rd_kstat));
1097			mutex_exit(rsp->rd_kstat->ks_lock);
1098		}
1099
1100		nbytes = min(bp->b_bcount, rsp->rd_size - offset);
1101
1102		rd_rw(rsp, bp, offset, nbytes);
1103
1104		bp->b_resid = bp->b_bcount - nbytes;
1105
1106		if (rsp->rd_kstat) {
1107			kstat_io_t *kioptr;
1108
1109			mutex_enter(rsp->rd_kstat->ks_lock);
1110			kioptr = KSTAT_IO_PTR(rsp->rd_kstat);
1111			if (bp->b_flags & B_READ) {
1112				kioptr->nread += nbytes;
1113				kioptr->reads++;
1114			} else {
1115				kioptr->nwritten += nbytes;
1116				kioptr->writes++;
1117			}
1118			kstat_runq_exit(kioptr);
1119			mutex_exit(rsp->rd_kstat->ks_lock);
1120		}
1121	}
1122
1123	biodone(bp);
1124	return (0);
1125}
1126
1127/*ARGSUSED*/
1128static int
1129rd_read(dev_t dev, struct uio *uiop, cred_t *credp)
1130{
1131	rd_devstate_t	*rsp;
1132
1133	rsp = ddi_get_soft_state(rd_statep, getminor(dev));
1134
1135	if (uiop->uio_offset >= rsp->rd_size)
1136		return (EINVAL);
1137
1138	return (physio(rd_strategy, NULL, dev, B_READ, rd_minphys, uiop));
1139}
1140
1141/*ARGSUSED*/
1142static int
1143rd_write(dev_t dev, register struct uio *uiop, cred_t *credp)
1144{
1145	rd_devstate_t	*rsp;
1146
1147	rsp = ddi_get_soft_state(rd_statep, getminor(dev));
1148
1149	if (uiop->uio_offset >= rsp->rd_size)
1150		return (EINVAL);
1151
1152	return (physio(rd_strategy, NULL, dev, B_WRITE, rd_minphys, uiop));
1153}
1154
1155/*ARGSUSED*/
1156static int
1157rd_create_disk(dev_t dev, struct rd_ioctl *urip, int mode, int *rvalp)
1158{
1159	struct rd_ioctl	kri;
1160	size_t		size;
1161	rd_devstate_t	*rsp;
1162
1163	if (ddi_copyin(urip, &kri, sizeof (kri), mode) == -1) {
1164		return (EFAULT);
1165	}
1166
1167	kri.ri_name[RD_NAME_LEN] = '\0';
1168
1169	size = kri.ri_size;
1170	if (size == 0) {
1171		return (EINVAL);
1172	}
1173	size = ptob(btopr(size));
1174
1175	mutex_enter(&rd_lock);
1176
1177	if (rd_find_named_disk(kri.ri_name) != NULL) {
1178		mutex_exit(&rd_lock);
1179		return (EEXIST);
1180	}
1181
1182	rsp = rd_alloc_resources(kri.ri_name, 0, size, rd_dip);
1183	if (rsp == NULL) {
1184		mutex_exit(&rd_lock);
1185		return (EAGAIN);
1186	}
1187
1188	mutex_exit(&rd_lock);
1189
1190	return (ddi_copyout(&kri, urip, sizeof (kri), mode) == -1 ? EFAULT : 0);
1191}
1192
1193/*ARGSUSED*/
1194static int
1195rd_delete_disk(dev_t dev, struct rd_ioctl *urip, int mode)
1196{
1197	struct rd_ioctl	kri;
1198	rd_devstate_t	*rsp;
1199
1200	if (ddi_copyin(urip, &kri, sizeof (kri), mode) == -1) {
1201		return (EFAULT);
1202	}
1203
1204	kri.ri_name[RD_NAME_LEN] = '\0';
1205
1206	mutex_enter(&rd_lock);
1207
1208	rsp = rd_find_named_disk(kri.ri_name);
1209	if (rsp == NULL || rsp->rd_dip != rd_dip) {
1210		mutex_exit(&rd_lock);
1211		return (EINVAL);
1212	}
1213	if (rd_is_open(rsp)) {
1214		mutex_exit(&rd_lock);
1215		return (EBUSY);
1216	}
1217
1218	rd_dealloc_resources(rsp);
1219
1220	mutex_exit(&rd_lock);
1221
1222	return (0);
1223}
1224
1225/*ARGSUSED*/
1226static int
1227rd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1228{
1229	minor_t		minor;
1230	int		error;
1231	enum dkio_state	dkstate;
1232	rd_devstate_t	*rsp;
1233
1234	minor = getminor(dev);
1235
1236	/*
1237	 * Ramdisk ioctls only apply to the master device.
1238	 */
1239	if (minor == RD_CTL_MINOR) {
1240		struct rd_ioctl *rip = (struct rd_ioctl *)arg;
1241
1242		/*
1243		 * The query commands only need read-access - i.e., normal
1244		 * users are allowed to do those on the controlling device
1245		 * as long as they can open it read-only.
1246		 */
1247		switch (cmd) {
1248		case RD_CREATE_DISK:
1249			if ((mode & FWRITE) == 0)
1250				return (EPERM);
1251			return (rd_create_disk(dev, rip, mode, rvalp));
1252
1253		case RD_DELETE_DISK:
1254			if ((mode & FWRITE) == 0)
1255				return (EPERM);
1256			return (rd_delete_disk(dev, rip, mode));
1257
1258		default:
1259			return (EINVAL);
1260		}
1261	}
1262
1263	rsp = ddi_get_soft_state(rd_statep, minor);
1264	if (rsp == NULL) {
1265		return (ENXIO);
1266	}
1267
1268	/*
1269	 * These are for faking out utilities like newfs.
1270	 */
1271	switch (cmd) {
1272	case DKIOCGVTOC:
1273		switch (ddi_model_convert_from(mode & FMODELS)) {
1274		case DDI_MODEL_ILP32: {
1275			struct vtoc32 vtoc32;
1276
1277			vtoctovtoc32(rsp->rd_vtoc, vtoc32);
1278			if (ddi_copyout(&vtoc32, (void *)arg,
1279			    sizeof (struct vtoc32), mode))
1280				return (EFAULT);
1281			}
1282			break;
1283
1284		case DDI_MODEL_NONE:
1285			if (ddi_copyout(&rsp->rd_vtoc, (void *)arg,
1286			    sizeof (struct vtoc), mode))
1287				return (EFAULT);
1288			break;
1289		}
1290		return (0);
1291	case DKIOCINFO:
1292		error = ddi_copyout(&rsp->rd_ci, (void *)arg,
1293		    sizeof (struct dk_cinfo), mode);
1294		if (error)
1295			return (EFAULT);
1296		return (0);
1297	case DKIOCG_VIRTGEOM:
1298	case DKIOCG_PHYGEOM:
1299	case DKIOCGGEOM:
1300		error = ddi_copyout(&rsp->rd_dkg, (void *)arg,
1301		    sizeof (struct dk_geom), mode);
1302		if (error)
1303			return (EFAULT);
1304		return (0);
1305	case DKIOCSTATE:
1306		/* the file is always there */
1307		dkstate = DKIO_INSERTED;
1308		error = ddi_copyout(&dkstate, (void *)arg,
1309		    sizeof (enum dkio_state), mode);
1310		if (error)
1311			return (EFAULT);
1312		return (0);
1313	default:
1314		return (ENOTTY);
1315	}
1316}
1317
1318
1319static struct cb_ops rd_cb_ops = {
1320	rd_open,
1321	rd_close,
1322	rd_strategy,
1323	nodev,
1324	nodev,		/* dump */
1325	rd_read,
1326	rd_write,
1327	rd_ioctl,
1328	nodev,		/* devmap */
1329	nodev,		/* mmap */
1330	nodev,		/* segmap */
1331	nochpoll,	/* poll */
1332	ddi_prop_op,
1333	NULL,
1334	D_NEW | D_MP
1335};
1336
1337static struct dev_ops rd_ops = {
1338	DEVO_REV,
1339	0,
1340	rd_getinfo,
1341	nulldev,	/* identify */
1342	nulldev,	/* probe */
1343	rd_attach,
1344	rd_detach,
1345	nodev,		/* reset */
1346	&rd_cb_ops,
1347	(struct bus_ops *)0,
1348	NULL,
1349	ddi_quiesce_not_needed,		/* quiesce */
1350};
1351
1352
1353extern struct mod_ops mod_driverops;
1354
1355static struct modldrv modldrv = {
1356	&mod_driverops,
1357	"ramdisk driver",
1358	&rd_ops
1359};
1360
1361static struct modlinkage modlinkage = {
1362	MODREV_1,
1363	&modldrv,
1364	0
1365};
1366
1367int
1368_init(void)
1369{
1370	int e;
1371
1372	if ((e = ddi_soft_state_init(&rd_statep,
1373	    sizeof (rd_devstate_t), 0)) != 0) {
1374		return (e);
1375	}
1376
1377	mutex_init(&rd_lock, NULL, MUTEX_DRIVER, NULL);
1378
1379	if ((e = mod_install(&modlinkage)) != 0)  {
1380		mutex_destroy(&rd_lock);
1381		ddi_soft_state_fini(&rd_statep);
1382	}
1383
1384	return (e);
1385}
1386
1387int
1388_fini(void)
1389{
1390	int e;
1391
1392	if ((e = mod_remove(&modlinkage)) != 0)  {
1393		return (e);
1394	}
1395
1396	ddi_soft_state_fini(&rd_statep);
1397	mutex_destroy(&rd_lock);
1398
1399	return (e);
1400}
1401
1402int
1403_info(struct modinfo *modinfop)
1404{
1405	return (mod_info(&modlinkage, modinfop));
1406}
1407