1/*	$NetBSD: dk.c,v 1.171 2023/05/22 15:00:17 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2004, 2005, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__KERNEL_RCSID(0, "$NetBSD: dk.c,v 1.171 2023/05/22 15:00:17 riastradh Exp $");
34
35#ifdef _KERNEL_OPT
36#include "opt_dkwedge.h"
37#endif
38
39#include <sys/param.h>
40#include <sys/types.h>
41
42#include <sys/buf.h>
43#include <sys/bufq.h>
44#include <sys/callout.h>
45#include <sys/conf.h>
46#include <sys/device.h>
47#include <sys/disk.h>
48#include <sys/disklabel.h>
49#include <sys/errno.h>
50#include <sys/fcntl.h>
51#include <sys/ioctl.h>
52#include <sys/kauth.h>
53#include <sys/kernel.h>
54#include <sys/malloc.h>
55#include <sys/pool.h>
56#include <sys/proc.h>
57#include <sys/rwlock.h>
58#include <sys/stat.h>
59#include <sys/systm.h>
60#include <sys/vnode.h>
61
62#include <miscfs/specfs/specdev.h>
63
64MALLOC_DEFINE(M_DKWEDGE, "dkwedge", "Disk wedge structures");
65
66typedef enum {
67	DKW_STATE_LARVAL	= 0,
68	DKW_STATE_RUNNING	= 1,
69	DKW_STATE_DYING		= 2,
70	DKW_STATE_DEAD		= 666
71} dkwedge_state_t;
72
73/*
74 * Lock order:
75 *
76 *	sc->sc_dk.dk_openlock
77 *	=> sc->sc_parent->dk_rawlock
78 *	=> sc->sc_parent->dk_openlock
79 *	=> dkwedges_lock
80 *	=> sc->sc_sizelock
81 *
82 * Locking notes:
83 *
84 *	W	dkwedges_lock
85 *	D	device reference
86 *	O	sc->sc_dk.dk_openlock
87 *	P	sc->sc_parent->dk_openlock
88 *	R	sc->sc_parent->dk_rawlock
89 *	S	sc->sc_sizelock
90 *	I	sc->sc_iolock
91 *	$	stable after initialization
92 *	1	used only by a single thread
93 *
94 * x&y means both x and y must be held to write (with a write lock if
95 * one is rwlock), and either x or y must be held to read.
96 */
97
98struct dkwedge_softc {
99	device_t	sc_dev;	/* P&W: pointer to our pseudo-device */
100		/* sc_dev is also stable while device is referenced */
101	struct cfdata	sc_cfdata;	/* 1: our cfdata structure */
102	uint8_t		sc_wname[128];	/* $: wedge name (Unicode, UTF-8) */
103
104	dkwedge_state_t sc_state;	/* state this wedge is in */
105		/* stable while device is referenced */
106		/* used only in assertions when stable, and in dump in ddb */
107
108	struct disk	*sc_parent;	/* $: parent disk */
109		/* P: sc_parent->dk_openmask */
110		/* P: sc_parent->dk_nwedges */
111		/* P: sc_parent->dk_wedges */
112		/* R: sc_parent->dk_rawopens */
113		/* R: sc_parent->dk_rawvp (also stable while wedge is open) */
114	daddr_t		sc_offset;	/* $: LBA offset of wedge in parent */
115	krwlock_t	sc_sizelock;
116	uint64_t	sc_size;	/* S: size of wedge in blocks */
117	char		sc_ptype[32];	/* $: partition type */
118	dev_t		sc_pdev;	/* $: cached parent's dev_t */
119					/* P: link on parent's wedge list */
120	LIST_ENTRY(dkwedge_softc) sc_plink;
121
122	struct disk	sc_dk;		/* our own disk structure */
123		/* O&R: sc_dk.dk_bopenmask */
124		/* O&R: sc_dk.dk_copenmask */
125		/* O&R: sc_dk.dk_openmask */
126	struct bufq_state *sc_bufq;	/* $: buffer queue */
127	struct callout	sc_restart_ch;	/* I: callout to restart I/O */
128
129	kmutex_t	sc_iolock;
130	bool		sc_iostop;	/* I: don't schedule restart */
131	int		sc_mode;	/* O&R: parent open mode */
132};
133
134static int	dkwedge_match(device_t, cfdata_t, void *);
135static void	dkwedge_attach(device_t, device_t, void *);
136static int	dkwedge_detach(device_t, int);
137
138static void	dk_set_geometry(struct dkwedge_softc *, struct disk *);
139
140static void	dkstart(struct dkwedge_softc *);
141static void	dkiodone(struct buf *);
142static void	dkrestart(void *);
143static void	dkminphys(struct buf *);
144
145static int	dkfirstopen(struct dkwedge_softc *, int);
146static void	dklastclose(struct dkwedge_softc *);
147static int	dkwedge_detach(device_t, int);
148static void	dkwedge_delall1(struct disk *, bool);
149static int	dkwedge_del1(struct dkwedge_info *, int);
150static int	dk_open_parent(dev_t, int, struct vnode **);
151static int	dk_close_parent(struct vnode *, int);
152
153static dev_type_open(dkopen);
154static dev_type_close(dkclose);
155static dev_type_cancel(dkcancel);
156static dev_type_read(dkread);
157static dev_type_write(dkwrite);
158static dev_type_ioctl(dkioctl);
159static dev_type_strategy(dkstrategy);
160static dev_type_dump(dkdump);
161static dev_type_size(dksize);
162static dev_type_discard(dkdiscard);
163
164CFDRIVER_DECL(dk, DV_DISK, NULL);
165CFATTACH_DECL3_NEW(dk, 0,
166    dkwedge_match, dkwedge_attach, dkwedge_detach, NULL, NULL, NULL,
167    DVF_DETACH_SHUTDOWN);
168
169const struct bdevsw dk_bdevsw = {
170	.d_open = dkopen,
171	.d_close = dkclose,
172	.d_cancel = dkcancel,
173	.d_strategy = dkstrategy,
174	.d_ioctl = dkioctl,
175	.d_dump = dkdump,
176	.d_psize = dksize,
177	.d_discard = dkdiscard,
178	.d_cfdriver = &dk_cd,
179	.d_devtounit = dev_minor_unit,
180	.d_flag = D_DISK | D_MPSAFE
181};
182
183const struct cdevsw dk_cdevsw = {
184	.d_open = dkopen,
185	.d_close = dkclose,
186	.d_cancel = dkcancel,
187	.d_read = dkread,
188	.d_write = dkwrite,
189	.d_ioctl = dkioctl,
190	.d_stop = nostop,
191	.d_tty = notty,
192	.d_poll = nopoll,
193	.d_mmap = nommap,
194	.d_kqfilter = nokqfilter,
195	.d_discard = dkdiscard,
196	.d_cfdriver = &dk_cd,
197	.d_devtounit = dev_minor_unit,
198	.d_flag = D_DISK | D_MPSAFE
199};
200
201static struct dkwedge_softc **dkwedges;
202static u_int ndkwedges;
203static krwlock_t dkwedges_lock;
204
205static LIST_HEAD(, dkwedge_discovery_method) dkwedge_discovery_methods;
206static krwlock_t dkwedge_discovery_methods_lock;
207
208/*
209 * dkwedge_match:
210 *
211 *	Autoconfiguration match function for pseudo-device glue.
212 */
213static int
214dkwedge_match(device_t parent, cfdata_t match, void *aux)
215{
216
217	/* Pseudo-device; always present. */
218	return 1;
219}
220
221/*
222 * dkwedge_attach:
223 *
224 *	Autoconfiguration attach function for pseudo-device glue.
225 */
226static void
227dkwedge_attach(device_t parent, device_t self, void *aux)
228{
229	struct dkwedge_softc *sc = aux;
230	struct disk *pdk = sc->sc_parent;
231	int unit = device_unit(self);
232
233	KASSERTMSG(unit >= 0, "unit=%d", unit);
234
235	if (!pmf_device_register(self, NULL, NULL))
236		aprint_error_dev(self, "couldn't establish power handler\n");
237
238	mutex_enter(&pdk->dk_openlock);
239	rw_enter(&dkwedges_lock, RW_WRITER);
240	KASSERTMSG(unit < ndkwedges, "unit=%d ndkwedges=%u", unit, ndkwedges);
241	KASSERTMSG(sc == dkwedges[unit], "sc=%p dkwedges[%d]=%p",
242	    sc, unit, dkwedges[unit]);
243	KASSERTMSG(sc->sc_dev == NULL, "sc=%p sc->sc_dev=%p", sc, sc->sc_dev);
244	sc->sc_dev = self;
245	rw_exit(&dkwedges_lock);
246	mutex_exit(&pdk->dk_openlock);
247
248	disk_init(&sc->sc_dk, device_xname(sc->sc_dev), NULL);
249	mutex_enter(&pdk->dk_openlock);
250	dk_set_geometry(sc, pdk);
251	mutex_exit(&pdk->dk_openlock);
252	disk_attach(&sc->sc_dk);
253
254	/* Disk wedge is ready for use! */
255	device_set_private(self, sc);
256	sc->sc_state = DKW_STATE_RUNNING;
257}
258
259/*
260 * dkwedge_compute_pdev:
261 *
262 *	Compute the parent disk's dev_t.
263 */
264static int
265dkwedge_compute_pdev(const char *pname, dev_t *pdevp, enum vtype type)
266{
267	const char *name, *cp;
268	devmajor_t pmaj;
269	int punit;
270	char devname[16];
271
272	name = pname;
273	switch (type) {
274	case VBLK:
275		pmaj = devsw_name2blk(name, devname, sizeof(devname));
276		break;
277	case VCHR:
278		pmaj = devsw_name2chr(name, devname, sizeof(devname));
279		break;
280	default:
281		pmaj = NODEVMAJOR;
282		break;
283	}
284	if (pmaj == NODEVMAJOR)
285		return ENXIO;
286
287	name += strlen(devname);
288	for (cp = name, punit = 0; *cp >= '0' && *cp <= '9'; cp++)
289		punit = (punit * 10) + (*cp - '0');
290	if (cp == name) {
291		/* Invalid parent disk name. */
292		return ENXIO;
293	}
294
295	*pdevp = MAKEDISKDEV(pmaj, punit, RAW_PART);
296
297	return 0;
298}
299
300/*
301 * dkwedge_array_expand:
302 *
303 *	Expand the dkwedges array.
304 *
305 *	Releases and reacquires dkwedges_lock as a writer.
306 */
307static int
308dkwedge_array_expand(void)
309{
310
311	const unsigned incr = 16;
312	unsigned newcnt, oldcnt;
313	struct dkwedge_softc **newarray = NULL, **oldarray = NULL;
314
315	KASSERT(rw_write_held(&dkwedges_lock));
316
317	oldcnt = ndkwedges;
318	oldarray = dkwedges;
319
320	if (oldcnt >= INT_MAX - incr)
321		return ENFILE;	/* XXX */
322	newcnt = oldcnt + incr;
323
324	rw_exit(&dkwedges_lock);
325	newarray = malloc(newcnt * sizeof(*newarray), M_DKWEDGE,
326	    M_WAITOK|M_ZERO);
327	rw_enter(&dkwedges_lock, RW_WRITER);
328
329	if (ndkwedges != oldcnt || dkwedges != oldarray) {
330		oldarray = NULL; /* already recycled */
331		goto out;
332	}
333
334	if (oldarray != NULL)
335		memcpy(newarray, dkwedges, ndkwedges * sizeof(*newarray));
336	dkwedges = newarray;
337	newarray = NULL;	/* transferred to dkwedges */
338	ndkwedges = newcnt;
339
340out:	rw_exit(&dkwedges_lock);
341	if (oldarray != NULL)
342		free(oldarray, M_DKWEDGE);
343	if (newarray != NULL)
344		free(newarray, M_DKWEDGE);
345	rw_enter(&dkwedges_lock, RW_WRITER);
346	return 0;
347}
348
349static void
350dkwedge_size_init(struct dkwedge_softc *sc, uint64_t size)
351{
352
353	rw_init(&sc->sc_sizelock);
354	sc->sc_size = size;
355}
356
357static void
358dkwedge_size_fini(struct dkwedge_softc *sc)
359{
360
361	rw_destroy(&sc->sc_sizelock);
362}
363
364static uint64_t
365dkwedge_size(struct dkwedge_softc *sc)
366{
367	uint64_t size;
368
369	rw_enter(&sc->sc_sizelock, RW_READER);
370	size = sc->sc_size;
371	rw_exit(&sc->sc_sizelock);
372
373	return size;
374}
375
376static void
377dkwedge_size_increase(struct dkwedge_softc *sc, uint64_t size)
378{
379
380	KASSERT(mutex_owned(&sc->sc_parent->dk_openlock));
381
382	rw_enter(&sc->sc_sizelock, RW_WRITER);
383	KASSERTMSG(size >= sc->sc_size,
384	    "decreasing dkwedge size from %"PRIu64" to %"PRIu64,
385	    sc->sc_size, size);
386	sc->sc_size = size;
387	rw_exit(&sc->sc_sizelock);
388}
389
390static void
391dk_set_geometry(struct dkwedge_softc *sc, struct disk *pdk)
392{
393	struct disk *dk = &sc->sc_dk;
394	struct disk_geom *dg = &dk->dk_geom;
395
396	KASSERT(mutex_owned(&pdk->dk_openlock));
397
398	memset(dg, 0, sizeof(*dg));
399
400	dg->dg_secperunit = dkwedge_size(sc);
401	dg->dg_secsize = DEV_BSIZE << pdk->dk_blkshift;
402
403	/* fake numbers, 1 cylinder is 1 MB with default sector size */
404	dg->dg_nsectors = 32;
405	dg->dg_ntracks = 64;
406	dg->dg_ncylinders =
407	    dg->dg_secperunit / (dg->dg_nsectors * dg->dg_ntracks);
408
409	disk_set_info(sc->sc_dev, dk, NULL);
410}
411
412/*
413 * dkwedge_add:		[exported function]
414 *
415 *	Add a disk wedge based on the provided information.
416 *
417 *	The incoming dkw_devname[] is ignored, instead being
418 *	filled in and returned to the caller.
419 */
420int
421dkwedge_add(struct dkwedge_info *dkw)
422{
423	struct dkwedge_softc *sc, *lsc;
424	struct disk *pdk;
425	u_int unit;
426	int error;
427	dev_t pdev;
428	device_t dev __diagused;
429
430	dkw->dkw_parent[sizeof(dkw->dkw_parent) - 1] = '\0';
431	pdk = disk_find(dkw->dkw_parent);
432	if (pdk == NULL)
433		return ENXIO;
434
435	error = dkwedge_compute_pdev(pdk->dk_name, &pdev, VBLK);
436	if (error)
437		return error;
438
439	if (dkw->dkw_offset < 0)
440		return EINVAL;
441
442	/*
443	 * Check for an existing wedge at the same disk offset. Allow
444	 * updating a wedge if the only change is the size, and the new
445	 * size is larger than the old.
446	 */
447	sc = NULL;
448	mutex_enter(&pdk->dk_openlock);
449	LIST_FOREACH(lsc, &pdk->dk_wedges, sc_plink) {
450		if (lsc->sc_offset != dkw->dkw_offset)
451			continue;
452		if (strcmp(lsc->sc_wname, dkw->dkw_wname) != 0)
453			break;
454		if (strcmp(lsc->sc_ptype, dkw->dkw_ptype) != 0)
455			break;
456		if (dkwedge_size(lsc) > dkw->dkw_size)
457			break;
458		if (lsc->sc_dev == NULL)
459			break;
460
461		sc = lsc;
462		device_acquire(sc->sc_dev);
463		dkwedge_size_increase(sc, dkw->dkw_size);
464		dk_set_geometry(sc, pdk);
465
466		break;
467	}
468	mutex_exit(&pdk->dk_openlock);
469
470	if (sc != NULL)
471		goto announce;
472
473	sc = malloc(sizeof(*sc), M_DKWEDGE, M_WAITOK|M_ZERO);
474	sc->sc_state = DKW_STATE_LARVAL;
475	sc->sc_parent = pdk;
476	sc->sc_pdev = pdev;
477	sc->sc_offset = dkw->dkw_offset;
478	dkwedge_size_init(sc, dkw->dkw_size);
479
480	memcpy(sc->sc_wname, dkw->dkw_wname, sizeof(sc->sc_wname));
481	sc->sc_wname[sizeof(sc->sc_wname) - 1] = '\0';
482
483	memcpy(sc->sc_ptype, dkw->dkw_ptype, sizeof(sc->sc_ptype));
484	sc->sc_ptype[sizeof(sc->sc_ptype) - 1] = '\0';
485
486	bufq_alloc(&sc->sc_bufq, "fcfs", 0);
487
488	callout_init(&sc->sc_restart_ch, 0);
489	callout_setfunc(&sc->sc_restart_ch, dkrestart, sc);
490
491	mutex_init(&sc->sc_iolock, MUTEX_DEFAULT, IPL_BIO);
492
493	/*
494	 * Wedge will be added; increment the wedge count for the parent.
495	 * Only allow this to happen if RAW_PART is the only thing open.
496	 */
497	mutex_enter(&pdk->dk_openlock);
498	if (pdk->dk_openmask & ~(1 << RAW_PART))
499		error = EBUSY;
500	else {
501		/* Check for wedge overlap. */
502		LIST_FOREACH(lsc, &pdk->dk_wedges, sc_plink) {
503			/* XXX arithmetic overflow */
504			uint64_t size = dkwedge_size(sc);
505			uint64_t lsize = dkwedge_size(lsc);
506			daddr_t lastblk = sc->sc_offset + size - 1;
507			daddr_t llastblk = lsc->sc_offset + lsize - 1;
508
509			if (sc->sc_offset >= lsc->sc_offset &&
510			    sc->sc_offset <= llastblk) {
511				/* Overlaps the tail of the existing wedge. */
512				break;
513			}
514			if (lastblk >= lsc->sc_offset &&
515			    lastblk <= llastblk) {
516				/* Overlaps the head of the existing wedge. */
517			    	break;
518			}
519		}
520		if (lsc != NULL) {
521			if (sc->sc_offset == lsc->sc_offset &&
522			    dkwedge_size(sc) == dkwedge_size(lsc) &&
523			    strcmp(sc->sc_wname, lsc->sc_wname) == 0)
524				error = EEXIST;
525			else
526				error = EINVAL;
527		} else {
528			pdk->dk_nwedges++;
529			LIST_INSERT_HEAD(&pdk->dk_wedges, sc, sc_plink);
530		}
531	}
532	mutex_exit(&pdk->dk_openlock);
533	if (error) {
534		mutex_destroy(&sc->sc_iolock);
535		bufq_free(sc->sc_bufq);
536		dkwedge_size_fini(sc);
537		free(sc, M_DKWEDGE);
538		return error;
539	}
540
541	/* Fill in our cfdata for the pseudo-device glue. */
542	sc->sc_cfdata.cf_name = dk_cd.cd_name;
543	sc->sc_cfdata.cf_atname = dk_ca.ca_name;
544	/* sc->sc_cfdata.cf_unit set below */
545	sc->sc_cfdata.cf_fstate = FSTATE_NOTFOUND; /* use chosen cf_unit */
546
547	/* Insert the larval wedge into the array. */
548	rw_enter(&dkwedges_lock, RW_WRITER);
549	for (error = 0;;) {
550		struct dkwedge_softc **scpp;
551
552		/*
553		 * Check for a duplicate wname while searching for
554		 * a slot.
555		 */
556		for (scpp = NULL, unit = 0; unit < ndkwedges; unit++) {
557			if (dkwedges[unit] == NULL) {
558				if (scpp == NULL) {
559					scpp = &dkwedges[unit];
560					sc->sc_cfdata.cf_unit = unit;
561				}
562			} else {
563				/* XXX Unicode. */
564				if (strcmp(dkwedges[unit]->sc_wname,
565					sc->sc_wname) == 0) {
566					error = EEXIST;
567					break;
568				}
569			}
570		}
571		if (error)
572			break;
573		KASSERT(unit == ndkwedges);
574		if (scpp == NULL) {
575			error = dkwedge_array_expand();
576			if (error)
577				break;
578		} else {
579			KASSERT(scpp == &dkwedges[sc->sc_cfdata.cf_unit]);
580			*scpp = sc;
581			break;
582		}
583	}
584	rw_exit(&dkwedges_lock);
585	if (error) {
586		mutex_enter(&pdk->dk_openlock);
587		pdk->dk_nwedges--;
588		LIST_REMOVE(sc, sc_plink);
589		mutex_exit(&pdk->dk_openlock);
590
591		mutex_destroy(&sc->sc_iolock);
592		bufq_free(sc->sc_bufq);
593		dkwedge_size_fini(sc);
594		free(sc, M_DKWEDGE);
595		return error;
596	}
597
598	/*
599	 * Now that we know the unit #, attach a pseudo-device for
600	 * this wedge instance.  This will provide us with the
601	 * device_t necessary for glue to other parts of the system.
602	 *
603	 * This should never fail, unless we're almost totally out of
604	 * memory.
605	 */
606	if ((dev = config_attach_pseudo_acquire(&sc->sc_cfdata, sc)) == NULL) {
607		aprint_error("%s%u: unable to attach pseudo-device\n",
608		    sc->sc_cfdata.cf_name, sc->sc_cfdata.cf_unit);
609
610		rw_enter(&dkwedges_lock, RW_WRITER);
611		KASSERT(dkwedges[sc->sc_cfdata.cf_unit] == sc);
612		dkwedges[sc->sc_cfdata.cf_unit] = NULL;
613		rw_exit(&dkwedges_lock);
614
615		mutex_enter(&pdk->dk_openlock);
616		pdk->dk_nwedges--;
617		LIST_REMOVE(sc, sc_plink);
618		mutex_exit(&pdk->dk_openlock);
619
620		mutex_destroy(&sc->sc_iolock);
621		bufq_free(sc->sc_bufq);
622		dkwedge_size_fini(sc);
623		free(sc, M_DKWEDGE);
624		return ENOMEM;
625	}
626
627	KASSERT(dev == sc->sc_dev);
628
629announce:
630	/* Announce our arrival. */
631	aprint_normal(
632	    "%s at %s: \"%s\", %"PRIu64" blocks at %"PRId64", type: %s\n",
633	    device_xname(sc->sc_dev), pdk->dk_name,
634	    sc->sc_wname,	/* XXX Unicode */
635	    dkwedge_size(sc), sc->sc_offset,
636	    sc->sc_ptype[0] == '\0' ? "<unknown>" : sc->sc_ptype);
637
638	/* Return the devname to the caller. */
639	strlcpy(dkw->dkw_devname, device_xname(sc->sc_dev),
640	    sizeof(dkw->dkw_devname));
641
642	device_release(sc->sc_dev);
643	return 0;
644}
645
646/*
647 * dkwedge_find_acquire:
648 *
649 *	Lookup a disk wedge based on the provided information.
650 *	NOTE: We look up the wedge based on the wedge devname,
651 *	not wname.
652 *
653 *	Return NULL if the wedge is not found, otherwise return
654 *	the wedge's softc.  Assign the wedge's unit number to unitp
655 *	if unitp is not NULL.  The wedge's sc_dev is referenced and
656 *	must be released by device_release or equivalent.
657 */
658static struct dkwedge_softc *
659dkwedge_find_acquire(struct dkwedge_info *dkw, u_int *unitp)
660{
661	struct dkwedge_softc *sc = NULL;
662	u_int unit;
663
664	/* Find our softc. */
665	dkw->dkw_devname[sizeof(dkw->dkw_devname) - 1] = '\0';
666	rw_enter(&dkwedges_lock, RW_READER);
667	for (unit = 0; unit < ndkwedges; unit++) {
668		if ((sc = dkwedges[unit]) != NULL &&
669		    sc->sc_dev != NULL &&
670		    strcmp(device_xname(sc->sc_dev), dkw->dkw_devname) == 0 &&
671		    strcmp(sc->sc_parent->dk_name, dkw->dkw_parent) == 0) {
672			device_acquire(sc->sc_dev);
673			break;
674		}
675	}
676	rw_exit(&dkwedges_lock);
677	if (sc == NULL)
678		return NULL;
679
680	if (unitp != NULL)
681		*unitp = unit;
682
683	return sc;
684}
685
686/*
687 * dkwedge_del:		[exported function]
688 *
689 *	Delete a disk wedge based on the provided information.
690 *	NOTE: We look up the wedge based on the wedge devname,
691 *	not wname.
692 */
693int
694dkwedge_del(struct dkwedge_info *dkw)
695{
696
697	return dkwedge_del1(dkw, 0);
698}
699
700int
701dkwedge_del1(struct dkwedge_info *dkw, int flags)
702{
703	struct dkwedge_softc *sc = NULL;
704
705	/* Find our softc. */
706	if ((sc = dkwedge_find_acquire(dkw, NULL)) == NULL)
707		return ESRCH;
708
709	return config_detach_release(sc->sc_dev, flags);
710}
711
712/*
713 * dkwedge_detach:
714 *
715 *	Autoconfiguration detach function for pseudo-device glue.
716 */
717static int
718dkwedge_detach(device_t self, int flags)
719{
720	struct dkwedge_softc *const sc = device_private(self);
721	const u_int unit = device_unit(self);
722	int bmaj, cmaj, error;
723
724	error = disk_begindetach(&sc->sc_dk, /*lastclose*/NULL, self, flags);
725	if (error)
726		return error;
727
728	/* Mark the wedge as dying. */
729	sc->sc_state = DKW_STATE_DYING;
730
731	pmf_device_deregister(self);
732
733	/* Kill any pending restart. */
734	mutex_enter(&sc->sc_iolock);
735	sc->sc_iostop = true;
736	mutex_exit(&sc->sc_iolock);
737	callout_halt(&sc->sc_restart_ch, NULL);
738
739	/* Locate the wedge major numbers. */
740	bmaj = bdevsw_lookup_major(&dk_bdevsw);
741	cmaj = cdevsw_lookup_major(&dk_cdevsw);
742
743	/* Nuke the vnodes for any open instances. */
744	vdevgone(bmaj, unit, unit, VBLK);
745	vdevgone(cmaj, unit, unit, VCHR);
746
747	/*
748	 * At this point, all block device opens have been closed,
749	 * synchronously flushing any buffered writes; and all
750	 * character device I/O operations have completed
751	 * synchronously, and character device opens have been closed.
752	 *
753	 * So there can be no more opens or queued buffers by now.
754	 */
755	KASSERT(sc->sc_dk.dk_openmask == 0);
756	KASSERT(bufq_peek(sc->sc_bufq) == NULL);
757	bufq_drain(sc->sc_bufq);
758
759	/* Announce our departure. */
760	aprint_normal("%s at %s (%s) deleted\n", device_xname(sc->sc_dev),
761	    sc->sc_parent->dk_name,
762	    sc->sc_wname);	/* XXX Unicode */
763
764	mutex_enter(&sc->sc_parent->dk_openlock);
765	sc->sc_parent->dk_nwedges--;
766	LIST_REMOVE(sc, sc_plink);
767	mutex_exit(&sc->sc_parent->dk_openlock);
768
769	/* Delete our buffer queue. */
770	bufq_free(sc->sc_bufq);
771
772	/* Detach from the disk list. */
773	disk_detach(&sc->sc_dk);
774	disk_destroy(&sc->sc_dk);
775
776	/* Poof. */
777	rw_enter(&dkwedges_lock, RW_WRITER);
778	KASSERT(dkwedges[unit] == sc);
779	dkwedges[unit] = NULL;
780	sc->sc_state = DKW_STATE_DEAD;
781	rw_exit(&dkwedges_lock);
782
783	mutex_destroy(&sc->sc_iolock);
784	dkwedge_size_fini(sc);
785
786	free(sc, M_DKWEDGE);
787
788	return 0;
789}
790
791/*
792 * dkwedge_delall:	[exported function]
793 *
794 *	Forcibly delete all of the wedges on the specified disk.  Used
795 *	when a disk is being detached.
796 */
797void
798dkwedge_delall(struct disk *pdk)
799{
800
801	dkwedge_delall1(pdk, /*idleonly*/false);
802}
803
804/*
805 * dkwedge_delidle:	[exported function]
806 *
807 *	Delete all of the wedges on the specified disk if idle.  Used
808 *	by ioctl(DIOCRMWEDGES).
809 */
810void
811dkwedge_delidle(struct disk *pdk)
812{
813
814	dkwedge_delall1(pdk, /*idleonly*/true);
815}
816
817static void
818dkwedge_delall1(struct disk *pdk, bool idleonly)
819{
820	struct dkwedge_softc *sc;
821	int flags;
822
823	flags = DETACH_QUIET;
824	if (!idleonly)
825		flags |= DETACH_FORCE;
826
827	for (;;) {
828		mutex_enter(&pdk->dk_rawlock); /* for sc->sc_dk.dk_openmask */
829		mutex_enter(&pdk->dk_openlock);
830		LIST_FOREACH(sc, &pdk->dk_wedges, sc_plink) {
831			/*
832			 * Wedge is not yet created.  This is a race --
833			 * it may as well have been added just after we
834			 * deleted all the wedges, so pretend it's not
835			 * here yet.
836			 */
837			if (sc->sc_dev == NULL)
838				continue;
839			if (!idleonly || sc->sc_dk.dk_openmask == 0) {
840				device_acquire(sc->sc_dev);
841				break;
842			}
843		}
844		if (sc == NULL) {
845			KASSERT(idleonly || pdk->dk_nwedges == 0);
846			mutex_exit(&pdk->dk_openlock);
847			mutex_exit(&pdk->dk_rawlock);
848			return;
849		}
850		mutex_exit(&pdk->dk_openlock);
851		mutex_exit(&pdk->dk_rawlock);
852		(void)config_detach_release(sc->sc_dev, flags);
853	}
854}
855
856/*
857 * dkwedge_list:	[exported function]
858 *
859 *	List all of the wedges on a particular disk.
860 */
861int
862dkwedge_list(struct disk *pdk, struct dkwedge_list *dkwl, struct lwp *l)
863{
864	struct uio uio;
865	struct iovec iov;
866	struct dkwedge_softc *sc;
867	struct dkwedge_info dkw;
868	int error = 0;
869
870	iov.iov_base = dkwl->dkwl_buf;
871	iov.iov_len = dkwl->dkwl_bufsize;
872
873	uio.uio_iov = &iov;
874	uio.uio_iovcnt = 1;
875	uio.uio_offset = 0;
876	uio.uio_resid = dkwl->dkwl_bufsize;
877	uio.uio_rw = UIO_READ;
878	KASSERT(l == curlwp);
879	uio.uio_vmspace = l->l_proc->p_vmspace;
880
881	dkwl->dkwl_ncopied = 0;
882
883	mutex_enter(&pdk->dk_openlock);
884	LIST_FOREACH(sc, &pdk->dk_wedges, sc_plink) {
885		if (uio.uio_resid < sizeof(dkw))
886			break;
887
888		if (sc->sc_dev == NULL)
889			continue;
890
891		strlcpy(dkw.dkw_devname, device_xname(sc->sc_dev),
892		    sizeof(dkw.dkw_devname));
893		memcpy(dkw.dkw_wname, sc->sc_wname, sizeof(dkw.dkw_wname));
894		dkw.dkw_wname[sizeof(dkw.dkw_wname) - 1] = '\0';
895		strlcpy(dkw.dkw_parent, sc->sc_parent->dk_name,
896		    sizeof(dkw.dkw_parent));
897		dkw.dkw_offset = sc->sc_offset;
898		dkw.dkw_size = dkwedge_size(sc);
899		strlcpy(dkw.dkw_ptype, sc->sc_ptype, sizeof(dkw.dkw_ptype));
900
901		/*
902		 * Acquire a device reference so this wedge doesn't go
903		 * away before our next iteration in LIST_FOREACH, and
904		 * then release the lock for uiomove.
905		 */
906		device_acquire(sc->sc_dev);
907		mutex_exit(&pdk->dk_openlock);
908		error = uiomove(&dkw, sizeof(dkw), &uio);
909		mutex_enter(&pdk->dk_openlock);
910		device_release(sc->sc_dev);
911		if (error)
912			break;
913
914		dkwl->dkwl_ncopied++;
915	}
916	dkwl->dkwl_nwedges = pdk->dk_nwedges;
917	mutex_exit(&pdk->dk_openlock);
918
919	return error;
920}
921
922static device_t
923dkwedge_find_by_wname_acquire(const char *wname)
924{
925	device_t dv = NULL;
926	struct dkwedge_softc *sc;
927	int i;
928
929	rw_enter(&dkwedges_lock, RW_READER);
930	for (i = 0; i < ndkwedges; i++) {
931		if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL)
932			continue;
933		if (strcmp(sc->sc_wname, wname) == 0) {
934			if (dv != NULL) {
935				printf(
936				    "WARNING: double match for wedge name %s "
937				    "(%s, %s)\n", wname, device_xname(dv),
938				    device_xname(sc->sc_dev));
939				continue;
940			}
941			device_acquire(sc->sc_dev);
942			dv = sc->sc_dev;
943		}
944	}
945	rw_exit(&dkwedges_lock);
946	return dv;
947}
948
949static device_t
950dkwedge_find_by_parent_acquire(const char *name, size_t *i)
951{
952
953	rw_enter(&dkwedges_lock, RW_READER);
954	for (; *i < (size_t)ndkwedges; (*i)++) {
955		struct dkwedge_softc *sc;
956		if ((sc = dkwedges[*i]) == NULL || sc->sc_dev == NULL)
957			continue;
958		if (strcmp(sc->sc_parent->dk_name, name) != 0)
959			continue;
960		device_acquire(sc->sc_dev);
961		rw_exit(&dkwedges_lock);
962		return sc->sc_dev;
963	}
964	rw_exit(&dkwedges_lock);
965	return NULL;
966}
967
968/* XXX unsafe */
969device_t
970dkwedge_find_by_wname(const char *wname)
971{
972	device_t dv;
973
974	if ((dv = dkwedge_find_by_wname_acquire(wname)) == NULL)
975		return NULL;
976	device_release(dv);
977	return dv;
978}
979
980/* XXX unsafe */
981device_t
982dkwedge_find_by_parent(const char *name, size_t *i)
983{
984	device_t dv;
985
986	if ((dv = dkwedge_find_by_parent_acquire(name, i)) == NULL)
987		return NULL;
988	device_release(dv);
989	return dv;
990}
991
992void
993dkwedge_print_wnames(void)
994{
995	struct dkwedge_softc *sc;
996	int i;
997
998	rw_enter(&dkwedges_lock, RW_READER);
999	for (i = 0; i < ndkwedges; i++) {
1000		if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL)
1001			continue;
1002		printf(" wedge:%s", sc->sc_wname);
1003	}
1004	rw_exit(&dkwedges_lock);
1005}
1006
1007/*
1008 * We need a dummy object to stuff into the dkwedge discovery method link
1009 * set to ensure that there is always at least one object in the set.
1010 */
1011static struct dkwedge_discovery_method dummy_discovery_method;
1012__link_set_add_bss(dkwedge_methods, dummy_discovery_method);
1013
1014/*
1015 * dkwedge_init:
1016 *
1017 *	Initialize the disk wedge subsystem.
1018 */
1019void
1020dkwedge_init(void)
1021{
1022	__link_set_decl(dkwedge_methods, struct dkwedge_discovery_method);
1023	struct dkwedge_discovery_method * const *ddmp;
1024	struct dkwedge_discovery_method *lddm, *ddm;
1025
1026	rw_init(&dkwedges_lock);
1027	rw_init(&dkwedge_discovery_methods_lock);
1028
1029	if (config_cfdriver_attach(&dk_cd) != 0)
1030		panic("dkwedge: unable to attach cfdriver");
1031	if (config_cfattach_attach(dk_cd.cd_name, &dk_ca) != 0)
1032		panic("dkwedge: unable to attach cfattach");
1033
1034	rw_enter(&dkwedge_discovery_methods_lock, RW_WRITER);
1035
1036	LIST_INIT(&dkwedge_discovery_methods);
1037
1038	__link_set_foreach(ddmp, dkwedge_methods) {
1039		ddm = *ddmp;
1040		if (ddm == &dummy_discovery_method)
1041			continue;
1042		if (LIST_EMPTY(&dkwedge_discovery_methods)) {
1043			LIST_INSERT_HEAD(&dkwedge_discovery_methods,
1044			    ddm, ddm_list);
1045			continue;
1046		}
1047		LIST_FOREACH(lddm, &dkwedge_discovery_methods, ddm_list) {
1048			if (ddm->ddm_priority == lddm->ddm_priority) {
1049				aprint_error("dk-method-%s: method \"%s\" "
1050				    "already exists at priority %d\n",
1051				    ddm->ddm_name, lddm->ddm_name,
1052				    lddm->ddm_priority);
1053				/* Not inserted. */
1054				break;
1055			}
1056			if (ddm->ddm_priority < lddm->ddm_priority) {
1057				/* Higher priority; insert before. */
1058				LIST_INSERT_BEFORE(lddm, ddm, ddm_list);
1059				break;
1060			}
1061			if (LIST_NEXT(lddm, ddm_list) == NULL) {
1062				/* Last one; insert after. */
1063				KASSERT(lddm->ddm_priority < ddm->ddm_priority);
1064				LIST_INSERT_AFTER(lddm, ddm, ddm_list);
1065				break;
1066			}
1067		}
1068	}
1069
1070	rw_exit(&dkwedge_discovery_methods_lock);
1071}
1072
1073#ifdef DKWEDGE_AUTODISCOVER
1074int	dkwedge_autodiscover = 1;
1075#else
1076int	dkwedge_autodiscover = 0;
1077#endif
1078
1079/*
1080 * dkwedge_discover:	[exported function]
1081 *
1082 *	Discover the wedges on a newly attached disk.
1083 *	Remove all unused wedges on the disk first.
1084 */
1085void
1086dkwedge_discover(struct disk *pdk)
1087{
1088	struct dkwedge_discovery_method *ddm;
1089	struct vnode *vp;
1090	int error;
1091	dev_t pdev;
1092
1093	/*
1094	 * Require people playing with wedges to enable this explicitly.
1095	 */
1096	if (dkwedge_autodiscover == 0)
1097		return;
1098
1099	rw_enter(&dkwedge_discovery_methods_lock, RW_READER);
1100
1101	/*
1102	 * Use the character device for scanning, the block device
1103	 * is busy if there are already wedges attached.
1104	 */
1105	error = dkwedge_compute_pdev(pdk->dk_name, &pdev, VCHR);
1106	if (error) {
1107		aprint_error("%s: unable to compute pdev, error = %d\n",
1108		    pdk->dk_name, error);
1109		goto out;
1110	}
1111
1112	error = cdevvp(pdev, &vp);
1113	if (error) {
1114		aprint_error("%s: unable to find vnode for pdev, error = %d\n",
1115		    pdk->dk_name, error);
1116		goto out;
1117	}
1118
1119	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1120	if (error) {
1121		aprint_error("%s: unable to lock vnode for pdev, error = %d\n",
1122		    pdk->dk_name, error);
1123		vrele(vp);
1124		goto out;
1125	}
1126
1127	error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
1128	if (error) {
1129		if (error != ENXIO)
1130			aprint_error("%s: unable to open device, error = %d\n",
1131			    pdk->dk_name, error);
1132		vput(vp);
1133		goto out;
1134	}
1135	VOP_UNLOCK(vp);
1136
1137	/*
1138	 * Remove unused wedges
1139	 */
1140	dkwedge_delidle(pdk);
1141
1142	/*
1143	 * For each supported partition map type, look to see if
1144	 * this map type exists.  If so, parse it and add the
1145	 * corresponding wedges.
1146	 */
1147	LIST_FOREACH(ddm, &dkwedge_discovery_methods, ddm_list) {
1148		error = (*ddm->ddm_discover)(pdk, vp);
1149		if (error == 0) {
1150			/* Successfully created wedges; we're done. */
1151			break;
1152		}
1153	}
1154
1155	error = vn_close(vp, FREAD, NOCRED);
1156	if (error) {
1157		aprint_error("%s: unable to close device, error = %d\n",
1158		    pdk->dk_name, error);
1159		/* We'll just assume the vnode has been cleaned up. */
1160	}
1161
1162out:
1163	rw_exit(&dkwedge_discovery_methods_lock);
1164}
1165
1166/*
1167 * dkwedge_read:
1168 *
1169 *	Read some data from the specified disk, used for
1170 *	partition discovery.
1171 */
1172int
1173dkwedge_read(struct disk *pdk, struct vnode *vp, daddr_t blkno,
1174    void *tbuf, size_t len)
1175{
1176	buf_t *bp;
1177	int error;
1178	bool isopen;
1179	dev_t bdev;
1180	struct vnode *bdvp;
1181
1182	/*
1183	 * The kernel cannot read from a character device vnode
1184	 * as physio() only handles user memory.
1185	 *
1186	 * If the block device has already been opened by a wedge
1187	 * use that vnode and temporarily bump the open counter.
1188	 *
1189	 * Otherwise try to open the block device.
1190	 */
1191
1192	bdev = devsw_chr2blk(vp->v_rdev);
1193
1194	mutex_enter(&pdk->dk_rawlock);
1195	if (pdk->dk_rawopens != 0) {
1196		KASSERT(pdk->dk_rawvp != NULL);
1197		isopen = true;
1198		++pdk->dk_rawopens;
1199		bdvp = pdk->dk_rawvp;
1200		error = 0;
1201	} else {
1202		isopen = false;
1203		error = dk_open_parent(bdev, FREAD, &bdvp);
1204	}
1205	mutex_exit(&pdk->dk_rawlock);
1206
1207	if (error)
1208		return error;
1209
1210	bp = getiobuf(bdvp, true);
1211	bp->b_flags = B_READ;
1212	bp->b_cflags = BC_BUSY;
1213	bp->b_dev = bdev;
1214	bp->b_data = tbuf;
1215	bp->b_bufsize = bp->b_bcount = len;
1216	bp->b_blkno = blkno;
1217	bp->b_cylinder = 0;
1218	bp->b_error = 0;
1219
1220	VOP_STRATEGY(bdvp, bp);
1221	error = biowait(bp);
1222	putiobuf(bp);
1223
1224	mutex_enter(&pdk->dk_rawlock);
1225	if (isopen) {
1226		--pdk->dk_rawopens;
1227	} else {
1228		dk_close_parent(bdvp, FREAD);
1229	}
1230	mutex_exit(&pdk->dk_rawlock);
1231
1232	return error;
1233}
1234
1235/*
1236 * dkwedge_lookup:
1237 *
1238 *	Look up a dkwedge_softc based on the provided dev_t.
1239 *
1240 *	Caller must guarantee the wedge is referenced.
1241 */
1242static struct dkwedge_softc *
1243dkwedge_lookup(dev_t dev)
1244{
1245
1246	return device_lookup_private(&dk_cd, minor(dev));
1247}
1248
1249static struct dkwedge_softc *
1250dkwedge_lookup_acquire(dev_t dev)
1251{
1252	device_t dv = device_lookup_acquire(&dk_cd, minor(dev));
1253
1254	if (dv == NULL)
1255		return NULL;
1256	return device_private(dv);
1257}
1258
1259static int
1260dk_open_parent(dev_t dev, int mode, struct vnode **vpp)
1261{
1262	struct vnode *vp;
1263	int error;
1264
1265	error = bdevvp(dev, &vp);
1266	if (error)
1267		return error;
1268
1269	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1270	if (error) {
1271		vrele(vp);
1272		return error;
1273	}
1274	error = VOP_OPEN(vp, mode, NOCRED);
1275	if (error) {
1276		vput(vp);
1277		return error;
1278	}
1279
1280	/* VOP_OPEN() doesn't do this for us. */
1281	if (mode & FWRITE) {
1282		mutex_enter(vp->v_interlock);
1283		vp->v_writecount++;
1284		mutex_exit(vp->v_interlock);
1285	}
1286
1287	VOP_UNLOCK(vp);
1288
1289	*vpp = vp;
1290
1291	return 0;
1292}
1293
1294static int
1295dk_close_parent(struct vnode *vp, int mode)
1296{
1297	int error;
1298
1299	error = vn_close(vp, mode, NOCRED);
1300	return error;
1301}
1302
1303/*
1304 * dkopen:		[devsw entry point]
1305 *
1306 *	Open a wedge.
1307 */
1308static int
1309dkopen(dev_t dev, int flags, int fmt, struct lwp *l)
1310{
1311	struct dkwedge_softc *sc = dkwedge_lookup(dev);
1312	int error = 0;
1313
1314	if (sc == NULL)
1315		return ENXIO;
1316	KASSERT(sc->sc_dev != NULL);
1317	KASSERT(sc->sc_state == DKW_STATE_RUNNING);
1318
1319	/*
1320	 * We go through a complicated little dance to only open the parent
1321	 * vnode once per wedge, no matter how many times the wedge is
1322	 * opened.  The reason?  We see one dkopen() per open call, but
1323	 * only dkclose() on the last close.
1324	 */
1325	mutex_enter(&sc->sc_dk.dk_openlock);
1326	mutex_enter(&sc->sc_parent->dk_rawlock);
1327	if (sc->sc_dk.dk_openmask == 0) {
1328		error = dkfirstopen(sc, flags);
1329		if (error)
1330			goto out;
1331	} else if (flags & ~sc->sc_mode & FWRITE) {
1332		/*
1333		 * The parent is already open, but the previous attempt
1334		 * to open it read/write failed and fell back to
1335		 * read-only.  In that case, we assume the medium is
1336		 * read-only and fail to open the wedge read/write.
1337		 */
1338		error = EROFS;
1339		goto out;
1340	}
1341	KASSERT(sc->sc_mode != 0);
1342	KASSERTMSG(sc->sc_mode & FREAD, "%s: sc_mode=%x",
1343	    device_xname(sc->sc_dev), sc->sc_mode);
1344	KASSERTMSG((flags & FWRITE) ? (sc->sc_mode & FWRITE) : 1,
1345	    "%s: flags=%x sc_mode=%x",
1346	    device_xname(sc->sc_dev), flags, sc->sc_mode);
1347	if (fmt == S_IFCHR)
1348		sc->sc_dk.dk_copenmask |= 1;
1349	else
1350		sc->sc_dk.dk_bopenmask |= 1;
1351	sc->sc_dk.dk_openmask =
1352	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
1353
1354out:	mutex_exit(&sc->sc_parent->dk_rawlock);
1355	mutex_exit(&sc->sc_dk.dk_openlock);
1356	return error;
1357}
1358
1359static int
1360dkfirstopen(struct dkwedge_softc *sc, int flags)
1361{
1362	struct dkwedge_softc *nsc;
1363	struct vnode *vp;
1364	int mode;
1365	int error;
1366
1367	KASSERT(mutex_owned(&sc->sc_dk.dk_openlock));
1368	KASSERT(mutex_owned(&sc->sc_parent->dk_rawlock));
1369
1370	if (sc->sc_parent->dk_rawopens == 0) {
1371		KASSERT(sc->sc_parent->dk_rawvp == NULL);
1372		/*
1373		 * Try open read-write. If this fails for EROFS
1374		 * and wedge is read-only, retry to open read-only.
1375		 */
1376		mode = FREAD | FWRITE;
1377		error = dk_open_parent(sc->sc_pdev, mode, &vp);
1378		if (error == EROFS && (flags & FWRITE) == 0) {
1379			mode &= ~FWRITE;
1380			error = dk_open_parent(sc->sc_pdev, mode, &vp);
1381		}
1382		if (error)
1383			return error;
1384		KASSERT(vp != NULL);
1385		sc->sc_parent->dk_rawvp = vp;
1386	} else {
1387		/*
1388		 * Retrieve mode from an already opened wedge.
1389		 *
1390		 * At this point, dk_rawopens is bounded by the number
1391		 * of dkwedge devices in the system, which is limited
1392		 * by autoconf device numbering to INT_MAX.  Since
1393		 * dk_rawopens is unsigned, this can't overflow.
1394		 */
1395		KASSERT(sc->sc_parent->dk_rawopens < UINT_MAX);
1396		KASSERT(sc->sc_parent->dk_rawvp != NULL);
1397		mode = 0;
1398		mutex_enter(&sc->sc_parent->dk_openlock);
1399		LIST_FOREACH(nsc, &sc->sc_parent->dk_wedges, sc_plink) {
1400			if (nsc == sc || nsc->sc_dk.dk_openmask == 0)
1401				continue;
1402			mode = nsc->sc_mode;
1403			break;
1404		}
1405		mutex_exit(&sc->sc_parent->dk_openlock);
1406	}
1407	sc->sc_mode = mode;
1408	sc->sc_parent->dk_rawopens++;
1409
1410	return 0;
1411}
1412
1413static void
1414dklastclose(struct dkwedge_softc *sc)
1415{
1416
1417	KASSERT(mutex_owned(&sc->sc_dk.dk_openlock));
1418	KASSERT(mutex_owned(&sc->sc_parent->dk_rawlock));
1419	KASSERT(sc->sc_parent->dk_rawopens > 0);
1420	KASSERT(sc->sc_parent->dk_rawvp != NULL);
1421
1422	if (--sc->sc_parent->dk_rawopens == 0) {
1423		struct vnode *const vp = sc->sc_parent->dk_rawvp;
1424		const int mode = sc->sc_mode;
1425
1426		sc->sc_parent->dk_rawvp = NULL;
1427		sc->sc_mode = 0;
1428
1429		dk_close_parent(vp, mode);
1430	}
1431}
1432
1433/*
1434 * dkclose:		[devsw entry point]
1435 *
1436 *	Close a wedge.
1437 */
1438static int
1439dkclose(dev_t dev, int flags, int fmt, struct lwp *l)
1440{
1441	struct dkwedge_softc *sc = dkwedge_lookup(dev);
1442
1443	/*
1444	 * dkclose can be called even if dkopen didn't succeed, so we
1445	 * have to handle the same possibility that the wedge may not
1446	 * exist.
1447	 */
1448	if (sc == NULL)
1449		return ENXIO;
1450	KASSERT(sc->sc_dev != NULL);
1451	KASSERT(sc->sc_state != DKW_STATE_LARVAL);
1452	KASSERT(sc->sc_state != DKW_STATE_DEAD);
1453
1454	mutex_enter(&sc->sc_dk.dk_openlock);
1455	mutex_enter(&sc->sc_parent->dk_rawlock);
1456
1457	KASSERT(sc->sc_dk.dk_openmask != 0);
1458
1459	if (fmt == S_IFCHR)
1460		sc->sc_dk.dk_copenmask &= ~1;
1461	else
1462		sc->sc_dk.dk_bopenmask &= ~1;
1463	sc->sc_dk.dk_openmask =
1464	    sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;
1465
1466	if (sc->sc_dk.dk_openmask == 0) {
1467		dklastclose(sc);
1468	}
1469
1470	mutex_exit(&sc->sc_parent->dk_rawlock);
1471	mutex_exit(&sc->sc_dk.dk_openlock);
1472
1473	return 0;
1474}
1475
1476/*
1477 * dkcancel:		[devsw entry point]
1478 *
1479 *	Cancel any pending I/O operations waiting on a wedge.
1480 */
1481static int
1482dkcancel(dev_t dev, int flags, int fmt, struct lwp *l)
1483{
1484	struct dkwedge_softc *sc = dkwedge_lookup(dev);
1485
1486	KASSERT(sc != NULL);
1487	KASSERT(sc->sc_dev != NULL);
1488	KASSERT(sc->sc_state != DKW_STATE_LARVAL);
1489	KASSERT(sc->sc_state != DKW_STATE_DEAD);
1490
1491	/*
1492	 * Disk I/O is expected to complete or fail within a reasonable
1493	 * timeframe -- it's storage, not communication.  Further, the
1494	 * character and block device interface guarantees that prior
1495	 * reads and writes have completed or failed by the time close
1496	 * returns -- we are not to cancel them here.  If the parent
1497	 * device's hardware is gone, the parent driver can make them
1498	 * fail.  Nothing for dk(4) itself to do.
1499	 */
1500
1501	return 0;
1502}
1503
1504/*
1505 * dkstrategy:		[devsw entry point]
1506 *
1507 *	Perform I/O based on the wedge I/O strategy.
1508 */
1509static void
1510dkstrategy(struct buf *bp)
1511{
1512	struct dkwedge_softc *sc = dkwedge_lookup(bp->b_dev);
1513	uint64_t p_size, p_offset;
1514
1515	KASSERT(sc != NULL);
1516	KASSERT(sc->sc_dev != NULL);
1517	KASSERT(sc->sc_state != DKW_STATE_LARVAL);
1518	KASSERT(sc->sc_state != DKW_STATE_DEAD);
1519	KASSERT(sc->sc_parent->dk_rawvp != NULL);
1520
1521	/* If it's an empty transfer, wake up the top half now. */
1522	if (bp->b_bcount == 0)
1523		goto done;
1524
1525	p_offset = sc->sc_offset << sc->sc_parent->dk_blkshift;
1526	p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift;
1527
1528	/* Make sure it's in-range. */
1529	if (bounds_check_with_mediasize(bp, DEV_BSIZE, p_size) <= 0)
1530		goto done;
1531
1532	/* Translate it to the parent's raw LBA. */
1533	bp->b_rawblkno = bp->b_blkno + p_offset;
1534
1535	/* Place it in the queue and start I/O on the unit. */
1536	mutex_enter(&sc->sc_iolock);
1537	disk_wait(&sc->sc_dk);
1538	bufq_put(sc->sc_bufq, bp);
1539	mutex_exit(&sc->sc_iolock);
1540
1541	dkstart(sc);
1542	return;
1543
1544done:
1545	bp->b_resid = bp->b_bcount;
1546	biodone(bp);
1547}
1548
1549/*
1550 * dkstart:
1551 *
1552 *	Start I/O that has been enqueued on the wedge.
1553 */
1554static void
1555dkstart(struct dkwedge_softc *sc)
1556{
1557	struct vnode *vp;
1558	struct buf *bp, *nbp;
1559
1560	mutex_enter(&sc->sc_iolock);
1561
1562	/* Do as much work as has been enqueued. */
1563	while ((bp = bufq_peek(sc->sc_bufq)) != NULL) {
1564		if (sc->sc_iostop) {
1565			(void) bufq_get(sc->sc_bufq);
1566			mutex_exit(&sc->sc_iolock);
1567			bp->b_error = ENXIO;
1568			bp->b_resid = bp->b_bcount;
1569			biodone(bp);
1570			mutex_enter(&sc->sc_iolock);
1571			continue;
1572		}
1573
1574		/* fetch an I/O buf with sc_iolock dropped */
1575		mutex_exit(&sc->sc_iolock);
1576		nbp = getiobuf(sc->sc_parent->dk_rawvp, false);
1577		mutex_enter(&sc->sc_iolock);
1578		if (nbp == NULL) {
1579			/*
1580			 * No resources to run this request; leave the
1581			 * buffer queued up, and schedule a timer to
1582			 * restart the queue in 1/2 a second.
1583			 */
1584			if (!sc->sc_iostop)
1585				callout_schedule(&sc->sc_restart_ch, hz/2);
1586			break;
1587		}
1588
1589		/*
1590		 * fetch buf, this can fail if another thread
1591		 * has already processed the queue, it can also
1592		 * return a completely different buf.
1593		 */
1594		bp = bufq_get(sc->sc_bufq);
1595		if (bp == NULL) {
1596			mutex_exit(&sc->sc_iolock);
1597			putiobuf(nbp);
1598			mutex_enter(&sc->sc_iolock);
1599			continue;
1600		}
1601
1602		/* Instrumentation. */
1603		disk_busy(&sc->sc_dk);
1604
1605		/* release lock for VOP_STRATEGY */
1606		mutex_exit(&sc->sc_iolock);
1607
1608		nbp->b_data = bp->b_data;
1609		nbp->b_flags = bp->b_flags;
1610		nbp->b_oflags = bp->b_oflags;
1611		nbp->b_cflags = bp->b_cflags;
1612		nbp->b_iodone = dkiodone;
1613		nbp->b_proc = bp->b_proc;
1614		nbp->b_blkno = bp->b_rawblkno;
1615		nbp->b_dev = sc->sc_parent->dk_rawvp->v_rdev;
1616		nbp->b_bcount = bp->b_bcount;
1617		nbp->b_private = bp;
1618		BIO_COPYPRIO(nbp, bp);
1619
1620		vp = nbp->b_vp;
1621		if ((nbp->b_flags & B_READ) == 0) {
1622			mutex_enter(vp->v_interlock);
1623			vp->v_numoutput++;
1624			mutex_exit(vp->v_interlock);
1625		}
1626		VOP_STRATEGY(vp, nbp);
1627
1628		mutex_enter(&sc->sc_iolock);
1629	}
1630
1631	mutex_exit(&sc->sc_iolock);
1632}
1633
1634/*
1635 * dkiodone:
1636 *
1637 *	I/O to a wedge has completed; alert the top half.
1638 */
1639static void
1640dkiodone(struct buf *bp)
1641{
1642	struct buf *obp = bp->b_private;
1643	struct dkwedge_softc *sc = dkwedge_lookup(obp->b_dev);
1644
1645	KASSERT(sc != NULL);
1646	KASSERT(sc->sc_dev != NULL);
1647
1648	if (bp->b_error != 0)
1649		obp->b_error = bp->b_error;
1650	obp->b_resid = bp->b_resid;
1651	putiobuf(bp);
1652
1653	mutex_enter(&sc->sc_iolock);
1654	disk_unbusy(&sc->sc_dk, obp->b_bcount - obp->b_resid,
1655	    obp->b_flags & B_READ);
1656	mutex_exit(&sc->sc_iolock);
1657
1658	biodone(obp);
1659
1660	/* Kick the queue in case there is more work we can do. */
1661	dkstart(sc);
1662}
1663
1664/*
1665 * dkrestart:
1666 *
1667 *	Restart the work queue after it was stalled due to
1668 *	a resource shortage.  Invoked via a callout.
1669 */
1670static void
1671dkrestart(void *v)
1672{
1673	struct dkwedge_softc *sc = v;
1674
1675	dkstart(sc);
1676}
1677
1678/*
1679 * dkminphys:
1680 *
1681 *	Call parent's minphys function.
1682 */
1683static void
1684dkminphys(struct buf *bp)
1685{
1686	struct dkwedge_softc *sc = dkwedge_lookup(bp->b_dev);
1687	dev_t dev;
1688
1689	KASSERT(sc != NULL);
1690	KASSERT(sc->sc_dev != NULL);
1691
1692	dev = bp->b_dev;
1693	bp->b_dev = sc->sc_pdev;
1694	if (sc->sc_parent->dk_driver && sc->sc_parent->dk_driver->d_minphys)
1695		(*sc->sc_parent->dk_driver->d_minphys)(bp);
1696	else
1697		minphys(bp);
1698	bp->b_dev = dev;
1699}
1700
1701/*
1702 * dkread:		[devsw entry point]
1703 *
1704 *	Read from a wedge.
1705 */
1706static int
1707dkread(dev_t dev, struct uio *uio, int flags)
1708{
1709	struct dkwedge_softc *sc __diagused = dkwedge_lookup(dev);
1710
1711	KASSERT(sc != NULL);
1712	KASSERT(sc->sc_dev != NULL);
1713	KASSERT(sc->sc_state != DKW_STATE_LARVAL);
1714	KASSERT(sc->sc_state != DKW_STATE_DEAD);
1715
1716	return physio(dkstrategy, NULL, dev, B_READ, dkminphys, uio);
1717}
1718
1719/*
1720 * dkwrite:		[devsw entry point]
1721 *
1722 *	Write to a wedge.
1723 */
1724static int
1725dkwrite(dev_t dev, struct uio *uio, int flags)
1726{
1727	struct dkwedge_softc *sc __diagused = dkwedge_lookup(dev);
1728
1729	KASSERT(sc != NULL);
1730	KASSERT(sc->sc_dev != NULL);
1731	KASSERT(sc->sc_state != DKW_STATE_LARVAL);
1732	KASSERT(sc->sc_state != DKW_STATE_DEAD);
1733
1734	return physio(dkstrategy, NULL, dev, B_WRITE, dkminphys, uio);
1735}
1736
1737/*
1738 * dkioctl:		[devsw entry point]
1739 *
1740 *	Perform an ioctl request on a wedge.
1741 */
1742static int
1743dkioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1744{
1745	struct dkwedge_softc *sc = dkwedge_lookup(dev);
1746	int error = 0;
1747
1748	KASSERT(sc != NULL);
1749	KASSERT(sc->sc_dev != NULL);
1750	KASSERT(sc->sc_state != DKW_STATE_LARVAL);
1751	KASSERT(sc->sc_state != DKW_STATE_DEAD);
1752	KASSERT(sc->sc_parent->dk_rawvp != NULL);
1753
1754	/*
1755	 * We pass NODEV instead of our device to indicate we don't
1756	 * want to handle disklabel ioctls
1757	 */
1758	error = disk_ioctl(&sc->sc_dk, NODEV, cmd, data, flag, l);
1759	if (error != EPASSTHROUGH)
1760		return error;
1761
1762	error = 0;
1763
1764	switch (cmd) {
1765	case DIOCGSTRATEGY:
1766	case DIOCGCACHE:
1767	case DIOCCACHESYNC:
1768		error = VOP_IOCTL(sc->sc_parent->dk_rawvp, cmd, data, flag,
1769		    l != NULL ? l->l_cred : NOCRED);
1770		break;
1771	case DIOCGWEDGEINFO: {
1772		struct dkwedge_info *dkw = data;
1773
1774		strlcpy(dkw->dkw_devname, device_xname(sc->sc_dev),
1775		    sizeof(dkw->dkw_devname));
1776	    	memcpy(dkw->dkw_wname, sc->sc_wname, sizeof(dkw->dkw_wname));
1777		dkw->dkw_wname[sizeof(dkw->dkw_wname) - 1] = '\0';
1778		strlcpy(dkw->dkw_parent, sc->sc_parent->dk_name,
1779		    sizeof(dkw->dkw_parent));
1780		dkw->dkw_offset = sc->sc_offset;
1781		dkw->dkw_size = dkwedge_size(sc);
1782		strlcpy(dkw->dkw_ptype, sc->sc_ptype, sizeof(dkw->dkw_ptype));
1783
1784		break;
1785	}
1786	case DIOCGSECTORALIGN: {
1787		struct disk_sectoralign *dsa = data;
1788		uint32_t r;
1789
1790		error = VOP_IOCTL(sc->sc_parent->dk_rawvp, cmd, dsa, flag,
1791		    l != NULL ? l->l_cred : NOCRED);
1792		if (error)
1793			break;
1794
1795		r = sc->sc_offset % dsa->dsa_alignment;
1796		if (r < dsa->dsa_firstaligned)
1797			dsa->dsa_firstaligned = dsa->dsa_firstaligned - r;
1798		else
1799			dsa->dsa_firstaligned = (dsa->dsa_firstaligned +
1800			    dsa->dsa_alignment) - r;
1801		break;
1802	}
1803	default:
1804		error = ENOTTY;
1805	}
1806
1807	return error;
1808}
1809
1810/*
1811 * dkdiscard:		[devsw entry point]
1812 *
1813 *	Perform a discard-range request on a wedge.
1814 */
1815static int
1816dkdiscard(dev_t dev, off_t pos, off_t len)
1817{
1818	struct dkwedge_softc *sc = dkwedge_lookup(dev);
1819	uint64_t size = dkwedge_size(sc);
1820	unsigned shift;
1821	off_t offset, maxlen;
1822	int error;
1823
1824	KASSERT(sc != NULL);
1825	KASSERT(sc->sc_dev != NULL);
1826	KASSERT(sc->sc_state != DKW_STATE_LARVAL);
1827	KASSERT(sc->sc_state != DKW_STATE_DEAD);
1828	KASSERT(sc->sc_parent->dk_rawvp != NULL);
1829
1830	/* XXX check bounds on size/offset up front */
1831	shift = (sc->sc_parent->dk_blkshift + DEV_BSHIFT);
1832	KASSERT(__type_fit(off_t, size));
1833	KASSERT(__type_fit(off_t, sc->sc_offset));
1834	KASSERT(0 <= sc->sc_offset);
1835	KASSERT(size <= (__type_max(off_t) >> shift));
1836	KASSERT(sc->sc_offset <= ((__type_max(off_t) >> shift) - size));
1837	offset = ((off_t)sc->sc_offset << shift);
1838	maxlen = ((off_t)size << shift);
1839
1840	if (len > maxlen)
1841		return EINVAL;
1842	if (pos > (maxlen - len))
1843		return EINVAL;
1844
1845	pos += offset;
1846
1847	vn_lock(sc->sc_parent->dk_rawvp, LK_EXCLUSIVE | LK_RETRY);
1848	error = VOP_FDISCARD(sc->sc_parent->dk_rawvp, pos, len);
1849	VOP_UNLOCK(sc->sc_parent->dk_rawvp);
1850
1851	return error;
1852}
1853
1854/*
1855 * dksize:		[devsw entry point]
1856 *
1857 *	Query the size of a wedge for the purpose of performing a dump
1858 *	or for swapping to.
1859 */
1860static int
1861dksize(dev_t dev)
1862{
1863	/*
1864	 * Don't bother taking a reference because this is only used
1865	 * either (a) while the device is open (for swap), or (b) while
1866	 * any multiprocessing is quiescent (for crash dumps).
1867	 */
1868	struct dkwedge_softc *sc = dkwedge_lookup(dev);
1869	uint64_t p_size;
1870	int rv = -1;
1871
1872	if (sc == NULL)
1873		return -1;
1874	if (sc->sc_state != DKW_STATE_RUNNING)
1875		return -1;
1876
1877	/* Our content type is static, no need to open the device. */
1878
1879	p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift;
1880	if (strcmp(sc->sc_ptype, DKW_PTYPE_SWAP) == 0) {
1881		/* Saturate if we are larger than INT_MAX. */
1882		if (p_size > INT_MAX)
1883			rv = INT_MAX;
1884		else
1885			rv = (int)p_size;
1886	}
1887
1888	return rv;
1889}
1890
1891/*
1892 * dkdump:		[devsw entry point]
1893 *
1894 *	Perform a crash dump to a wedge.
1895 */
1896static int
1897dkdump(dev_t dev, daddr_t blkno, void *va, size_t size)
1898{
1899	/*
1900	 * Don't bother taking a reference because this is only used
1901	 * while any multiprocessing is quiescent.
1902	 */
1903	struct dkwedge_softc *sc = dkwedge_lookup(dev);
1904	const struct bdevsw *bdev;
1905	uint64_t p_size, p_offset;
1906
1907	if (sc == NULL)
1908		return ENXIO;
1909	if (sc->sc_state != DKW_STATE_RUNNING)
1910		return ENXIO;
1911
1912	/* Our content type is static, no need to open the device. */
1913
1914	if (strcmp(sc->sc_ptype, DKW_PTYPE_SWAP) != 0 &&
1915	    strcmp(sc->sc_ptype, DKW_PTYPE_RAID) != 0 &&
1916	    strcmp(sc->sc_ptype, DKW_PTYPE_CGD) != 0)
1917		return ENXIO;
1918	if (size % DEV_BSIZE != 0)
1919		return EINVAL;
1920
1921	p_offset = sc->sc_offset << sc->sc_parent->dk_blkshift;
1922	p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift;
1923
1924	if (blkno < 0 || blkno + size/DEV_BSIZE > p_size) {
1925		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
1926		    "p_size (%" PRIu64 ")\n", __func__, blkno,
1927		    size/DEV_BSIZE, p_size);
1928		return EINVAL;
1929	}
1930
1931	bdev = bdevsw_lookup(sc->sc_pdev);
1932	return (*bdev->d_dump)(sc->sc_pdev, blkno + p_offset, va, size);
1933}
1934
1935/*
1936 * config glue
1937 */
1938
1939/*
1940 * dkwedge_find_partition
1941 *
1942 *	Find wedge corresponding to the specified parent name
1943 *	and offset/length.
1944 */
1945static device_t
1946dkwedge_find_partition_acquire(device_t parent, daddr_t startblk,
1947    uint64_t nblks)
1948{
1949	struct dkwedge_softc *sc;
1950	int i;
1951	device_t wedge = NULL;
1952
1953	rw_enter(&dkwedges_lock, RW_READER);
1954	for (i = 0; i < ndkwedges; i++) {
1955		if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL)
1956			continue;
1957		if (strcmp(sc->sc_parent->dk_name, device_xname(parent)) == 0 &&
1958		    sc->sc_offset == startblk &&
1959		    dkwedge_size(sc) == nblks) {
1960			if (wedge) {
1961				printf("WARNING: double match for boot wedge "
1962				    "(%s, %s)\n",
1963				    device_xname(wedge),
1964				    device_xname(sc->sc_dev));
1965				continue;
1966			}
1967			wedge = sc->sc_dev;
1968			device_acquire(wedge);
1969		}
1970	}
1971	rw_exit(&dkwedges_lock);
1972
1973	return wedge;
1974}
1975
1976/* XXX unsafe */
1977device_t
1978dkwedge_find_partition(device_t parent, daddr_t startblk,
1979    uint64_t nblks)
1980{
1981	device_t dv;
1982
1983	if ((dv = dkwedge_find_partition_acquire(parent, startblk, nblks))
1984	    == NULL)
1985		return NULL;
1986	device_release(dv);
1987	return dv;
1988}
1989
1990const char *
1991dkwedge_get_parent_name(dev_t dev)
1992{
1993	/* XXX: perhaps do this in lookup? */
1994	int bmaj = bdevsw_lookup_major(&dk_bdevsw);
1995	int cmaj = cdevsw_lookup_major(&dk_cdevsw);
1996
1997	if (major(dev) != bmaj && major(dev) != cmaj)
1998		return NULL;
1999
2000	struct dkwedge_softc *const sc = dkwedge_lookup_acquire(dev);
2001	if (sc == NULL)
2002		return NULL;
2003	const char *const name = sc->sc_parent->dk_name;
2004	device_release(sc->sc_dev);
2005	return name;
2006}
2007