geom_bsd.c revision 112988
1/*-
2 * Copyright (c) 2002 Poul-Henning Kamp
3 * Copyright (c) 2002 Networks Associates Technology, Inc.
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7 * and NAI Labs, the Security Research Division of Network Associates, Inc.
8 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. The names of the authors may not be used to endorse or promote
20 *    products derived from this software without specific prior written
21 *    permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * $FreeBSD: head/sys/geom/geom_bsd.c 112988 2003-04-02 20:41:18Z phk $
36 *
37 * This is the method for dealing with BSD disklabels.  It has been
38 * extensively (by my standards at least) commented, in the vain hope that
39 * it will serve as the source in future copy&paste operations.
40 */
41
42#include <sys/param.h>
43#ifndef _KERNEL
44#include <stdio.h>
45#include <string.h>
46#include <stdlib.h>
47#include <signal.h>
48#include <err.h>
49#else
50#include <sys/systm.h>
51#include <sys/kernel.h>
52#include <sys/conf.h>
53#include <sys/bio.h>
54#include <sys/malloc.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#endif
58#include <sys/md5.h>
59#include <sys/errno.h>
60#include <sys/disklabel.h>
61#include <geom/geom.h>
62#include <geom/geom_slice.h>
63
64#define	BSD_CLASS_NAME "BSD"
65
66#define ALPHA_LABEL_OFFSET	64
67
68/*
69 * Our private data about one instance.  All the rest is handled by the
70 * slice code and stored in its softc, so this is just the stuff
71 * specific to BSD disklabels.
72 */
73struct g_bsd_softc {
74	off_t	labeloffset;
75	off_t	mbroffset;
76	off_t	rawoffset;
77	struct disklabel ondisk;
78	struct disklabel inram;
79	u_char	labelsum[16];
80};
81
82/*
83 * The next 4 functions isolate us from how the compiler lays out and pads
84 * "struct disklabel".  We treat what we read from disk as a bytestream and
85 * explicitly convert it into a struct disklabel.  This makes us compiler-
86 * endianness- and wordsize- agnostic.
87 * For now we only have little-endian formats to deal with.
88 */
89
90static void
91g_bsd_ledec_partition(u_char *ptr, struct partition *d)
92{
93	d->p_size = g_dec_le4(ptr + 0);
94	d->p_offset = g_dec_le4(ptr + 4);
95	d->p_fsize = g_dec_le4(ptr + 8);
96	d->p_fstype = ptr[12];
97	d->p_frag = ptr[13];
98	d->p_cpg = g_dec_le2(ptr + 14);
99}
100
101static void
102g_bsd_ledec_disklabel(u_char *ptr, struct disklabel *d)
103{
104	int i;
105
106	d->d_magic = g_dec_le4(ptr + 0);
107	d->d_type = g_dec_le2(ptr + 4);
108	d->d_subtype = g_dec_le2(ptr + 6);
109	bcopy(ptr + 8, d->d_typename, 16);
110	bcopy(ptr + 24, d->d_packname, 16);
111	d->d_secsize = g_dec_le4(ptr + 40);
112	d->d_nsectors = g_dec_le4(ptr + 44);
113	d->d_ntracks = g_dec_le4(ptr + 48);
114	d->d_ncylinders = g_dec_le4(ptr + 52);
115	d->d_secpercyl = g_dec_le4(ptr + 56);
116	d->d_secperunit = g_dec_le4(ptr + 60);
117	d->d_sparespertrack = g_dec_le2(ptr + 64);
118	d->d_sparespercyl = g_dec_le2(ptr + 66);
119	d->d_acylinders = g_dec_le4(ptr + 68);
120	d->d_rpm = g_dec_le2(ptr + 72);
121	d->d_interleave = g_dec_le2(ptr + 74);
122	d->d_trackskew = g_dec_le2(ptr + 76);
123	d->d_cylskew = g_dec_le2(ptr + 78);
124	d->d_headswitch = g_dec_le4(ptr + 80);
125	d->d_trkseek = g_dec_le4(ptr + 84);
126	d->d_flags = g_dec_le4(ptr + 88);
127	d->d_drivedata[0] = g_dec_le4(ptr + 92);
128	d->d_drivedata[1] = g_dec_le4(ptr + 96);
129	d->d_drivedata[2] = g_dec_le4(ptr + 100);
130	d->d_drivedata[3] = g_dec_le4(ptr + 104);
131	d->d_drivedata[4] = g_dec_le4(ptr + 108);
132	d->d_spare[0] = g_dec_le4(ptr + 112);
133	d->d_spare[1] = g_dec_le4(ptr + 116);
134	d->d_spare[2] = g_dec_le4(ptr + 120);
135	d->d_spare[3] = g_dec_le4(ptr + 124);
136	d->d_spare[4] = g_dec_le4(ptr + 128);
137	d->d_magic2 = g_dec_le4(ptr + 132);
138	d->d_checksum = g_dec_le2(ptr + 136);
139	d->d_npartitions = g_dec_le2(ptr + 138);
140	d->d_bbsize = g_dec_le4(ptr + 140);
141	d->d_sbsize = g_dec_le4(ptr + 144);
142	for (i = 0; i < MAXPARTITIONS; i++)
143		g_bsd_ledec_partition(ptr + 148 + 16 * i, &d->d_partitions[i]);
144}
145
146static void
147g_bsd_leenc_partition(u_char *ptr, struct partition *d)
148{
149	g_enc_le4(ptr + 0, d->p_size);
150	g_enc_le4(ptr + 4, d->p_offset);
151	g_enc_le4(ptr + 8, d->p_fsize);
152	ptr[12] = d->p_fstype;
153	ptr[13] = d->p_frag;
154	g_enc_le2(ptr + 14, d->p_cpg);
155}
156
157static void
158g_bsd_leenc_disklabel(u_char *ptr, struct disklabel *d)
159{
160	int i;
161
162	g_enc_le4(ptr + 0, d->d_magic);
163	g_enc_le2(ptr + 4, d->d_type);
164	g_enc_le2(ptr + 6, d->d_subtype);
165	bcopy(d->d_typename, ptr + 8, 16);
166	bcopy(d->d_packname, ptr + 24, 16);
167	g_enc_le4(ptr + 40, d->d_secsize);
168	g_enc_le4(ptr + 44, d->d_nsectors);
169	g_enc_le4(ptr + 48, d->d_ntracks);
170	g_enc_le4(ptr + 52, d->d_ncylinders);
171	g_enc_le4(ptr + 56, d->d_secpercyl);
172	g_enc_le4(ptr + 60, d->d_secperunit);
173	g_enc_le2(ptr + 64, d->d_sparespertrack);
174	g_enc_le2(ptr + 66, d->d_sparespercyl);
175	g_enc_le4(ptr + 68, d->d_acylinders);
176	g_enc_le2(ptr + 72, d->d_rpm);
177	g_enc_le2(ptr + 74, d->d_interleave);
178	g_enc_le2(ptr + 76, d->d_trackskew);
179	g_enc_le2(ptr + 78, d->d_cylskew);
180	g_enc_le4(ptr + 80, d->d_headswitch);
181	g_enc_le4(ptr + 84, d->d_trkseek);
182	g_enc_le4(ptr + 88, d->d_flags);
183	g_enc_le4(ptr + 92, d->d_drivedata[0]);
184	g_enc_le4(ptr + 96, d->d_drivedata[1]);
185	g_enc_le4(ptr + 100, d->d_drivedata[2]);
186	g_enc_le4(ptr + 104, d->d_drivedata[3]);
187	g_enc_le4(ptr + 108, d->d_drivedata[4]);
188	g_enc_le4(ptr + 112, d->d_spare[0]);
189	g_enc_le4(ptr + 116, d->d_spare[1]);
190	g_enc_le4(ptr + 120, d->d_spare[2]);
191	g_enc_le4(ptr + 124, d->d_spare[3]);
192	g_enc_le4(ptr + 128, d->d_spare[4]);
193	g_enc_le4(ptr + 132, d->d_magic2);
194	g_enc_le2(ptr + 136, d->d_checksum);
195	g_enc_le2(ptr + 138, d->d_npartitions);
196	g_enc_le4(ptr + 140, d->d_bbsize);
197	g_enc_le4(ptr + 144, d->d_sbsize);
198	for (i = 0; i < MAXPARTITIONS; i++)
199		g_bsd_leenc_partition(ptr + 148 + 16 * i, &d->d_partitions[i]);
200}
201
202static int
203g_bsd_ondisk_size(void)
204{
205	return (148 + 16 * MAXPARTITIONS);
206}
207
208/*
209 * For reasons which were valid and just in their days, FreeBSD/i386 uses
210 * absolute disk-addresses in disklabels.  The way it works is that the
211 * p_offset field of all partitions have the first sector number of the
212 * disk slice added to them.  This was hidden kernel-magic, userland did
213 * not see these offsets.  These two functions subtract and add them
214 * while converting from the "ondisk" to the "inram" labels and vice
215 * versa.
216 */
217static void
218ondisk2inram(struct g_bsd_softc *sc)
219{
220	struct partition *ppp;
221	struct disklabel *dl;
222	int i;
223
224	sc->inram = sc->ondisk;
225	dl = &sc->inram;
226
227	/* Basic sanity-check needed to avoid mistakes. */
228	if (dl->d_magic != DISKMAGIC || dl->d_magic2 != DISKMAGIC)
229		return;
230	if (dl->d_npartitions > MAXPARTITIONS)
231		return;
232
233	sc->rawoffset = dl->d_partitions[RAW_PART].p_offset;
234	for (i = 0; i < dl->d_npartitions; i++) {
235		ppp = &dl->d_partitions[i];
236		if (ppp->p_size != 0 && ppp->p_offset < sc->rawoffset)
237			sc->rawoffset = 0;
238	}
239	if (sc->rawoffset > 0) {
240		for (i = 0; i < dl->d_npartitions; i++) {
241			ppp = &dl->d_partitions[i];
242			if (ppp->p_offset != 0)
243				ppp->p_offset -= sc->rawoffset;
244		}
245	}
246	dl->d_checksum = 0;
247	dl->d_checksum = dkcksum(&sc->inram);
248}
249
250static void
251inram2ondisk(struct g_bsd_softc *sc)
252{
253	struct partition *ppp;
254	int i;
255
256	sc->ondisk = sc->inram;
257	if (sc->mbroffset != 0)
258		sc->rawoffset = sc->mbroffset / sc->inram.d_secsize;
259	if (sc->rawoffset != 0) {
260		for (i = 0; i < sc->inram.d_npartitions; i++) {
261			ppp = &sc->ondisk.d_partitions[i];
262			if (ppp->p_size > 0)
263				ppp->p_offset += sc->rawoffset;
264			else
265				ppp->p_offset = 0;
266		}
267	}
268	sc->ondisk.d_checksum = 0;
269	sc->ondisk.d_checksum = dkcksum(&sc->ondisk);
270}
271
272/*
273 * Check that this looks like a valid disklabel, but be prepared
274 * to get any kind of junk.  The checksum must be checked only
275 * after this function returns success to prevent a bogus d_npartitions
276 * value from tripping us up.
277 */
278static int
279g_bsd_checklabel(struct disklabel *dl)
280{
281	struct partition *ppp;
282	int i;
283
284	if (dl->d_magic != DISKMAGIC || dl->d_magic2 != DISKMAGIC)
285		return (EINVAL);
286	/*
287	 * If the label specifies more partitions than we can handle
288	 * we have to reject it:  If people updated the label they would
289	 * trash it, and that would break the checksum.
290	 */
291	if (dl->d_npartitions > MAXPARTITIONS)
292		return (EINVAL);
293
294	for (i = 0; i < dl->d_npartitions; i++) {
295		ppp = &dl->d_partitions[i];
296		/* Cannot extend past unit. */
297		if (ppp->p_size != 0 &&
298		     ppp->p_offset + ppp->p_size > dl->d_secperunit) {
299			return (EINVAL);
300		}
301	}
302	return (0);
303}
304
305/*
306 * Modify our slicer to match proposed disklabel, if possible.
307 * First carry out all the simple checks, then lock topology
308 * and check that no open providers are affected negatively
309 * then carry out all the changes.
310 *
311 * NB: Returns with topology held only if successful return.
312 */
313static int
314g_bsd_modify(struct g_geom *gp, struct disklabel *dl)
315{
316	int i, error;
317	struct partition *ppp;
318	struct g_slicer *gsp;
319	struct g_consumer *cp;
320	u_int secsize, u;
321	off_t mediasize;
322
323	/* Basic check that this is indeed a disklabel. */
324	error = g_bsd_checklabel(dl);
325	if (error)
326		return (error);
327
328	/* Make sure the checksum is OK. */
329	if (dkcksum(dl) != 0)
330		return (EINVAL);
331
332	/* Get dimensions of our device. */
333	cp = LIST_FIRST(&gp->consumer);
334	secsize = cp->provider->sectorsize;
335	mediasize = cp->provider->mediasize;
336
337#ifdef nolonger
338	/*
339	 * The raw-partition must start at zero.  We do not check that the
340	 * size == mediasize because this is overly restrictive.  We have
341	 * already tested in g_bsd_checklabel() that it is not longer.
342	 * XXX: RAW_PART is archaic anyway, and we should drop it.
343	 */
344	if (dl->d_partitions[RAW_PART].p_offset != 0)
345		return (EINVAL);
346#endif
347
348#ifdef notyet
349	/*
350	 * Indications are that the d_secperunit is not correctly
351	 * initialized in many cases, and since we don't need it
352	 * for anything, we dont strictly need this test.
353	 * Preemptive action to avoid confusing people in disklabel(8)
354	 * may be in order.
355	 */
356	/* The label cannot claim a larger size than the media. */
357	if ((off_t)dl->d_secperunit * dl->d_secsize > mediasize)
358		return (EINVAL);
359#endif
360
361
362	/* ... or a smaller sector size. */
363	if (dl->d_secsize < secsize)
364		return (EINVAL);
365
366	/* ... or a non-multiple sector size. */
367	if (dl->d_secsize % secsize != 0)
368		return (EINVAL);
369
370	g_topology_lock();
371
372	/* Don't munge open partitions. */
373	gsp = gp->softc;
374	for (i = 0; i < dl->d_npartitions; i++) {
375		ppp = &dl->d_partitions[i];
376
377		error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK,
378		    (off_t)ppp->p_offset * dl->d_secsize,
379		    (off_t)ppp->p_size * dl->d_secsize,
380		     dl->d_secsize,
381		    "%s%c", gp->name, 'a' + i);
382		if (error) {
383			g_topology_unlock();
384			return (error);
385		}
386	}
387
388	/* Look good, go for it... */
389	for (u = 0; u < gsp->nslice; u++) {
390		ppp = &dl->d_partitions[u];
391		g_slice_config(gp, u, G_SLICE_CONFIG_SET,
392		    (off_t)ppp->p_offset * dl->d_secsize,
393		    (off_t)ppp->p_size * dl->d_secsize,
394		     dl->d_secsize,
395		    "%s%c", gp->name, 'a' + u);
396	}
397	return (0);
398}
399
400/*
401 * Calculate a disklabel checksum for a little-endian byte-stream.
402 * We need access to the decoded disklabel because the checksum only
403 * covers the partition data for the first d_npartitions.
404 */
405static int
406g_bsd_lesum(struct disklabel *dl, u_char *p)
407{
408	u_char *pe;
409	uint16_t sum;
410
411	pe = p + 148 + 16 * dl->d_npartitions;
412	sum = 0;
413	while (p < pe) {
414		sum ^= g_dec_le2(p);
415		p += 2;
416	}
417	return (sum);
418}
419
420/*
421 * This is an internal helper function, called multiple times from the taste
422 * function to try to locate a disklabel on the disk.  More civilized formats
423 * will not need this, as there is only one possible place on disk to look
424 * for the magic spot.
425 */
426
427static int
428g_bsd_try(struct g_geom *gp, struct g_slicer *gsp, struct g_consumer *cp, int secsize, struct g_bsd_softc *ms, off_t offset)
429{
430	int error;
431	u_char *buf;
432	struct disklabel *dl;
433	off_t secoff;
434
435	/*
436	 * We need to read entire aligned sectors, and we assume that the
437	 * disklabel does not span sectors, so one sector is enough.
438	 */
439	error = 0;
440	secoff = offset % secsize;
441	buf = g_read_data(cp, offset - secoff, secsize, &error);
442	if (buf == NULL || error != 0)
443		return (ENOENT);
444
445	/* Decode into our native format. */
446	dl = &ms->ondisk;
447	g_bsd_ledec_disklabel(buf + secoff, dl);
448
449	ondisk2inram(ms);
450
451	dl = &ms->inram;
452	/* Does it look like a label at all? */
453	if (g_bsd_checklabel(dl))
454		error = ENOENT;
455	/* ... and does the raw data have a good checksum? */
456	if (error == 0 && g_bsd_lesum(dl, buf + secoff) != 0)
457		error = ENOENT;
458
459	/* Remember to free the buffer g_read_data() gave us. */
460	g_free(buf);
461
462	/* If we had a label, record it properly. */
463	if (error == 0) {
464		gsp->frontstuff = 16 * secsize;	/* XXX */
465		ms->labeloffset = offset;
466		g_topology_lock();
467		g_slice_conf_hot(gp, 0, offset, g_bsd_ondisk_size());
468		g_topology_unlock();
469	}
470	return (error);
471}
472
473/*
474 * Implement certain ioctls to modify disklabels with.  This function
475 * is called by the event handler thread with topology locked as result
476 * of the g_call_me() in g_bsd_start().  It is not necessary to keep
477 * topology locked all the time but make sure to return with topology
478 * locked as well.
479 */
480
481static void
482g_bsd_ioctl(void *arg, int flag __unused)
483{
484	struct bio *bp;
485	struct g_geom *gp;
486	struct g_slicer *gsp;
487	struct g_bsd_softc *ms;
488	struct disklabel *dl;
489	struct g_ioctl *gio;
490	struct g_consumer *cp;
491	u_char *buf;
492	off_t secoff;
493	u_int secsize;
494	int error, i;
495	uint64_t sum;
496
497	/* We don't need topology for now. */
498	g_topology_unlock();
499
500	/* Get hold of the interesting bits from the bio. */
501	bp = arg;
502	gp = bp->bio_to->geom;
503	gsp = gp->softc;
504	ms = gsp->softc;
505	gio = (struct g_ioctl *)bp->bio_data;
506
507	/* The disklabel to set is the ioctl argument. */
508	dl = gio->data;
509
510	/* Validate and modify our slice instance to match. */
511	error = g_bsd_modify(gp, dl);	/* Picks up topology lock on success. */
512	if (error) {
513		g_topology_lock();
514		g_io_deliver(bp, error);
515		return;
516	}
517	/* Update our copy of the disklabel. */
518	ms->inram = *dl;
519	inram2ondisk(ms);
520
521	if (gio->cmd == DIOCSDINFO) {
522		g_io_deliver(bp, 0);
523		return;
524	}
525	KASSERT(gio->cmd == DIOCWDINFO, ("Unknown ioctl in g_bsd_ioctl"));
526	cp = LIST_FIRST(&gp->consumer);
527	/* Get sector size, we need it to read data. */
528	secsize = cp->provider->sectorsize;
529	secoff = ms->labeloffset % secsize;
530	buf = g_read_data(cp, ms->labeloffset - secoff, secsize, &error);
531	if (buf == NULL || error != 0) {
532		g_io_deliver(bp, error);
533		return;
534	}
535	dl = &ms->ondisk;
536	g_bsd_leenc_disklabel(buf + secoff, dl);
537	if (ms->labeloffset == ALPHA_LABEL_OFFSET) {
538		sum = 0;
539		for (i = 0; i < 63; i++)
540			sum += g_dec_le8(buf + i * 8);
541		g_enc_le8(buf + 504, sum);
542	}
543	error = g_write_data(cp, ms->labeloffset - secoff, buf, secsize);
544	g_free(buf);
545	g_io_deliver(bp, error);
546}
547
548/*
549 * Rewrite the bootblock, which is BBSIZE bytes from the start of the disk.
550 * We punch down the disklabel where we expect it to be before writing.
551 */
552static int
553g_bsd_diocbsdbb(dev_t dev, u_long cmd __unused, caddr_t data, int fflag __unused, struct thread *td __unused)
554{
555	struct g_geom *gp;
556	struct g_slicer *gsp;
557	struct g_bsd_softc *ms;
558	struct disklabel *dl;
559	struct g_consumer *cp;
560	u_char *buf;
561	void *p;
562	u_int secsize;
563	int error, i;
564	uint64_t sum;
565
566	/* Get hold of the interesting bits from the bio. */
567	gp = (void *)dev;
568	gsp = gp->softc;
569	ms = gsp->softc;
570
571	/* The disklabel to set is the ioctl argument. */
572	buf = g_malloc(BBSIZE, M_WAITOK);
573	p = *(void **)data;
574	error = copyin(p, buf, BBSIZE);
575	if (error) {
576		g_free(buf);
577		return (error);
578	}
579	/* The disklabel to set is the ioctl argument. */
580	dl = (void *)(buf + ms->labeloffset);
581
582	DROP_GIANT();
583
584	/* Validate and modify our slice instance to match. */
585	error = g_bsd_modify(gp, dl);	/* Picks up topology lock on success. */
586	if (!error) {
587		cp = LIST_FIRST(&gp->consumer);
588		secsize = cp->provider->sectorsize;
589		dl = &ms->ondisk;
590		g_bsd_leenc_disklabel(buf + ms->labeloffset, dl);
591		if (ms->labeloffset == ALPHA_LABEL_OFFSET) {
592			sum = 0;
593			for (i = 0; i < 63; i++)
594				sum += g_dec_le8(buf + i * 8);
595			g_enc_le8(buf + 504, sum);
596		}
597		error = g_write_data(cp, 0, buf, BBSIZE);
598		g_topology_unlock();
599	}
600	g_free(buf);
601	PICKUP_GIANT();
602	return (error);
603}
604
605/*
606 * If the user tries to overwrite our disklabel through an open partition
607 * or via a magicwrite config call, we end up here and try to prevent
608 * footshooting as best we can.
609 */
610static void
611g_bsd_hotwrite(void *arg, int flag __unused)
612{
613	struct bio *bp;
614	struct g_geom *gp;
615	struct g_slicer *gsp;
616	struct g_slice *gsl;
617	struct g_bsd_softc *ms;
618	struct g_bsd_softc fake;
619	u_char *p;
620	int error;
621
622	bp = arg;
623	gp = bp->bio_to->geom;
624	gsp = gp->softc;
625	ms = gsp->softc;
626	gsl = &gsp->slices[bp->bio_to->index];
627	p = (u_char*)bp->bio_data + ms->labeloffset
628	    - (bp->bio_offset + gsl->offset);
629	g_bsd_ledec_disklabel(p, &fake.ondisk);
630
631	ondisk2inram(&fake);
632	if (g_bsd_checklabel(&fake.inram)) {
633		g_io_deliver(bp, EPERM);
634		return;
635	}
636	if (g_bsd_lesum(&fake.ondisk, p) != 0) {
637		g_io_deliver(bp, EPERM);
638		return;
639	}
640	g_topology_unlock();
641	error = g_bsd_modify(gp, &fake.inram);	/* May pick up topology. */
642	if (error) {
643		g_io_deliver(bp, EPERM);
644		g_topology_lock();
645		return;
646	}
647	/* Update our copy of the disklabel. */
648	ms->inram = fake.inram;
649	inram2ondisk(ms);
650	g_bsd_leenc_disklabel(p, &ms->ondisk);
651	g_slice_finish_hot(bp);
652}
653
654/*-
655 * This start routine is only called for non-trivial requests, all the
656 * trivial ones are handled autonomously by the slice code.
657 * For requests we handle here, we must call the g_io_deliver() on the
658 * bio, and return non-zero to indicate to the slice code that we did so.
659 * This code executes in the "DOWN" I/O path, this means:
660 *    * No sleeping.
661 *    * Don't grab the topology lock.
662 *    * Don't call biowait, g_getattr(), g_setattr() or g_read_data()
663 */
664
665static int
666g_bsd_start(struct bio *bp)
667{
668	struct g_geom *gp;
669	struct g_bsd_softc *ms;
670	struct g_slicer *gsp;
671	struct g_ioctl *gio;
672	int error;
673
674	gp = bp->bio_to->geom;
675	gsp = gp->softc;
676	ms = gsp->softc;
677	switch(bp->bio_cmd) {
678	case BIO_READ:
679		/* We allow reading of our hot spots */
680		return (0);
681	case BIO_DELETE:
682		/* We do not allow deleting our hot spots */
683		return (EPERM);
684	case BIO_WRITE:
685		g_call_me(g_bsd_hotwrite, bp, gp, NULL);
686		return (EJUSTRETURN);
687	case BIO_GETATTR:
688		if (g_handleattr(bp, "BSD::labelsum", ms->labelsum,
689		    sizeof(ms->labelsum)))
690			return (1);
691		break;
692	case BIO_SETATTR:
693		break;
694	default:
695		KASSERT(0 == 1, ("Unknown bio_cmd in g_bsd_start (%d)",
696		    bp->bio_cmd));
697	}
698
699	/* We only handle ioctl(2) requests of the right format. */
700	if (strcmp(bp->bio_attribute, "GEOM::ioctl"))
701		return (0);
702	else if (bp->bio_length != sizeof(*gio))
703		return (0);
704
705	/* Get hold of the ioctl parameters. */
706	gio = (struct g_ioctl *)bp->bio_data;
707
708	switch (gio->cmd) {
709	case DIOCGDINFO:
710		/* Return a copy of the disklabel to userland. */
711		bcopy(&ms->inram, gio->data, sizeof(ms->inram));
712		g_io_deliver(bp, 0);
713		return (1);
714	case DIOCBSDBB:
715		gio->func = g_bsd_diocbsdbb;
716		gio->dev = (void *)gp;
717		g_io_deliver(bp, EDIRIOCTL);
718		return (1);
719	case DIOCSDINFO:
720	case DIOCWDINFO:
721		/*
722		 * These we cannot do without the topology lock and some
723		 * some I/O requests.  Ask the event-handler to schedule
724		 * us in a less restricted environment.
725		 */
726		error = g_call_me(g_bsd_ioctl, bp, gp, NULL);
727		if (error)
728			g_io_deliver(bp, error);
729		/*
730		 * We must return non-zero to indicate that we will deal
731		 * with this bio, even though we have not done so yet.
732		 */
733		return (1);
734	default:
735		return (0);
736	}
737}
738
739/*
740 * Dump configuration information in XML format.
741 * Notice that the function is called once for the geom and once for each
742 * consumer and provider.  We let g_slice_dumpconf() do most of the work.
743 */
744static void
745g_bsd_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp)
746{
747	struct g_bsd_softc *ms;
748	struct g_slicer *gsp;
749
750	gsp = gp->softc;
751	ms = gsp->softc;
752	g_slice_dumpconf(sb, indent, gp, cp, pp);
753	if (indent != NULL && pp == NULL && cp == NULL) {
754		sbuf_printf(sb, "%s<labeloffset>%jd</labeloffset>\n",
755		    indent, (intmax_t)ms->labeloffset);
756		sbuf_printf(sb, "%s<rawoffset>%jd</rawoffset>\n",
757		    indent, (intmax_t)ms->rawoffset);
758		sbuf_printf(sb, "%s<mbroffset>%jd</mbroffset>\n",
759		    indent, (intmax_t)ms->mbroffset);
760	} else if (pp != NULL) {
761		if (indent == NULL)
762			sbuf_printf(sb, " ty %d",
763			    ms->inram.d_partitions[pp->index].p_fstype);
764		else
765			sbuf_printf(sb, "%s<type>%d</type>\n", indent,
766			    ms->inram.d_partitions[pp->index].p_fstype);
767	}
768}
769
770/*
771 * The taste function is called from the event-handler, with the topology
772 * lock already held and a provider to examine.  The flags are unused.
773 *
774 * If flags == G_TF_NORMAL, the idea is to take a bite of the provider and
775 * if we find valid, consistent magic on it, build a geom on it.
776 * any magic bits which indicate that we should automatically put a BSD
777 * geom on it.
778 *
779 * There may be cases where the operator would like to put a BSD-geom on
780 * providers which do not meet all of the requirements.  This can be done
781 * by instead passing the G_TF_INSIST flag, which will override these
782 * checks.
783 *
784 * The final flags value is G_TF_TRANSPARENT, which instructs the method
785 * to put a geom on top of the provider and configure it to be as transparent
786 * as possible.  This is not really relevant to the BSD method and therefore
787 * not implemented here.
788 */
789
790static struct g_geom *
791g_bsd_taste(struct g_class *mp, struct g_provider *pp, int flags)
792{
793	struct g_geom *gp;
794	struct g_consumer *cp;
795	int error, i;
796	struct g_bsd_softc *ms;
797	struct disklabel *dl;
798	u_int secsize;
799	struct g_slicer *gsp;
800	MD5_CTX md5sum;
801	u_char hash[16];
802
803	g_trace(G_T_TOPOLOGY, "bsd_taste(%s,%s)", mp->name, pp->name);
804	g_topology_assert();
805
806	/* We don't implement transparent inserts. */
807	if (flags == G_TF_TRANSPARENT)
808		return (NULL);
809
810	/*
811	 * BSD labels are a subclass of the general "slicing" topology so
812	 * a lot of the work can be done by the common "slice" code.
813	 * Create a geom with space for MAXPARTITIONS providers, one consumer
814	 * and a softc structure for us.  Specify the provider to attach
815	 * the consumer to and our "start" routine for special requests.
816	 * The provider is opened with mode (1,0,0) so we can do reads
817	 * from it.
818	 */
819	gp = g_slice_new(mp, MAXPARTITIONS, pp, &cp, &ms,
820	     sizeof(*ms), g_bsd_start);
821	if (gp == NULL)
822		return (NULL);
823
824	/*
825	 * Now that we have attached to and opened our provider, we do
826	 * not need the topology lock until we change the topology again
827	 * next time.
828	 */
829	g_topology_unlock();
830
831	/*
832	 * Fill in the optional details, in our case we have a dumpconf
833	 * routine which the "slice" code should call at the right time
834	 */
835	gp->dumpconf = g_bsd_dumpconf;
836
837	/* Get the geom_slicer softc from the geom. */
838	gsp = gp->softc;
839
840	/*
841	 * The do...while loop here allows us to have multiple escapes
842	 * using a simple "break".  This improves code clarity without
843	 * ending up in deep nesting and without using goto or come from.
844	 */
845	do {
846		/*
847		 * If the provider is an MBR we will only auto attach
848		 * to type 165 slices in the G_TF_NORMAL case.  We will
849		 * attach to any other type.
850		 */
851		error = g_getattr("MBR::type", cp, &i);
852		if (!error) {
853			if (i != 165 && flags == G_TF_NORMAL)
854				break;
855			error = g_getattr("MBR::offset", cp, &ms->mbroffset);
856			if (error)
857				break;
858		}
859
860		/* Same thing if we are inside a PC98 */
861		error = g_getattr("PC98::type", cp, &i);
862		if (!error) {
863			if (i != 0xc494 && flags == G_TF_NORMAL)
864				break;
865			error = g_getattr("PC98::offset", cp, &ms->mbroffset);
866			if (error)
867				break;
868		}
869
870		/* Get sector size, we need it to read data. */
871		secsize = cp->provider->sectorsize;
872		if (secsize < 512)
873			break;
874
875		/* First look for a label at the start of the second sector. */
876		error = g_bsd_try(gp, gsp, cp, secsize, ms, secsize);
877
878		/* Next, look for alpha labels */
879		if (error)
880			error = g_bsd_try(gp, gsp, cp, secsize, ms,
881			    ALPHA_LABEL_OFFSET);
882
883		/* If we didn't find a label, punt. */
884		if (error)
885			break;
886
887		/*
888		 * In order to avoid recursively attaching to the same
889		 * on-disk label (it's usually visible through the 'c'
890		 * partition) we calculate an MD5 and ask if other BSD's
891		 * below us love that label.  If they do, we don't.
892		 */
893
894		dl = &ms->inram;
895		MD5Init(&md5sum);
896		MD5Update(&md5sum, (u_char *)dl, sizeof(dl));
897		MD5Final(ms->labelsum, &md5sum);
898
899		error = g_getattr("BSD::labelsum", cp, &hash);
900		if (!error && !strncmp(ms->labelsum, hash, sizeof(hash)))
901			break;
902
903		/*
904		 * Process the found disklabel, and modify our "slice"
905		 * instance to match it, if possible.
906		 */
907		error = g_bsd_modify(gp, dl);	/* Picks up topology lock. */
908		if (!error)
909			g_topology_unlock();
910		break;
911	} while (0);
912
913	/* Success or failure, we can close our provider now. */
914	g_topology_lock();
915	error = g_access_rel(cp, -1, 0, 0);
916
917	/* If we have configured any providers, return the new geom. */
918	if (gsp->nprovider > 0)
919		return (gp);
920	/*
921	 * ...else push the "self-destruct" button, by spoiling our own
922	 * consumer.  This triggers a call to g_std_spoiled which will
923	 * dismantle what was setup.
924	 */
925	g_std_spoiled(cp);
926	return (NULL);
927}
928
929/* Finally, register with GEOM infrastructure. */
930static struct g_class g_bsd_class = {
931	.name = BSD_CLASS_NAME,
932	.taste = g_bsd_taste,
933	G_CLASS_INITIALIZER
934};
935
936DECLARE_GEOM_CLASS(g_bsd_class, g_bsd);
937