geom_bsd.c revision 109900
1163516Simp/*-
2163516Simp * Copyright (c) 2002 Poul-Henning Kamp
3163516Simp * Copyright (c) 2002 Networks Associates Technology, Inc.
4163516Simp * All rights reserved.
5163516Simp *
6163516Simp * This software was developed for the FreeBSD Project by Poul-Henning Kamp
7163516Simp * and NAI Labs, the Security Research Division of Network Associates, Inc.
8163516Simp * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9163516Simp * DARPA CHATS research program.
10163516Simp *
11163516Simp * Redistribution and use in source and binary forms, with or without
12163516Simp * modification, are permitted provided that the following conditions
13163516Simp * are met:
14163516Simp * 1. Redistributions of source code must retain the above copyright
15163516Simp *    notice, this list of conditions and the following disclaimer.
16163516Simp * 2. Redistributions in binary form must reproduce the above copyright
17163516Simp *    notice, this list of conditions and the following disclaimer in the
18163516Simp *    documentation and/or other materials provided with the distribution.
19163516Simp * 3. The names of the authors may not be used to endorse or promote
20163516Simp *    products derived from this software without specific prior written
21163516Simp *    permission.
22163516Simp *
23163516Simp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24170002Simp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25170002Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26170002Simp * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27170002Simp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28170002Simp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29170002Simp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30170002Simp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31170002Simp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32170002Simp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33170002Simp * SUCH DAMAGE.
34170002Simp *
35170002Simp * $FreeBSD: head/sys/geom/geom_bsd.c 109900 2003-01-26 21:54:36Z phk $
36170002Simp *
37170002Simp * This is the method for dealing with BSD disklabels.  It has been
38170002Simp * extensively (by my standards at least) commented, in the vain hope that
39170002Simp * it will serve as the source in future copy&paste operations.
40170002Simp */
41170002Simp
42170002Simp#include <sys/param.h>
43170002Simp#ifndef _KERNEL
44170002Simp#include <stdio.h>
45170002Simp#include <string.h>
46170002Simp#include <stdlib.h>
47170002Simp#include <signal.h>
48170002Simp#include <err.h>
49170002Simp#else
50170002Simp#include <sys/systm.h>
51163516Simp#include <sys/kernel.h>
52163516Simp#include <sys/conf.h>
53163516Simp#include <sys/bio.h>
54163516Simp#include <sys/malloc.h>
55163516Simp#include <sys/lock.h>
56163516Simp#include <sys/mutex.h>
57163516Simp#endif
58163516Simp#include <sys/stdint.h>
59163516Simp#include <sys/md5.h>
60163516Simp#include <sys/errno.h>
61163516Simp#include <sys/disklabel.h>
62163516Simp#include <geom/geom.h>
63163516Simp#include <geom/geom_slice.h>
64163516Simp
65163516Simp#define	BSD_CLASS_NAME "BSD"
66163516Simp
67163516Simp#define ALPHA_LABEL_OFFSET	64
68163516Simp
69163516Simp/*
70163516Simp * Our private data about one instance.  All the rest is handled by the
71163516Simp * slice code and stored in its softc, so this is just the stuff
72163516Simp * specific to BSD disklabels.
73163516Simp */
74163516Simpstruct g_bsd_softc {
75163516Simp	off_t	labeloffset;
76163516Simp	off_t	mbroffset;
77163516Simp	off_t	rawoffset;
78163516Simp	struct disklabel ondisk;
79163516Simp	struct disklabel inram;
80169567Simp	u_char	labelsum[16];
81163516Simp};
82163516Simp
83183480Simp/*
84163516Simp * The next 4 functions isolate us from how the compiler lays out and pads
85163516Simp * "struct disklabel".  We treat what we read from disk as a bytestream and
86163516Simp * explicitly convert it into a struct disklabel.  This makes us compiler-
87163516Simp * endianness- and wordsize- agnostic.
88163516Simp * For now we only have little-endian formats to deal with.
89163516Simp */
90163516Simp
91163516Simpstatic void
92163516Simpg_bsd_ledec_partition(u_char *ptr, struct partition *d)
93163516Simp{
94163516Simp	d->p_size = g_dec_le4(ptr + 0);
95163516Simp	d->p_offset = g_dec_le4(ptr + 4);
96183774Simp	d->p_fsize = g_dec_le4(ptr + 8);
97183774Simp	d->p_fstype = ptr[12];
98183774Simp	d->p_frag = ptr[13];
99163516Simp	d->p_cpg = g_dec_le2(ptr + 14);
100163516Simp}
101163516Simp
102163516Simpstatic void
103163516Simpg_bsd_ledec_disklabel(u_char *ptr, struct disklabel *d)
104163516Simp{
105163516Simp	int i;
106163516Simp
107163516Simp	d->d_magic = g_dec_le4(ptr + 0);
108163516Simp	d->d_type = g_dec_le2(ptr + 4);
109163516Simp	d->d_subtype = g_dec_le2(ptr + 6);
110163516Simp	bcopy(ptr + 8, d->d_typename, 16);
111163516Simp	bcopy(ptr + 24, d->d_packname, 16);
112183704Smav	d->d_secsize = g_dec_le4(ptr + 40);
113183480Simp	d->d_nsectors = g_dec_le4(ptr + 44);
114163516Simp	d->d_ntracks = g_dec_le4(ptr + 48);
115163516Simp	d->d_ncylinders = g_dec_le4(ptr + 52);
116163516Simp	d->d_secpercyl = g_dec_le4(ptr + 56);
117163516Simp	d->d_secperunit = g_dec_le4(ptr + 60);
118163516Simp	d->d_sparespertrack = g_dec_le2(ptr + 64);
119163516Simp	d->d_sparespercyl = g_dec_le2(ptr + 66);
120163516Simp	d->d_acylinders = g_dec_le4(ptr + 68);
121183774Simp	d->d_rpm = g_dec_le2(ptr + 72);
122183774Simp	d->d_interleave = g_dec_le2(ptr + 74);
123183774Simp	d->d_trackskew = g_dec_le2(ptr + 76);
124163516Simp	d->d_cylskew = g_dec_le2(ptr + 78);
125163516Simp	d->d_headswitch = g_dec_le4(ptr + 80);
126163516Simp	d->d_trkseek = g_dec_le4(ptr + 84);
127163516Simp	d->d_flags = g_dec_le4(ptr + 88);
128163516Simp	d->d_drivedata[0] = g_dec_le4(ptr + 92);
129183774Simp	d->d_drivedata[1] = g_dec_le4(ptr + 96);
130183774Simp	d->d_drivedata[2] = g_dec_le4(ptr + 100);
131183774Simp	d->d_drivedata[3] = g_dec_le4(ptr + 104);
132183774Simp	d->d_drivedata[4] = g_dec_le4(ptr + 108);
133183774Simp	d->d_spare[0] = g_dec_le4(ptr + 112);
134183774Simp	d->d_spare[1] = g_dec_le4(ptr + 116);
135183774Simp	d->d_spare[2] = g_dec_le4(ptr + 120);
136184033Smav	d->d_spare[3] = g_dec_le4(ptr + 124);
137183774Simp	d->d_spare[4] = g_dec_le4(ptr + 128);
138183774Simp	d->d_magic2 = g_dec_le4(ptr + 132);
139183774Simp	d->d_checksum = g_dec_le2(ptr + 136);
140184033Smav	d->d_npartitions = g_dec_le2(ptr + 138);
141183774Simp	d->d_bbsize = g_dec_le4(ptr + 140);
142183774Simp	d->d_sbsize = g_dec_le4(ptr + 144);
143183774Simp	for (i = 0; i < MAXPARTITIONS; i++)
144183774Simp		g_bsd_ledec_partition(ptr + 148 + 16 * i, &d->d_partitions[i]);
145183774Simp}
146183774Simp
147183774Simpstatic void
148183774Simpg_bsd_leenc_partition(u_char *ptr, struct partition *d)
149183774Simp{
150183774Simp	g_enc_le4(ptr + 0, d->p_size);
151183774Simp	g_enc_le4(ptr + 4, d->p_offset);
152183805Smav	g_enc_le4(ptr + 8, d->p_fsize);
153183774Simp	ptr[12] = d->p_fstype;
154183774Simp	ptr[13] = d->p_frag;
155183774Simp	g_enc_le2(ptr + 14, d->p_cpg);
156183774Simp}
157183774Simp
158183774Simpstatic void
159183774Simpg_bsd_leenc_disklabel(u_char *ptr, struct disklabel *d)
160183774Simp{
161183774Simp	int i;
162163516Simp
163169567Simp	g_enc_le4(ptr + 0, d->d_magic);
164169567Simp	g_enc_le2(ptr + 4, d->d_type);
165172836Sjulian	g_enc_le2(ptr + 6, d->d_subtype);
166163516Simp	bcopy(d->d_typename, ptr + 8, 16);
167163516Simp	bcopy(d->d_packname, ptr + 24, 16);
168163516Simp	g_enc_le4(ptr + 40, d->d_secsize);
169163516Simp	g_enc_le4(ptr + 44, d->d_nsectors);
170163516Simp	g_enc_le4(ptr + 48, d->d_ntracks);
171163516Simp	g_enc_le4(ptr + 52, d->d_ncylinders);
172163516Simp	g_enc_le4(ptr + 56, d->d_secpercyl);
173169567Simp	g_enc_le4(ptr + 60, d->d_secperunit);
174169567Simp	g_enc_le2(ptr + 64, d->d_sparespertrack);
175169567Simp	g_enc_le2(ptr + 66, d->d_sparespercyl);
176169567Simp	g_enc_le4(ptr + 68, d->d_acylinders);
177169567Simp	g_enc_le2(ptr + 72, d->d_rpm);
178169567Simp	g_enc_le2(ptr + 74, d->d_interleave);
179169567Simp	g_enc_le2(ptr + 76, d->d_trackskew);
180169567Simp	g_enc_le2(ptr + 78, d->d_cylskew);
181169567Simp	g_enc_le4(ptr + 80, d->d_headswitch);
182169567Simp	g_enc_le4(ptr + 84, d->d_trkseek);
183169567Simp	g_enc_le4(ptr + 88, d->d_flags);
184169567Simp	g_enc_le4(ptr + 92, d->d_drivedata[0]);
185169567Simp	g_enc_le4(ptr + 96, d->d_drivedata[1]);
186169567Simp	g_enc_le4(ptr + 100, d->d_drivedata[2]);
187169567Simp	g_enc_le4(ptr + 104, d->d_drivedata[3]);
188169567Simp	g_enc_le4(ptr + 108, d->d_drivedata[4]);
189169567Simp	g_enc_le4(ptr + 112, d->d_spare[0]);
190169567Simp	g_enc_le4(ptr + 116, d->d_spare[1]);
191169567Simp	g_enc_le4(ptr + 120, d->d_spare[2]);
192169567Simp	g_enc_le4(ptr + 124, d->d_spare[3]);
193183467Simp	g_enc_le4(ptr + 128, d->d_spare[4]);
194163516Simp	g_enc_le4(ptr + 132, d->d_magic2);
195163516Simp	g_enc_le2(ptr + 136, d->d_checksum);
196163516Simp	g_enc_le2(ptr + 138, d->d_npartitions);
197163516Simp	g_enc_le4(ptr + 140, d->d_bbsize);
198163516Simp	g_enc_le4(ptr + 144, d->d_sbsize);
199183467Simp	for (i = 0; i < MAXPARTITIONS; i++)
200163516Simp		g_bsd_leenc_partition(ptr + 148 + 16 * i, &d->d_partitions[i]);
201163516Simp}
202163516Simp
203163516Simpstatic int
204163516Simpg_bsd_ondisk_size(void)
205183467Simp{
206163516Simp	return (148 + 16 * MAXPARTITIONS);
207163516Simp}
208163516Simp
209163516Simp/*
210163516Simp * For reasons which were valid and just in their days, FreeBSD/i386 uses
211163516Simp * absolute disk-addresses in disklabels.  The way it works is that the
212163516Simp * p_offset field of all partitions have the first sector number of the
213163516Simp * disk slice added to them.  This was hidden kernel-magic, userland did
214163516Simp * not see these offsets.  These two functions subtract and add them
215163516Simp * while converting from the "ondisk" to the "inram" labels and vice
216163516Simp * versa.
217163516Simp */
218163516Simpstatic void
219163516Simpondisk2inram(struct g_bsd_softc *sc)
220184033Smav{
221184033Smav	struct partition *ppp;
222184033Smav	struct disklabel *dl;
223184033Smav	int i;
224184033Smav
225184033Smav	sc->inram = sc->ondisk;
226184033Smav	dl = &sc->inram;
227184033Smav
228184033Smav	/* Basic sanity-check needed to avoid mistakes. */
229184033Smav	if (dl->d_magic != DISKMAGIC || dl->d_magic2 != DISKMAGIC)
230184033Smav		return;
231184033Smav	if (dl->d_npartitions > MAXPARTITIONS)
232184033Smav		return;
233184033Smav
234184033Smav	sc->rawoffset = dl->d_partitions[RAW_PART].p_offset;
235184033Smav	for (i = 0; i < dl->d_npartitions; i++) {
236184033Smav		ppp = &dl->d_partitions[i];
237184033Smav		if (ppp->p_size != 0 && ppp->p_offset < sc->rawoffset)
238184033Smav			sc->rawoffset = 0;
239184033Smav	}
240184033Smav	if (sc->rawoffset > 0) {
241184033Smav		for (i = 0; i < dl->d_npartitions; i++) {
242184033Smav			ppp = &dl->d_partitions[i];
243184033Smav			if (ppp->p_offset != 0)
244184033Smav				ppp->p_offset -= sc->rawoffset;
245184033Smav		}
246184033Smav	}
247184033Smav	dl->d_checksum = 0;
248184033Smav	dl->d_checksum = dkcksum(&sc->inram);
249184033Smav}
250184033Smav
251184033Smavstatic void
252184033Smavinram2ondisk(struct g_bsd_softc *sc)
253184033Smav{
254184033Smav	struct partition *ppp;
255184033Smav	int i;
256184033Smav
257184033Smav	sc->ondisk = sc->inram;
258184033Smav	if (sc->mbroffset != 0)
259184033Smav		sc->rawoffset = sc->mbroffset / sc->inram.d_secsize;
260184033Smav	if (sc->rawoffset != 0) {
261184033Smav		for (i = 0; i < sc->inram.d_npartitions; i++) {
262184033Smav			ppp = &sc->ondisk.d_partitions[i];
263184033Smav			if (ppp->p_size > 0)
264184033Smav				ppp->p_offset += sc->rawoffset;
265184033Smav			else
266184033Smav				ppp->p_offset = 0;
267184033Smav		}
268184033Smav	}
269184033Smav	sc->ondisk.d_checksum = 0;
270184033Smav	sc->ondisk.d_checksum = dkcksum(&sc->ondisk);
271184033Smav}
272184033Smav
273184033Smav/*
274184033Smav * Check that this looks like a valid disklabel, but be prepared
275184033Smav * to get any kind of junk.  The checksum must be checked only
276184033Smav * after this function returns success to prevent a bogus d_npartitions
277184033Smav * value from tripping us up.
278184033Smav */
279184033Smavstatic int
280184033Smavg_bsd_checklabel(struct disklabel *dl)
281184033Smav{
282184033Smav	struct partition *ppp;
283184033Smav	int i;
284184033Smav
285184033Smav	if (dl->d_magic != DISKMAGIC || dl->d_magic2 != DISKMAGIC)
286184033Smav		return (EINVAL);
287184033Smav	/*
288184033Smav	 * If the label specifies more partitions than we can handle
289184033Smav	 * we have to reject it:  If people updated the label they would
290184033Smav	 * trash it, and that would break the checksum.
291184033Smav	 */
292184033Smav	if (dl->d_npartitions > MAXPARTITIONS)
293184033Smav		return (EINVAL);
294184033Smav
295184033Smav	for (i = 0; i < dl->d_npartitions; i++) {
296184033Smav		ppp = &dl->d_partitions[i];
297184033Smav		/* Cannot extend past unit. */
298184033Smav		if (ppp->p_size != 0 &&
299184033Smav		     ppp->p_offset + ppp->p_size > dl->d_secperunit) {
300184033Smav			return (EINVAL);
301184033Smav		}
302184033Smav	}
303184033Smav	return (0);
304184033Smav}
305184033Smav
306184033Smav/*
307184033Smav * Modify our slicer to match proposed disklabel, if possible.
308184033Smav * First carry out all the simple checks, then lock topology
309184033Smav * and check that no open providers are affected negatively
310184033Smav * then carry out all the changes.
311184033Smav *
312184033Smav * NB: Returns with topology held only if successful return.
313184033Smav */
314184033Smavstatic int
315184033Smavg_bsd_modify(struct g_geom *gp, struct disklabel *dl)
316184033Smav{
317184033Smav	int i, error;
318184033Smav	struct partition *ppp;
319184033Smav	struct g_slicer *gsp;
320184033Smav	struct g_consumer *cp;
321184033Smav	u_int secsize, u;
322184033Smav	off_t mediasize;
323184033Smav
324184033Smav	/* Basic check that this is indeed a disklabel. */
325184033Smav	error = g_bsd_checklabel(dl);
326184033Smav	if (error)
327184033Smav		return (error);
328184033Smav
329184033Smav	/* Make sure the checksum is OK. */
330184033Smav	if (dkcksum(dl) != 0)
331184033Smav		return (EINVAL);
332184033Smav
333184033Smav	/* Get dimensions of our device. */
334184033Smav	cp = LIST_FIRST(&gp->consumer);
335184033Smav	secsize = cp->provider->sectorsize;
336184033Smav	mediasize = cp->provider->mediasize;
337184033Smav
338184033Smav#ifdef nolonger
339184033Smav	/*
340184033Smav	 * The raw-partition must start at zero.  We do not check that the
341184033Smav	 * size == mediasize because this is overly restrictive.  We have
342184033Smav	 * already tested in g_bsd_checklabel() that it is not longer.
343184033Smav	 * XXX: RAW_PART is archaic anyway, and we should drop it.
344184033Smav	 */
345184033Smav	if (dl->d_partitions[RAW_PART].p_offset != 0)
346184033Smav		return (EINVAL);
347184033Smav#endif
348184033Smav
349184033Smav#ifdef notyet
350184033Smav	/*
351184033Smav	 * Indications are that the d_secperunit is not correctly
352184033Smav	 * initialized in many cases, and since we don't need it
353184033Smav	 * for anything, we dont strictly need this test.
354184033Smav	 * Preemptive action to avoid confusing people in disklabel(8)
355184033Smav	 * may be in order.
356184033Smav	 */
357184033Smav	/* The label cannot claim a larger size than the media. */
358184033Smav	if ((off_t)dl->d_secperunit * dl->d_secsize > mediasize)
359184033Smav		return (EINVAL);
360184033Smav#endif
361184033Smav
362184033Smav
363184033Smav	/* ... or a smaller sector size. */
364184033Smav	if (dl->d_secsize < secsize)
365163516Simp		return (EINVAL);
366163516Simp
367163516Simp	/* ... or a non-multiple sector size. */
368163516Simp	if (dl->d_secsize % secsize != 0)
369163516Simp		return (EINVAL);
370163516Simp
371163516Simp	g_topology_lock();
372163516Simp
373163516Simp	/* Don't munge open partitions. */
374163516Simp	gsp = gp->softc;
375169567Simp	for (i = 0; i < dl->d_npartitions; i++) {
376163516Simp		ppp = &dl->d_partitions[i];
377163516Simp
378163516Simp		error = g_slice_config(gp, i, G_SLICE_CONFIG_CHECK,
379163516Simp		    (off_t)ppp->p_offset * dl->d_secsize,
380163516Simp		    (off_t)ppp->p_size * dl->d_secsize,
381169567Simp		     dl->d_secsize,
382169567Simp		    "%s%c", gp->name, 'a' + i);
383169567Simp		if (error) {
384163516Simp			g_topology_unlock();
385169567Simp			return (error);
386169567Simp		}
387183448Simp	}
388183448Simp
389183448Simp	/* Look good, go for it... */
390183448Simp	for (u = 0; u < gsp->nslice; u++) {
391183448Simp		ppp = &dl->d_partitions[u];
392183448Simp		g_slice_config(gp, u, G_SLICE_CONFIG_SET,
393183448Simp		    (off_t)ppp->p_offset * dl->d_secsize,
394163516Simp		    (off_t)ppp->p_size * dl->d_secsize,
395163516Simp		     dl->d_secsize,
396184033Smav		    "%s%c", gp->name, 'a' + u);
397163516Simp	}
398184033Smav	return (0);
399184033Smav}
400184033Smav
401184033Smav/*
402184033Smav * Calculate a disklabel checksum for a little-endian byte-stream.
403184033Smav * We need access to the decoded disklabel because the checksum only
404184033Smav * covers the partition data for the first d_npartitions.
405163516Simp */
406163516Simpstatic int
407183480Simpg_bsd_lesum(struct disklabel *dl, u_char *p)
408183480Simp{
409183480Simp	u_char *pe;
410183480Simp	uint16_t sum;
411183480Simp
412163516Simp	pe = p + 148 + 16 * dl->d_npartitions;
413163516Simp	sum = 0;
414169567Simp	while (p < pe) {
415169567Simp		sum ^= g_dec_le2(p);
416169567Simp		p += 2;
417169567Simp	}
418169567Simp	return (sum);
419169567Simp}
420169567Simp
421172836Sjulian/*
422163516Simp * This is an internal helper function, called multiple times from the taste
423163516Simp * function to try to locate a disklabel on the disk.  More civilized formats
424183774Simp * will not need this, as there is only one possible place on disk to look
425183774Simp * for the magic spot.
426183774Simp */
427183774Simp
428183774Simpstatic int
429183774Simpg_bsd_try(struct g_geom *gp, struct g_slicer *gsp, struct g_consumer *cp, int secsize, struct g_bsd_softc *ms, off_t offset)
430183774Simp{
431183774Simp	int error;
432183774Simp	u_char *buf;
433183774Simp	struct disklabel *dl;
434183774Simp	off_t secoff;
435183774Simp
436183774Simp	/*
437183774Simp	 * We need to read entire aligned sectors, and we assume that the
438183774Simp	 * disklabel does not span sectors, so one sector is enough.
439183774Simp	 */
440183774Simp	error = 0;
441183774Simp	secoff = offset % secsize;
442183774Simp	buf = g_read_data(cp, offset - secoff, secsize, &error);
443183774Simp	if (buf == NULL || error != 0)
444163516Simp		return (ENOENT);
445163516Simp
446163516Simp	/* Decode into our native format. */
447163516Simp	dl = &ms->ondisk;
448163516Simp	g_bsd_ledec_disklabel(buf + secoff, dl);
449163516Simp
450163516Simp	ondisk2inram(ms);
451163516Simp
452163516Simp	dl = &ms->inram;
453163516Simp	/* Does it look like a label at all? */
454163516Simp	if (g_bsd_checklabel(dl))
455163516Simp		error = ENOENT;
456163516Simp	/* ... and does the raw data have a good checksum? */
457163516Simp	if (error == 0 && g_bsd_lesum(dl, buf + secoff) != 0)
458163516Simp		error = ENOENT;
459
460	/* Remember to free the buffer g_read_data() gave us. */
461	g_free(buf);
462
463	/* If we had a label, record it properly. */
464	if (error == 0) {
465		gsp->frontstuff = 16 * secsize;	/* XXX */
466		ms->labeloffset = offset;
467		g_topology_lock();
468		g_slice_conf_hot(gp, 0, offset, g_bsd_ondisk_size());
469		g_topology_unlock();
470	}
471	return (error);
472}
473
474/*
475 * Implement certain ioctls to modify disklabels with.  This function
476 * is called by the event handler thread with topology locked as result
477 * of the g_call_me() in g_bsd_start().  It is not necessary to keep
478 * topology locked all the time but make sure to return with topology
479 * locked as well.
480 */
481
482static void
483g_bsd_ioctl(void *arg)
484{
485	struct bio *bp;
486	struct g_geom *gp;
487	struct g_slicer *gsp;
488	struct g_bsd_softc *ms;
489	struct disklabel *dl;
490	struct g_ioctl *gio;
491	struct g_consumer *cp;
492	u_char *buf;
493	off_t secoff;
494	u_int secsize;
495	int error, i;
496	uint64_t sum;
497
498	/* We don't need topology for now. */
499	g_topology_unlock();
500
501	/* Get hold of the interesting bits from the bio. */
502	bp = arg;
503	gp = bp->bio_to->geom;
504	gsp = gp->softc;
505	ms = gsp->softc;
506	gio = (struct g_ioctl *)bp->bio_data;
507
508	/* The disklabel to set is the ioctl argument. */
509	dl = gio->data;
510
511	/* Validate and modify our slice instance to match. */
512	error = g_bsd_modify(gp, dl);	/* Picks up topology lock on success. */
513	if (error) {
514		g_topology_lock();
515		g_io_deliver(bp, error);
516		return;
517	}
518	/* Update our copy of the disklabel. */
519	ms->inram = *dl;
520	inram2ondisk(ms);
521
522	if (gio->cmd == DIOCSDINFO) {
523		g_io_deliver(bp, 0);
524		return;
525	}
526	KASSERT(gio->cmd == DIOCWDINFO, ("Unknown ioctl in g_bsd_ioctl"));
527	cp = LIST_FIRST(&gp->consumer);
528	/* Get sector size, we need it to read data. */
529	secsize = cp->provider->sectorsize;
530	secoff = ms->labeloffset % secsize;
531	buf = g_read_data(cp, ms->labeloffset - secoff, secsize, &error);
532	if (buf == NULL || error != 0) {
533		g_io_deliver(bp, error);
534		return;
535	}
536	dl = &ms->ondisk;
537	g_bsd_leenc_disklabel(buf + secoff, dl);
538	if (ms->labeloffset == ALPHA_LABEL_OFFSET) {
539		sum = 0;
540		for (i = 0; i < 63; i++)
541			sum += g_dec_le8(buf + i * 8);
542		g_enc_le8(buf + 504, sum);
543	}
544	error = g_write_data(cp, ms->labeloffset - secoff, buf, secsize);
545	g_free(buf);
546	g_io_deliver(bp, error);
547}
548
549/*
550 * Rewrite the bootblock, which is BBSIZE bytes from the start of the disk.
551 * We punch down the disklabel where we expect it to be before writing.
552 */
553static int
554g_bsd_diocbsdbb(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
555{
556	struct g_geom *gp;
557	struct g_slicer *gsp;
558	struct g_bsd_softc *ms;
559	struct disklabel *dl;
560	struct g_consumer *cp;
561	u_char *buf;
562	void *p;
563	u_int secsize;
564	int error, i;
565	uint64_t sum;
566
567	/* Get hold of the interesting bits from the bio. */
568	gp = (void *)dev;
569	gsp = gp->softc;
570	ms = gsp->softc;
571
572	/* The disklabel to set is the ioctl argument. */
573	buf = g_malloc(BBSIZE, 0);
574	p = *(void **)data;
575	error = copyin(p, buf, BBSIZE);
576	if (error) {
577		g_free(buf);
578		return (error);
579	}
580	/* The disklabel to set is the ioctl argument. */
581	dl = (void *)(buf + ms->labeloffset);
582
583	DROP_GIANT();
584
585	/* Validate and modify our slice instance to match. */
586	error = g_bsd_modify(gp, dl);	/* Picks up topology lock on success. */
587	if (!error) {
588		cp = LIST_FIRST(&gp->consumer);
589		secsize = cp->provider->sectorsize;
590		dl = &ms->ondisk;
591		g_bsd_leenc_disklabel(buf + ms->labeloffset, dl);
592		if (ms->labeloffset == ALPHA_LABEL_OFFSET) {
593			sum = 0;
594			for (i = 0; i < 63; i++)
595				sum += g_dec_le8(buf + i * 8);
596			g_enc_le8(buf + 504, sum);
597		}
598		error = g_write_data(cp, 0, buf, BBSIZE);
599		g_topology_unlock();
600	}
601	g_free(buf);
602	PICKUP_GIANT();
603	return (error);
604}
605
606/*
607 * If the user tries to overwrite our disklabel through an open partition
608 * or via a magicwrite config call, we end up here and try to prevent
609 * footshooting as best we can.
610 */
611static void
612g_bsd_hotwrite(void *arg)
613{
614	struct bio *bp;
615	struct g_geom *gp;
616	struct g_slicer *gsp;
617	struct g_slice *gsl;
618	struct g_bsd_softc *ms;
619	struct g_bsd_softc fake;
620	u_char *p;
621	int error;
622
623	bp = arg;
624	gp = bp->bio_to->geom;
625	gsp = gp->softc;
626	ms = gsp->softc;
627	gsl = &gsp->slices[bp->bio_to->index];
628	p = (u_char*)bp->bio_data + ms->labeloffset
629	    - (bp->bio_offset + gsl->offset);
630	g_bsd_ledec_disklabel(p, &fake.ondisk);
631
632	ondisk2inram(&fake);
633	if (g_bsd_checklabel(&fake.inram)) {
634		g_io_deliver(bp, EPERM);
635		return;
636	}
637	if (g_bsd_lesum(&fake.ondisk, p) != 0) {
638		g_io_deliver(bp, EPERM);
639		return;
640	}
641	g_topology_unlock();
642	error = g_bsd_modify(gp, &fake.inram);	/* May pick up topology. */
643	if (error) {
644		g_io_deliver(bp, EPERM);
645		g_topology_lock();
646		return;
647	}
648	/* Update our copy of the disklabel. */
649	ms->inram = fake.inram;
650	inram2ondisk(ms);
651	g_bsd_leenc_disklabel(p, &ms->ondisk);
652	g_slice_finish_hot(bp);
653}
654
655/*-
656 * This start routine is only called for non-trivial requests, all the
657 * trivial ones are handled autonomously by the slice code.
658 * For requests we handle here, we must call the g_io_deliver() on the
659 * bio, and return non-zero to indicate to the slice code that we did so.
660 * This code executes in the "DOWN" I/O path, this means:
661 *    * No sleeping.
662 *    * Don't grab the topology lock.
663 *    * Don't call biowait, g_getattr(), g_setattr() or g_read_data()
664 */
665
666static int
667g_bsd_start(struct bio *bp)
668{
669	struct g_geom *gp;
670	struct g_bsd_softc *ms;
671	struct g_slicer *gsp;
672	struct g_ioctl *gio;
673	int error;
674
675	gp = bp->bio_to->geom;
676	gsp = gp->softc;
677	ms = gsp->softc;
678	switch(bp->bio_cmd) {
679	case BIO_READ:
680		/* We allow reading of our hot spots */
681		return (0);
682	case BIO_DELETE:
683		/* We do not allow deleting our hot spots */
684		return (EPERM);
685	case BIO_WRITE:
686		g_call_me(g_bsd_hotwrite, bp);
687		return (EJUSTRETURN);
688	case BIO_GETATTR:
689		if (g_handleattr(bp, "BSD::labelsum", ms->labelsum,
690		    sizeof(ms->labelsum)))
691			return (1);
692		break;
693	case BIO_SETATTR:
694		break;
695	default:
696		KASSERT(0 == 1, ("Unknown bio_cmd in g_bsd_start (%d)",
697		    bp->bio_cmd));
698	}
699
700	/* We only handle ioctl(2) requests of the right format. */
701	if (strcmp(bp->bio_attribute, "GEOM::ioctl"))
702		return (0);
703	else if (bp->bio_length != sizeof(*gio))
704		return (0);
705
706	/* Get hold of the ioctl parameters. */
707	gio = (struct g_ioctl *)bp->bio_data;
708
709	switch (gio->cmd) {
710	case DIOCGDINFO:
711		/* Return a copy of the disklabel to userland. */
712		bcopy(&ms->inram, gio->data, sizeof(ms->inram));
713		g_io_deliver(bp, 0);
714		return (1);
715	case DIOCBSDBB:
716		gio->func = g_bsd_diocbsdbb;
717		gio->dev = (void *)gp;
718		g_io_deliver(bp, EDIRIOCTL);
719		return (1);
720	case DIOCSDINFO:
721	case DIOCWDINFO:
722		/*
723		 * These we cannot do without the topology lock and some
724		 * some I/O requests.  Ask the event-handler to schedule
725		 * us in a less restricted environment.
726		 */
727		error = g_call_me(g_bsd_ioctl, bp);
728		if (error)
729			g_io_deliver(bp, error);
730		/*
731		 * We must return non-zero to indicate that we will deal
732		 * with this bio, even though we have not done so yet.
733		 */
734		return (1);
735	default:
736		return (0);
737	}
738}
739
740/*
741 * Dump configuration information in XML format.
742 * Notice that the function is called once for the geom and once for each
743 * consumer and provider.  We let g_slice_dumpconf() do most of the work.
744 */
745static void
746g_bsd_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp)
747{
748	struct g_bsd_softc *ms;
749	struct g_slicer *gsp;
750
751	gsp = gp->softc;
752	ms = gsp->softc;
753	g_slice_dumpconf(sb, indent, gp, cp, pp);
754	if (indent != NULL && pp == NULL && cp == NULL) {
755		sbuf_printf(sb, "%s<labeloffset>%jd</labeloffset>\n",
756		    indent, (intmax_t)ms->labeloffset);
757		sbuf_printf(sb, "%s<rawoffset>%jd</rawoffset>\n",
758		    indent, (intmax_t)ms->rawoffset);
759		sbuf_printf(sb, "%s<mbroffset>%jd</mbroffset>\n",
760		    indent, (intmax_t)ms->mbroffset);
761	} else if (pp != NULL) {
762		if (indent == NULL)
763			sbuf_printf(sb, " ty %d",
764			    ms->inram.d_partitions[pp->index].p_fstype);
765		else
766			sbuf_printf(sb, "%s<type>%d</type>\n", indent,
767			    ms->inram.d_partitions[pp->index].p_fstype);
768	}
769}
770
771/*
772 * The taste function is called from the event-handler, with the topology
773 * lock already held and a provider to examine.  The flags are unused.
774 *
775 * If flags == G_TF_NORMAL, the idea is to take a bite of the provider and
776 * if we find valid, consistent magic on it, build a geom on it.
777 * any magic bits which indicate that we should automatically put a BSD
778 * geom on it.
779 *
780 * There may be cases where the operator would like to put a BSD-geom on
781 * providers which do not meet all of the requirements.  This can be done
782 * by instead passing the G_TF_INSIST flag, which will override these
783 * checks.
784 *
785 * The final flags value is G_TF_TRANSPARENT, which instructs the method
786 * to put a geom on top of the provider and configure it to be as transparent
787 * as possible.  This is not really relevant to the BSD method and therefore
788 * not implemented here.
789 */
790
791static struct g_geom *
792g_bsd_taste(struct g_class *mp, struct g_provider *pp, int flags)
793{
794	struct g_geom *gp;
795	struct g_consumer *cp;
796	int error, i;
797	struct g_bsd_softc *ms;
798	struct disklabel *dl;
799	u_int secsize;
800	struct g_slicer *gsp;
801	MD5_CTX md5sum;
802	u_char hash[16];
803
804	g_trace(G_T_TOPOLOGY, "bsd_taste(%s,%s)", mp->name, pp->name);
805	g_topology_assert();
806
807	/* We don't implement transparent inserts. */
808	if (flags == G_TF_TRANSPARENT)
809		return (NULL);
810
811	/*
812	 * BSD labels are a subclass of the general "slicing" topology so
813	 * a lot of the work can be done by the common "slice" code.
814	 * Create a geom with space for MAXPARTITIONS providers, one consumer
815	 * and a softc structure for us.  Specify the provider to attach
816	 * the consumer to and our "start" routine for special requests.
817	 * The provider is opened with mode (1,0,0) so we can do reads
818	 * from it.
819	 */
820	gp = g_slice_new(mp, MAXPARTITIONS, pp, &cp, &ms,
821	     sizeof(*ms), g_bsd_start);
822	if (gp == NULL)
823		return (NULL);
824
825	/*
826	 * Now that we have attached to and opened our provider, we do
827	 * not need the topology lock until we change the topology again
828	 * next time.
829	 */
830	g_topology_unlock();
831
832	/*
833	 * Fill in the optional details, in our case we have a dumpconf
834	 * routine which the "slice" code should call at the right time
835	 */
836	gp->dumpconf = g_bsd_dumpconf;
837
838	/* Get the geom_slicer softc from the geom. */
839	gsp = gp->softc;
840
841	/*
842	 * The do...while loop here allows us to have multiple escapes
843	 * using a simple "break".  This improves code clarity without
844	 * ending up in deep nesting and without using goto or come from.
845	 */
846	do {
847		/*
848		 * If the provider is an MBR we will only auto attach
849		 * to type 165 slices in the G_TF_NORMAL case.  We will
850		 * attach to any other type.
851		 */
852		error = g_getattr("MBR::type", cp, &i);
853		if (!error) {
854			if (i != 165 && flags == G_TF_NORMAL)
855				break;
856			error = g_getattr("MBR::offset", cp, &ms->mbroffset);
857			if (error)
858				break;
859		}
860
861		/* Same thing if we are inside a PC98 */
862		error = g_getattr("PC98::type", cp, &i);
863		if (!error) {
864			if (i != 0xc494 && flags == G_TF_NORMAL)
865				break;
866			error = g_getattr("PC98::offset", cp, &ms->mbroffset);
867			if (error)
868				break;
869		}
870
871		/* Get sector size, we need it to read data. */
872		secsize = cp->provider->sectorsize;
873		if (secsize < 512)
874			break;
875
876		/* First look for a label at the start of the second sector. */
877		error = g_bsd_try(gp, gsp, cp, secsize, ms, secsize);
878
879		/* Next, look for alpha labels */
880		if (error)
881			error = g_bsd_try(gp, gsp, cp, secsize, ms,
882			    ALPHA_LABEL_OFFSET);
883
884		/* If we didn't find a label, punt. */
885		if (error)
886			break;
887
888		/*
889		 * In order to avoid recursively attaching to the same
890		 * on-disk label (it's usually visible through the 'c'
891		 * partition) we calculate an MD5 and ask if other BSD's
892		 * below us love that label.  If they do, we don't.
893		 */
894
895		dl = &ms->inram;
896		MD5Init(&md5sum);
897		MD5Update(&md5sum, (u_char *)dl, sizeof(dl));
898		MD5Final(ms->labelsum, &md5sum);
899
900		error = g_getattr("BSD::labelsum", cp, &hash);
901		if (!error && !strncmp(ms->labelsum, hash, sizeof(hash)))
902			break;
903
904		/*
905		 * Process the found disklabel, and modify our "slice"
906		 * instance to match it, if possible.
907		 */
908		error = g_bsd_modify(gp, dl);	/* Picks up topology lock. */
909		if (!error)
910			g_topology_unlock();
911		break;
912	} while (0);
913
914	/* Success or failure, we can close our provider now. */
915	g_topology_lock();
916	error = g_access_rel(cp, -1, 0, 0);
917
918	/* If we have configured any providers, return the new geom. */
919	if (gsp->nprovider > 0)
920		return (gp);
921	/*
922	 * ...else push the "self-destruct" button, by spoiling our own
923	 * consumer.  This triggers a call to g_std_spoiled which will
924	 * dismantle what was setup.
925	 */
926	g_std_spoiled(cp);
927	return (NULL);
928}
929
930/* Finally, register with GEOM infrastructure. */
931static struct g_class g_bsd_class = {
932	BSD_CLASS_NAME,
933	g_bsd_taste,
934	NULL,
935	G_CLASS_INITIALIZER
936};
937
938DECLARE_GEOM_CLASS(g_bsd_class, g_bsd);
939