subr_disk.c revision 105365
1/*
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/kern/subr_disk.c 105365 2002-10-17 23:48:29Z sobomax $
10 *
11 */
12
13#include "opt_geom.h"
14
15#include <sys/param.h>
16#include <sys/systm.h>
17#include <sys/stdint.h>
18#include <sys/bio.h>
19#include <sys/conf.h>
20#include <sys/disk.h>
21#include <sys/diskslice.h>
22#include <sys/disklabel.h>
23#ifdef NO_GEOM
24#include <sys/kernel.h>
25#include <sys/sysctl.h>
26#include <sys/malloc.h>
27#include <sys/sysctl.h>
28#include <machine/md_var.h>
29#include <sys/ctype.h>
30
31static MALLOC_DEFINE(M_DISK, "disk", "disk data");
32
33static d_strategy_t diskstrategy;
34static d_open_t diskopen;
35static d_close_t diskclose;
36static d_ioctl_t diskioctl;
37static d_psize_t diskpsize;
38
39static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
40
41void disk_dev_synth(dev_t dev);
42
43void
44disk_dev_synth(dev_t dev)
45{
46	struct disk *dp;
47	int u, s, p;
48	dev_t pdev;
49
50	if (dksparebits(dev))
51		return;
52	LIST_FOREACH(dp, &disklist, d_list) {
53		if (major(dev) != dp->d_devsw->d_maj)
54			continue;
55		u = dkunit(dev);
56		p = RAW_PART;
57		s = WHOLE_DISK_SLICE;
58		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
59		if (pdev->si_devsw == NULL)
60			return;		/* Probably a unit we don't have */
61		s = dkslice(dev);
62		p = dkpart(dev);
63		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
64			/* XXX: actually should not happen */
65			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
66			    UID_ROOT, GID_OPERATOR, 0640, "%s%d",
67				dp->d_devsw->d_name, u);
68			dev_depends(pdev, dev);
69			return;
70		}
71		if (s == COMPATIBILITY_SLICE) {
72			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
73			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
74				dp->d_devsw->d_name, u, 'a' + p);
75			dev_depends(pdev, dev);
76			return;
77		}
78		if (p != RAW_PART) {
79			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
80			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
81				dp->d_devsw->d_name, u, s - BASE_SLICE + 1,
82				'a' + p);
83		} else {
84			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
85			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
86				dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
87			make_dev_alias(dev, "%s%ds%dc",
88			    dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
89		}
90		dev_depends(pdev, dev);
91		return;
92	}
93}
94
95static void
96disk_clone(void *arg, char *name, int namelen, dev_t *dev)
97{
98	struct disk *dp;
99	char const *d;
100	char *e;
101	int j, u, s, p;
102	dev_t pdev;
103
104	if (*dev != NODEV)
105		return;
106
107	LIST_FOREACH(dp, &disklist, d_list) {
108		d = dp->d_devsw->d_name;
109		j = dev_stdclone(name, &e, d, &u);
110		if (j == 0)
111			continue;
112		if (u > DKMAXUNIT)
113			continue;
114		p = RAW_PART;
115		s = WHOLE_DISK_SLICE;
116		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
117		if (pdev->si_disk == NULL)
118			continue;
119		if (*e != '\0') {
120			j = dev_stdclone(e, &e, "s", &s);
121			if (j == 0)
122				s = COMPATIBILITY_SLICE;
123			else if (j == 1 || j == 2)
124				s += BASE_SLICE - 1;
125			if (!*e)
126				;		/* ad0s1 case */
127			else if (e[1] != '\0')
128				return;		/* can never be a disk name */
129			else if (*e < 'a' || *e > 'h')
130				return;		/* can never be a disk name */
131			else
132				p = *e - 'a';
133		}
134		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
135			return;
136		} else if (s >= BASE_SLICE && p != RAW_PART) {
137			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
138			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
139			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1,
140			    p + 'a');
141		} else if (s >= BASE_SLICE) {
142			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
143			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
144			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
145			make_dev_alias(*dev, "%s%ds%dc",
146			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
147		} else {
148			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
149			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
150			    pdev->si_devsw->d_name, u, p + 'a');
151		}
152		dev_depends(pdev, *dev);
153		return;
154	}
155}
156
157static void
158inherit_raw(dev_t pdev, dev_t dev)
159{
160	dev->si_disk = pdev->si_disk;
161	dev->si_drv1 = pdev->si_drv1;
162	dev->si_drv2 = pdev->si_drv2;
163	dev->si_iosize_max = pdev->si_iosize_max;
164	dev->si_bsize_phys = pdev->si_bsize_phys;
165	dev->si_bsize_best = pdev->si_bsize_best;
166}
167
168dev_t
169disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto)
170{
171	static int once;
172	dev_t dev;
173
174	if (!once) {
175		EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000);
176		once++;
177	}
178
179	bzero(dp, sizeof(*dp));
180	dp->d_label = malloc(sizeof *dp->d_label, M_DEVBUF, M_WAITOK|M_ZERO);
181
182	if (proto->d_open != diskopen) {
183		*proto = *cdevsw;
184		proto->d_open = diskopen;
185		proto->d_close = diskclose;
186		proto->d_ioctl = diskioctl;
187		proto->d_strategy = diskstrategy;
188		proto->d_psize = diskpsize;
189	}
190
191	if (bootverbose)
192		printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
193	dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
194	    UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
195
196	dev->si_disk = dp;
197	dp->d_dev = dev;
198	dp->d_dsflags = flags;
199	dp->d_devsw = cdevsw;
200	LIST_INSERT_HEAD(&disklist, dp, d_list);
201
202	return (dev);
203}
204
205static int
206diskdumpconf(u_int onoff, dev_t dev, struct disk *dp)
207{
208	struct dumperinfo di;
209	struct disklabel *dl;
210
211	if (!onoff)
212		return(set_dumper(NULL));
213	dl = dsgetlabel(dev, dp->d_slice);
214	if (!dl)
215		return (ENXIO);
216	bzero(&di, sizeof di);
217	di.dumper = (dumper_t *)dp->d_devsw->d_dump;
218	di.priv = dp->d_dev;
219	di.blocksize = dl->d_secsize;
220	di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset +
221	    dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE;
222	di.mediasize =
223	    (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE;
224	if (di.mediasize == 0)
225		return (EINVAL);
226	return(set_dumper(&di));
227}
228
229void
230disk_invalidate (struct disk *disk)
231{
232	if (disk->d_slice)
233		dsgone(&disk->d_slice);
234}
235
236void
237disk_destroy(dev_t dev)
238{
239	LIST_REMOVE(dev->si_disk, d_list);
240	free(dev->si_disk->d_label, M_DEVBUF);
241	bzero(dev->si_disk, sizeof(*dev->si_disk));
242    	dev->si_disk = NULL;
243	destroy_dev(dev);
244	return;
245}
246
247struct disk *
248disk_enumerate(struct disk *disk)
249{
250	if (!disk)
251		return (LIST_FIRST(&disklist));
252	else
253		return (LIST_NEXT(disk, d_list));
254}
255
256static int
257sysctl_disks(SYSCTL_HANDLER_ARGS)
258{
259	struct disk *disk;
260	int error, first;
261
262	disk = NULL;
263	first = 1;
264
265	while ((disk = disk_enumerate(disk))) {
266		if (!first) {
267			error = SYSCTL_OUT(req, " ", 1);
268			if (error)
269				return error;
270		} else {
271			first = 0;
272		}
273		error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
274		if (error)
275			return error;
276	}
277	error = SYSCTL_OUT(req, "", 1);
278	return error;
279}
280
281SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
282    sysctl_disks, "A", "names of available disks");
283
284/*
285 * The cdevsw functions
286 */
287
288static int
289diskopen(dev_t dev, int oflags, int devtype, struct thread *td)
290{
291	dev_t pdev;
292	struct disk *dp;
293	int error;
294
295	error = 0;
296	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
297
298	dp = pdev->si_disk;
299	if (!dp)
300		return (ENXIO);
301
302	while (dp->d_flags & DISKFLAG_LOCK) {
303		dp->d_flags |= DISKFLAG_WANTED;
304		error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz);
305		if (error)
306			return (error);
307	}
308	dp->d_flags |= DISKFLAG_LOCK;
309
310	if (!dsisopen(dp->d_slice)) {
311		if (!pdev->si_iosize_max)
312			pdev->si_iosize_max = dev->si_iosize_max;
313		error = dp->d_devsw->d_open(pdev, oflags, devtype, td);
314		dp->d_label->d_secsize = dp->d_sectorsize;
315		dp->d_label->d_secperunit = dp->d_mediasize / dp->d_sectorsize;
316		dp->d_label->d_nsectors = dp->d_fwsectors;
317		dp->d_label->d_ntracks = dp->d_fwheads;
318	}
319
320	/* Inherit properties from the whole/raw dev_t */
321	inherit_raw(pdev, dev);
322
323	if (error)
324		goto out;
325
326	error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, dp->d_label);
327
328	if (!dsisopen(dp->d_slice))
329		dp->d_devsw->d_close(pdev, oflags, devtype, td);
330out:
331	dp->d_flags &= ~DISKFLAG_LOCK;
332	if (dp->d_flags & DISKFLAG_WANTED) {
333		dp->d_flags &= ~DISKFLAG_WANTED;
334		wakeup(dp);
335	}
336
337	return(error);
338}
339
340static int
341diskclose(dev_t dev, int fflag, int devtype, struct thread *td)
342{
343	struct disk *dp;
344	int error;
345	dev_t pdev;
346
347	error = 0;
348	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
349	dp = pdev->si_disk;
350	if (!dp)
351		return (ENXIO);
352	dsclose(dev, devtype, dp->d_slice);
353	if (!dsisopen(dp->d_slice))
354		error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td);
355	return (error);
356}
357
358static void
359diskstrategy(struct bio *bp)
360{
361	dev_t pdev;
362	struct disk *dp;
363
364	pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART);
365	dp = pdev->si_disk;
366	bp->bio_resid = bp->bio_bcount;
367	if (dp != bp->bio_dev->si_disk)
368		inherit_raw(pdev, bp->bio_dev);
369
370	if (!dp) {
371		biofinish(bp, NULL, ENXIO);
372		return;
373	}
374
375	if (dscheck(bp, dp->d_slice) <= 0) {
376		biodone(bp);
377		return;
378	}
379
380	if (bp->bio_bcount == 0) {
381		biodone(bp);
382		return;
383	}
384
385	KASSERT(dp->d_devsw != NULL, ("NULL devsw"));
386	KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy"));
387	dp->d_devsw->d_strategy(bp);
388	return;
389
390}
391
392static int
393diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
394{
395	struct disk *dp;
396	int error;
397	u_int u;
398	dev_t pdev;
399
400	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
401	dp = pdev->si_disk;
402	if (!dp)
403		return (ENXIO);
404	if (cmd == DIOCSKERNELDUMP) {
405		u = *(u_int *)data;
406		return (diskdumpconf(u, dev, dp));
407	}
408	if (cmd == DIOCGFRONTSTUFF) {
409		*(off_t *)data = 8192;	/* XXX: crude but enough) */
410		return (0);
411	}
412	error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
413	if (error == ENOIOCTL)
414		error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td);
415	return (error);
416}
417
418static int
419diskpsize(dev_t dev)
420{
421	struct disk *dp;
422	dev_t pdev;
423
424	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
425	dp = pdev->si_disk;
426	if (!dp)
427		return (-1);
428	if (dp != dev->si_disk) {
429		dev->si_drv1 = pdev->si_drv1;
430		dev->si_drv2 = pdev->si_drv2;
431		/* XXX: don't set bp->b_dev->si_disk (?) */
432	}
433	return (dssize(dev, &dp->d_slice));
434}
435
436SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD,
437    0, sizeof(struct disklabel), "sizeof(struct disklabel)");
438
439SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
440    0, sizeof(struct diskslices), "sizeof(struct diskslices)");
441
442SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
443    0, sizeof(struct disk), "sizeof(struct disk)");
444
445#endif /* NO_GEOM */
446
447/*-
448 * Disk error is the preface to plaintive error messages
449 * about failing disk transfers.  It prints messages of the form
450 * 	"hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347"
451 * blkdone should be -1 if the position of the error is unknown.
452 * The message is printed with printf.
453 */
454void
455disk_err(struct bio *bp, const char *what, int blkdone, int nl)
456{
457	daddr_t sn;
458
459	printf("%s: %s ", devtoname(bp->bio_dev), what);
460	switch(bp->bio_cmd) {
461	case BIO_READ:		printf("cmd=read "); break;
462	case BIO_WRITE:		printf("cmd=write "); break;
463	case BIO_DELETE:	printf("cmd=delete "); break;
464	case BIO_GETATTR:	printf("cmd=getattr "); break;
465	case BIO_SETATTR:	printf("cmd=setattr "); break;
466	default:		printf("cmd=%x ", bp->bio_cmd); break;
467	}
468	sn = bp->bio_blkno;
469	if (bp->bio_bcount <= DEV_BSIZE) {
470		printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : "");
471		return;
472	}
473	if (blkdone >= 0) {
474		sn += blkdone;
475		printf("fsbn %jd of ", (intmax_t)sn);
476	}
477	printf("%jd-%jd", (intmax_t)bp->bio_blkno,
478	    (intmax_t)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE));
479	if (nl)
480		printf("\n");
481}
482
483#ifdef notquite
484/*
485 * Mutex to use when delaying niced I/O bound processes in bioq_disksort().
486 */
487static struct mtx dksort_mtx;
488static void
489dksort_init(void)
490{
491
492	mtx_init(&dksort_mtx, "dksort", NULL, MTX_DEF);
493}
494SYSINIT(dksort, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, dksort_init, NULL)
495#endif
496
497/*
498 * Seek sort for disks.
499 *
500 * The buf_queue keep two queues, sorted in ascending block order.  The first
501 * queue holds those requests which are positioned after the current block
502 * (in the first request); the second, which starts at queue->switch_point,
503 * holds requests which came in after their block number was passed.  Thus
504 * we implement a one way scan, retracting after reaching the end of the drive
505 * to the first request on the second queue, at which time it becomes the
506 * first queue.
507 *
508 * A one-way scan is natural because of the way UNIX read-ahead blocks are
509 * allocated.
510 */
511
512void
513bioq_disksort(bioq, bp)
514	struct bio_queue_head *bioq;
515	struct bio *bp;
516{
517	struct bio *bq;
518	struct bio *bn;
519	struct bio *be;
520
521#ifdef notquite
522	struct thread *td = curthread;
523
524	if (td && td->td_ksegrp->kg_nice > 0) {
525		TAILQ_FOREACH(bn, &bioq->queue, bio_queue)
526			if (BIOTOBUF(bp)->b_vp != BIOTOBUF(bn)->b_vp)
527				break;
528		if (bn != NULL) {
529			mtx_lock(&dksort_mtx);
530			msleep(&dksort_mtx, &dksort_mtx,
531			    PPAUSE | PCATCH | PDROP, "ioslow",
532			    td->td_ksegrp->kg_nice);
533		}
534	}
535#endif
536	if (!atomic_cmpset_int(&bioq->busy, 0, 1))
537		panic("Recursing in bioq_disksort()");
538	be = TAILQ_LAST(&bioq->queue, bio_queue);
539	/*
540	 * If the queue is empty or we are an
541	 * ordered transaction, then it's easy.
542	 */
543	if ((bq = bioq_first(bioq)) == NULL) {
544		bioq_insert_tail(bioq, bp);
545		bioq->busy = 0;
546		return;
547	} else if (bioq->insert_point != NULL) {
548
549		/*
550		 * A certain portion of the list is
551		 * "locked" to preserve ordering, so
552		 * we can only insert after the insert
553		 * point.
554		 */
555		bq = bioq->insert_point;
556	} else {
557
558		/*
559		 * If we lie before the last removed (currently active)
560		 * request, and are not inserting ourselves into the
561		 * "locked" portion of the list, then we must add ourselves
562		 * to the second request list.
563		 */
564		if (bp->bio_pblkno < bioq->last_pblkno) {
565
566			bq = bioq->switch_point;
567			/*
568			 * If we are starting a new secondary list,
569			 * then it's easy.
570			 */
571			if (bq == NULL) {
572				bioq->switch_point = bp;
573				bioq_insert_tail(bioq, bp);
574				bioq->busy = 0;
575				return;
576			}
577			/*
578			 * If we lie ahead of the current switch point,
579			 * insert us before the switch point and move
580			 * the switch point.
581			 */
582			if (bp->bio_pblkno < bq->bio_pblkno) {
583				bioq->switch_point = bp;
584				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
585				bioq->busy = 0;
586				return;
587			}
588		} else {
589			if (bioq->switch_point != NULL)
590				be = TAILQ_PREV(bioq->switch_point,
591						bio_queue, bio_queue);
592			/*
593			 * If we lie between last_pblkno and bq,
594			 * insert before bq.
595			 */
596			if (bp->bio_pblkno < bq->bio_pblkno) {
597				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
598				bioq->busy = 0;
599				return;
600			}
601		}
602	}
603
604	/*
605	 * Request is at/after our current position in the list.
606	 * Optimize for sequential I/O by seeing if we go at the tail.
607	 */
608	if (bp->bio_pblkno > be->bio_pblkno) {
609		TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue);
610		bioq->busy = 0;
611		return;
612	}
613
614	/* Otherwise, insertion sort */
615	while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) {
616
617		/*
618		 * We want to go after the current request if it is the end
619		 * of the first request list, or if the next request is a
620		 * larger cylinder than our request.
621		 */
622		if (bn == bioq->switch_point
623		 || bp->bio_pblkno < bn->bio_pblkno)
624			break;
625		bq = bn;
626	}
627	TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue);
628	bioq->busy = 0;
629}
630
631
632