subr_disk.c revision 105667
1/*
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/kern/subr_disk.c 105667 2002-10-22 00:59:49Z mckusick $
10 *
11 */
12
13#include "opt_geom.h"
14
15#include <sys/param.h>
16#include <sys/systm.h>
17#include <sys/stdint.h>
18#include <sys/bio.h>
19#include <sys/conf.h>
20#include <sys/disk.h>
21#include <sys/diskslice.h>
22#include <sys/disklabel.h>
23#ifdef NO_GEOM
24#include <sys/kernel.h>
25#include <sys/malloc.h>
26#include <sys/sysctl.h>
27#include <machine/md_var.h>
28#include <sys/ctype.h>
29
30static MALLOC_DEFINE(M_DISK, "disk", "disk data");
31
32static d_strategy_t diskstrategy;
33static d_open_t diskopen;
34static d_close_t diskclose;
35static d_ioctl_t diskioctl;
36static d_psize_t diskpsize;
37
38static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
39
40void disk_dev_synth(dev_t dev);
41
42void
43disk_dev_synth(dev_t dev)
44{
45	struct disk *dp;
46	int u, s, p;
47	dev_t pdev;
48
49	if (dksparebits(dev))
50		return;
51	LIST_FOREACH(dp, &disklist, d_list) {
52		if (major(dev) != dp->d_devsw->d_maj)
53			continue;
54		u = dkunit(dev);
55		p = RAW_PART;
56		s = WHOLE_DISK_SLICE;
57		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
58		if (pdev->si_devsw == NULL)
59			return;		/* Probably a unit we don't have */
60		s = dkslice(dev);
61		p = dkpart(dev);
62		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
63			/* XXX: actually should not happen */
64			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
65			    UID_ROOT, GID_OPERATOR, 0640, "%s%d",
66				dp->d_devsw->d_name, u);
67			dev_depends(pdev, dev);
68			return;
69		}
70		if (s == COMPATIBILITY_SLICE) {
71			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
72			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
73				dp->d_devsw->d_name, u, 'a' + p);
74			dev_depends(pdev, dev);
75			return;
76		}
77		if (p != RAW_PART) {
78			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
79			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
80				dp->d_devsw->d_name, u, s - BASE_SLICE + 1,
81				'a' + p);
82		} else {
83			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
84			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
85				dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
86			make_dev_alias(dev, "%s%ds%dc",
87			    dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
88		}
89		dev_depends(pdev, dev);
90		return;
91	}
92}
93
94static void
95disk_clone(void *arg, char *name, int namelen, dev_t *dev)
96{
97	struct disk *dp;
98	char const *d;
99	char *e;
100	int j, u, s, p;
101	dev_t pdev;
102
103	if (*dev != NODEV)
104		return;
105
106	LIST_FOREACH(dp, &disklist, d_list) {
107		d = dp->d_devsw->d_name;
108		j = dev_stdclone(name, &e, d, &u);
109		if (j == 0)
110			continue;
111		if (u > DKMAXUNIT)
112			continue;
113		p = RAW_PART;
114		s = WHOLE_DISK_SLICE;
115		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
116		if (pdev->si_disk == NULL)
117			continue;
118		if (*e != '\0') {
119			j = dev_stdclone(e, &e, "s", &s);
120			if (j == 0)
121				s = COMPATIBILITY_SLICE;
122			else if (j == 1 || j == 2)
123				s += BASE_SLICE - 1;
124			if (!*e)
125				;		/* ad0s1 case */
126			else if (e[1] != '\0')
127				return;		/* can never be a disk name */
128			else if (*e < 'a' || *e > 'h')
129				return;		/* can never be a disk name */
130			else
131				p = *e - 'a';
132		}
133		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
134			return;
135		} else if (s >= BASE_SLICE && p != RAW_PART) {
136			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
137			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
138			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1,
139			    p + 'a');
140		} else if (s >= BASE_SLICE) {
141			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
142			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
143			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
144			make_dev_alias(*dev, "%s%ds%dc",
145			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
146		} else {
147			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
148			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
149			    pdev->si_devsw->d_name, u, p + 'a');
150		}
151		dev_depends(pdev, *dev);
152		return;
153	}
154}
155
156static void
157inherit_raw(dev_t pdev, dev_t dev)
158{
159	dev->si_disk = pdev->si_disk;
160	dev->si_drv1 = pdev->si_drv1;
161	dev->si_drv2 = pdev->si_drv2;
162	dev->si_iosize_max = pdev->si_iosize_max;
163	dev->si_bsize_phys = pdev->si_bsize_phys;
164	dev->si_bsize_best = pdev->si_bsize_best;
165}
166
167dev_t
168disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto)
169{
170	static int once;
171	dev_t dev;
172
173	if (!once) {
174		EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000);
175		once++;
176	}
177
178	bzero(dp, sizeof(*dp));
179	dp->d_label = malloc(sizeof *dp->d_label, M_DEVBUF, M_WAITOK|M_ZERO);
180
181	if (proto->d_open != diskopen) {
182		*proto = *cdevsw;
183		proto->d_open = diskopen;
184		proto->d_close = diskclose;
185		proto->d_ioctl = diskioctl;
186		proto->d_strategy = diskstrategy;
187		proto->d_psize = diskpsize;
188	}
189
190	if (bootverbose)
191		printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
192	dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
193	    UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
194
195	dev->si_disk = dp;
196	dp->d_dev = dev;
197	dp->d_dsflags = flags;
198	dp->d_devsw = cdevsw;
199	LIST_INSERT_HEAD(&disklist, dp, d_list);
200
201	return (dev);
202}
203
204static int
205diskdumpconf(u_int onoff, dev_t dev, struct disk *dp)
206{
207	struct dumperinfo di;
208	struct disklabel *dl;
209
210	if (!onoff)
211		return(set_dumper(NULL));
212	dl = dsgetlabel(dev, dp->d_slice);
213	if (!dl)
214		return (ENXIO);
215	bzero(&di, sizeof di);
216	di.dumper = (dumper_t *)dp->d_devsw->d_dump;
217	di.priv = dp->d_dev;
218	di.blocksize = dl->d_secsize;
219	di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset +
220	    dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE;
221	di.mediasize =
222	    (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE;
223	if (di.mediasize == 0)
224		return (EINVAL);
225	return(set_dumper(&di));
226}
227
228void
229disk_invalidate (struct disk *disk)
230{
231	if (disk->d_slice)
232		dsgone(&disk->d_slice);
233}
234
235void
236disk_destroy(dev_t dev)
237{
238	LIST_REMOVE(dev->si_disk, d_list);
239	free(dev->si_disk->d_label, M_DEVBUF);
240	bzero(dev->si_disk, sizeof(*dev->si_disk));
241    	dev->si_disk = NULL;
242	destroy_dev(dev);
243	return;
244}
245
246struct disk *
247disk_enumerate(struct disk *disk)
248{
249	if (!disk)
250		return (LIST_FIRST(&disklist));
251	else
252		return (LIST_NEXT(disk, d_list));
253}
254
255static int
256sysctl_disks(SYSCTL_HANDLER_ARGS)
257{
258	struct disk *disk;
259	int error, first;
260
261	disk = NULL;
262	first = 1;
263
264	while ((disk = disk_enumerate(disk))) {
265		if (!first) {
266			error = SYSCTL_OUT(req, " ", 1);
267			if (error)
268				return error;
269		} else {
270			first = 0;
271		}
272		error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
273		if (error)
274			return error;
275	}
276	error = SYSCTL_OUT(req, "", 1);
277	return error;
278}
279
280SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
281    sysctl_disks, "A", "names of available disks");
282
283/*
284 * The cdevsw functions
285 */
286
287static int
288diskopen(dev_t dev, int oflags, int devtype, struct thread *td)
289{
290	dev_t pdev;
291	struct disk *dp;
292	int error;
293
294	error = 0;
295	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
296
297	dp = pdev->si_disk;
298	if (!dp)
299		return (ENXIO);
300
301	while (dp->d_flags & DISKFLAG_LOCK) {
302		dp->d_flags |= DISKFLAG_WANTED;
303		error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz);
304		if (error)
305			return (error);
306	}
307	dp->d_flags |= DISKFLAG_LOCK;
308
309	if (!dsisopen(dp->d_slice)) {
310		if (!pdev->si_iosize_max)
311			pdev->si_iosize_max = dev->si_iosize_max;
312		error = dp->d_devsw->d_open(pdev, oflags, devtype, td);
313		dp->d_label->d_secsize = dp->d_sectorsize;
314		dp->d_label->d_secperunit = dp->d_mediasize / dp->d_sectorsize;
315		dp->d_label->d_nsectors = dp->d_fwsectors;
316		dp->d_label->d_ntracks = dp->d_fwheads;
317	}
318
319	/* Inherit properties from the whole/raw dev_t */
320	inherit_raw(pdev, dev);
321
322	if (error)
323		goto out;
324
325	error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, dp->d_label);
326
327	if (!dsisopen(dp->d_slice))
328		dp->d_devsw->d_close(pdev, oflags, devtype, td);
329out:
330	dp->d_flags &= ~DISKFLAG_LOCK;
331	if (dp->d_flags & DISKFLAG_WANTED) {
332		dp->d_flags &= ~DISKFLAG_WANTED;
333		wakeup(dp);
334	}
335
336	return(error);
337}
338
339static int
340diskclose(dev_t dev, int fflag, int devtype, struct thread *td)
341{
342	struct disk *dp;
343	int error;
344	dev_t pdev;
345
346	error = 0;
347	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
348	dp = pdev->si_disk;
349	if (!dp)
350		return (ENXIO);
351	dsclose(dev, devtype, dp->d_slice);
352	if (!dsisopen(dp->d_slice))
353		error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td);
354	return (error);
355}
356
357static void
358diskstrategy(struct bio *bp)
359{
360	dev_t pdev;
361	struct disk *dp;
362
363	pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART);
364	dp = pdev->si_disk;
365	bp->bio_resid = bp->bio_bcount;
366	if (dp != bp->bio_dev->si_disk)
367		inherit_raw(pdev, bp->bio_dev);
368
369	if (!dp) {
370		biofinish(bp, NULL, ENXIO);
371		return;
372	}
373
374	if (dscheck(bp, dp->d_slice) <= 0) {
375		biodone(bp);
376		return;
377	}
378
379	if (bp->bio_bcount == 0) {
380		biodone(bp);
381		return;
382	}
383
384	KASSERT(dp->d_devsw != NULL, ("NULL devsw"));
385	KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy"));
386	dp->d_devsw->d_strategy(bp);
387	return;
388
389}
390
391static int
392diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
393{
394	struct disk *dp;
395	int error;
396	u_int u;
397	dev_t pdev;
398
399	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
400	dp = pdev->si_disk;
401	if (!dp)
402		return (ENXIO);
403	if (cmd == DIOCSKERNELDUMP) {
404		u = *(u_int *)data;
405		return (diskdumpconf(u, dev, dp));
406	}
407	if (cmd == DIOCGFRONTSTUFF) {
408		*(off_t *)data = 8192;	/* XXX: crude but enough) */
409		return (0);
410	}
411	error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
412	if (error == ENOIOCTL)
413		error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td);
414	return (error);
415}
416
417static int
418diskpsize(dev_t dev)
419{
420	struct disk *dp;
421	dev_t pdev;
422
423	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
424	dp = pdev->si_disk;
425	if (!dp)
426		return (-1);
427	if (dp != dev->si_disk) {
428		dev->si_drv1 = pdev->si_drv1;
429		dev->si_drv2 = pdev->si_drv2;
430		/* XXX: don't set bp->b_dev->si_disk (?) */
431	}
432	return (dssize(dev, &dp->d_slice));
433}
434
435SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD,
436    0, sizeof(struct disklabel), "sizeof(struct disklabel)");
437
438SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
439    0, sizeof(struct diskslices), "sizeof(struct diskslices)");
440
441SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
442    0, sizeof(struct disk), "sizeof(struct disk)");
443
444#endif /* NO_GEOM */
445
446/*-
447 * Disk error is the preface to plaintive error messages
448 * about failing disk transfers.  It prints messages of the form
449 * 	"hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347"
450 * blkdone should be -1 if the position of the error is unknown.
451 * The message is printed with printf.
452 */
453void
454disk_err(struct bio *bp, const char *what, int blkdone, int nl)
455{
456	daddr_t sn;
457
458	printf("%s: %s ", devtoname(bp->bio_dev), what);
459	switch(bp->bio_cmd) {
460	case BIO_READ:		printf("cmd=read "); break;
461	case BIO_WRITE:		printf("cmd=write "); break;
462	case BIO_DELETE:	printf("cmd=delete "); break;
463	case BIO_GETATTR:	printf("cmd=getattr "); break;
464	case BIO_SETATTR:	printf("cmd=setattr "); break;
465	default:		printf("cmd=%x ", bp->bio_cmd); break;
466	}
467	sn = bp->bio_blkno;
468	if (bp->bio_bcount <= DEV_BSIZE) {
469		printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : "");
470		return;
471	}
472	if (blkdone >= 0) {
473		sn += blkdone;
474		printf("fsbn %jd of ", (intmax_t)sn);
475	}
476	printf("%jd-%jd", (intmax_t)bp->bio_blkno,
477	    (intmax_t)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE));
478	if (nl)
479		printf("\n");
480}
481
482/*
483 * Seek sort for disks.
484 *
485 * The buf_queue keep two queues, sorted in ascending block order.  The first
486 * queue holds those requests which are positioned after the current block
487 * (in the first request); the second, which starts at queue->switch_point,
488 * holds requests which came in after their block number was passed.  Thus
489 * we implement a one way scan, retracting after reaching the end of the drive
490 * to the first request on the second queue, at which time it becomes the
491 * first queue.
492 *
493 * A one-way scan is natural because of the way UNIX read-ahead blocks are
494 * allocated.
495 */
496
497void
498bioq_disksort(bioq, bp)
499	struct bio_queue_head *bioq;
500	struct bio *bp;
501{
502	struct bio *bq;
503	struct bio *bn;
504	struct bio *be;
505
506	if (!atomic_cmpset_int(&bioq->busy, 0, 1))
507		panic("Recursing in bioq_disksort()");
508	be = TAILQ_LAST(&bioq->queue, bio_queue);
509	/*
510	 * If the queue is empty or we are an
511	 * ordered transaction, then it's easy.
512	 */
513	if ((bq = bioq_first(bioq)) == NULL) {
514		bioq_insert_tail(bioq, bp);
515		bioq->busy = 0;
516		return;
517	} else if (bioq->insert_point != NULL) {
518
519		/*
520		 * A certain portion of the list is
521		 * "locked" to preserve ordering, so
522		 * we can only insert after the insert
523		 * point.
524		 */
525		bq = bioq->insert_point;
526	} else {
527
528		/*
529		 * If we lie before the last removed (currently active)
530		 * request, and are not inserting ourselves into the
531		 * "locked" portion of the list, then we must add ourselves
532		 * to the second request list.
533		 */
534		if (bp->bio_pblkno < bioq->last_pblkno) {
535
536			bq = bioq->switch_point;
537			/*
538			 * If we are starting a new secondary list,
539			 * then it's easy.
540			 */
541			if (bq == NULL) {
542				bioq->switch_point = bp;
543				bioq_insert_tail(bioq, bp);
544				bioq->busy = 0;
545				return;
546			}
547			/*
548			 * If we lie ahead of the current switch point,
549			 * insert us before the switch point and move
550			 * the switch point.
551			 */
552			if (bp->bio_pblkno < bq->bio_pblkno) {
553				bioq->switch_point = bp;
554				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
555				bioq->busy = 0;
556				return;
557			}
558		} else {
559			if (bioq->switch_point != NULL)
560				be = TAILQ_PREV(bioq->switch_point,
561						bio_queue, bio_queue);
562			/*
563			 * If we lie between last_pblkno and bq,
564			 * insert before bq.
565			 */
566			if (bp->bio_pblkno < bq->bio_pblkno) {
567				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
568				bioq->busy = 0;
569				return;
570			}
571		}
572	}
573
574	/*
575	 * Request is at/after our current position in the list.
576	 * Optimize for sequential I/O by seeing if we go at the tail.
577	 */
578	if (bp->bio_pblkno > be->bio_pblkno) {
579		TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue);
580		bioq->busy = 0;
581		return;
582	}
583
584	/* Otherwise, insertion sort */
585	while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) {
586
587		/*
588		 * We want to go after the current request if it is the end
589		 * of the first request list, or if the next request is a
590		 * larger cylinder than our request.
591		 */
592		if (bn == bioq->switch_point
593		 || bp->bio_pblkno < bn->bio_pblkno)
594			break;
595		bq = bn;
596	}
597	TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue);
598	bioq->busy = 0;
599}
600
601
602