subr_disk.c revision 103714
150565Sphk/*
250565Sphk * ----------------------------------------------------------------------------
350565Sphk * "THE BEER-WARE LICENSE" (Revision 42):
450565Sphk * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
550565Sphk * can do whatever you want with this stuff. If we meet some day, and you think
650565Sphk * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
750565Sphk * ----------------------------------------------------------------------------
850565Sphk *
950565Sphk * $FreeBSD: head/sys/kern/subr_disk.c 103714 2002-09-20 19:36:05Z phk $
1050565Sphk *
1150565Sphk */
1250565Sphk
1392074Sphk#include "opt_geom.h"
1492074Sphk
1550565Sphk#include <sys/param.h>
1650565Sphk#include <sys/systm.h>
17103675Sphk#include <sys/stdint.h>
1860041Sphk#include <sys/bio.h>
1950565Sphk#include <sys/conf.h>
2050565Sphk#include <sys/disk.h>
21103714Sphk#include <sys/diskslice.h>
22103714Sphk#include <sys/disklabel.h>
23103675Sphk#ifndef GEOM
24103675Sphk#include <sys/kernel.h>
25103675Sphk#include <sys/sysctl.h>
2650565Sphk#include <sys/malloc.h>
2761953Snbm#include <sys/sysctl.h>
2850728Sphk#include <machine/md_var.h>
2964880Sphk#include <sys/ctype.h>
3064880Sphk
3169774Sphkstatic MALLOC_DEFINE(M_DISK, "disk", "disk data");
3250565Sphk
3350565Sphkstatic d_strategy_t diskstrategy;
3450565Sphkstatic d_open_t diskopen;
3550565Sphkstatic d_close_t diskclose;
3650565Sphkstatic d_ioctl_t diskioctl;
3750565Sphkstatic d_psize_t diskpsize;
3861717Sphk
3961717Sphkstatic LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
4064880Sphk
4185603Sphkvoid disk_dev_synth(dev_t dev);
4285603Sphk
4385603Sphkvoid
4485603Sphkdisk_dev_synth(dev_t dev)
4585603Sphk{
4685603Sphk	struct disk *dp;
4785603Sphk	int u, s, p;
4885603Sphk	dev_t pdev;
4985603Sphk
5086012Sphk	if (dksparebits(dev))
5185996Sphk		return;
5285603Sphk	LIST_FOREACH(dp, &disklist, d_list) {
5385603Sphk		if (major(dev) != dp->d_devsw->d_maj)
5485603Sphk			continue;
5585603Sphk		u = dkunit(dev);
5685603Sphk		p = RAW_PART;
5785603Sphk		s = WHOLE_DISK_SLICE;
5885603Sphk		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
5985624Sphk		if (pdev->si_devsw == NULL)
6085624Sphk			return;		/* Probably a unit we don't have */
6185603Sphk		s = dkslice(dev);
6285603Sphk		p = dkpart(dev);
6385603Sphk		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
6485603Sphk			/* XXX: actually should not happen */
6585603Sphk			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
6685603Sphk			    UID_ROOT, GID_OPERATOR, 0640, "%s%d",
6785603Sphk				dp->d_devsw->d_name, u);
6885603Sphk			dev_depends(pdev, dev);
6985603Sphk			return;
7085603Sphk		}
7185603Sphk		if (s == COMPATIBILITY_SLICE) {
7285603Sphk			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
7385603Sphk			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
7485603Sphk				dp->d_devsw->d_name, u, 'a' + p);
7585603Sphk			dev_depends(pdev, dev);
7685603Sphk			return;
7785603Sphk		}
7885858Sphk		if (p != RAW_PART) {
7985858Sphk			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
8085858Sphk			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
8185858Sphk				dp->d_devsw->d_name, u, s - BASE_SLICE + 1,
8285858Sphk				'a' + p);
8385858Sphk		} else {
8485858Sphk			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
8585858Sphk			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
8685858Sphk				dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
8785858Sphk			make_dev_alias(dev, "%s%ds%dc",
8885858Sphk			    dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
8985858Sphk		}
9085603Sphk		dev_depends(pdev, dev);
9185603Sphk		return;
9285603Sphk	}
9385603Sphk}
9485603Sphk
9562617Simpstatic void
9664880Sphkdisk_clone(void *arg, char *name, int namelen, dev_t *dev)
9764880Sphk{
9864880Sphk	struct disk *dp;
9964880Sphk	char const *d;
10092074Sphk	char *e;
10192074Sphk	int j, u, s, p;
10264880Sphk	dev_t pdev;
10364880Sphk
10464880Sphk	if (*dev != NODEV)
10564880Sphk		return;
10664880Sphk
10764880Sphk	LIST_FOREACH(dp, &disklist, d_list) {
10864880Sphk		d = dp->d_devsw->d_name;
10992074Sphk		j = dev_stdclone(name, &e, d, &u);
11092074Sphk		if (j == 0)
11164880Sphk			continue;
11270058Sphk		if (u > DKMAXUNIT)
11370058Sphk			continue;
11464880Sphk		p = RAW_PART;
11564880Sphk		s = WHOLE_DISK_SLICE;
11664880Sphk		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
11764880Sphk		if (pdev->si_disk == NULL)
11864880Sphk			continue;
11992074Sphk		if (*e != '\0') {
12092074Sphk			j = dev_stdclone(e, &e, "s", &s);
12192074Sphk			if (j == 0)
12292074Sphk				s = COMPATIBILITY_SLICE;
12392074Sphk			else if (j == 1 || j == 2)
12464880Sphk				s += BASE_SLICE - 1;
12592074Sphk			if (!*e)
12692074Sphk				;		/* ad0s1 case */
12792074Sphk			else if (e[1] != '\0')
12892074Sphk				return;		/* can never be a disk name */
12992074Sphk			else if (*e < 'a' || *e > 'h')
13092074Sphk				return;		/* can never be a disk name */
13164880Sphk			else
13292074Sphk				p = *e - 'a';
13364880Sphk		}
13492074Sphk		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
13592074Sphk			return;
13692074Sphk		} else if (s >= BASE_SLICE && p != RAW_PART) {
13785603Sphk			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
13885603Sphk			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
13985858Sphk			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1,
14085858Sphk			    p + 'a');
14185858Sphk		} else if (s >= BASE_SLICE) {
14285603Sphk			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
14385858Sphk			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
14485858Sphk			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
14585858Sphk			make_dev_alias(*dev, "%s%ds%dc",
14685858Sphk			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
14785858Sphk		} else {
14885858Sphk			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
14992074Sphk			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
15092074Sphk			    pdev->si_devsw->d_name, u, p + 'a');
15185858Sphk		}
15277215Sphk		dev_depends(pdev, *dev);
15364880Sphk		return;
15464880Sphk	}
15564880Sphk}
15664880Sphk
15764880Sphkstatic void
15862617Simpinherit_raw(dev_t pdev, dev_t dev)
15962617Simp{
16062617Simp	dev->si_disk = pdev->si_disk;
16162617Simp	dev->si_drv1 = pdev->si_drv1;
16262617Simp	dev->si_drv2 = pdev->si_drv2;
16362617Simp	dev->si_iosize_max = pdev->si_iosize_max;
16462617Simp	dev->si_bsize_phys = pdev->si_bsize_phys;
16562617Simp	dev->si_bsize_best = pdev->si_bsize_best;
16662617Simp}
16762617Simp
16850565Sphkdev_t
16951215Sphkdisk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto)
17050565Sphk{
17164880Sphk	static int once;
17277408Sphk	dev_t dev;
17350565Sphk
17477215Sphk	if (!once) {
17577215Sphk		EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000);
17677215Sphk		once++;
17777215Sphk	}
17877215Sphk
17951198Sphk	bzero(dp, sizeof(*dp));
180103714Sphk	dp->d_label = malloc(sizeof *dp->d_label, M_DEVBUF, M_WAITOK|M_ZERO);
18151198Sphk
18277147Sphk	if (proto->d_open != diskopen) {
18351215Sphk		*proto = *cdevsw;
18451215Sphk		proto->d_open = diskopen;
18551215Sphk		proto->d_close = diskclose;
18651215Sphk		proto->d_ioctl = diskioctl;
18751215Sphk		proto->d_strategy = diskstrategy;
18851215Sphk		proto->d_psize = diskpsize;
18950565Sphk	}
19050565Sphk
19153437Sjkh	if (bootverbose)
19253437Sjkh		printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
19351243Sphk	dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
19464880Sphk	    UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
19550565Sphk
19650565Sphk	dev->si_disk = dp;
19750565Sphk	dp->d_dev = dev;
19852917Sphk	dp->d_dsflags = flags;
19951215Sphk	dp->d_devsw = cdevsw;
20061717Sphk	LIST_INSERT_HEAD(&disklist, dp, d_list);
20177215Sphk
20250565Sphk	return (dev);
20350565Sphk}
20450565Sphk
20593496Sphkstatic int
20693496Sphkdiskdumpconf(u_int onoff, dev_t dev, struct disk *dp)
20750728Sphk{
20893496Sphk	struct dumperinfo di;
20950728Sphk	struct disklabel *dl;
21050728Sphk
21193496Sphk	if (!onoff)
21293496Sphk		return(set_dumper(NULL));
21350728Sphk	dl = dsgetlabel(dev, dp->d_slice);
21450728Sphk	if (!dl)
21550728Sphk		return (ENXIO);
21693496Sphk	bzero(&di, sizeof di);
21793496Sphk	di.dumper = (dumper_t *)dp->d_devsw->d_dump;
21893496Sphk	di.priv = dp->d_dev;
21993496Sphk	di.blocksize = dl->d_secsize;
22093496Sphk	di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset +
22193496Sphk	    dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE;
22293496Sphk	di.mediasize =
22393496Sphk	    (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE;
22493496Sphk	return(set_dumper(&di));
22550728Sphk}
22650728Sphk
22750728Sphkvoid
22850728Sphkdisk_invalidate (struct disk *disk)
22950728Sphk{
23057325Ssos	if (disk->d_slice)
23157325Ssos		dsgone(&disk->d_slice);
23250728Sphk}
23350728Sphk
23450565Sphkvoid
23556767Sphkdisk_destroy(dev_t dev)
23650565Sphk{
23761717Sphk	LIST_REMOVE(dev->si_disk, d_list);
238103714Sphk	free(dev->si_disk->d_label, M_DEVBUF);
23961717Sphk	bzero(dev->si_disk, sizeof(*dev->si_disk));
24057325Ssos    	dev->si_disk = NULL;
24157325Ssos	destroy_dev(dev);
24250565Sphk	return;
24350565Sphk}
24450565Sphk
24561717Sphkstruct disk *
24661717Sphkdisk_enumerate(struct disk *disk)
24761717Sphk{
24861717Sphk	if (!disk)
24961717Sphk		return (LIST_FIRST(&disklist));
25061717Sphk	else
25161717Sphk		return (LIST_NEXT(disk, d_list));
25261717Sphk}
25361717Sphk
25461953Snbmstatic int
25562573Sphksysctl_disks(SYSCTL_HANDLER_ARGS)
25661953Snbm{
25761953Snbm	struct disk *disk;
25861953Snbm	int error, first;
25961953Snbm
26061953Snbm	disk = NULL;
26161953Snbm	first = 1;
26261953Snbm
26361953Snbm	while ((disk = disk_enumerate(disk))) {
26461953Snbm		if (!first) {
26561953Snbm			error = SYSCTL_OUT(req, " ", 1);
26661953Snbm			if (error)
26761953Snbm				return error;
26861953Snbm		} else {
26961953Snbm			first = 0;
27061953Snbm		}
27161953Snbm		error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
27261953Snbm		if (error)
27361953Snbm			return error;
27461953Snbm	}
27561953Snbm	error = SYSCTL_OUT(req, "", 1);
27661953Snbm	return error;
27761953Snbm}
27861953Snbm
279102241SarchieSYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
28061953Snbm    sysctl_disks, "A", "names of available disks");
28161953Snbm
28250728Sphk/*
28350728Sphk * The cdevsw functions
28450728Sphk */
28550728Sphk
28650565Sphkstatic int
28783366Sjuliandiskopen(dev_t dev, int oflags, int devtype, struct thread *td)
28850565Sphk{
28950565Sphk	dev_t pdev;
29050565Sphk	struct disk *dp;
29150565Sphk	int error;
29250565Sphk
29350728Sphk	error = 0;
29450565Sphk	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
29550728Sphk
29650565Sphk	dp = pdev->si_disk;
29750565Sphk	if (!dp)
29850565Sphk		return (ENXIO);
29950728Sphk
30052917Sphk	while (dp->d_flags & DISKFLAG_LOCK) {
30152917Sphk		dp->d_flags |= DISKFLAG_WANTED;
30254815Sphk		error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz);
30354815Sphk		if (error)
30454815Sphk			return (error);
30552917Sphk	}
30652917Sphk	dp->d_flags |= DISKFLAG_LOCK;
30752917Sphk
30851860Sphk	if (!dsisopen(dp->d_slice)) {
30951878Ssos		if (!pdev->si_iosize_max)
31051878Ssos			pdev->si_iosize_max = dev->si_iosize_max;
31183366Sjulian		error = dp->d_devsw->d_open(pdev, oflags, devtype, td);
312103714Sphk		dp->d_label->d_secsize = dp->d_sectorsize;
313103714Sphk		dp->d_label->d_secperunit = dp->d_mediasize / dp->d_sectorsize;
31451860Sphk	}
31551826Sphk
31651826Sphk	/* Inherit properties from the whole/raw dev_t */
31762617Simp	inherit_raw(pdev, dev);
31850728Sphk
31950728Sphk	if (error)
32052917Sphk		goto out;
32150728Sphk
322103714Sphk	error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, dp->d_label);
32350565Sphk
32450728Sphk	if (!dsisopen(dp->d_slice))
32583366Sjulian		dp->d_devsw->d_close(pdev, oflags, devtype, td);
32652917Sphkout:
32752917Sphk	dp->d_flags &= ~DISKFLAG_LOCK;
32852917Sphk	if (dp->d_flags & DISKFLAG_WANTED) {
32952917Sphk		dp->d_flags &= ~DISKFLAG_WANTED;
33052917Sphk		wakeup(dp);
33152917Sphk	}
33250728Sphk
33350565Sphk	return(error);
33450565Sphk}
33550565Sphk
33650565Sphkstatic int
33783366Sjuliandiskclose(dev_t dev, int fflag, int devtype, struct thread *td)
33850565Sphk{
33950565Sphk	struct disk *dp;
34050565Sphk	int error;
34162617Simp	dev_t pdev;
34250565Sphk
34350565Sphk	error = 0;
34462617Simp	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
34562617Simp	dp = pdev->si_disk;
34674206Ssos	if (!dp)
34774206Ssos		return (ENXIO);
34851822Sphk	dsclose(dev, devtype, dp->d_slice);
34974206Ssos	if (!dsisopen(dp->d_slice))
35083366Sjulian		error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td);
35150565Sphk	return (error);
35250565Sphk}
35350565Sphk
35450565Sphkstatic void
35559249Sphkdiskstrategy(struct bio *bp)
35650565Sphk{
35750565Sphk	dev_t pdev;
35850565Sphk	struct disk *dp;
35950565Sphk
36062617Simp	pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART);
36162617Simp	dp = pdev->si_disk;
36276361Sphk	bp->bio_resid = bp->bio_bcount;
36362617Simp	if (dp != bp->bio_dev->si_disk)
36462617Simp		inherit_raw(pdev, bp->bio_dev);
36550565Sphk
36650565Sphk	if (!dp) {
36776322Sphk		biofinish(bp, NULL, ENXIO);
36850565Sphk		return;
36950565Sphk	}
37050565Sphk
37155763Sphk	if (dscheck(bp, dp->d_slice) <= 0) {
37250565Sphk		biodone(bp);
37350565Sphk		return;
37450565Sphk	}
37550565Sphk
37676324Sphk	if (bp->bio_bcount == 0) {
37776324Sphk		biodone(bp);
37876324Sphk		return;
37976324Sphk	}
38076324Sphk
38159623Sphk	KASSERT(dp->d_devsw != NULL, ("NULL devsw"));
38259623Sphk	KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy"));
38351215Sphk	dp->d_devsw->d_strategy(bp);
38450565Sphk	return;
38550565Sphk
38650565Sphk}
38750565Sphk
38850565Sphkstatic int
38983366Sjuliandiskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
39050565Sphk{
39150565Sphk	struct disk *dp;
39250565Sphk	int error;
39393496Sphk	u_int u;
39462617Simp	dev_t pdev;
39550565Sphk
39662617Simp	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
39762617Simp	dp = pdev->si_disk;
39874206Ssos	if (!dp)
39974206Ssos		return (ENXIO);
40094272Sphk	if (cmd == DIOCSKERNELDUMP) {
40193496Sphk		u = *(u_int *)data;
40293496Sphk		return (diskdumpconf(u, dev, dp));
40393496Sphk	}
40494287Sphk	if (cmd == DIOCGFRONTSTUFF) {
40594287Sphk		*(off_t *)data = 8192;	/* XXX: crude but enough) */
40694287Sphk		return (0);
40794287Sphk	}
40850565Sphk	error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
40950565Sphk	if (error == ENOIOCTL)
41083366Sjulian		error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td);
41150565Sphk	return (error);
41250565Sphk}
41350565Sphk
41450565Sphkstatic int
41550565Sphkdiskpsize(dev_t dev)
41650565Sphk{
41750565Sphk	struct disk *dp;
41850728Sphk	dev_t pdev;
41950565Sphk
42062617Simp	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
42162617Simp	dp = pdev->si_disk;
42262617Simp	if (!dp)
42362617Simp		return (-1);
42462617Simp	if (dp != dev->si_disk) {
42550728Sphk		dev->si_drv1 = pdev->si_drv1;
42650728Sphk		dev->si_drv2 = pdev->si_drv2;
42750728Sphk		/* XXX: don't set bp->b_dev->si_disk (?) */
42850728Sphk	}
42950565Sphk	return (dssize(dev, &dp->d_slice));
43050565Sphk}
43151111Sjulian
43251111SjulianSYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD,
43351111Sjulian    0, sizeof(struct disklabel), "sizeof(struct disklabel)");
43451111Sjulian
43551111SjulianSYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
43651111Sjulian    0, sizeof(struct diskslices), "sizeof(struct diskslices)");
43751111Sjulian
43851111SjulianSYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
43951111Sjulian    0, sizeof(struct disk), "sizeof(struct disk)");
44092074Sphk
44192074Sphk#endif
442103675Sphk
443103675Sphk/*-
444103675Sphk * Disk error is the preface to plaintive error messages
445103675Sphk * about failing disk transfers.  It prints messages of the form
446103675Sphk * 	"hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347"
447103675Sphk * blkdone should be -1 if the position of the error is unknown.
448103675Sphk * The message is printed with printf.
449103675Sphk */
450103675Sphkvoid
451103675Sphkdisk_err(struct bio *bp, const char *what, int blkdone, int nl)
452103675Sphk{
453103675Sphk	daddr_t sn;
454103675Sphk
455103675Sphk	printf("%s: %s", devtoname(bp->bio_dev), what);
456103675Sphk	switch(bp->bio_cmd) {
457103675Sphk	case BIO_READ:		printf("cmd=read"); break;
458103675Sphk	case BIO_WRITE:		printf("cmd=write"); break;
459103675Sphk	case BIO_DELETE:	printf("cmd=delete"); break;
460103675Sphk	case BIO_GETATTR:	printf("cmd=getattr"); break;
461103675Sphk	case BIO_SETATTR:	printf("cmd=setattr"); break;
462103675Sphk	default:		printf("cmd=%x", bp->bio_cmd); break;
463103675Sphk	}
464103675Sphk	sn = bp->bio_blkno;
465103675Sphk	if (bp->bio_bcount <= DEV_BSIZE) {
466103675Sphk		printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : "");
467103675Sphk		return;
468103675Sphk	}
469103675Sphk	if (blkdone >= 0) {
470103675Sphk		sn += blkdone;
471103675Sphk		printf("fsbn %jd of ", (intmax_t)sn);
472103675Sphk	}
473103675Sphk	printf("%jd-%jd", (intmax_t)bp->bio_blkno,
474103675Sphk	    (intmax_t)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE));
475103675Sphk	if (nl)
476103675Sphk		printf("\n");
477103675Sphk}
478103683Sphk
479103683Sphk#ifdef notquite
480103683Sphk/*
481103683Sphk * Mutex to use when delaying niced I/O bound processes in bioq_disksort().
482103683Sphk */
483103683Sphkstatic struct mtx dksort_mtx;
484103683Sphkstatic void
485103683Sphkdksort_init(void)
486103683Sphk{
487103683Sphk
488103683Sphk	mtx_init(&dksort_mtx, "dksort", NULL, MTX_DEF);
489103683Sphk}
490103683SphkSYSINIT(dksort, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, dksort_init, NULL)
491103683Sphk#endif
492103683Sphk
493103683Sphk/*
494103683Sphk * Seek sort for disks.
495103683Sphk *
496103683Sphk * The buf_queue keep two queues, sorted in ascending block order.  The first
497103683Sphk * queue holds those requests which are positioned after the current block
498103683Sphk * (in the first request); the second, which starts at queue->switch_point,
499103683Sphk * holds requests which came in after their block number was passed.  Thus
500103683Sphk * we implement a one way scan, retracting after reaching the end of the drive
501103683Sphk * to the first request on the second queue, at which time it becomes the
502103683Sphk * first queue.
503103683Sphk *
504103683Sphk * A one-way scan is natural because of the way UNIX read-ahead blocks are
505103683Sphk * allocated.
506103683Sphk */
507103683Sphk
508103683Sphkvoid
509103683Sphkbioq_disksort(bioq, bp)
510103683Sphk	struct bio_queue_head *bioq;
511103683Sphk	struct bio *bp;
512103683Sphk{
513103683Sphk	struct bio *bq;
514103683Sphk	struct bio *bn;
515103683Sphk	struct bio *be;
516103683Sphk
517103683Sphk#ifdef notquite
518103683Sphk	struct thread *td = curthread;
519103683Sphk
520103683Sphk	if (td && td->td_ksegrp->kg_nice > 0) {
521103683Sphk		TAILQ_FOREACH(bn, &bioq->queue, bio_queue)
522103683Sphk			if (BIOTOBUF(bp)->b_vp != BIOTOBUF(bn)->b_vp)
523103683Sphk				break;
524103683Sphk		if (bn != NULL) {
525103683Sphk			mtx_lock(&dksort_mtx);
526103683Sphk			msleep(&dksort_mtx, &dksort_mtx,
527103683Sphk			    PPAUSE | PCATCH | PDROP, "ioslow",
528103683Sphk			    td->td_ksegrp->kg_nice);
529103683Sphk		}
530103683Sphk	}
531103683Sphk#endif
532103683Sphk	if (!atomic_cmpset_int(&bioq->busy, 0, 1))
533103683Sphk		panic("Recursing in bioq_disksort()");
534103683Sphk	be = TAILQ_LAST(&bioq->queue, bio_queue);
535103683Sphk	/*
536103683Sphk	 * If the queue is empty or we are an
537103683Sphk	 * ordered transaction, then it's easy.
538103683Sphk	 */
539103683Sphk	if ((bq = bioq_first(bioq)) == NULL) {
540103683Sphk		bioq_insert_tail(bioq, bp);
541103683Sphk		bioq->busy = 0;
542103683Sphk		return;
543103683Sphk	} else if (bioq->insert_point != NULL) {
544103683Sphk
545103683Sphk		/*
546103683Sphk		 * A certain portion of the list is
547103683Sphk		 * "locked" to preserve ordering, so
548103683Sphk		 * we can only insert after the insert
549103683Sphk		 * point.
550103683Sphk		 */
551103683Sphk		bq = bioq->insert_point;
552103683Sphk	} else {
553103683Sphk
554103683Sphk		/*
555103683Sphk		 * If we lie before the last removed (currently active)
556103683Sphk		 * request, and are not inserting ourselves into the
557103683Sphk		 * "locked" portion of the list, then we must add ourselves
558103683Sphk		 * to the second request list.
559103683Sphk		 */
560103683Sphk		if (bp->bio_pblkno < bioq->last_pblkno) {
561103683Sphk
562103683Sphk			bq = bioq->switch_point;
563103683Sphk			/*
564103683Sphk			 * If we are starting a new secondary list,
565103683Sphk			 * then it's easy.
566103683Sphk			 */
567103683Sphk			if (bq == NULL) {
568103683Sphk				bioq->switch_point = bp;
569103683Sphk				bioq_insert_tail(bioq, bp);
570103683Sphk				bioq->busy = 0;
571103683Sphk				return;
572103683Sphk			}
573103683Sphk			/*
574103683Sphk			 * If we lie ahead of the current switch point,
575103683Sphk			 * insert us before the switch point and move
576103683Sphk			 * the switch point.
577103683Sphk			 */
578103683Sphk			if (bp->bio_pblkno < bq->bio_pblkno) {
579103683Sphk				bioq->switch_point = bp;
580103683Sphk				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
581103683Sphk				bioq->busy = 0;
582103683Sphk				return;
583103683Sphk			}
584103683Sphk		} else {
585103683Sphk			if (bioq->switch_point != NULL)
586103683Sphk				be = TAILQ_PREV(bioq->switch_point,
587103683Sphk						bio_queue, bio_queue);
588103683Sphk			/*
589103683Sphk			 * If we lie between last_pblkno and bq,
590103683Sphk			 * insert before bq.
591103683Sphk			 */
592103683Sphk			if (bp->bio_pblkno < bq->bio_pblkno) {
593103683Sphk				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
594103683Sphk				bioq->busy = 0;
595103683Sphk				return;
596103683Sphk			}
597103683Sphk		}
598103683Sphk	}
599103683Sphk
600103683Sphk	/*
601103683Sphk	 * Request is at/after our current position in the list.
602103683Sphk	 * Optimize for sequential I/O by seeing if we go at the tail.
603103683Sphk	 */
604103683Sphk	if (bp->bio_pblkno > be->bio_pblkno) {
605103683Sphk		TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue);
606103683Sphk		bioq->busy = 0;
607103683Sphk		return;
608103683Sphk	}
609103683Sphk
610103683Sphk	/* Otherwise, insertion sort */
611103683Sphk	while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) {
612103683Sphk
613103683Sphk		/*
614103683Sphk		 * We want to go after the current request if it is the end
615103683Sphk		 * of the first request list, or if the next request is a
616103683Sphk		 * larger cylinder than our request.
617103683Sphk		 */
618103683Sphk		if (bn == bioq->switch_point
619103683Sphk		 || bp->bio_pblkno < bn->bio_pblkno)
620103683Sphk			break;
621103683Sphk		bq = bn;
622103683Sphk	}
623103683Sphk	TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue);
624103683Sphk	bioq->busy = 0;
625103683Sphk}
626103683Sphk
627103683Sphk
628