1/*- 2 * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD$"); 31 32#include "opt_kdtrace.h" 33 34#include <sys/param.h> 35#include <sys/kernel.h> 36#include <sys/systm.h> 37#include <sys/bio.h> 38#include <sys/devicestat.h> 39#include <sys/sdt.h> 40#include <sys/sysctl.h> 41#include <sys/malloc.h> 42#include <sys/lock.h> 43#include <sys/mutex.h> 44#include <sys/conf.h> 45#include <vm/vm.h> 46#include <vm/pmap.h> 47 48#include <machine/atomic.h> 49 50SDT_PROVIDER_DEFINE(io); 51 52SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *"); 53SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *"); 54SDT_PROBE_DEFINE2(io, , , wait__start, "struct bio *", 55 "struct devstat *"); 56SDT_PROBE_DEFINE2(io, , , wait__done, "struct bio *", 57 "struct devstat *"); 58 59#define DTRACE_DEVSTAT_START() SDT_PROBE2(io, , , start, NULL, ds) 60#define DTRACE_DEVSTAT_BIO_START() SDT_PROBE2(io, , , start, bp, ds) 61#define DTRACE_DEVSTAT_DONE() SDT_PROBE2(io, , , done, NULL, ds) 62#define DTRACE_DEVSTAT_BIO_DONE() SDT_PROBE2(io, , , done, bp, ds) 63#define DTRACE_DEVSTAT_WAIT_START() SDT_PROBE2(io, , , wait__start, NULL, ds) 64#define DTRACE_DEVSTAT_WAIT_DONE() SDT_PROBE2(io, , , wait__done, NULL, ds) 65 66static int devstat_num_devs; 67static long devstat_generation = 1; 68static int devstat_version = DEVSTAT_VERSION; 69static int devstat_current_devnumber; 70static struct mtx devstat_mutex; 71MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF); 72 73static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq); 74static struct devstat *devstat_alloc(void); 75static void devstat_free(struct devstat *); 76static void devstat_add_entry(struct devstat *ds, const void *dev_name, 77 int unit_number, uint32_t block_size, 78 devstat_support_flags flags, 79 devstat_type_flags device_type, 80 devstat_priority priority); 81 82/* 83 * Allocate a devstat and initialize it 84 */ 85struct devstat * 86devstat_new_entry(const void *dev_name, 87 int unit_number, uint32_t block_size, 88 devstat_support_flags flags, 89 devstat_type_flags device_type, 90 devstat_priority priority) 91{ 92 struct devstat *ds; 93 94 mtx_assert(&devstat_mutex, MA_NOTOWNED); 95 96 ds = devstat_alloc(); 97 mtx_lock(&devstat_mutex); 98 if (unit_number == -1) { 99 ds->id = dev_name; 100 binuptime(&ds->creation_time); 101 devstat_generation++; 102 } else { 103 devstat_add_entry(ds, dev_name, unit_number, block_size, 104 flags, device_type, priority); 105 } 106 mtx_unlock(&devstat_mutex); 107 return (ds); 108} 109 110/* 111 * Take a malloced and zeroed devstat structure given to us, fill it in 112 * and add it to the queue of devices. 113 */ 114static void 115devstat_add_entry(struct devstat *ds, const void *dev_name, 116 int unit_number, uint32_t block_size, 117 devstat_support_flags flags, 118 devstat_type_flags device_type, 119 devstat_priority priority) 120{ 121 struct devstatlist *devstat_head; 122 struct devstat *ds_tmp; 123 124 mtx_assert(&devstat_mutex, MA_OWNED); 125 devstat_num_devs++; 126 127 devstat_head = &device_statq; 128 129 /* 130 * Priority sort. Each driver passes in its priority when it adds 131 * its devstat entry. Drivers are sorted first by priority, and 132 * then by probe order. 133 * 134 * For the first device, we just insert it, since the priority 135 * doesn't really matter yet. Subsequent devices are inserted into 136 * the list using the order outlined above. 137 */ 138 if (devstat_num_devs == 1) 139 STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); 140 else { 141 STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { 142 struct devstat *ds_next; 143 144 ds_next = STAILQ_NEXT(ds_tmp, dev_links); 145 146 /* 147 * If we find a break between higher and lower 148 * priority items, and if this item fits in the 149 * break, insert it. This also applies if the 150 * "lower priority item" is the end of the list. 151 */ 152 if ((priority <= ds_tmp->priority) 153 && ((ds_next == NULL) 154 || (priority > ds_next->priority))) { 155 STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, 156 dev_links); 157 break; 158 } else if (priority > ds_tmp->priority) { 159 /* 160 * If this is the case, we should be able 161 * to insert ourselves at the head of the 162 * list. If we can't, something is wrong. 163 */ 164 if (ds_tmp == STAILQ_FIRST(devstat_head)) { 165 STAILQ_INSERT_HEAD(devstat_head, 166 ds, dev_links); 167 break; 168 } else { 169 STAILQ_INSERT_TAIL(devstat_head, 170 ds, dev_links); 171 printf("devstat_add_entry: HELP! " 172 "sorting problem detected " 173 "for name %p unit %d\n", 174 dev_name, unit_number); 175 break; 176 } 177 } 178 } 179 } 180 181 ds->device_number = devstat_current_devnumber++; 182 ds->unit_number = unit_number; 183 strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); 184 ds->block_size = block_size; 185 ds->flags = flags; 186 ds->device_type = device_type; 187 ds->priority = priority; 188 binuptime(&ds->creation_time); 189 devstat_generation++; 190} 191 192/* 193 * Remove a devstat structure from the list of devices. 194 */ 195void 196devstat_remove_entry(struct devstat *ds) 197{ 198 struct devstatlist *devstat_head; 199 200 mtx_assert(&devstat_mutex, MA_NOTOWNED); 201 if (ds == NULL) 202 return; 203 204 mtx_lock(&devstat_mutex); 205 206 devstat_head = &device_statq; 207 208 /* Remove this entry from the devstat queue */ 209 atomic_add_acq_int(&ds->sequence1, 1); 210 if (ds->id == NULL) { 211 devstat_num_devs--; 212 STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); 213 } 214 devstat_free(ds); 215 devstat_generation++; 216 mtx_unlock(&devstat_mutex); 217} 218 219/* 220 * Record a transaction start. 221 * 222 * See comments for devstat_end_transaction(). Ordering is very important 223 * here. 224 */ 225void 226devstat_start_transaction(struct devstat *ds, struct bintime *now) 227{ 228 229 mtx_assert(&devstat_mutex, MA_NOTOWNED); 230 231 /* sanity check */ 232 if (ds == NULL) 233 return; 234 235 atomic_add_acq_int(&ds->sequence1, 1); 236 /* 237 * We only want to set the start time when we are going from idle 238 * to busy. The start time is really the start of the latest busy 239 * period. 240 */ 241 if (ds->start_count == ds->end_count) { 242 if (now != NULL) 243 ds->busy_from = *now; 244 else 245 binuptime(&ds->busy_from); 246 } 247 ds->start_count++; 248 atomic_add_rel_int(&ds->sequence0, 1); 249 DTRACE_DEVSTAT_START(); 250} 251 252void 253devstat_start_transaction_bio(struct devstat *ds, struct bio *bp) 254{ 255 256 mtx_assert(&devstat_mutex, MA_NOTOWNED); 257 258 /* sanity check */ 259 if (ds == NULL) 260 return; 261 262 binuptime(&bp->bio_t0); 263 devstat_start_transaction(ds, &bp->bio_t0); 264 DTRACE_DEVSTAT_BIO_START(); 265} 266 267/* 268 * Record the ending of a transaction, and incrment the various counters. 269 * 270 * Ordering in this function, and in devstat_start_transaction() is VERY 271 * important. The idea here is to run without locks, so we are very 272 * careful to only modify some fields on the way "down" (i.e. at 273 * transaction start) and some fields on the way "up" (i.e. at transaction 274 * completion). One exception is busy_from, which we only modify in 275 * devstat_start_transaction() when there are no outstanding transactions, 276 * and thus it can't be modified in devstat_end_transaction() 277 * simultaneously. 278 * 279 * The sequence0 and sequence1 fields are provided to enable an application 280 * spying on the structures with mmap(2) to tell when a structure is in a 281 * consistent state or not. 282 * 283 * For this to work 100% reliably, it is important that the two fields 284 * are at opposite ends of the structure and that they are incremented 285 * in the opposite order of how a memcpy(3) in userland would copy them. 286 * We assume that the copying happens front to back, but there is actually 287 * no way short of writing your own memcpy(3) replacement to guarantee 288 * this will be the case. 289 * 290 * In addition to this, being a kind of locks, they must be updated with 291 * atomic instructions using appropriate memory barriers. 292 */ 293void 294devstat_end_transaction(struct devstat *ds, uint32_t bytes, 295 devstat_tag_type tag_type, devstat_trans_flags flags, 296 struct bintime *now, struct bintime *then) 297{ 298 struct bintime dt, lnow; 299 300 /* sanity check */ 301 if (ds == NULL) 302 return; 303 304 if (now == NULL) { 305 now = &lnow; 306 binuptime(now); 307 } 308 309 atomic_add_acq_int(&ds->sequence1, 1); 310 /* Update byte and operations counts */ 311 ds->bytes[flags] += bytes; 312 ds->operations[flags]++; 313 314 /* 315 * Keep a count of the various tag types sent. 316 */ 317 if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && 318 tag_type != DEVSTAT_TAG_NONE) 319 ds->tag_types[tag_type]++; 320 321 if (then != NULL) { 322 /* Update duration of operations */ 323 dt = *now; 324 bintime_sub(&dt, then); 325 bintime_add(&ds->duration[flags], &dt); 326 } 327 328 /* Accumulate busy time */ 329 dt = *now; 330 bintime_sub(&dt, &ds->busy_from); 331 bintime_add(&ds->busy_time, &dt); 332 ds->busy_from = *now; 333 334 ds->end_count++; 335 atomic_add_rel_int(&ds->sequence0, 1); 336 DTRACE_DEVSTAT_DONE(); 337} 338 339void 340devstat_end_transaction_bio(struct devstat *ds, struct bio *bp) 341{ 342 343 devstat_end_transaction_bio_bt(ds, bp, NULL); 344} 345 346void 347devstat_end_transaction_bio_bt(struct devstat *ds, struct bio *bp, 348 struct bintime *now) 349{ 350 devstat_trans_flags flg; 351 352 /* sanity check */ 353 if (ds == NULL) 354 return; 355 356 if (bp->bio_cmd == BIO_DELETE) 357 flg = DEVSTAT_FREE; 358 else if (bp->bio_cmd == BIO_READ) 359 flg = DEVSTAT_READ; 360 else if (bp->bio_cmd == BIO_WRITE) 361 flg = DEVSTAT_WRITE; 362 else 363 flg = DEVSTAT_NO_DATA; 364 365 devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, 366 DEVSTAT_TAG_SIMPLE, flg, now, &bp->bio_t0); 367 DTRACE_DEVSTAT_BIO_DONE(); 368} 369 370/* 371 * This is the sysctl handler for the devstat package. The data pushed out 372 * on the kern.devstat.all sysctl variable consists of the current devstat 373 * generation number, and then an array of devstat structures, one for each 374 * device in the system. 375 * 376 * This is more cryptic that obvious, but basically we neither can nor 377 * want to hold the devstat_mutex for any amount of time, so we grab it 378 * only when we need to and keep an eye on devstat_generation all the time. 379 */ 380static int 381sysctl_devstat(SYSCTL_HANDLER_ARGS) 382{ 383 int error; 384 long mygen; 385 struct devstat *nds; 386 387 mtx_assert(&devstat_mutex, MA_NOTOWNED); 388 389 /* 390 * XXX devstat_generation should really be "volatile" but that 391 * XXX freaks out the sysctl macro below. The places where we 392 * XXX change it and inspect it are bracketed in the mutex which 393 * XXX guarantees us proper write barriers. I don't belive the 394 * XXX compiler is allowed to optimize mygen away across calls 395 * XXX to other functions, so the following is belived to be safe. 396 */ 397 mygen = devstat_generation; 398 399 error = SYSCTL_OUT(req, &mygen, sizeof(mygen)); 400 401 if (devstat_num_devs == 0) 402 return(0); 403 404 if (error != 0) 405 return (error); 406 407 mtx_lock(&devstat_mutex); 408 nds = STAILQ_FIRST(&device_statq); 409 if (mygen != devstat_generation) 410 error = EBUSY; 411 mtx_unlock(&devstat_mutex); 412 413 if (error != 0) 414 return (error); 415 416 for (;nds != NULL;) { 417 error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); 418 if (error != 0) 419 return (error); 420 mtx_lock(&devstat_mutex); 421 if (mygen != devstat_generation) 422 error = EBUSY; 423 else 424 nds = STAILQ_NEXT(nds, dev_links); 425 mtx_unlock(&devstat_mutex); 426 if (error != 0) 427 return (error); 428 } 429 return(error); 430} 431 432/* 433 * Sysctl entries for devstat. The first one is a node that all the rest 434 * hang off of. 435 */ 436static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL, 437 "Device Statistics"); 438 439SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE, 440 NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list"); 441/* 442 * Export the number of devices in the system so that userland utilities 443 * can determine how much memory to allocate to hold all the devices. 444 */ 445SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 446 &devstat_num_devs, 0, "Number of devices in the devstat list"); 447SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, 448 &devstat_generation, 0, "Devstat list generation"); 449SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 450 &devstat_version, 0, "Devstat list version number"); 451 452/* 453 * Allocator for struct devstat structures. We sub-allocate these from pages 454 * which we get from malloc. These pages are exported for mmap(2)'ing through 455 * a miniature device driver 456 */ 457 458#define statsperpage (PAGE_SIZE / sizeof(struct devstat)) 459 460static d_mmap_t devstat_mmap; 461 462static struct cdevsw devstat_cdevsw = { 463 .d_version = D_VERSION, 464 .d_flags = D_NEEDGIANT, 465 .d_mmap = devstat_mmap, 466 .d_name = "devstat", 467}; 468 469struct statspage { 470 TAILQ_ENTRY(statspage) list; 471 struct devstat *stat; 472 u_int nfree; 473}; 474 475static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist); 476static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics"); 477 478static int 479devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, 480 int nprot, vm_memattr_t *memattr) 481{ 482 struct statspage *spp; 483 484 if (nprot != VM_PROT_READ) 485 return (-1); 486 TAILQ_FOREACH(spp, &pagelist, list) { 487 if (offset == 0) { 488 *paddr = vtophys(spp->stat); 489 return (0); 490 } 491 offset -= PAGE_SIZE; 492 } 493 return (-1); 494} 495 496static struct devstat * 497devstat_alloc(void) 498{ 499 struct devstat *dsp; 500 struct statspage *spp, *spp2; 501 u_int u; 502 static int once; 503 504 mtx_assert(&devstat_mutex, MA_NOTOWNED); 505 if (!once) { 506 make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME, 507 &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, 508 DEVSTAT_DEVICE_NAME); 509 once = 1; 510 } 511 spp2 = NULL; 512 mtx_lock(&devstat_mutex); 513 for (;;) { 514 TAILQ_FOREACH(spp, &pagelist, list) { 515 if (spp->nfree > 0) 516 break; 517 } 518 if (spp != NULL) 519 break; 520 mtx_unlock(&devstat_mutex); 521 spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK); 522 spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK); 523 spp2->nfree = statsperpage; 524 525 /* 526 * If free statspages were added while the lock was released 527 * just reuse them. 528 */ 529 mtx_lock(&devstat_mutex); 530 TAILQ_FOREACH(spp, &pagelist, list) 531 if (spp->nfree > 0) 532 break; 533 if (spp == NULL) { 534 spp = spp2; 535 536 /* 537 * It would make more sense to add the new page at the 538 * head but the order on the list determine the 539 * sequence of the mapping so we can't do that. 540 */ 541 TAILQ_INSERT_TAIL(&pagelist, spp, list); 542 } else 543 break; 544 } 545 dsp = spp->stat; 546 for (u = 0; u < statsperpage; u++) { 547 if (dsp->allocated == 0) 548 break; 549 dsp++; 550 } 551 spp->nfree--; 552 dsp->allocated = 1; 553 mtx_unlock(&devstat_mutex); 554 if (spp2 != NULL && spp2 != spp) { 555 free(spp2->stat, M_DEVSTAT); 556 free(spp2, M_DEVSTAT); 557 } 558 return (dsp); 559} 560 561static void 562devstat_free(struct devstat *dsp) 563{ 564 struct statspage *spp; 565 566 mtx_assert(&devstat_mutex, MA_OWNED); 567 bzero(dsp, sizeof *dsp); 568 TAILQ_FOREACH(spp, &pagelist, list) { 569 if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) { 570 spp->nfree++; 571 return; 572 } 573 } 574} 575 576SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD, 577 NULL, sizeof(struct devstat), "sizeof(struct devstat)"); 578