1/* 2 * Copyright (C) 2003 Sistina Software 3 * Copyright (C) 2006 Red Hat GmbH 4 * 5 * This file is released under the GPL. 6 */ 7 8#include "dm.h" 9 10#include <linux/device-mapper.h> 11 12#include <linux/bio.h> 13#include <linux/mempool.h> 14#include <linux/module.h> 15#include <linux/sched.h> 16#include <linux/slab.h> 17#include <linux/dm-io.h> 18 19#define DM_MSG_PREFIX "io" 20 21#define DM_IO_MAX_REGIONS BITS_PER_LONG 22 23struct dm_io_client { 24 mempool_t *pool; 25 struct bio_set *bios; 26}; 27 28/* 29 * Aligning 'struct io' reduces the number of bits required to store 30 * its address. Refer to store_io_and_region_in_bio() below. 31 */ 32struct io { 33 unsigned long error_bits; 34 unsigned long eopnotsupp_bits; 35 atomic_t count; 36 struct task_struct *sleeper; 37 struct dm_io_client *client; 38 io_notify_fn callback; 39 void *context; 40} __attribute__((aligned(DM_IO_MAX_REGIONS))); 41 42static struct kmem_cache *_dm_io_cache; 43 44 45static unsigned int pages_to_ios(unsigned int pages) 46{ 47 return 4 * pages; /* too many ? */ 48} 49 50/* 51 * Create a client with mempool and bioset. 52 */ 53struct dm_io_client *dm_io_client_create(unsigned num_pages) 54{ 55 unsigned ios = pages_to_ios(num_pages); 56 struct dm_io_client *client; 57 58 client = kmalloc(sizeof(*client), GFP_KERNEL); 59 if (!client) 60 return ERR_PTR(-ENOMEM); 61 62 client->pool = mempool_create_slab_pool(ios, _dm_io_cache); 63 if (!client->pool) 64 goto bad; 65 66 client->bios = bioset_create(16, 0); 67 if (!client->bios) 68 goto bad; 69 70 return client; 71 72 bad: 73 if (client->pool) 74 mempool_destroy(client->pool); 75 kfree(client); 76 return ERR_PTR(-ENOMEM); 77} 78EXPORT_SYMBOL(dm_io_client_create); 79 80int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client) 81{ 82 return mempool_resize(client->pool, pages_to_ios(num_pages), 83 GFP_KERNEL); 84} 85EXPORT_SYMBOL(dm_io_client_resize); 86 87void dm_io_client_destroy(struct dm_io_client *client) 88{ 89 mempool_destroy(client->pool); 90 bioset_free(client->bios); 91 kfree(client); 92} 93EXPORT_SYMBOL(dm_io_client_destroy); 94 95/*----------------------------------------------------------------- 96 * We need to keep track of which region a bio is doing io for. 97 * To avoid a memory allocation to store just 5 or 6 bits, we 98 * ensure the 'struct io' pointer is aligned so enough low bits are 99 * always zero and then combine it with the region number directly in 100 * bi_private. 101 *---------------------------------------------------------------*/ 102static void store_io_and_region_in_bio(struct bio *bio, struct io *io, 103 unsigned region) 104{ 105 if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { 106 DMCRIT("Unaligned struct io pointer %p", io); 107 BUG(); 108 } 109 110 bio->bi_private = (void *)((unsigned long)io | region); 111} 112 113static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, 114 unsigned *region) 115{ 116 unsigned long val = (unsigned long)bio->bi_private; 117 118 *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); 119 *region = val & (DM_IO_MAX_REGIONS - 1); 120} 121 122/*----------------------------------------------------------------- 123 * We need an io object to keep track of the number of bios that 124 * have been dispatched for a particular io. 125 *---------------------------------------------------------------*/ 126static void dec_count(struct io *io, unsigned int region, int error) 127{ 128 if (error) { 129 set_bit(region, &io->error_bits); 130 if (error == -EOPNOTSUPP) 131 set_bit(region, &io->eopnotsupp_bits); 132 } 133 134 if (atomic_dec_and_test(&io->count)) { 135 if (io->sleeper) 136 wake_up_process(io->sleeper); 137 138 else { 139 unsigned long r = io->error_bits; 140 io_notify_fn fn = io->callback; 141 void *context = io->context; 142 143 mempool_free(io, io->client->pool); 144 fn(r, context); 145 } 146 } 147} 148 149static void endio(struct bio *bio, int error) 150{ 151 struct io *io; 152 unsigned region; 153 154 if (error && bio_data_dir(bio) == READ) 155 zero_fill_bio(bio); 156 157 /* 158 * The bio destructor in bio_put() may use the io object. 159 */ 160 retrieve_io_and_region_from_bio(bio, &io, ®ion); 161 162 bio_put(bio); 163 164 dec_count(io, region, error); 165} 166 167/*----------------------------------------------------------------- 168 * These little objects provide an abstraction for getting a new 169 * destination page for io. 170 *---------------------------------------------------------------*/ 171struct dpages { 172 void (*get_page)(struct dpages *dp, 173 struct page **p, unsigned long *len, unsigned *offset); 174 void (*next_page)(struct dpages *dp); 175 176 unsigned context_u; 177 void *context_ptr; 178}; 179 180/* 181 * Functions for getting the pages from a list. 182 */ 183static void list_get_page(struct dpages *dp, 184 struct page **p, unsigned long *len, unsigned *offset) 185{ 186 unsigned o = dp->context_u; 187 struct page_list *pl = (struct page_list *) dp->context_ptr; 188 189 *p = pl->page; 190 *len = PAGE_SIZE - o; 191 *offset = o; 192} 193 194static void list_next_page(struct dpages *dp) 195{ 196 struct page_list *pl = (struct page_list *) dp->context_ptr; 197 dp->context_ptr = pl->next; 198 dp->context_u = 0; 199} 200 201static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset) 202{ 203 dp->get_page = list_get_page; 204 dp->next_page = list_next_page; 205 dp->context_u = offset; 206 dp->context_ptr = pl; 207} 208 209/* 210 * Functions for getting the pages from a bvec. 211 */ 212static void bvec_get_page(struct dpages *dp, 213 struct page **p, unsigned long *len, unsigned *offset) 214{ 215 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; 216 *p = bvec->bv_page; 217 *len = bvec->bv_len; 218 *offset = bvec->bv_offset; 219} 220 221static void bvec_next_page(struct dpages *dp) 222{ 223 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; 224 dp->context_ptr = bvec + 1; 225} 226 227static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) 228{ 229 dp->get_page = bvec_get_page; 230 dp->next_page = bvec_next_page; 231 dp->context_ptr = bvec; 232} 233 234/* 235 * Functions for getting the pages from a VMA. 236 */ 237static void vm_get_page(struct dpages *dp, 238 struct page **p, unsigned long *len, unsigned *offset) 239{ 240 *p = vmalloc_to_page(dp->context_ptr); 241 *offset = dp->context_u; 242 *len = PAGE_SIZE - dp->context_u; 243} 244 245static void vm_next_page(struct dpages *dp) 246{ 247 dp->context_ptr += PAGE_SIZE - dp->context_u; 248 dp->context_u = 0; 249} 250 251static void vm_dp_init(struct dpages *dp, void *data) 252{ 253 dp->get_page = vm_get_page; 254 dp->next_page = vm_next_page; 255 dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); 256 dp->context_ptr = data; 257} 258 259static void dm_bio_destructor(struct bio *bio) 260{ 261 unsigned region; 262 struct io *io; 263 264 retrieve_io_and_region_from_bio(bio, &io, ®ion); 265 266 bio_free(bio, io->client->bios); 267} 268 269/* 270 * Functions for getting the pages from kernel memory. 271 */ 272static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len, 273 unsigned *offset) 274{ 275 *p = virt_to_page(dp->context_ptr); 276 *offset = dp->context_u; 277 *len = PAGE_SIZE - dp->context_u; 278} 279 280static void km_next_page(struct dpages *dp) 281{ 282 dp->context_ptr += PAGE_SIZE - dp->context_u; 283 dp->context_u = 0; 284} 285 286static void km_dp_init(struct dpages *dp, void *data) 287{ 288 dp->get_page = km_get_page; 289 dp->next_page = km_next_page; 290 dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1); 291 dp->context_ptr = data; 292} 293 294/*----------------------------------------------------------------- 295 * IO routines that accept a list of pages. 296 *---------------------------------------------------------------*/ 297static void do_region(int rw, unsigned region, struct dm_io_region *where, 298 struct dpages *dp, struct io *io) 299{ 300 struct bio *bio; 301 struct page *page; 302 unsigned long len; 303 unsigned offset; 304 unsigned num_bvecs; 305 sector_t remaining = where->count; 306 307 /* 308 * where->count may be zero if rw holds a write barrier and we 309 * need to send a zero-sized barrier. 310 */ 311 do { 312 /* 313 * Allocate a suitably sized-bio. 314 */ 315 num_bvecs = dm_sector_div_up(remaining, 316 (PAGE_SIZE >> SECTOR_SHIFT)); 317 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); 318 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 319 bio->bi_sector = where->sector + (where->count - remaining); 320 bio->bi_bdev = where->bdev; 321 bio->bi_end_io = endio; 322 bio->bi_destructor = dm_bio_destructor; 323 store_io_and_region_in_bio(bio, io, region); 324 325 /* 326 * Try and add as many pages as possible. 327 */ 328 while (remaining) { 329 dp->get_page(dp, &page, &len, &offset); 330 len = min(len, to_bytes(remaining)); 331 if (!bio_add_page(bio, page, len, offset)) 332 break; 333 334 offset = 0; 335 remaining -= to_sector(len); 336 dp->next_page(dp); 337 } 338 339 atomic_inc(&io->count); 340 submit_bio(rw, bio); 341 } while (remaining); 342} 343 344static void dispatch_io(int rw, unsigned int num_regions, 345 struct dm_io_region *where, struct dpages *dp, 346 struct io *io, int sync) 347{ 348 int i; 349 struct dpages old_pages = *dp; 350 351 BUG_ON(num_regions > DM_IO_MAX_REGIONS); 352 353 if (sync) 354 rw |= REQ_SYNC | REQ_UNPLUG; 355 356 /* 357 * For multiple regions we need to be careful to rewind 358 * the dp object for each call to do_region. 359 */ 360 for (i = 0; i < num_regions; i++) { 361 *dp = old_pages; 362 if (where[i].count || (rw & REQ_HARDBARRIER)) 363 do_region(rw, i, where + i, dp, io); 364 } 365 366 /* 367 * Drop the extra reference that we were holding to avoid 368 * the io being completed too early. 369 */ 370 dec_count(io, 0, 0); 371} 372 373static int sync_io(struct dm_io_client *client, unsigned int num_regions, 374 struct dm_io_region *where, int rw, struct dpages *dp, 375 unsigned long *error_bits) 376{ 377 /* 378 * gcc <= 4.3 can't do the alignment for stack variables, so we must 379 * align it on our own. 380 * volatile prevents the optimizer from removing or reusing 381 * "io_" field from the stack frame (allowed in ANSI C). 382 */ 383 volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; 384 struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); 385 386 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 387 WARN_ON(1); 388 return -EIO; 389 } 390 391retry: 392 io->error_bits = 0; 393 io->eopnotsupp_bits = 0; 394 atomic_set(&io->count, 1); /* see dispatch_io() */ 395 io->sleeper = current; 396 io->client = client; 397 398 dispatch_io(rw, num_regions, where, dp, io, 1); 399 400 while (1) { 401 set_current_state(TASK_UNINTERRUPTIBLE); 402 403 if (!atomic_read(&io->count)) 404 break; 405 406 io_schedule(); 407 } 408 set_current_state(TASK_RUNNING); 409 410 if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) { 411 rw &= ~REQ_HARDBARRIER; 412 goto retry; 413 } 414 415 if (error_bits) 416 *error_bits = io->error_bits; 417 418 return io->error_bits ? -EIO : 0; 419} 420 421static int async_io(struct dm_io_client *client, unsigned int num_regions, 422 struct dm_io_region *where, int rw, struct dpages *dp, 423 io_notify_fn fn, void *context) 424{ 425 struct io *io; 426 427 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 428 WARN_ON(1); 429 fn(1, context); 430 return -EIO; 431 } 432 433 io = mempool_alloc(client->pool, GFP_NOIO); 434 io->error_bits = 0; 435 io->eopnotsupp_bits = 0; 436 atomic_set(&io->count, 1); /* see dispatch_io() */ 437 io->sleeper = NULL; 438 io->client = client; 439 io->callback = fn; 440 io->context = context; 441 442 dispatch_io(rw, num_regions, where, dp, io, 0); 443 return 0; 444} 445 446static int dp_init(struct dm_io_request *io_req, struct dpages *dp) 447{ 448 /* Set up dpages based on memory type */ 449 switch (io_req->mem.type) { 450 case DM_IO_PAGE_LIST: 451 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); 452 break; 453 454 case DM_IO_BVEC: 455 bvec_dp_init(dp, io_req->mem.ptr.bvec); 456 break; 457 458 case DM_IO_VMA: 459 vm_dp_init(dp, io_req->mem.ptr.vma); 460 break; 461 462 case DM_IO_KMEM: 463 km_dp_init(dp, io_req->mem.ptr.addr); 464 break; 465 466 default: 467 return -EINVAL; 468 } 469 470 return 0; 471} 472 473/* 474 * New collapsed (a)synchronous interface. 475 * 476 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug 477 * the queue with blk_unplug() some time later or set REQ_SYNC in 478io_req->bi_rw. If you fail to do one of these, the IO will be submitted to 479 * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. 480 */ 481int dm_io(struct dm_io_request *io_req, unsigned num_regions, 482 struct dm_io_region *where, unsigned long *sync_error_bits) 483{ 484 int r; 485 struct dpages dp; 486 487 r = dp_init(io_req, &dp); 488 if (r) 489 return r; 490 491 if (!io_req->notify.fn) 492 return sync_io(io_req->client, num_regions, where, 493 io_req->bi_rw, &dp, sync_error_bits); 494 495 return async_io(io_req->client, num_regions, where, io_req->bi_rw, 496 &dp, io_req->notify.fn, io_req->notify.context); 497} 498EXPORT_SYMBOL(dm_io); 499 500int __init dm_io_init(void) 501{ 502 _dm_io_cache = KMEM_CACHE(io, 0); 503 if (!_dm_io_cache) 504 return -ENOMEM; 505 506 return 0; 507} 508 509void dm_io_exit(void) 510{ 511 kmem_cache_destroy(_dm_io_cache); 512 _dm_io_cache = NULL; 513} 514