1 2#include "ceph_debug.h" 3 4#include <linux/slab.h> 5#include <asm/div64.h> 6 7#include "super.h" 8#include "osdmap.h" 9#include "crush/hash.h" 10#include "crush/mapper.h" 11#include "decode.h" 12 13char *ceph_osdmap_state_str(char *str, int len, int state) 14{ 15 int flag = 0; 16 17 if (!len) 18 goto done; 19 20 *str = '\0'; 21 if (state) { 22 if (state & CEPH_OSD_EXISTS) { 23 snprintf(str, len, "exists"); 24 flag = 1; 25 } 26 if (state & CEPH_OSD_UP) { 27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), 28 "up"); 29 flag = 1; 30 } 31 } else { 32 snprintf(str, len, "doesn't exist"); 33 } 34done: 35 return str; 36} 37 38/* maps */ 39 40static int calc_bits_of(unsigned t) 41{ 42 int b = 0; 43 while (t) { 44 t = t >> 1; 45 b++; 46 } 47 return b; 48} 49 50/* 51 * the foo_mask is the smallest value 2^n-1 that is >= foo. 52 */ 53static void calc_pg_masks(struct ceph_pg_pool_info *pi) 54{ 55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; 56 pi->pgp_num_mask = 57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; 58 pi->lpg_num_mask = 59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; 60 pi->lpgp_num_mask = 61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; 62} 63 64/* 65 * decode crush map 66 */ 67static int crush_decode_uniform_bucket(void **p, void *end, 68 struct crush_bucket_uniform *b) 69{ 70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end); 71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); 72 b->item_weight = ceph_decode_32(p); 73 return 0; 74bad: 75 return -EINVAL; 76} 77 78static int crush_decode_list_bucket(void **p, void *end, 79 struct crush_bucket_list *b) 80{ 81 int j; 82 dout("crush_decode_list_bucket %p to %p\n", *p, end); 83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 84 if (b->item_weights == NULL) 85 return -ENOMEM; 86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 87 if (b->sum_weights == NULL) 88 return -ENOMEM; 89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); 90 for (j = 0; j < b->h.size; j++) { 91 b->item_weights[j] = ceph_decode_32(p); 92 b->sum_weights[j] = ceph_decode_32(p); 93 } 94 return 0; 95bad: 96 return -EINVAL; 97} 98 99static int crush_decode_tree_bucket(void **p, void *end, 100 struct crush_bucket_tree *b) 101{ 102 int j; 103 dout("crush_decode_tree_bucket %p to %p\n", *p, end); 104 ceph_decode_32_safe(p, end, b->num_nodes, bad); 105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); 106 if (b->node_weights == NULL) 107 return -ENOMEM; 108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); 109 for (j = 0; j < b->num_nodes; j++) 110 b->node_weights[j] = ceph_decode_32(p); 111 return 0; 112bad: 113 return -EINVAL; 114} 115 116static int crush_decode_straw_bucket(void **p, void *end, 117 struct crush_bucket_straw *b) 118{ 119 int j; 120 dout("crush_decode_straw_bucket %p to %p\n", *p, end); 121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 122 if (b->item_weights == NULL) 123 return -ENOMEM; 124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); 125 if (b->straws == NULL) 126 return -ENOMEM; 127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); 128 for (j = 0; j < b->h.size; j++) { 129 b->item_weights[j] = ceph_decode_32(p); 130 b->straws[j] = ceph_decode_32(p); 131 } 132 return 0; 133bad: 134 return -EINVAL; 135} 136 137static struct crush_map *crush_decode(void *pbyval, void *end) 138{ 139 struct crush_map *c; 140 int err = -EINVAL; 141 int i, j; 142 void **p = &pbyval; 143 void *start = pbyval; 144 u32 magic; 145 146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 147 148 c = kzalloc(sizeof(*c), GFP_NOFS); 149 if (c == NULL) 150 return ERR_PTR(-ENOMEM); 151 152 ceph_decode_need(p, end, 4*sizeof(u32), bad); 153 magic = ceph_decode_32(p); 154 if (magic != CRUSH_MAGIC) { 155 pr_err("crush_decode magic %x != current %x\n", 156 (unsigned)magic, (unsigned)CRUSH_MAGIC); 157 goto bad; 158 } 159 c->max_buckets = ceph_decode_32(p); 160 c->max_rules = ceph_decode_32(p); 161 c->max_devices = ceph_decode_32(p); 162 163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); 164 if (c->device_parents == NULL) 165 goto badmem; 166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); 167 if (c->bucket_parents == NULL) 168 goto badmem; 169 170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); 171 if (c->buckets == NULL) 172 goto badmem; 173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); 174 if (c->rules == NULL) 175 goto badmem; 176 177 /* buckets */ 178 for (i = 0; i < c->max_buckets; i++) { 179 int size = 0; 180 u32 alg; 181 struct crush_bucket *b; 182 183 ceph_decode_32_safe(p, end, alg, bad); 184 if (alg == 0) { 185 c->buckets[i] = NULL; 186 continue; 187 } 188 dout("crush_decode bucket %d off %x %p to %p\n", 189 i, (int)(*p-start), *p, end); 190 191 switch (alg) { 192 case CRUSH_BUCKET_UNIFORM: 193 size = sizeof(struct crush_bucket_uniform); 194 break; 195 case CRUSH_BUCKET_LIST: 196 size = sizeof(struct crush_bucket_list); 197 break; 198 case CRUSH_BUCKET_TREE: 199 size = sizeof(struct crush_bucket_tree); 200 break; 201 case CRUSH_BUCKET_STRAW: 202 size = sizeof(struct crush_bucket_straw); 203 break; 204 default: 205 err = -EINVAL; 206 goto bad; 207 } 208 BUG_ON(size == 0); 209 b = c->buckets[i] = kzalloc(size, GFP_NOFS); 210 if (b == NULL) 211 goto badmem; 212 213 ceph_decode_need(p, end, 4*sizeof(u32), bad); 214 b->id = ceph_decode_32(p); 215 b->type = ceph_decode_16(p); 216 b->alg = ceph_decode_8(p); 217 b->hash = ceph_decode_8(p); 218 b->weight = ceph_decode_32(p); 219 b->size = ceph_decode_32(p); 220 221 dout("crush_decode bucket size %d off %x %p to %p\n", 222 b->size, (int)(*p-start), *p, end); 223 224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); 225 if (b->items == NULL) 226 goto badmem; 227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); 228 if (b->perm == NULL) 229 goto badmem; 230 b->perm_n = 0; 231 232 ceph_decode_need(p, end, b->size*sizeof(u32), bad); 233 for (j = 0; j < b->size; j++) 234 b->items[j] = ceph_decode_32(p); 235 236 switch (b->alg) { 237 case CRUSH_BUCKET_UNIFORM: 238 err = crush_decode_uniform_bucket(p, end, 239 (struct crush_bucket_uniform *)b); 240 if (err < 0) 241 goto bad; 242 break; 243 case CRUSH_BUCKET_LIST: 244 err = crush_decode_list_bucket(p, end, 245 (struct crush_bucket_list *)b); 246 if (err < 0) 247 goto bad; 248 break; 249 case CRUSH_BUCKET_TREE: 250 err = crush_decode_tree_bucket(p, end, 251 (struct crush_bucket_tree *)b); 252 if (err < 0) 253 goto bad; 254 break; 255 case CRUSH_BUCKET_STRAW: 256 err = crush_decode_straw_bucket(p, end, 257 (struct crush_bucket_straw *)b); 258 if (err < 0) 259 goto bad; 260 break; 261 } 262 } 263 264 /* rules */ 265 dout("rule vec is %p\n", c->rules); 266 for (i = 0; i < c->max_rules; i++) { 267 u32 yes; 268 struct crush_rule *r; 269 270 ceph_decode_32_safe(p, end, yes, bad); 271 if (!yes) { 272 dout("crush_decode NO rule %d off %x %p to %p\n", 273 i, (int)(*p-start), *p, end); 274 c->rules[i] = NULL; 275 continue; 276 } 277 278 dout("crush_decode rule %d off %x %p to %p\n", 279 i, (int)(*p-start), *p, end); 280 281 /* len */ 282 ceph_decode_32_safe(p, end, yes, bad); 283#if BITS_PER_LONG == 32 284 err = -EINVAL; 285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) 286 goto bad; 287#endif 288 r = c->rules[i] = kmalloc(sizeof(*r) + 289 yes*sizeof(struct crush_rule_step), 290 GFP_NOFS); 291 if (r == NULL) 292 goto badmem; 293 dout(" rule %d is at %p\n", i, r); 294 r->len = yes; 295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ 296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); 297 for (j = 0; j < r->len; j++) { 298 r->steps[j].op = ceph_decode_32(p); 299 r->steps[j].arg1 = ceph_decode_32(p); 300 r->steps[j].arg2 = ceph_decode_32(p); 301 } 302 } 303 304 /* ignore trailing name maps. */ 305 306 dout("crush_decode success\n"); 307 return c; 308 309badmem: 310 err = -ENOMEM; 311bad: 312 dout("crush_decode fail %d\n", err); 313 crush_destroy(c); 314 return ERR_PTR(err); 315} 316 317/* 318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 319 * to a set of osds) 320 */ 321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) 322{ 323 u64 a = *(u64 *)&l; 324 u64 b = *(u64 *)&r; 325 326 if (a < b) 327 return -1; 328 if (a > b) 329 return 1; 330 return 0; 331} 332 333static int __insert_pg_mapping(struct ceph_pg_mapping *new, 334 struct rb_root *root) 335{ 336 struct rb_node **p = &root->rb_node; 337 struct rb_node *parent = NULL; 338 struct ceph_pg_mapping *pg = NULL; 339 int c; 340 341 while (*p) { 342 parent = *p; 343 pg = rb_entry(parent, struct ceph_pg_mapping, node); 344 c = pgid_cmp(new->pgid, pg->pgid); 345 if (c < 0) 346 p = &(*p)->rb_left; 347 else if (c > 0) 348 p = &(*p)->rb_right; 349 else 350 return -EEXIST; 351 } 352 353 rb_link_node(&new->node, parent, p); 354 rb_insert_color(&new->node, root); 355 return 0; 356} 357 358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, 359 struct ceph_pg pgid) 360{ 361 struct rb_node *n = root->rb_node; 362 struct ceph_pg_mapping *pg; 363 int c; 364 365 while (n) { 366 pg = rb_entry(n, struct ceph_pg_mapping, node); 367 c = pgid_cmp(pgid, pg->pgid); 368 if (c < 0) 369 n = n->rb_left; 370 else if (c > 0) 371 n = n->rb_right; 372 else 373 return pg; 374 } 375 return NULL; 376} 377 378/* 379 * rbtree of pg pool info 380 */ 381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) 382{ 383 struct rb_node **p = &root->rb_node; 384 struct rb_node *parent = NULL; 385 struct ceph_pg_pool_info *pi = NULL; 386 387 while (*p) { 388 parent = *p; 389 pi = rb_entry(parent, struct ceph_pg_pool_info, node); 390 if (new->id < pi->id) 391 p = &(*p)->rb_left; 392 else if (new->id > pi->id) 393 p = &(*p)->rb_right; 394 else 395 return -EEXIST; 396 } 397 398 rb_link_node(&new->node, parent, p); 399 rb_insert_color(&new->node, root); 400 return 0; 401} 402 403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) 404{ 405 struct ceph_pg_pool_info *pi; 406 struct rb_node *n = root->rb_node; 407 408 while (n) { 409 pi = rb_entry(n, struct ceph_pg_pool_info, node); 410 if (id < pi->id) 411 n = n->rb_left; 412 else if (id > pi->id) 413 n = n->rb_right; 414 else 415 return pi; 416 } 417 return NULL; 418} 419 420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) 421{ 422 rb_erase(&pi->node, root); 423 kfree(pi->name); 424 kfree(pi); 425} 426 427static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 428{ 429 unsigned n, m; 430 431 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 432 calc_pg_masks(pi); 433 434 /* num_snaps * snap_info_t */ 435 n = le32_to_cpu(pi->v.num_snaps); 436 while (n--) { 437 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + 438 sizeof(struct ceph_timespec), bad); 439 *p += sizeof(u64) + /* key */ 440 1 + sizeof(u64) + /* u8, snapid */ 441 sizeof(struct ceph_timespec); 442 m = ceph_decode_32(p); /* snap name */ 443 *p += m; 444 } 445 446 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 447 return 0; 448 449bad: 450 return -EINVAL; 451} 452 453static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 454{ 455 struct ceph_pg_pool_info *pi; 456 u32 num, len, pool; 457 458 ceph_decode_32_safe(p, end, num, bad); 459 dout(" %d pool names\n", num); 460 while (num--) { 461 ceph_decode_32_safe(p, end, pool, bad); 462 ceph_decode_32_safe(p, end, len, bad); 463 dout(" pool %d len %d\n", pool, len); 464 pi = __lookup_pg_pool(&map->pg_pools, pool); 465 if (pi) { 466 kfree(pi->name); 467 pi->name = kmalloc(len + 1, GFP_NOFS); 468 if (pi->name) { 469 memcpy(pi->name, *p, len); 470 pi->name[len] = '\0'; 471 dout(" name is %s\n", pi->name); 472 } 473 } 474 *p += len; 475 } 476 return 0; 477 478bad: 479 return -EINVAL; 480} 481 482/* 483 * osd map 484 */ 485void ceph_osdmap_destroy(struct ceph_osdmap *map) 486{ 487 dout("osdmap_destroy %p\n", map); 488 if (map->crush) 489 crush_destroy(map->crush); 490 while (!RB_EMPTY_ROOT(&map->pg_temp)) { 491 struct ceph_pg_mapping *pg = 492 rb_entry(rb_first(&map->pg_temp), 493 struct ceph_pg_mapping, node); 494 rb_erase(&pg->node, &map->pg_temp); 495 kfree(pg); 496 } 497 while (!RB_EMPTY_ROOT(&map->pg_pools)) { 498 struct ceph_pg_pool_info *pi = 499 rb_entry(rb_first(&map->pg_pools), 500 struct ceph_pg_pool_info, node); 501 __remove_pg_pool(&map->pg_pools, pi); 502 } 503 kfree(map->osd_state); 504 kfree(map->osd_weight); 505 kfree(map->osd_addr); 506 kfree(map); 507} 508 509/* 510 * adjust max osd value. reallocate arrays. 511 */ 512static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 513{ 514 u8 *state; 515 struct ceph_entity_addr *addr; 516 u32 *weight; 517 518 state = kcalloc(max, sizeof(*state), GFP_NOFS); 519 addr = kcalloc(max, sizeof(*addr), GFP_NOFS); 520 weight = kcalloc(max, sizeof(*weight), GFP_NOFS); 521 if (state == NULL || addr == NULL || weight == NULL) { 522 kfree(state); 523 kfree(addr); 524 kfree(weight); 525 return -ENOMEM; 526 } 527 528 /* copy old? */ 529 if (map->osd_state) { 530 memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); 531 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); 532 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); 533 kfree(map->osd_state); 534 kfree(map->osd_addr); 535 kfree(map->osd_weight); 536 } 537 538 map->osd_state = state; 539 map->osd_weight = weight; 540 map->osd_addr = addr; 541 map->max_osd = max; 542 return 0; 543} 544 545/* 546 * decode a full map. 547 */ 548struct ceph_osdmap *osdmap_decode(void **p, void *end) 549{ 550 struct ceph_osdmap *map; 551 u16 version; 552 u32 len, max, i; 553 u8 ev; 554 int err = -EINVAL; 555 void *start = *p; 556 struct ceph_pg_pool_info *pi; 557 558 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 559 560 map = kzalloc(sizeof(*map), GFP_NOFS); 561 if (map == NULL) 562 return ERR_PTR(-ENOMEM); 563 map->pg_temp = RB_ROOT; 564 565 ceph_decode_16_safe(p, end, version, bad); 566 if (version > CEPH_OSDMAP_VERSION) { 567 pr_warning("got unknown v %d > %d of osdmap\n", version, 568 CEPH_OSDMAP_VERSION); 569 goto bad; 570 } 571 572 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); 573 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); 574 map->epoch = ceph_decode_32(p); 575 ceph_decode_copy(p, &map->created, sizeof(map->created)); 576 ceph_decode_copy(p, &map->modified, sizeof(map->modified)); 577 578 ceph_decode_32_safe(p, end, max, bad); 579 while (max--) { 580 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 581 pi = kzalloc(sizeof(*pi), GFP_NOFS); 582 if (!pi) 583 goto bad; 584 pi->id = ceph_decode_32(p); 585 ev = ceph_decode_8(p); /* encoding version */ 586 if (ev > CEPH_PG_POOL_VERSION) { 587 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 588 ev, CEPH_PG_POOL_VERSION); 589 kfree(pi); 590 goto bad; 591 } 592 err = __decode_pool(p, end, pi); 593 if (err < 0) 594 goto bad; 595 __insert_pg_pool(&map->pg_pools, pi); 596 } 597 598 if (version >= 5 && __decode_pool_names(p, end, map) < 0) 599 goto bad; 600 601 ceph_decode_32_safe(p, end, map->pool_max, bad); 602 603 ceph_decode_32_safe(p, end, map->flags, bad); 604 605 max = ceph_decode_32(p); 606 607 /* (re)alloc osd arrays */ 608 err = osdmap_set_max_osd(map, max); 609 if (err < 0) 610 goto bad; 611 dout("osdmap_decode max_osd = %d\n", map->max_osd); 612 613 /* osds */ 614 err = -EINVAL; 615 ceph_decode_need(p, end, 3*sizeof(u32) + 616 map->max_osd*(1 + sizeof(*map->osd_weight) + 617 sizeof(*map->osd_addr)), bad); 618 *p += 4; /* skip length field (should match max) */ 619 ceph_decode_copy(p, map->osd_state, map->max_osd); 620 621 *p += 4; /* skip length field (should match max) */ 622 for (i = 0; i < map->max_osd; i++) 623 map->osd_weight[i] = ceph_decode_32(p); 624 625 *p += 4; /* skip length field (should match max) */ 626 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); 627 for (i = 0; i < map->max_osd; i++) 628 ceph_decode_addr(&map->osd_addr[i]); 629 630 /* pg_temp */ 631 ceph_decode_32_safe(p, end, len, bad); 632 for (i = 0; i < len; i++) { 633 int n, j; 634 struct ceph_pg pgid; 635 struct ceph_pg_mapping *pg; 636 637 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); 638 ceph_decode_copy(p, &pgid, sizeof(pgid)); 639 n = ceph_decode_32(p); 640 ceph_decode_need(p, end, n * sizeof(u32), bad); 641 err = -ENOMEM; 642 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); 643 if (!pg) 644 goto bad; 645 pg->pgid = pgid; 646 pg->len = n; 647 for (j = 0; j < n; j++) 648 pg->osds[j] = ceph_decode_32(p); 649 650 err = __insert_pg_mapping(pg, &map->pg_temp); 651 if (err) 652 goto bad; 653 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); 654 } 655 656 /* crush */ 657 ceph_decode_32_safe(p, end, len, bad); 658 dout("osdmap_decode crush len %d from off 0x%x\n", len, 659 (int)(*p - start)); 660 ceph_decode_need(p, end, len, bad); 661 map->crush = crush_decode(*p, end); 662 *p += len; 663 if (IS_ERR(map->crush)) { 664 err = PTR_ERR(map->crush); 665 map->crush = NULL; 666 goto bad; 667 } 668 669 /* ignore the rest of the map */ 670 *p = end; 671 672 dout("osdmap_decode done %p %p\n", *p, end); 673 return map; 674 675bad: 676 dout("osdmap_decode fail\n"); 677 ceph_osdmap_destroy(map); 678 return ERR_PTR(err); 679} 680 681/* 682 * decode and apply an incremental map update. 683 */ 684struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 685 struct ceph_osdmap *map, 686 struct ceph_messenger *msgr) 687{ 688 struct crush_map *newcrush = NULL; 689 struct ceph_fsid fsid; 690 u32 epoch = 0; 691 struct ceph_timespec modified; 692 u32 len, pool; 693 __s32 new_pool_max, new_flags, max; 694 void *start = *p; 695 int err = -EINVAL; 696 u16 version; 697 struct rb_node *rbp; 698 699 ceph_decode_16_safe(p, end, version, bad); 700 if (version > CEPH_OSDMAP_INC_VERSION) { 701 pr_warning("got unknown v %d > %d of inc osdmap\n", version, 702 CEPH_OSDMAP_INC_VERSION); 703 goto bad; 704 } 705 706 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), 707 bad); 708 ceph_decode_copy(p, &fsid, sizeof(fsid)); 709 epoch = ceph_decode_32(p); 710 BUG_ON(epoch != map->epoch+1); 711 ceph_decode_copy(p, &modified, sizeof(modified)); 712 new_pool_max = ceph_decode_32(p); 713 new_flags = ceph_decode_32(p); 714 715 /* full map? */ 716 ceph_decode_32_safe(p, end, len, bad); 717 if (len > 0) { 718 dout("apply_incremental full map len %d, %p to %p\n", 719 len, *p, end); 720 return osdmap_decode(p, min(*p+len, end)); 721 } 722 723 /* new crush? */ 724 ceph_decode_32_safe(p, end, len, bad); 725 if (len > 0) { 726 dout("apply_incremental new crush map len %d, %p to %p\n", 727 len, *p, end); 728 newcrush = crush_decode(*p, min(*p+len, end)); 729 if (IS_ERR(newcrush)) 730 return ERR_CAST(newcrush); 731 *p += len; 732 } 733 734 /* new flags? */ 735 if (new_flags >= 0) 736 map->flags = new_flags; 737 if (new_pool_max >= 0) 738 map->pool_max = new_pool_max; 739 740 ceph_decode_need(p, end, 5*sizeof(u32), bad); 741 742 /* new max? */ 743 max = ceph_decode_32(p); 744 if (max >= 0) { 745 err = osdmap_set_max_osd(map, max); 746 if (err < 0) 747 goto bad; 748 } 749 750 map->epoch++; 751 map->modified = map->modified; 752 if (newcrush) { 753 if (map->crush) 754 crush_destroy(map->crush); 755 map->crush = newcrush; 756 newcrush = NULL; 757 } 758 759 /* new_pool */ 760 ceph_decode_32_safe(p, end, len, bad); 761 while (len--) { 762 __u8 ev; 763 struct ceph_pg_pool_info *pi; 764 765 ceph_decode_32_safe(p, end, pool, bad); 766 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); 767 ev = ceph_decode_8(p); /* encoding version */ 768 if (ev > CEPH_PG_POOL_VERSION) { 769 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 770 ev, CEPH_PG_POOL_VERSION); 771 goto bad; 772 } 773 pi = __lookup_pg_pool(&map->pg_pools, pool); 774 if (!pi) { 775 pi = kzalloc(sizeof(*pi), GFP_NOFS); 776 if (!pi) { 777 err = -ENOMEM; 778 goto bad; 779 } 780 pi->id = pool; 781 __insert_pg_pool(&map->pg_pools, pi); 782 } 783 err = __decode_pool(p, end, pi); 784 if (err < 0) 785 goto bad; 786 } 787 if (version >= 5 && __decode_pool_names(p, end, map) < 0) 788 goto bad; 789 790 /* old_pool */ 791 ceph_decode_32_safe(p, end, len, bad); 792 while (len--) { 793 struct ceph_pg_pool_info *pi; 794 795 ceph_decode_32_safe(p, end, pool, bad); 796 pi = __lookup_pg_pool(&map->pg_pools, pool); 797 if (pi) 798 __remove_pg_pool(&map->pg_pools, pi); 799 } 800 801 /* new_up */ 802 err = -EINVAL; 803 ceph_decode_32_safe(p, end, len, bad); 804 while (len--) { 805 u32 osd; 806 struct ceph_entity_addr addr; 807 ceph_decode_32_safe(p, end, osd, bad); 808 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); 809 ceph_decode_addr(&addr); 810 pr_info("osd%d up\n", osd); 811 BUG_ON(osd >= map->max_osd); 812 map->osd_state[osd] |= CEPH_OSD_UP; 813 map->osd_addr[osd] = addr; 814 } 815 816 /* new_down */ 817 ceph_decode_32_safe(p, end, len, bad); 818 while (len--) { 819 u32 osd; 820 ceph_decode_32_safe(p, end, osd, bad); 821 (*p)++; /* clean flag */ 822 pr_info("osd%d down\n", osd); 823 if (osd < map->max_osd) 824 map->osd_state[osd] &= ~CEPH_OSD_UP; 825 } 826 827 /* new_weight */ 828 ceph_decode_32_safe(p, end, len, bad); 829 while (len--) { 830 u32 osd, off; 831 ceph_decode_need(p, end, sizeof(u32)*2, bad); 832 osd = ceph_decode_32(p); 833 off = ceph_decode_32(p); 834 pr_info("osd%d weight 0x%x %s\n", osd, off, 835 off == CEPH_OSD_IN ? "(in)" : 836 (off == CEPH_OSD_OUT ? "(out)" : "")); 837 if (osd < map->max_osd) 838 map->osd_weight[osd] = off; 839 } 840 841 /* new_pg_temp */ 842 rbp = rb_first(&map->pg_temp); 843 ceph_decode_32_safe(p, end, len, bad); 844 while (len--) { 845 struct ceph_pg_mapping *pg; 846 int j; 847 struct ceph_pg pgid; 848 u32 pglen; 849 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); 850 ceph_decode_copy(p, &pgid, sizeof(pgid)); 851 pglen = ceph_decode_32(p); 852 853 /* remove any? */ 854 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, 855 node)->pgid, pgid) <= 0) { 856 struct ceph_pg_mapping *cur = 857 rb_entry(rbp, struct ceph_pg_mapping, node); 858 859 rbp = rb_next(rbp); 860 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); 861 rb_erase(&cur->node, &map->pg_temp); 862 kfree(cur); 863 } 864 865 if (pglen) { 866 /* insert */ 867 ceph_decode_need(p, end, pglen*sizeof(u32), bad); 868 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); 869 if (!pg) { 870 err = -ENOMEM; 871 goto bad; 872 } 873 pg->pgid = pgid; 874 pg->len = pglen; 875 for (j = 0; j < pglen; j++) 876 pg->osds[j] = ceph_decode_32(p); 877 err = __insert_pg_mapping(pg, &map->pg_temp); 878 if (err) { 879 kfree(pg); 880 goto bad; 881 } 882 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, 883 pglen); 884 } 885 } 886 while (rbp) { 887 struct ceph_pg_mapping *cur = 888 rb_entry(rbp, struct ceph_pg_mapping, node); 889 890 rbp = rb_next(rbp); 891 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); 892 rb_erase(&cur->node, &map->pg_temp); 893 kfree(cur); 894 } 895 896 /* ignore the rest */ 897 *p = end; 898 return map; 899 900bad: 901 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", 902 epoch, (int)(*p - start), *p, start, end); 903 print_hex_dump(KERN_DEBUG, "osdmap: ", 904 DUMP_PREFIX_OFFSET, 16, 1, 905 start, end - start, true); 906 if (newcrush) 907 crush_destroy(newcrush); 908 return ERR_PTR(err); 909} 910 911 912 913 914/* 915 * calculate file layout from given offset, length. 916 * fill in correct oid, logical length, and object extent 917 * offset, length. 918 * 919 * for now, we write only a single su, until we can 920 * pass a stride back to the caller. 921 */ 922void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 923 u64 off, u64 *plen, 924 u64 *ono, 925 u64 *oxoff, u64 *oxlen) 926{ 927 u32 osize = le32_to_cpu(layout->fl_object_size); 928 u32 su = le32_to_cpu(layout->fl_stripe_unit); 929 u32 sc = le32_to_cpu(layout->fl_stripe_count); 930 u32 bl, stripeno, stripepos, objsetno; 931 u32 su_per_object; 932 u64 t, su_offset; 933 934 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, 935 osize, su); 936 su_per_object = osize / su; 937 dout("osize %u / su %u = su_per_object %u\n", osize, su, 938 su_per_object); 939 940 BUG_ON((su & ~PAGE_MASK) != 0); 941 /* bl = *off / su; */ 942 t = off; 943 do_div(t, su); 944 bl = t; 945 dout("off %llu / su %u = bl %u\n", off, su, bl); 946 947 stripeno = bl / sc; 948 stripepos = bl % sc; 949 objsetno = stripeno / su_per_object; 950 951 *ono = objsetno * sc + stripepos; 952 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); 953 954 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ 955 t = off; 956 su_offset = do_div(t, su); 957 *oxoff = su_offset + (stripeno % su_per_object) * su; 958 959 /* 960 * Calculate the length of the extent being written to the selected 961 * object. This is the minimum of the full length requested (plen) or 962 * the remainder of the current stripe being written to. 963 */ 964 *oxlen = min_t(u64, *plen, su - su_offset); 965 *plen = *oxlen; 966 967 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 968} 969 970/* 971 * calculate an object layout (i.e. pgid) from an oid, 972 * file_layout, and osdmap 973 */ 974int ceph_calc_object_layout(struct ceph_object_layout *ol, 975 const char *oid, 976 struct ceph_file_layout *fl, 977 struct ceph_osdmap *osdmap) 978{ 979 unsigned num, num_mask; 980 struct ceph_pg pgid; 981 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); 982 int poolid = le32_to_cpu(fl->fl_pg_pool); 983 struct ceph_pg_pool_info *pool; 984 unsigned ps; 985 986 BUG_ON(!osdmap); 987 988 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 989 if (!pool) 990 return -EIO; 991 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); 992 if (preferred >= 0) { 993 ps += preferred; 994 num = le32_to_cpu(pool->v.lpg_num); 995 num_mask = pool->lpg_num_mask; 996 } else { 997 num = le32_to_cpu(pool->v.pg_num); 998 num_mask = pool->pg_num_mask; 999 } 1000 1001 pgid.ps = cpu_to_le16(ps); 1002 pgid.preferred = cpu_to_le16(preferred); 1003 pgid.pool = fl->fl_pg_pool; 1004 if (preferred >= 0) 1005 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, 1006 (int)preferred); 1007 else 1008 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); 1009 1010 ol->ol_pgid = pgid; 1011 ol->ol_stripe_unit = fl->fl_object_stripe_unit; 1012 return 0; 1013} 1014 1015/* 1016 * Calculate raw osd vector for the given pgid. Return pointer to osd 1017 * array, or NULL on failure. 1018 */ 1019static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1020 int *osds, int *num) 1021{ 1022 struct ceph_pg_mapping *pg; 1023 struct ceph_pg_pool_info *pool; 1024 int ruleno; 1025 unsigned poolid, ps, pps; 1026 int preferred; 1027 1028 /* pg_temp? */ 1029 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1030 if (pg) { 1031 *num = pg->len; 1032 return pg->osds; 1033 } 1034 1035 /* crush */ 1036 poolid = le32_to_cpu(pgid.pool); 1037 ps = le16_to_cpu(pgid.ps); 1038 preferred = (s16)le16_to_cpu(pgid.preferred); 1039 1040 /* don't forcefeed bad device ids to crush */ 1041 if (preferred >= osdmap->max_osd || 1042 preferred >= osdmap->crush->max_devices) 1043 preferred = -1; 1044 1045 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); 1046 if (!pool) 1047 return NULL; 1048 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 1049 pool->v.type, pool->v.size); 1050 if (ruleno < 0) { 1051 pr_err("no crush rule pool %d ruleset %d type %d size %d\n", 1052 poolid, pool->v.crush_ruleset, pool->v.type, 1053 pool->v.size); 1054 return NULL; 1055 } 1056 1057 if (preferred >= 0) 1058 pps = ceph_stable_mod(ps, 1059 le32_to_cpu(pool->v.lpgp_num), 1060 pool->lpgp_num_mask); 1061 else 1062 pps = ceph_stable_mod(ps, 1063 le32_to_cpu(pool->v.pgp_num), 1064 pool->pgp_num_mask); 1065 pps += poolid; 1066 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1067 min_t(int, pool->v.size, *num), 1068 preferred, osdmap->osd_weight); 1069 return osds; 1070} 1071 1072/* 1073 * Return acting set for given pgid. 1074 */ 1075int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1076 int *acting) 1077{ 1078 int rawosds[CEPH_PG_MAX_SIZE], *osds; 1079 int i, o, num = CEPH_PG_MAX_SIZE; 1080 1081 osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1082 if (!osds) 1083 return -1; 1084 1085 /* primary is first up osd */ 1086 o = 0; 1087 for (i = 0; i < num; i++) 1088 if (ceph_osd_is_up(osdmap, osds[i])) 1089 acting[o++] = osds[i]; 1090 return o; 1091} 1092 1093/* 1094 * Return primary osd for given pgid, or -1 if none. 1095 */ 1096int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 1097{ 1098 int rawosds[CEPH_PG_MAX_SIZE], *osds; 1099 int i, num = CEPH_PG_MAX_SIZE; 1100 1101 osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1102 if (!osds) 1103 return -1; 1104 1105 /* primary is first up osd */ 1106 for (i = 0; i < num; i++) 1107 if (ceph_osd_is_up(osdmap, osds[i])) 1108 return osds[i]; 1109 return -1; 1110} 1111