1/* 2 * ntfs_index.c - NTFS kernel index handling. 3 * 4 * Copyright (c) 2006-2008 Anton Altaparmakov. All Rights Reserved. 5 * Portions Copyright (c) 2006-2008 Apple Inc. All Rights Reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its 16 * contributors may be used to endorse or promote products derived from this 17 * software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in 31 * full, this file may be redistributed and/or modified under the terms of the 32 * GNU General Public License (GPL) Version 2, in which case the provisions of 33 * that version of the GPL will apply to you instead of the license terms 34 * above. You can obtain a copy of the GPL Version 2 at 35 * http://developer.apple.com/opensource/licenses/gpl-2.txt. 36 */ 37 38#include <sys/errno.h> 39 40#include <string.h> 41 42#include <libkern/libkern.h> 43#include <libkern/OSMalloc.h> 44 45#include <kern/debug.h> 46#include <kern/locks.h> 47 48#include "ntfs.h" 49#include "ntfs_attr.h" 50#include "ntfs_attr_list.h" 51#include "ntfs_bitmap.h" 52#include "ntfs_collate.h" 53#include "ntfs_debug.h" 54#include "ntfs_dir.h" 55#include "ntfs_endian.h" 56#include "ntfs_index.h" 57#include "ntfs_inode.h" 58#include "ntfs_layout.h" 59#include "ntfs_lcnalloc.h" 60#include "ntfs_mft.h" 61#include "ntfs_page.h" 62#include "ntfs_runlist.h" 63#include "ntfs_types.h" 64#include "ntfs_unistr.h" 65#include "ntfs_volume.h" 66 67/** 68 * ntfs_index_ctx_unlock - unlock an index context 69 * @ictx: index context to unlock 70 * 71 * Unlock the index context @ictx. We also unmap the mft record (in index root 72 * case) or the page (in index allocation block case) thus all pointers into 73 * the index node need to be revalidated when the mft record or page is mapped 74 * again in ntfs_index_ctx_relock(). 75 * 76 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 77 * - The index context @ictx must be locked. 78 */ 79void ntfs_index_ctx_unlock(ntfs_index_context *ictx) 80{ 81 if (!ictx) 82 panic("%s(): !ictx\n", __FUNCTION__); 83 if (!ictx->is_locked) 84 panic("%s(): !ictx->is_locked\n", __FUNCTION__); 85 if (ictx->is_root) { 86 if (ictx->actx) { 87 ntfs_attr_search_ctx_put(ictx->actx); 88 if (!ictx->base_ni) 89 panic("%s(): !ictx->base_ni\n", __FUNCTION__); 90 ntfs_mft_record_unmap(ictx->base_ni); 91 ictx->actx = NULL; 92 } 93 } else { 94 if (ictx->upl) { 95 ntfs_page_unmap(ictx->idx_ni, ictx->upl, ictx->pl, 96 ictx->is_dirty); 97 ictx->upl = NULL; 98 ictx->is_dirty = 0; 99 } 100 } 101 ictx->is_locked = 0; 102} 103 104/** 105 * ntfs_index_ctx_path_unlock - unlock an entire B+tree index context path 106 * @index_ctx: index context whose whole path to unlock 107 * 108 * Unlock all index contexts attached to the B+tree path to which @index_ctx 109 * belongs. 110 * 111 * This function is only called in error handling to ensure nothing is held 112 * busy so the error handling code cannot deadlock. 113 * 114 * Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode. 115 */ 116static void ntfs_index_ctx_path_unlock(ntfs_index_context *index_ctx) 117{ 118 ntfs_index_context *ictx; 119 120 ictx = index_ctx; 121 if (!ictx) 122 panic("%s(): !ictx\n", __FUNCTION__); 123 /* 124 * Note we traverse the tree path backwards (upwards) because @up is 125 * the first element in the index_context structure thus doing things 126 * this way generates faster/better machine code. 127 */ 128 do { 129 if (ictx->is_locked) 130 ntfs_index_ctx_unlock(ictx); 131 } while ((ictx = ictx->up) != index_ctx); 132} 133 134/** 135 * ntfs_index_ctx_relock - relock an index context that was unlocked earlier 136 * @ictx: index context to relock 137 * 138 * Relock the index context @ictx after it was unlocked with 139 * ntfs_index_ctx_unlock(). We also remap the mft record (in index root case) 140 * or the page (in index allocation block case) after which we revalidate all 141 * pointers into the index node because the page may have been mapped into a 142 * different virtual address and the mft record may have been changed with the 143 * result that the index root attribute is moved within the mft record or even 144 * to a completely different mft record. 145 * 146 * Note the check whether to revalidate or not is very simple because the index 147 * node content cannot have changed thus all points change by a fixed offset 148 * delta which once determined can be applied to all pointers. 149 * 150 * In the index root case, there is also a non-pointer index context field that 151 * can have changed (and it does so irrespective of the index root position). 152 * This is @ictx->bytes_free as that is dependent on the other attributes in 153 * the mft record which can have changed legitimately under our feet which of 154 * course is the reason why the index root can have moved about, too. 155 * 156 * Return 0 on success and errno on error. 157 * 158 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 159 * - The index context @ictx must not be locked. 160 */ 161errno_t ntfs_index_ctx_relock(ntfs_index_context *ictx) 162{ 163 long delta; 164 errno_t err; 165 166 if (!ictx) 167 panic("%s(): !ictx\n", __FUNCTION__); 168 if (ictx->is_locked) 169 panic("%s(): ictx->is_locked\n", __FUNCTION__); 170 if (ictx->is_root) { 171 MFT_RECORD *m; 172 ntfs_attr_search_ctx *actx; 173 174 if (ictx->actx) 175 panic("%s(): ictx->actx\n", __FUNCTION__); 176 if (!ictx->base_ni) 177 panic("%s(): !ictx->base_ni\n", __FUNCTION__); 178 err = ntfs_mft_record_map(ictx->base_ni, &m); 179 if (err) { 180 ntfs_error(ictx->idx_ni->vol->mp, "Failed to lock " 181 "index root because " 182 "ntfs_mft_record_map() failed (error " 183 "%d).", err); 184 return err; 185 } 186 ictx->actx = actx = ntfs_attr_search_ctx_get(ictx->base_ni, m); 187 if (!actx) { 188 ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate " 189 "attribute search context."); 190 err = ENOMEM; 191 goto actx_err; 192 } 193 err = ntfs_attr_lookup(AT_INDEX_ROOT, ictx->idx_ni->name, 194 ictx->idx_ni->name_len, 0, NULL, 0, actx); 195 if (err) { 196 if (err == ENOENT) 197 panic("%s(): err == ENOENT\n", __FUNCTION__); 198 ntfs_error(ictx->idx_ni->vol->mp, "Failed to look up " 199 "index root attribute (error %d).", 200 err); 201 goto actx_err; 202 } 203 ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - 204 le32_to_cpu(actx->m->bytes_in_use); 205 /* Get to the index root value. */ 206 ictx->ir = (INDEX_ROOT*)((u8*)actx->a + 207 le16_to_cpu(actx->a->value_offset)); 208 delta = (u8*)&ictx->ir->index - (u8*)ictx->index; 209 ictx->index = &ictx->ir->index; 210 } else { 211 u8 *addr; 212 213 if (ictx->upl) 214 panic("%s(): ictx->upl\n", __FUNCTION__); 215 if (ictx->is_dirty) 216 panic("%s(): ictx->is_dirty\n", __FUNCTION__); 217 err = ntfs_page_map(ictx->idx_ni, ictx->upl_ofs, &ictx->upl, 218 &ictx->pl, &addr, TRUE); 219 if (err) { 220 ntfs_error(ictx->idx_ni->vol->mp, "Failed to map " 221 "index page (error %d).", err); 222 ictx->upl = NULL; 223 return err; 224 } 225 /* Get to the index allocation block. */ 226 delta = addr - ictx->addr; 227 if (delta) { 228 ictx->addr = addr; 229 ictx->ia = (INDEX_ALLOCATION*)((u8*)ictx->ia + delta); 230 ictx->index = &ictx->ia->index; 231 } 232 } 233 if (delta) { 234 INDEX_ENTRY **entries; 235 unsigned u; 236 237 /* 238 * The index node has moved thus we have to update all stored 239 * pointers so they point into the new memory location. 240 */ 241 ictx->entry = (INDEX_ENTRY*)((u8*)ictx->entry + delta); 242 entries = ictx->entries; 243 for (u = 0; u < ictx->nr_entries; u++) 244 entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta); 245 } 246 ictx->is_locked = 1; 247 return 0; 248actx_err: 249 if (ictx->actx) { 250 ntfs_attr_search_ctx_put(ictx->actx); 251 ictx->actx = NULL; 252 } 253 ntfs_mft_record_unmap(ictx->base_ni); 254 return err; 255} 256 257/** 258 * ntfs_index_ctx_get - allocate and initialize a new index context 259 * @idx_ni: ntfs index inode with which to initialize the context 260 * 261 * Allocate a new index context, initialize it with @idx_ni and return it. 262 * 263 * Return NULL if the allocation failed. 264 * 265 * Locking: Caller must hold @idx_ni->lock on the index inode. 266 */ 267ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) 268{ 269 ntfs_index_context *ictx; 270 271 ictx = ntfs_index_ctx_alloc(); 272 if (ictx) 273 ntfs_index_ctx_init(ictx, idx_ni); 274 return ictx; 275} 276 277/** 278 * ntfs_index_ctx_put_reuse_single - release an index context but do not free it 279 * @ictx: index context to release 280 * 281 * Release the index context @ictx, releasing all associated resources but keep 282 * the index context itself allocated so it can be reused with a call to 283 * ntfs_index_ctx_init(). 284 * 285 * This function ignores the tree path which this entry may be a part of and is 286 * only a helper function for ntfs_index_ctx_put_reuse(). 287 * 288 * Locking: Caller must hold @ictx->idx_ni->lock on the index inode. 289 */ 290void ntfs_index_ctx_put_reuse_single(ntfs_index_context *ictx) 291{ 292 if (ictx->entry && ictx->is_locked) 293 ntfs_index_ctx_unlock(ictx); 294 if (ictx->entries) 295 OSFree(ictx->entries, ictx->max_entries * sizeof(INDEX_ENTRY*), 296 ntfs_malloc_tag); 297} 298 299/** 300 * ntfs_index_ctx_put_reuse - release an index context but do not free it 301 * @index_ctx: index context to release 302 * 303 * Release the index context @index_ctx, releasing all associated resources but 304 * keep the index context itself allocated so it can be reused with a call to 305 * ntfs_index_ctx_init(). 306 * 307 * Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode. 308 */ 309void ntfs_index_ctx_put_reuse(ntfs_index_context *index_ctx) 310{ 311 ntfs_index_context *ictx, *next; 312 313 /* 314 * Destroy all tree path components except @index_ctx itself. We need 315 * the temporary index context pointer @next because we deallocate the 316 * current index context in each iteration of the loop thus we would no 317 * longer have any means of finding the next index context in the tree 318 * path if we had not already stored a pointer to it in @next. Note we 319 * actually traverse the tree path backwards (upwards) because @up is 320 * the first element in the index_context structure thus doing things 321 * this way generates faster/better machine code. 322 */ 323 for (ictx = index_ctx->up, next = ictx->up; ictx != index_ctx; 324 ictx = next, next = next->up) { 325 /* 326 * Disconnect the current index context from the tree and 327 * release it and all its resources. 328 */ 329 ntfs_index_ctx_put_single(ictx); 330 } 331 /* Reuse the only remaining, bottom entry. */ 332 ntfs_index_ctx_put_reuse_single(index_ctx); 333} 334 335/** 336 * ntfs_index_get_entries - get the entries for the index node into the context 337 * @ictx: index context for which to get the entries 338 * 339 * Loop through the entries in the index node described by @ictx and gather all 340 * the index entries into the @ictx->entries array which is also allocated by 341 * this function. 342 * 343 * Return 0 on success and errno on error. 344 * 345 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 346 * - The index context @ictx must be locked. 347 */ 348static errno_t ntfs_index_get_entries(ntfs_index_context *ictx) 349{ 350 u8 *index_end; 351 INDEX_HEADER *index; 352 INDEX_ENTRY *ie, **entries; 353 unsigned nr_entries, max_entries; 354 errno_t err; 355 BOOL is_view; 356 357 ntfs_debug("Entering."); 358 if (!ictx->is_locked) 359 panic("%s(): !ictx->is_locked\n", __FUNCTION__); 360 is_view = FALSE; 361 if (ictx->idx_ni->name != I30) 362 is_view = TRUE; 363 nr_entries = 0; 364 max_entries = ictx->max_entries; 365 /* Allocate memory for the index entry pointers in the index node. */ 366 entries = OSMalloc(max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag); 367 if (!entries) { 368 ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate index " 369 "entry pointer array."); 370 return ENOMEM; 371 } 372 index = ictx->index; 373 index_end = (u8*)index + le32_to_cpu(index->index_length); 374 /* The first index entry. */ 375 ie = (INDEX_ENTRY*)((u8*)index + le32_to_cpu(index->entries_offset)); 376 /* 377 * Loop until we exceed valid memory (corruption case) or until we 378 * reach the last entry and for each entry place a pointer to it into 379 * our array of entry pointers. 380 */ 381 for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { 382 /* Bounds checks. */ 383 if ((u8*)ie < (u8*)index || (u8*)ie + 384 sizeof(INDEX_ENTRY_HEADER) > index_end || 385 (u8*)ie + le16_to_cpu(ie->length) > index_end || 386 (u32)sizeof(INDEX_ENTRY_HEADER) + 387 le16_to_cpu(ie->key_length) > 388 le16_to_cpu(ie->length)) 389 goto err; 390 /* Add this entry to the array of entry pointers. */ 391 if (nr_entries >= max_entries) 392 panic("%s(): nr_entries >= max_entries\n", 393 __FUNCTION__); 394 entries[nr_entries++] = ie; 395 if (ie->flags & INDEX_ENTRY_END) 396 break; 397 /* Further bounds checks for view indexes. */ 398 if (is_view && ((u32)sizeof(INDEX_ENTRY_HEADER) + 399 le16_to_cpu(ie->key_length) > 400 le16_to_cpu(ie->data_offset) || 401 (u32)le16_to_cpu(ie->data_offset) + 402 le16_to_cpu(ie->data_length) > 403 le16_to_cpu(ie->length))) 404 goto err; 405 } 406 /* Except for the index root, leaf nodes are not allowed to be empty. */ 407 if (nr_entries < 2 && !ictx->is_root && !(index->flags & INDEX_NODE)) { 408 ntfs_error(ictx->idx_ni->vol->mp, "Illegal empty leaf node " 409 "found in index."); 410 err = EIO; 411 goto err; 412 } 413 ictx->entries = entries; 414 ictx->nr_entries = nr_entries; 415 ntfs_debug("Done."); 416 return 0; 417err: 418 OSFree(entries, max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag); 419 ntfs_error(ictx->idx_ni->vol->mp, "Corrupt index in inode 0x%llx. " 420 "Run chkdsk.", 421 (unsigned long long)ictx->idx_ni->mft_no); 422 NVolSetErrors(ictx->idx_ni->vol); 423 return EIO; 424} 425 426/** 427 * ntfs_index_lookup_init - prepare an index context for a lookup 428 * @ictx: [IN] index context to prepare 429 * @key_len: [IN] size of the key ntfs_index_lookup() is called with in bytes 430 * 431 * Prepare the index context @ictx for a call to ntfs_index_lookup() or 432 * ntfs_index_lookup_by_position(). 433 * 434 * @key_len is the length of the key ntfs_index_lookup() will be called with. 435 * If the index @ictx is a directory index this is ignored. 436 * 437 * Return 0 on success and errno on error. 438 */ 439static errno_t ntfs_index_lookup_init(ntfs_index_context *ictx, 440 const int key_len) 441{ 442 ntfs_inode *idx_ni; 443 MFT_RECORD *m; 444 ntfs_attr_search_ctx *actx; 445 INDEX_ROOT *ir; 446 unsigned min_entry_size, max_entries; 447 errno_t err; 448 449 ntfs_debug("Entering."); 450 if (!ictx->base_ni) 451 panic("%s(): !ictx->base_ni\n", __FUNCTION__); 452 idx_ni = ictx->idx_ni; 453 if (idx_ni->type != AT_INDEX_ALLOCATION) 454 panic("%s(): idx_ni->type != AT_INDEX_ALLOCATION\n", 455 __FUNCTION__); 456 /* 457 * Ensure the index context is still uninitialized, i.e. it is not 458 * legal to call ntfs_index_lookup*() with a search context that has 459 * been used already. 460 */ 461 if (ictx->up != ictx || ictx->max_entries) 462 panic("%s(): Called for already used index context.\n", 463 __FUNCTION__); 464 if (!ntfs_is_collation_rule_supported(idx_ni->collation_rule)) { 465 ntfs_error(idx_ni->vol->mp, "Index uses unsupported collation " 466 "rule 0x%x. Aborting.", 467 le32_to_cpu(idx_ni->collation_rule)); 468 return ENOTSUP; 469 } 470 /* Get hold of the mft record for the index inode. */ 471 err = ntfs_mft_record_map(ictx->base_ni, &m); 472 if (err) { 473 ntfs_error(idx_ni->vol->mp, "Failed to map mft record (error " 474 "%d.", err); 475 return err; 476 } 477 actx = ntfs_attr_search_ctx_get(ictx->base_ni, m); 478 if (!actx) { 479 err = ENOMEM; 480 goto err; 481 } 482 /* Find the index root attribute in the mft record. */ 483 err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, 484 0, NULL, 0, actx); 485 if (err) { 486 if (err == ENOENT) { 487 ntfs_error(idx_ni->vol->mp, "Index root attribute " 488 "missing in inode 0x%llx. Inode is " 489 "corrupt. Run chkdsk.", 490 (unsigned long long)idx_ni->mft_no); 491 /* 492 * This will cause the returned error to be EIO and the 493 * volume to be marked as containing errors. 494 */ 495 err = 0; 496 } 497 goto err; 498 } 499 /* 500 * The entry size is made up of the index entry header plus the index 501 * key plus the index data plus optionally the sub-node vcn pointer. 502 * 503 * For most index types the index key is constant size and is simply 504 * given by the caller supplied @key_len. 505 * 506 * The only collation types with variable sized keys are 507 * COLLATION_FILENAME and COLLATION_NTOFS_SID. 508 * 509 * Determining the index data size is more complicated than that so we 510 * simply ignore it. This means we waste some memory but it is not too 511 * bad as only directory indexes appear more than once on a volume thus 512 * only they can have more than one lookup in progress at any one time. 513 */ 514 min_entry_size = sizeof(INDEX_ENTRY_HEADER) + sizeof(FILENAME_ATTR); 515 if (idx_ni->collation_rule != COLLATION_FILENAME) { 516 if (idx_ni->collation_rule == COLLATION_NTOFS_SID) 517 min_entry_size = sizeof(INDEX_ENTRY_HEADER) + 518 sizeof(SID); 519 else 520 min_entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len; 521 } 522 /* 523 * Work out the absolute maximum number of entries there can be in an 524 * index allocation block. We add one for the end entry which does not 525 * contain a key. 526 */ 527 max_entries = 1 + ((idx_ni->block_size - sizeof(INDEX_BLOCK) - 528 sizeof(INDEX_ENTRY_HEADER)) / min_entry_size); 529 /* 530 * Should the mft record size exceed the size of an index block (this 531 * should never really happen) then calculate the maximum number of 532 * entries there can be in the index root and if they are more than the 533 * ones in the index block use that as the maximum value. 534 */ 535 if (idx_ni->vol->mft_record_size > idx_ni->block_size) { 536 unsigned max_root_entries; 537 538 max_root_entries = 1 + ((idx_ni->vol->mft_record_size - 539 le16_to_cpu(m->attrs_offset) - 540 offsetof(ATTR_RECORD, reservedR) - 541 sizeof(((ATTR_RECORD*)NULL)->reservedR) - 542 sizeof(INDEX_ROOT) - 543 sizeof(INDEX_ENTRY_HEADER)) / min_entry_size); 544 if (max_root_entries > max_entries) 545 max_entries = max_root_entries; 546 } 547 ictx->max_entries = max_entries; 548 /* 549 * Get to the index root value (it has been verified when the inode was 550 * read in ntfs_index_inode_read()). 551 */ 552 ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); 553 ictx->index = &ir->index; 554 /* 555 * Gather the index entry pointers and finish setting up the index 556 * context. 557 */ 558 ictx->is_root = 1; 559 ictx->is_locked = 1; 560 err = ntfs_index_get_entries(ictx); 561 if (err) { 562 ictx->is_locked = 0; 563 goto err; 564 } 565 ictx->ir = ir; 566 ictx->actx = actx; 567 ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - 568 le32_to_cpu(actx->m->bytes_in_use); 569 ntfs_debug("Done."); 570 return 0; 571err: 572 if (actx) 573 ntfs_attr_search_ctx_put(actx); 574 ntfs_mft_record_unmap(ictx->base_ni); 575 ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); 576 return err; 577} 578 579/** 580 * ntfs_index_descend_into_child_node - index node whose child node to return 581 * @index_ctx: pointer to index context whose child node to return 582 * 583 * Descend into the child node pointed to by (*@index_ctx)->entry and return 584 * its fully set up index context in *@index_ctx. 585 * 586 * Return 0 on success and errno on error. 587 * 588 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 589 * - The index context @ictx must be locked. 590 */ 591static errno_t ntfs_index_descend_into_child_node( 592 ntfs_index_context **index_ctx) 593{ 594 VCN vcn; 595 s64 ofs; 596 ntfs_index_context *ictx, *new_ictx; 597 ntfs_inode *idx_ni; 598 upl_t upl; 599 upl_page_info_array_t pl; 600 u8 *addr; 601 INDEX_ALLOCATION *ia; 602 errno_t err = 0; 603 int is_dirty = 0; 604 static const char es[] = "%s. Inode 0x%llx is corrupt. Run chkdsk."; 605 606 ntfs_debug("Entering."); 607 if (!index_ctx) 608 panic("%s(): !index_ctx\n", __FUNCTION__); 609 ictx = *index_ctx; 610 if (!ictx) 611 panic("%s(): !ictx\n", __FUNCTION__); 612 idx_ni = ictx->idx_ni; 613 if (!ictx->is_locked) 614 panic("%s(): !ictx->is_locked\n", __FUNCTION__); 615 if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) 616 panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n", 617 __FUNCTION__); 618 /* 619 * Since INDEX_NODE == LARGE_INDEX this check is ok for the index root 620 * as well. 621 */ 622 if (!(ictx->index->flags & INDEX_NODE)) { 623 ntfs_error(idx_ni->vol->mp, es, "Index entry with child node " 624 "found in a leaf node", 625 (unsigned long long)idx_ni->mft_no); 626 goto err; 627 } 628 /* Get the starting vcn of the child index block to descend into. */ 629 vcn = sle64_to_cpup((sle64*)((u8*)ictx->entry + 630 le16_to_cpu(ictx->entry->length) - sizeof(VCN))); 631 if (vcn < 0) { 632 ntfs_error(idx_ni->vol->mp, es, "Negative child node VCN", 633 (unsigned long long)idx_ni->mft_no); 634 goto err; 635 } 636 /* Determine the offset of the page containing the child index block. */ 637 ofs = (vcn << idx_ni->vcn_size_shift) & ~PAGE_MASK_64; 638 /* 639 * If the entry whose sub-node we are descending into is in the index 640 * root, release the index root unlocking its node or we can deadlock 641 * with ntfs_page_map(). 642 */ 643 if (ictx->is_root) { 644 /* 645 * As a sanity check verify that the index allocation attribute 646 * exists. 647 */ 648 if (!NInoIndexAllocPresent(idx_ni)) { 649 ntfs_error(idx_ni->vol->mp, "No index allocation " 650 "attribute but index root entry " 651 "requires one. Inode 0x%llx is " 652 "corrupt. Run chkdsk.", 653 (unsigned long long)idx_ni->mft_no); 654 goto err; 655 } 656 ntfs_index_ctx_unlock(ictx); 657 } else /* if (!ictx->is_root) */ { 658 /* 659 * If @vcn is in the same VM page as the existing page we reuse 660 * the mapped page, otherwise we release the page so we can get 661 * the new one. 662 */ 663 upl = ictx->upl; 664 pl = ictx->pl; 665 addr = ictx->addr; 666 is_dirty = ictx->is_dirty; 667 ictx->upl = NULL; 668 ictx->is_dirty = 0; 669 ictx->is_locked = 0; 670 if (ofs == ictx->upl_ofs) 671 goto have_page; 672 ntfs_page_unmap(idx_ni, upl, pl, is_dirty); 673 is_dirty = 0; 674 } 675 /* We did not reuse the old page, get the new page. */ 676 err = ntfs_page_map(idx_ni, ofs, &upl, &pl, &addr, TRUE); 677 if (err) { 678 ntfs_error(idx_ni->vol->mp, "Failed to map index page (error " 679 "%d).", err); 680 goto err; 681 } 682have_page: 683 /* Get to the index allocation block. */ 684 ia = (INDEX_ALLOCATION*)(addr + ((unsigned)(vcn << 685 idx_ni->vcn_size_shift) & PAGE_MASK)); 686 /* Bounds checks. */ 687 if ((u8*)ia < addr || (u8*)ia > addr + PAGE_SIZE) { 688 ntfs_error(idx_ni->vol->mp, es, "Out of bounds check failed", 689 (unsigned long long)idx_ni->mft_no); 690 goto unm_err; 691 } 692 /* Catch multi sector transfer fixup errors. */ 693 if (!ntfs_is_indx_record(ia->magic)) { 694 ntfs_error(idx_ni->vol->mp, "Index record with VCN 0x%llx is " 695 "corrupt. Inode 0x%llx is corrupt. Run " 696 "chkdsk.", (unsigned long long)vcn, 697 (unsigned long long)idx_ni->mft_no); 698 goto unm_err; 699 } 700 if (sle64_to_cpu(ia->index_block_vcn) != vcn) { 701 ntfs_error(idx_ni->vol->mp, "Actual VCN (0x%llx) of index " 702 "buffer is different from expected VCN " 703 "(0x%llx). Inode 0x%llx is corrupt. Run " 704 "chkdsk.", (unsigned long long) 705 sle64_to_cpu(ia->index_block_vcn), 706 (unsigned long long)vcn, 707 (unsigned long long)idx_ni->mft_no); 708 goto unm_err; 709 } 710 if (offsetof(INDEX_BLOCK, index) + le32_to_cpu( 711 ia->index.allocated_size) != idx_ni->block_size) { 712 ntfs_error(idx_ni->vol->mp, "Index buffer (VCN 0x%llx) of " 713 "inode 0x%llx has a size (%u) differing from " 714 "the index specified size (%u). Inode is " 715 "corrupt. Run chkdsk.", 716 (unsigned long long)vcn, 717 (unsigned long long)idx_ni->mft_no, 718 (unsigned)offsetof(INDEX_BLOCK, index) + 719 le32_to_cpu(ia->index.allocated_size), 720 (unsigned)idx_ni->block_size); 721 goto unm_err; 722 } 723 if ((u8*)ia + idx_ni->block_size > addr + PAGE_SIZE) 724 panic("%s(): (u8*)ia + idx_ni->block_size > kaddr + " 725 "PAGE_SIZE\n", __FUNCTION__); 726 if ((u8*)&ia->index + le32_to_cpu(ia->index.index_length) > 727 (u8*)ia + idx_ni->block_size) { 728 ntfs_error(idx_ni->vol->mp, "Size of index buffer (VCN " 729 "0x%llx) of inode 0x%llx exceeds maximum " 730 "size.", (unsigned long long)vcn, 731 (unsigned long long)idx_ni->mft_no); 732 goto unm_err; 733 } 734 /* Allocate a new index context. */ 735 new_ictx = ntfs_index_ctx_alloc(); 736 if (!new_ictx) { 737 err = ENOMEM; 738 ntfs_error(idx_ni->vol->mp, "Failed to allocate index " 739 "context."); 740 goto unm_err; 741 } 742 /* 743 * Attach the new index context between the current index context 744 * (which is the bottom of the tree) and the index context below it 745 * (which is the top of the tree), i.e. place the new index context at 746 * the bottom of the tree. 747 * 748 * Gcc is broken so it fails to see both the members of the anonymous 749 * union(s) and the bitfield inside the C99 initializer. Work around 750 * this by setting @is_locked, @is_dirty, @ia, and @vcn afterwards. 751 */ 752 *new_ictx = (ntfs_index_context) { 753 .up = ictx, 754 .down = ictx->down, 755 .idx_ni = idx_ni, 756 .base_ni = ictx->base_ni, 757 .index = &ia->index, 758 .bytes_free = le32_to_cpu(ia->index.allocated_size) - 759 le32_to_cpu(ia->index.index_length), 760 .max_entries = ictx->max_entries, 761 }; 762 new_ictx->is_locked = 1; 763 new_ictx->is_dirty = is_dirty; 764 new_ictx->ia = ia; 765 new_ictx->vcn = vcn; 766 new_ictx->upl_ofs = ofs; 767 new_ictx->upl = upl; 768 new_ictx->pl = pl; 769 new_ictx->addr = addr; 770 ictx->down->up = new_ictx; 771 ictx->down = new_ictx; 772 /* 773 * Gather the index entry pointers and finish setting up the new index 774 * context. 775 */ 776 err = ntfs_index_get_entries(new_ictx); 777 if (!err) { 778 *index_ctx = new_ictx; 779 ntfs_debug("Done."); 780 return 0; 781 } 782 new_ictx->is_locked = 0; 783 new_ictx->is_dirty = 0; 784 new_ictx->upl = NULL; 785unm_err: 786 ntfs_page_unmap(idx_ni, upl, pl, is_dirty); 787err: 788 if (!err) { 789 NVolSetErrors(idx_ni->vol); 790 err = EIO; 791 } 792 return err; 793} 794 795/** 796 * ntfs_index_lookup_in_node - search for an entry in an index node 797 * @ictx: index context in which to search for the entry 798 * @match_key: index entry key data to search for 799 * @match_key_len: length of @match_key in bytes 800 * @key: index entry key to search for 801 * @key_len: length of @key in bytes 802 * 803 * Perform a binary search through the index entries in the index node 804 * described by @ictx looking for the correct entry or if not found for the 805 * index entry whose sub-node pointer to descend into. 806 * 807 * @key and @key_len is the complete key to search for. This is used when 808 * doing the full blown collation. 809 * 810 * When doing exact matching for filenames we need to compare the actual 811 * filenames rather than the filename attributes whereas for view indexes we 812 * need to compare the whole key. To make this function simpler we let the 813 * caller specify the appropriate data to use for exact matching in @match_key 814 * and @match_key_len. For view indexes @match_key and @match_key_len are the 815 * same as @key and @key_len respectively. 816 * 817 * Return 0 on success and errno on error. 818 * 819 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 820 * - The index context @ictx must be locked. 821 */ 822static errno_t ntfs_index_lookup_in_node(ntfs_index_context *ictx, 823 const void *match_key, const int match_key_len, 824 const void *key, const int key_len) 825{ 826 ntfs_inode *idx_ni; 827 INDEX_ENTRY *ie, **entries; 828 unsigned min_left, max_right, cur_entry; 829 int rc; 830 BOOL is_view; 831 832 ntfs_debug("Entering."); 833 idx_ni = ictx->idx_ni; 834 is_view = FALSE; 835 if (idx_ni->name != I30) 836 is_view = TRUE; 837 entries = ictx->entries; 838 /* 839 * If there is only one entry, i.e. the index node is empty, we need to 840 * descend into the sub-node pointer of the end entry if present thus 841 * return the end entry. 842 */ 843 if (ictx->nr_entries == 1) { 844 cur_entry = 0; 845 goto not_found; 846 } 847 /* 848 * Now do a binary search through the index entries looking for the 849 * correct entry or if not found for the index entry whose sub-node 850 * pointer to descend into. 851 * 852 * Note we exclude the end entry from the search as it does not include 853 * a key we can compare. That makes the search algorithm simpler and 854 * more efficient. 855 */ 856 min_left = 0; 857 max_right = ictx->nr_entries - 2; 858 cur_entry = max_right / 2; 859 do { 860 void *ie_match_key; 861 int ie_match_key_len; 862 863 ie = entries[cur_entry]; 864 if (!is_view) { 865 ie_match_key_len = ie->key.filename.filename_length << 866 NTFSCHAR_SIZE_SHIFT; 867 ie_match_key = &ie->key.filename.filename; 868 } else { 869 ie_match_key_len = le16_to_cpu(ie->key_length); 870 ie_match_key = &ie->key; 871 } 872 /* 873 * If the keys match perfectly, we setup @ictx to point to the 874 * matching entry and return. 875 */ 876 if ((match_key_len == ie_match_key_len) && 877 !bcmp(match_key, ie_match_key, 878 ie_match_key_len)) { 879found: 880 ictx->entry = ie; 881 ictx->entry_nr = cur_entry; 882 ictx->is_match = 1; 883 ntfs_debug("Done (found)."); 884 return 0; 885 } 886 /* 887 * Not a perfect match, need to do full blown collation so we 888 * know which way in the B+tree we have to go. 889 */ 890 rc = ntfs_collate(idx_ni->vol, idx_ni->collation_rule, key, 891 key_len, &ie->key, le16_to_cpu(ie->key_length)); 892 /* 893 * If @key collates before the key of the current entry, need 894 * to search on the left. 895 */ 896 if (rc == -1) { 897 /* 898 * If this is the left-most entry we need to descend 899 * into it if it has a sub-node. 900 */ 901 if (cur_entry == min_left) 902 break; 903 /* Exclude the wrong half or remaining entries. */ 904 max_right = cur_entry - 1; 905 /* Find the middle of the remaining entries. */ 906 cur_entry = (min_left + max_right) / 2; 907 continue; 908 } 909 /* 910 * A match should never happen as the bcmp() call should have 911 * caught it, but we still treat it correctly. 912 */ 913 if (!rc) 914 goto found; 915 /* 916 * @key collates after the key of the current entry, need to 917 * search on the right. 918 * 919 * If this is the right-most entry we need to descend into the 920 * end entry on the right if it has a sub-node. 921 */ 922 if (cur_entry == max_right) { 923 cur_entry++; 924 break; 925 } 926 /* Exclude the wrong half or remaining entries. */ 927 min_left = cur_entry + 1; 928 /* Find the middle of the remaining entries. */ 929 cur_entry = (min_left + max_right) / 2; 930 } while (1); 931not_found: 932 ictx->follow_entry = entries[cur_entry]; 933 ictx->entry_nr = cur_entry; 934 ntfs_debug("Done (not found)."); 935 return ENOENT; 936} 937 938/** 939 * ntfs_index_lookup - find a key in an index and return its index entry 940 * @key: [IN] key for which to search in the index 941 * @key_len: [IN] length of @key in bytes 942 * @index_ctx: [IN/OUT] context describing the index and the returned entry 943 * 944 * Before calling ntfs_index_lookup(), *@index_ctx must have been obtained 945 * from a call to ntfs_index_ctx_get(). 946 * 947 * Look for the @key in the index specified by the index lookup context 948 * @index_ctx. ntfs_index_lookup() walks the contents of the index looking for 949 * the @key. As it does so, it records the path taken through the B+tree 950 * together with other useful information it obtains along the way. This is 951 * needed if the lookup is going to be followed by a complex index tree 952 * operation such as an index entry addition requiring one or more index block 953 * splits for example. 954 * 955 * If the @key is found in the index, 0 is returned and *@index_ctx is set up 956 * to describe the index entry containing the matching @key. The matching 957 * entry is pointed to by *@index_ctx->entry. 958 * 959 * If the @key is not found in the index, ENOENT is returned and *@index_ctx is 960 * setup to describe the index entry whose key collates immediately after the 961 * search @key, i.e. this is the position in the index at which an index entry 962 * with a key of @key would need to be inserted. 963 * 964 * If an error occurs return the error code. In this case *@index_ctx is 965 * undefined and must be freed via a call to ntfs_index_ctx_put() or 966 * reinitialized via a call to ntfs_index_ctx_put_reuse(). 967 * 968 * When finished with the entry and its data, call ntfs_index_ctx_put() to free 969 * the context and other associated resources. 970 * 971 * If the index entry was modified, call ntfs_index_entry_mark_dirty() before 972 * the call to ntfs_index_ctx_put() to ensure that the changes are written to 973 * disk. 974 * 975 * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode. 976 * - Caller must hold an iocount reference on the index inode. 977 * 978 * TODO: An optimization would be to take two new parameters, say @add and 979 * @del, which allow our caller to tell us if the search is for purposes of 980 * adding an entry (@add is true) or for removing an entry (@del is true) or if 981 * it is a simple lookup (read-only or overwrite without change in length, both 982 * @add and @del are false). For the lookup case we do not need to record the 983 * path so we can just use one single index context and only one array of index 984 * entry pointers and we keep reusing both instead of getting new ones each 985 * time we descend into a sub-node. Alternatively take a single parameter 986 * @reason or @intent or something and define some constants like NTFS_LOOKUP, 987 * NTFS_ADD, and NTFS_DEL or something to go with it to serve the same purpose 988 * as above. 989 */ 990errno_t ntfs_index_lookup(const void *key, const int key_len, 991 ntfs_index_context **index_ctx) 992{ 993 ntfs_index_context *ictx; 994 const void *match_key; 995 int match_key_len; 996 errno_t err; 997 998 ntfs_debug("Entering."); 999 if (!key) 1000 panic("%s(): !key\n", __FUNCTION__); 1001 if (key_len <= 0) 1002 panic("%s(): key_len <= 0\n", __FUNCTION__); 1003 ictx = *index_ctx; 1004 /* 1005 * When doing exact matching for filenames we need to compare the 1006 * actual filenames rather than the filename attributes whereas for 1007 * view indexes we need to compare the whole key. 1008 */ 1009 if (ictx->idx_ni->name == I30) { 1010 match_key_len = ((FILENAME_ATTR*)key)->filename_length << 1011 NTFSCHAR_SIZE_SHIFT; 1012 match_key = &((FILENAME_ATTR*)key)->filename; 1013 } else { 1014 match_key_len = key_len; 1015 match_key = key; 1016 } 1017 /* Prepare the search context for its first lookup. */ 1018 err = ntfs_index_lookup_init(ictx, key_len); 1019 if (err) 1020 goto err; 1021 do { 1022 /* 1023 * Look for the @key in the current index node. If found, the 1024 * index context points to the found entry so we are done. If 1025 * not found, the index context points to the entry whose 1026 * sub-node pointer we need to descend into if it is present 1027 * and if not we have failed to find the entry and are also 1028 * done. 1029 */ 1030 err = ntfs_index_lookup_in_node(ictx, match_key, match_key_len, 1031 key, key_len); 1032 if (err && err != ENOENT) 1033 panic("%s(): err && err != ENOENT\n", __FUNCTION__); 1034 if (!err || !(ictx->entry->flags & INDEX_ENTRY_NODE)) 1035 break; 1036 /* Not found but child node present, descend into it. */ 1037 err = ntfs_index_descend_into_child_node(&ictx); 1038 if (err) 1039 goto err; 1040 /* 1041 * Replace the caller's index context with the new one so the 1042 * caller always gets the bottom-most index context. 1043 */ 1044 *index_ctx = ictx; 1045 } while (1); 1046 ntfs_debug("Done (%s in index %s).", 1047 !err ? "found matching entry" : "entry not found", 1048 ictx->is_root ? "root" : "allocation block"); 1049 return err; 1050err: 1051 ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err); 1052 return err; 1053} 1054 1055/** 1056 * ntfs_index_lookup_by_position - find an entry by its position in the B+tree 1057 * @pos: [IN] position of index entry to find in the B+tree 1058 * @key_len: [IN] size of the key ntfs_index_lookup() is called with in bytes 1059 * @index_ctx: [IN/OUT] context describing the index and the returned entry 1060 * 1061 * Before calling ntfs_index_lookup_by_position(), *@index_ctx must have been 1062 * obtained from a call to ntfs_index_ctx_get(). 1063 * 1064 * Start searching at the beginning of the B+tree, counting each of the entries 1065 * and when the entry with position number @pos has been reached return that in 1066 * *@index_ctx. 1067 * 1068 * @key_len is the length of the key ntfs_index_lookup() would be called with. 1069 * If the index *@index_ictx is a directory index this is ignored. 1070 * 1071 * As the search progresses, the path taken through the B+tree is recorded 1072 * together with other useful information obtained along the way. This is 1073 * needed so the lookup can proceed to the next entry if/when desired, for 1074 * example during a ntfs_readdir() call. 1075 * 1076 * If the position @pos is found in the index, 0 is returned and *@index_ctx is 1077 * set up to describe the index entry containing at position @pos. The 1078 * matching entry is pointed to by *@index_ctx->entry. 1079 * 1080 * If the position @pos is not found in the index, ENOENT is returned. 1081 * 1082 * If an error occurs the error code is returned (cannot be ENOENT). 1083 * 1084 * If any values other than 0 are returned, *@index_ctx is undefined and must 1085 * be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to 1086 * ntfs_index_ctx_put_reuse(). 1087 * 1088 * When finished with the entry and its data, call ntfs_index_ctx_put() to free 1089 * the context and other associated resources. 1090 * 1091 * If the index entry was modified, call ntfs_index_entry_mark_dirty() before 1092 * the call to ntfs_index_ctx_put() to ensure that the changes are written to 1093 * disk. 1094 * 1095 * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode. 1096 * - Caller must hold an iocount reference on the index inode. 1097 */ 1098errno_t ntfs_index_lookup_by_position(const s64 pos, const int key_len, 1099 ntfs_index_context **index_ctx) 1100{ 1101 s64 current_pos; 1102 ntfs_index_context *ictx; 1103 errno_t err; 1104 1105 ntfs_debug("Entering for position 0x%llx.", (unsigned long long)pos); 1106 if (pos < 0) 1107 panic("%s(): pos < 0\n", __FUNCTION__); 1108 ictx = *index_ctx; 1109 /* Prepare the search context for its first lookup. */ 1110 err = ntfs_index_lookup_init(ictx, key_len); 1111 if (err) 1112 goto err; 1113 /* Start at the first index entry in the index root. */ 1114 ictx->entry = ictx->entries[0]; 1115 ictx->entry_nr = 0; 1116 current_pos = 0; 1117 /* 1118 * Iterate over the entries in the B+tree counting them up until we 1119 * reach entry position @pos in which case we are done. 1120 */ 1121 do { 1122 /* 1123 * Keep descending into the sub-node pointers until a leaf node 1124 * is found. 1125 */ 1126 while (ictx->entry->flags & INDEX_ENTRY_NODE) { 1127 /* Child node present, descend into it. */ 1128 err = ntfs_index_descend_into_child_node(&ictx); 1129 if (err) 1130 goto err; 1131 /* Start at the first index entry in the index node. */ 1132 ictx->entry = ictx->entries[0]; 1133 ictx->entry_nr = 0; 1134 } 1135 /* 1136 * We have reached the next leaf node. If @pos is in this node 1137 * skip to the correct entry and we are done. 1138 * 1139 * Note we need the -1 because @ictx->nr_entries counts the end 1140 * entry which does not contain any data thus we do not count 1141 * it as a real entry. 1142 */ 1143 if (current_pos + ictx->nr_entries - 1 > pos) { 1144 /* 1145 * No need to skip any entries if the current entry is 1146 * the one matching @pos. 1147 */ 1148 if (current_pos < pos) { 1149 s64 nr; 1150 1151 nr = ictx->entry_nr + (pos - current_pos); 1152 if (nr >= ictx->nr_entries - 1) 1153 panic("%s(): nr >= ictx->nr_entries - " 1154 "1\n", __FUNCTION__); 1155 ictx->entry_nr = nr; 1156 ictx->entry = ictx->entries[nr]; 1157 current_pos = pos; 1158 } else if (current_pos > pos) 1159 panic("%s(): current_pos > pos\n", 1160 __FUNCTION__); 1161 break; 1162 } 1163 /* 1164 * @pos is not in this node. Skip the whole node, i.e. skip 1165 * @ictx->nr_entries - 1 real index entries. 1166 */ 1167 current_pos += ictx->nr_entries - 1; 1168 /* 1169 * Keep moving up the B+tree until we find an index node that 1170 * we have not finished yet. 1171 */ 1172 do { 1173 ntfs_index_context *itmp; 1174 1175 /* If we are in the index root, we are done. */ 1176 if (ictx->is_root) 1177 goto not_found; 1178 /* Save the current index context so we can free it. */ 1179 itmp = ictx; 1180 /* Move up to the parent node. */ 1181 ictx = ictx->up; 1182 /* 1183 * Disconnect the old index context from its path and 1184 * free it. 1185 */ 1186 ntfs_index_ctx_disconnect(itmp); 1187 ntfs_index_ctx_put_reuse_single(itmp); 1188 ntfs_index_ctx_free(itmp); 1189 } while (ictx->entry_nr == ictx->nr_entries - 1); 1190 /* 1191 * We have reached a node which we have not finished with yet. 1192 * Lock it so we can work on it. 1193 */ 1194 err = ntfs_index_ctx_relock(ictx); 1195 if (err) 1196 goto err; 1197 /* 1198 * Check if the current entry is the entry matching @pos and if 1199 * so we are done. 1200 */ 1201 if (current_pos == pos) 1202 break; 1203 /* 1204 * Move to the next entry in this node and continue descending 1205 * into the sub-node pointers until a leaf node is found. The 1206 * first entry in the leaf node will be the next entry thus 1207 * increment @current_pos now. 1208 */ 1209 ictx->entry = ictx->entries[++ictx->entry_nr]; 1210 current_pos++; 1211 } while (1); 1212 /* 1213 * Indicate that the current entry is the one the caller was looking 1214 * for and return the index context containing it to the caller. 1215 */ 1216 ictx->is_match = 1; 1217 *index_ctx = ictx; 1218 ntfs_debug("Done (entry with position 0x%llx found in index %s).", 1219 (unsigned long long)pos, 1220 ictx->is_root ? "root" : "allocation block"); 1221 return 0; 1222not_found: 1223 ntfs_debug("Done (entry with position 0x%llx not found in index, " 1224 "returning ENOENT).", (unsigned long long)pos); 1225 /* Ensure we return a valid index context. */ 1226 *index_ctx = ictx; 1227 return ENOENT; 1228err: 1229 ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err); 1230 /* Ensure we return a valid index context. */ 1231 *index_ctx = ictx; 1232 return err; 1233} 1234 1235/** 1236 * ntfs_index_lookup_next - find the next entry in the B+tree 1237 * @index_ctx: [IN/OUT] context describing the index and the returned entry 1238 * 1239 * Before calling ntfs_index_lookup_next(), *@index_ctx must have been obtained 1240 * from a call to ntfs_index_lookup() or ntfs_index_lookup_by_position(). 1241 * 1242 * If the next entry is found in the index, 0 is returned and *@index_ctx is 1243 * set up to describe the next index entry. The matching entry is pointed to 1244 * by *@index_ctx->entry. 1245 * 1246 * If the position @pos is not found in the index, ENOENT is returned. 1247 * 1248 * If an error occurs the error code is returned (cannot be ENOENT). 1249 * 1250 * If any values other than 0 are returned, *@index_ctx is undefined and must 1251 * be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to 1252 * ntfs_index_ctx_put_reuse(). 1253 * 1254 * When finished with the entry and its data, call ntfs_index_ctx_put() to free 1255 * the context and other associated resources. 1256 * 1257 * If the index entry was modified, call ntfs_index_entry_mark_dirty() before 1258 * the call to ntfs_index_ctx_put() to ensure that the changes are written to 1259 * disk. 1260 * 1261 * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode. 1262 * - Caller must hold an iocount reference on the index inode. 1263 */ 1264errno_t ntfs_index_lookup_next(ntfs_index_context **index_ctx) 1265{ 1266 ntfs_index_context *ictx; 1267 errno_t err; 1268 1269 ntfs_debug("Entering."); 1270 ictx = *index_ctx; 1271 if (!ictx->base_ni) 1272 panic("%s(): !ictx->base_ni\n", __FUNCTION__); 1273 /* 1274 * Ensure the index context is initialized, i.e. it is not legal to 1275 * call ntfs_index_lookup_next() with a search context that has not 1276 * been used for a call to ntfs_index_lookup_by_position() or 1277 * ntfs_index_lookup() yet. 1278 */ 1279 if (!ictx->max_entries) 1280 panic("%s(): Called for uninitialized index context.\n", 1281 __FUNCTION__); 1282 /* 1283 * @ictx must currently point to real entry thus @ictx->nr_entries must 1284 * be greater or equal to 2 as it includes the end entry which cannot 1285 * contain any data. 1286 */ 1287 if (ictx->nr_entries < 2) 1288 panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__); 1289 if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) { 1290 /* 1291 * The current entry is in a leaf node. 1292 * 1293 * If it is not the last entry in the node return the next 1294 * entry in the node. 1295 */ 1296 if (ictx->entry_nr < ictx->nr_entries - 2) { 1297 ictx->entry = ictx->entries[++ictx->entry_nr]; 1298 ntfs_debug("Done (same leaf node)."); 1299 return 0; 1300 } 1301 /* 1302 * The current entry is in a leaf node and it is the last entry 1303 * in the node. The next entry is the first real entry above 1304 * the current node thus keep moving up the B+tree until we 1305 * find a real entry. 1306 */ 1307 do { 1308 ntfs_index_context *itmp; 1309 1310 /* If we are in the index root, we are done. */ 1311 if (ictx->is_root) { 1312 ntfs_debug("Done (was at last entry already, " 1313 "returning ENOENT)."); 1314 /* Ensure we return a valid index context. */ 1315 *index_ctx = ictx; 1316 return ENOENT; 1317 } 1318 /* Save the current index context so we can free it. */ 1319 itmp = ictx; 1320 /* Move up to the parent node. */ 1321 ictx = ictx->up; 1322 /* 1323 * Disconnect the old index context from its path and 1324 * free it. 1325 */ 1326 ntfs_index_ctx_disconnect(itmp); 1327 ntfs_index_ctx_put_reuse_single(itmp); 1328 ntfs_index_ctx_free(itmp); 1329 } while (ictx->entry_nr == ictx->nr_entries - 1); 1330 /* 1331 * We have reached a node with a real index entry. Lock it so 1332 * we can work on it. 1333 */ 1334 err = ntfs_index_ctx_relock(ictx); 1335 if (err) 1336 goto err; 1337 /* 1338 * Indicate that the current entry is the next entry and return 1339 * the index context containing it to the caller. 1340 */ 1341 ictx->is_match = 1; 1342 *index_ctx = ictx; 1343 ntfs_debug("Done (upper index node)."); 1344 return 0; 1345 } 1346 /* 1347 * The current entry is in an index node. 1348 * 1349 * To find the next entry we need to switch to the next entry in this 1350 * node and then descend into the sub-node pointers until a leaf node 1351 * is found. The first entry of that leaf node is the next entry. 1352 * 1353 * First, unmark this node as being the matching one and switch to the 1354 * next entry in this node. 1355 */ 1356 ictx->is_match = 0; 1357 ictx->entry = ictx->entries[++ictx->entry_nr]; 1358 /* 1359 * Keep descending into the sub-node pointers until a leaf node is 1360 * found. 1361 */ 1362 while (ictx->entry->flags & INDEX_ENTRY_NODE) { 1363 /* Child node present, descend into it. */ 1364 err = ntfs_index_descend_into_child_node(&ictx); 1365 if (err) 1366 goto err; 1367 /* Start at the first index entry in the index node. */ 1368 ictx->entry = ictx->entries[0]; 1369 ictx->entry_nr = 0; 1370 } 1371 /* 1372 * We have reached the next leaf node. The next entry is the first 1373 * entry in this node which we have already set up to be the current 1374 * entry. 1375 * 1376 * Indicate that the current entry is the one the caller was looking 1377 * for and return the index context containing it to the caller. 1378 */ 1379 ictx->is_match = 1; 1380 *index_ctx = ictx; 1381 ntfs_debug("Done (next leaf node)."); 1382 return 0; 1383err: 1384 ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err); 1385 /* Ensure we return a valid index context. */ 1386 *index_ctx = ictx; 1387 return err; 1388} 1389 1390/** 1391 * ntfs_index_entry_mark_dirty - mark an index entry dirty 1392 * @ictx: ntfs index context describing the index entry 1393 * 1394 * Mark the index entry described by the index entry context @ictx dirty. 1395 * 1396 * If the index entry is in the index root attribute, simply mark the mft 1397 * record containing the index root attribute dirty. This ensures the mft 1398 * record, and hence the index root attribute, will be written out to disk 1399 * later. 1400 * 1401 * If the index entry is in an index block belonging to the index allocation 1402 * attribute, mark the page the index block is in dirty. 1403 * 1404 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 1405 * writing. 1406 * - The index context @ictx must be locked. 1407 */ 1408void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) 1409{ 1410 if (!ictx->is_locked) 1411 panic("%s(): Index context is not locked.\n", __FUNCTION__); 1412 if (ictx->is_root) { 1413 if (!ictx->actx) 1414 panic("%s(): Attribute context is missing from index " 1415 "context.\n", __FUNCTION__); 1416 NInoSetMrecNeedsDirtying(ictx->actx->ni); 1417 } else { 1418 if (!ictx->upl) 1419 panic("%s(): Page list is missing from index " 1420 "context.\n", __FUNCTION__); 1421 ictx->is_dirty = 1; 1422 } 1423} 1424 1425/** 1426 * ntfs_insert_index_allocation_attribute - add the index allocation attribute 1427 * @ni: base ntfs inode to which the attribute is being added 1428 * @ctx: search context describing where to insert the attribute 1429 * @idx_ni: index inode for which to add the index allocation attribute 1430 * 1431 * Insert a new index allocation attribute in the base ntfs inode @ni at the 1432 * position indicated by the attribute search context @ctx and add an attribute 1433 * list attribute entry for it if the inode uses the attribute list attribute. 1434 * 1435 * The new attribute type and name are given by the index inode @idx_ni. 1436 * 1437 * If @idx_ni contains a runlist, it is used to create the mapping pairs array 1438 * for the non-resident index allocation attribute. 1439 * 1440 * Return 0 on success and errno on error. 1441 * 1442 * WARNING: Regardless of whether success or failure is returned, you need to 1443 * check @ctx->is_error and if true the @ctx is no longer valid, i.e. 1444 * you need to either call ntfs_attr_search_ctx_reinit() or 1445 * ntfs_attr_search_ctx_put() on it. In that case @ctx->error will 1446 * give the error code for why the mapping of the inode failed. 1447 * 1448 * Locking: Caller must hold @idx_ni->lock on the index inode for writing. 1449 */ 1450static errno_t ntfs_insert_index_allocation_attribute(ntfs_inode *ni, 1451 ntfs_attr_search_ctx *ctx, ntfs_inode *idx_ni) 1452{ 1453 ntfs_volume *vol; 1454 MFT_RECORD *base_m, *m; 1455 ATTR_RECORD *a; 1456 ATTR_LIST_ENTRY *al_entry; 1457 unsigned mp_ofs, mp_size, al_entry_used, al_entry_len; 1458 unsigned new_al_size, new_al_alloc; 1459 errno_t err; 1460 BOOL al_entry_added; 1461 1462 ntfs_debug("Entering for mft_no 0x%llx, name_len 0x%x.", 1463 (unsigned long long)ni->mft_no, idx_ni->name_len); 1464 vol = idx_ni->vol; 1465 /* 1466 * Calculate the offset into the new attribute at which the mapping 1467 * pairs array begins. The mapping pairs array is placed after the 1468 * name aligned to an 8-byte boundary which in turn is placed 1469 * immediately after the non-resident attribute record itself. 1470 */ 1471 mp_ofs = offsetof(ATTR_RECORD, compressed_size) + 1472 (((idx_ni->name_len << NTFSCHAR_SIZE_SHIFT) + 7) & ~7); 1473 /* Work out the size for the mapping pairs array. */ 1474 err = ntfs_get_size_for_mapping_pairs(vol, 1475 idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0, -1, 1476 &mp_size); 1477 if (err) 1478 panic("%s(): err\n", __FUNCTION__); 1479 /* 1480 * Work out the size for the attribute record. We simply take the 1481 * offset to the mapping pairs array we worked out above and add the 1482 * size of the mapping pairs array in bytes aligned to an 8-byte 1483 * boundary. Note we do not need to do the alignment as 1484 * ntfs_attr_record_make_space() does it anyway. 1485 * 1486 * The current implementation of ntfs_attr_lookup() will always return 1487 * pointing into the base mft record when an attribute is not found. 1488 */ 1489 base_m = ctx->m; 1490retry: 1491 if (ni != ctx->ni) 1492 panic("%s(): ni != ctx->ni\n", __FUNCTION__); 1493 m = ctx->m; 1494 a = ctx->a; 1495 err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size); 1496 if (err) { 1497 ntfs_inode *eni; 1498 1499 if (err != ENOSPC) 1500 panic("%s(): err != ENOSPC\n", __FUNCTION__); 1501 /* 1502 * There was not enough space in the mft record to insert the 1503 * new attribute record which means we will need to insert it 1504 * into an extent mft record. 1505 * 1506 * To avoid bugs and impossible situations, check that the 1507 * attribute is not already the only attribute in the mft 1508 * record otherwise moving it would not give us anything. 1509 */ 1510 if (ntfs_attr_record_is_only_one(m, a)) 1511 panic("%s(): ntfs_attr_record_is_only_one()\n", 1512 __FUNCTION__); 1513 /* 1514 * Before we can allocate an extent mft record, we need to 1515 * ensure that the inode has an attribute list attribute. 1516 */ 1517 if (!NInoAttrList(ni)) { 1518 err = ntfs_attr_list_add(ni, m, NULL); 1519 if (err) { 1520 ntfs_error(vol->mp, "Failed to add attribute " 1521 "list attribute to mft_no " 1522 "0x%llx (error %d).", 1523 (unsigned long long)ni->mft_no, 1524 err); 1525 return err; 1526 } 1527 /* 1528 * Adding the attribute list attribute may have 1529 * generated enough space in the base mft record to 1530 * fit the attribute so try again. 1531 */ 1532 ntfs_attr_search_ctx_reinit(ctx); 1533 err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, 1534 idx_ni->name, idx_ni->name_len, 0, 1535 NULL, 0, ctx); 1536 if (err == ENOENT) { 1537 /* 1538 * The current implementation of 1539 * ntfs_attr_lookup() will always return 1540 * pointing into the base mft record when an 1541 * attribute is not found. 1542 */ 1543 if (m != ctx->m) 1544 panic("%s(): m != ctx->m\n", 1545 __FUNCTION__); 1546 goto retry; 1547 } 1548 /* 1549 * We cannot have found the attribute as we have 1550 * exclusive access and know that it does not exist 1551 * already. 1552 */ 1553 if (!err) 1554 panic("%s() !err\n", __FUNCTION__); 1555 /* 1556 * Something has gone wrong. Note we have to bail out 1557 * as a failing attribute lookup indicates corruption 1558 * and/or disk failure and/or not enough memory all of 1559 * which would prevent us from rolling back the 1560 * attribute list attribute addition. 1561 */ 1562 ntfs_error(vol->mp, "Failed to add index allocation " 1563 "attribute to mft_no 0x%llx because " 1564 "looking up the attribute failed " 1565 "(error %d).", 1566 (unsigned long long)ni->mft_no, err); 1567 return err; 1568 } 1569 /* 1570 * We now need to allocate a new extent mft record, attach it 1571 * to the base ntfs inode and set up the search context to 1572 * point to it, then insert the new attribute into it. 1573 */ 1574 err = ntfs_mft_record_alloc(vol, NULL, NULL, ni, &eni, &m, &a); 1575 if (err) { 1576 ntfs_error(vol->mp, "Failed to add index allocation " 1577 "attribute to mft_no 0x%llx because " 1578 "allocating a new extent mft record " 1579 "failed (error %d).", 1580 (unsigned long long)ni->mft_no, err); 1581 /* 1582 * TODO: If we added the attribute list attribute above 1583 * we could now remove it again but this may require 1584 * moving attributes back into the base mft record so 1585 * is not a trivial amount of work and in the end it 1586 * does not really matter if we leave an inode with an 1587 * attribute list attribute that does not really need 1588 * it. Especially so since this is error handling and 1589 * it is not an error to have an attribute list 1590 * attribute when not strictly required. 1591 */ 1592 return err; 1593 } 1594 ctx->ni = eni; 1595 ctx->m = m; 1596 ctx->a = a; 1597 /* 1598 * Make space for the new attribute. This cannot fail as we 1599 * now have an empty mft record which by definition can hold 1600 * a maximum size resident attribute record. 1601 */ 1602 err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size); 1603 if (err) 1604 panic("%s(): err (ntfs_attr_record_make_space())\n", 1605 __FUNCTION__); 1606 } 1607 /* 1608 * Now setup the new attribute record. The entire attribute has been 1609 * zeroed and the length of the attribute record has been set up by 1610 * ntfs_attr_record_make_space(). 1611 */ 1612 a->type = AT_INDEX_ALLOCATION; 1613 a->non_resident = 1; 1614 a->name_length = idx_ni->name_len; 1615 a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD, 1616 compressed_size)); 1617 a->instance = m->next_attr_instance; 1618 /* 1619 * Increment the next attribute instance number in the mft record as we 1620 * consumed the old one. 1621 */ 1622 m->next_attr_instance = cpu_to_le16( 1623 (le16_to_cpu(m->next_attr_instance) + 1) & 0xffff); 1624 a->highest_vcn = cpu_to_sle64((idx_ni->allocated_size >> 1625 vol->cluster_size_shift) - 1); 1626 a->mapping_pairs_offset = cpu_to_le16(mp_ofs); 1627 lck_spin_lock(&idx_ni->size_lock); 1628 a->allocated_size = cpu_to_sle64(idx_ni->allocated_size); 1629 a->data_size = cpu_to_sle64(idx_ni->data_size); 1630 a->initialized_size = cpu_to_sle64(idx_ni->initialized_size); 1631 lck_spin_unlock(&idx_ni->size_lock); 1632 /* Copy the attribute name into place. */ 1633 if (idx_ni->name_len) 1634 memcpy((u8*)a + offsetof(ATTR_RECORD, compressed_size), 1635 idx_ni->name, idx_ni->name_len << 1636 NTFSCHAR_SIZE_SHIFT); 1637 /* Generate the mapping pairs array into place. */ 1638 err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs, mp_size, 1639 idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0, 1640 -1, NULL); 1641 if (err) 1642 panic("%s(): err (ntfs_mapping_pairs_build())\n", __FUNCTION__); 1643 /* 1644 * If the inode does not use the attribute list attribute we are done. 1645 * 1646 * If the inode uses the attribute list attribute (including the case 1647 * where we just created it), we need to add an attribute list 1648 * attribute entry for the attribute. 1649 */ 1650 if (!NInoAttrList(ni)) 1651 goto done; 1652 /* Add an attribute list attribute entry for the inserted attribute. */ 1653 al_entry = ctx->al_entry; 1654 al_entry_used = offsetof(ATTR_LIST_ENTRY, name) + 1655 (idx_ni->name_len << NTFSCHAR_SIZE_SHIFT); 1656 al_entry_len = (al_entry_used + 7) & ~7; 1657 new_al_size = ni->attr_list_size + al_entry_len; 1658 /* Out of bounds checks. */ 1659 if ((u8*)al_entry < ni->attr_list || 1660 (u8*)al_entry > ni->attr_list + new_al_size || 1661 (u8*)al_entry + al_entry_len > 1662 ni->attr_list + new_al_size) { 1663 /* Inode is corrupt. */ 1664 ntfs_error(vol->mp, "Mft_no 0x%llx is corrupt. Run chkdsk.", 1665 (unsigned long long)ni->mft_no); 1666 err = EIO; 1667 goto undo; 1668 } 1669 err = ntfs_attr_size_bounds_check(vol, AT_ATTRIBUTE_LIST, new_al_size); 1670 if (err) { 1671 if (err == ERANGE) { 1672 ntfs_error(vol->mp, "Cannot insert attribute into " 1673 "mft_no 0x%llx because the attribute " 1674 "list attribute would become too " 1675 "large. You need to defragment your " 1676 "volume and then try again.", 1677 (unsigned long long)ni->mft_no); 1678 err = ENOSPC; 1679 } else { 1680 ntfs_error(vol->mp, "Attribute list attribute is " 1681 "unknown on the volume. The volume " 1682 "is corrupt. Run chkdsk."); 1683 NVolSetErrors(vol); 1684 err = EIO; 1685 } 1686 goto undo; 1687 } 1688 new_al_alloc = (new_al_size + NTFS_ALLOC_BLOCK - 1) & 1689 ~(NTFS_ALLOC_BLOCK - 1); 1690 /* 1691 * Reallocate the memory buffer if needed and create space for the new 1692 * entry. 1693 */ 1694 if (new_al_alloc > ni->attr_list_alloc) { 1695 u8 *tmp, *al, *al_end; 1696 unsigned al_entry_ofs; 1697 1698 tmp = OSMalloc(new_al_alloc, ntfs_malloc_tag); 1699 if (!tmp) { 1700 ntfs_error(vol->mp, "Not enough memory to extend " 1701 "attribute list attribute of mft_no " 1702 "0x%llx.", 1703 (unsigned long long)ni->mft_no); 1704 err = ENOMEM; 1705 goto undo; 1706 } 1707 al = ni->attr_list; 1708 al_entry_ofs = (u8*)al_entry - al; 1709 al_end = al + ni->attr_list_size; 1710 memcpy(tmp, al, al_entry_ofs); 1711 if ((u8*)al_entry < al_end) 1712 memcpy(tmp + al_entry_ofs + al_entry_len, 1713 al + al_entry_ofs, 1714 ni->attr_list_size - al_entry_ofs); 1715 al_entry = ctx->al_entry = (ATTR_LIST_ENTRY*)(tmp + 1716 al_entry_ofs); 1717 OSFree(ni->attr_list, ni->attr_list_alloc, ntfs_malloc_tag); 1718 ni->attr_list_alloc = new_al_alloc; 1719 ni->attr_list = tmp; 1720 } else if ((u8*)al_entry < ni->attr_list + ni->attr_list_size) 1721 memmove((u8*)al_entry + al_entry_len, al_entry, 1722 ni->attr_list_size - ((u8*)al_entry - 1723 ni->attr_list)); 1724 ni->attr_list_size = new_al_size; 1725 /* Set up the attribute list entry. */ 1726 al_entry->type = AT_INDEX_ALLOCATION; 1727 al_entry->length = cpu_to_le16(al_entry_len); 1728 al_entry->name_length = idx_ni->name_len; 1729 al_entry->name_offset = offsetof(ATTR_LIST_ENTRY, name); 1730 al_entry->lowest_vcn = 0; 1731 al_entry->mft_reference = MK_LE_MREF(ctx->ni->mft_no, ctx->ni->seq_no); 1732 al_entry->instance = a->instance; 1733 /* Copy the attribute name into place. */ 1734 if (idx_ni->name_len) 1735 memcpy((u8*)&al_entry->name, idx_ni->name, 1736 idx_ni->name_len << NTFSCHAR_SIZE_SHIFT); 1737 /* For tidyness, zero any unused space. */ 1738 if (al_entry_len != al_entry_used) { 1739 if (al_entry_len < al_entry_used) 1740 panic("%s(): al_entry_len < al_entry_used\n", 1741 __FUNCTION__); 1742 memset((u8*)al_entry + al_entry_used, 0, 1743 al_entry_len - al_entry_used); 1744 } 1745 /* 1746 * Extend the attribute list attribute and copy in the modified 1747 * value from the cache. 1748 */ 1749 err = ntfs_attr_list_sync_extend(ni, base_m, 1750 (u8*)al_entry - ni->attr_list, ctx); 1751 if (err) { 1752 ntfs_error(vol->mp, "Failed to extend attribute list " 1753 "attribute of mft_no 0x%llx (error %d).", 1754 (unsigned long long)ni->mft_no, err); 1755 al_entry_added = TRUE; 1756 goto undo_al; 1757 } 1758done: 1759 ntfs_debug("Done."); 1760 return 0; 1761undo: 1762 al_entry_added = FALSE; 1763undo_al: 1764 /* 1765 * Need to remove the attribute again or free the extent mft record if 1766 * there are no attributes remaining in it. 1767 */ 1768 if (m == base_m || !ntfs_attr_record_is_only_one(m, a)) { 1769 ntfs_attr_record_delete_internal(m, a); 1770 /* 1771 * If the attribute was not in the base mft record mark the 1772 * extent mft record dirty so it gets written out later. If 1773 * the attribute was in the base mft record it will be marked 1774 * dirty later. 1775 * 1776 * We also unmap the extent mft record and we set @ctx->ni to 1777 * equal the base inode @ni so that the search context is 1778 * initialized from scratch or simply freed if the caller 1779 * reinitializes or releases the search context respectively. 1780 */ 1781 if (m != base_m) { 1782 NInoSetMrecNeedsDirtying(ctx->ni); 1783 ntfs_extent_mft_record_unmap(ctx->ni); 1784 ctx->ni = ni; 1785 } 1786 } else { 1787 errno_t err2; 1788 BOOL al_needed; 1789 1790 err2 = ntfs_extent_mft_record_free(ni, ctx->ni, m); 1791 if (err2) { 1792 /* 1793 * Ignore the error as we just end up with an unused 1794 * mft record that is marked in use. 1795 */ 1796 ntfs_error(vol->mp, "Failed to free extent mft_no " 1797 "0x%llx (error %d). Unmount and run " 1798 "chkdsk to recover the lost inode.", 1799 (unsigned long long)ctx->ni->mft_no, 1800 err2); 1801 NVolSetErrors(vol); 1802 /* 1803 * Relese the extent mft record after dirtying it thus 1804 * simulating the effect of freeing it. 1805 */ 1806 NInoSetMrecNeedsDirtying(ctx->ni); 1807 ntfs_extent_mft_record_unmap(ctx->ni); 1808 } 1809 /* 1810 * The attribute search context still points to the no longer 1811 * mapped extent inode thus we need to change it to point to 1812 * the base inode instead so the context can be reinitialized 1813 * or released safely. 1814 */ 1815 ctx->ni = ni; 1816 /* 1817 * Check the attribute list attribute. If there are no other 1818 * attribute list attribute entries referencing extent mft 1819 * records delete the attribute list attribute altogether. 1820 * 1821 * If this fails it does not matter as we simply retain the 1822 * attribute list attribute so we ignore the error and go on to 1823 * delete the attribute list attribute entry instead. 1824 * 1825 * If there are other attribute list attribute entries 1826 * referencing extent mft records we still need the attribute 1827 * list attribute thus we go on to delete the attribute list 1828 * entry corresponding to the attribute record we just deleted 1829 * by freeing its extent mft record. 1830 */ 1831 err2 = ntfs_attr_list_is_needed(ni, 1832 al_entry_added ? al_entry : NULL, &al_needed); 1833 if (err2) 1834 ntfs_warning(vol->mp, "Failed to determine if " 1835 "attribute list attribute of mft_no " 1836 "0x%llx if still needed (error %d). " 1837 "Assuming it is still needed and " 1838 "continuing.", 1839 (unsigned long long)ni->mft_no, err2); 1840 else if (!al_needed) { 1841 /* 1842 * No more extent mft records are in use. Delete the 1843 * attribute list attribute. 1844 */ 1845 ntfs_attr_search_ctx_reinit(ctx); 1846 err2 = ntfs_attr_list_delete(ni, ctx); 1847 if (!err2) { 1848 /* 1849 * We deleted the attribute list attribute and 1850 * this will have updated the base inode 1851 * appropriately thus we have restored 1852 * everything as it was before. 1853 */ 1854 return err; 1855 } 1856 ntfs_warning(vol->mp, "Failed to delete attribute " 1857 "list attribute of mft_no 0x%llx " 1858 "(error %d). Continuing using " 1859 "alternative error recovery method.", 1860 (unsigned long long)ni->mft_no, err2); 1861 } 1862 } 1863 /* 1864 * Both @ctx and @ni are now invalid and cannot be used any more which 1865 * is fine as we have finished dealing with the attribute record. 1866 * 1867 * We now need to delete the corresponding attribute list attribute 1868 * entry if we created it. 1869 * 1870 * Then we need to rewrite the attribute list attribute again because 1871 * ntfs_attr_list_sync_extend() may have left it in an indeterminate 1872 * state. 1873 */ 1874 if (al_entry_added) { 1875 errno_t err2; 1876 1877 ntfs_attr_list_entry_delete(ni, al_entry); 1878 ntfs_attr_search_ctx_reinit(ctx); 1879 err2 = ntfs_attr_list_sync_shrink(ni, 0, ctx); 1880 if (err2) { 1881 ntfs_error(vol->mp, "Failed to restore attribute list " 1882 "attribute in base mft_no 0x%llx " 1883 "(error %d). Leaving inconsistent " 1884 "metadata. Unmount and run chkdsk.", 1885 (unsigned long long)ni->mft_no, err2); 1886 NVolSetErrors(vol); 1887 } 1888 } 1889 /* Make sure any changes are written out. */ 1890 NInoSetMrecNeedsDirtying(ni); 1891 return err; 1892} 1893 1894/** 1895 * ntfs_index_block_lay_out - lay out an index block into a memory buffer 1896 * @idx_ni: index inode to which the index block will belong 1897 * @vcn: vcn of index block 1898 * @ia: destination buffer of size >= @idx_ni->block_size bytes 1899 * 1900 * Lay out an empty index allocation block with the @vcn into the buffer @ia. 1901 * The index inode @idx_ni is needed because we need to know the size of an 1902 * index block and the @vcn is needed because we need to record it in the index 1903 * block. 1904 * 1905 * Locking: Caller must hold @idx_ni->lock on the index inode for writing. 1906 */ 1907static void ntfs_index_block_lay_out(ntfs_inode *idx_ni, VCN vcn, 1908 INDEX_BLOCK *ia) 1909{ 1910 INDEX_HEADER *ih; 1911 INDEX_ENTRY *ie; 1912 u32 ie_ofs; 1913 1914 bzero(ia, idx_ni->block_size); 1915 ia->magic = magic_INDX; 1916 ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK)); 1917 ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE)); 1918 /* Set the update sequence number to 1. */ 1919 *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1); 1920 ia->index_block_vcn = cpu_to_sle64(vcn); 1921 ih = &ia->index; 1922 ie_ofs = (sizeof(INDEX_HEADER) + 1923 (le16_to_cpu(ia->usa_count) << 1) + 7) & ~7; 1924 ih->entries_offset = cpu_to_le32(ie_ofs); 1925 ih->index_length = cpu_to_le32(ie_ofs + sizeof(INDEX_ENTRY_HEADER)); 1926 ih->allocated_size = cpu_to_le32(idx_ni->block_size - 1927 offsetof(INDEX_BLOCK, index)); 1928 ih->flags = LEAF_NODE; 1929 ie = (INDEX_ENTRY*)((u8*)ih + ie_ofs); 1930 ie->length = const_cpu_to_le16(sizeof(INDEX_ENTRY_HEADER)); 1931 ie->flags = INDEX_ENTRY_END; 1932} 1933 1934/** 1935 * ntfs_index_block_alloc - allocate and return an index allocation block 1936 * @ictx: index context of the index for which to allocate a block 1937 * @dst_vcn: pointer in which to return the VCN of the allocated index block 1938 * @dst_ia: pointer in which to return the allocated index block 1939 * @dst_upl_ofs: pointer in which to return the mapped address of the page data 1940 * @dst_upl: pointer in which to return the page list containing @ia 1941 * @dst_pl: pointer in which to return the array of pages containing @ia 1942 * @dst_addr: pointer in which to return the mapped address 1943 * 1944 * Allocate an index allocation block for the index described by the index 1945 * context @ictx and return it in *@dst_ia. *@dst_vcn, *@dst_upl_ofs, 1946 * *@dst_upl, *@dst_pl, and *@dst_addr are set to the VCN of the allocated 1947 * index block and the mapped page list and array of pages containing the 1948 * returned index block respectively. 1949 * 1950 * Return 0 on success and errno on error. On error *@dst_vcn, *@dst_ia, 1951 * *@dst_upl_ofs, *@dst_upl, *@dst_pl, and *@dst_addr are left untouched. 1952 * 1953 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 1954 * writing. 1955 * - All of the index contexts in the index must be unlocked (this 1956 * includes @ictx, i.e. @ictx must not be locked). 1957 */ 1958static errno_t ntfs_index_block_alloc(ntfs_index_context *ictx, VCN *dst_vcn, 1959 INDEX_BLOCK **dst_ia, s64 *dst_upl_ofs, upl_t *dst_upl, 1960 upl_page_info_array_t *dst_pl, u8 **dst_addr) 1961{ 1962 s64 bmp_pos, end_pos, init_size, upl_ofs; 1963 ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; 1964 upl_t upl; 1965 upl_page_info_array_t pl; 1966 u8 *bmp, *bmp_end; 1967 INDEX_ALLOCATION *ia; 1968 errno_t err, err2; 1969 lck_rw_type_t lock; 1970 le16 usn; 1971 BOOL have_resized; 1972 u8 bit; 1973 1974 ntfs_debug("Entering for inode 0x%llx.", 1975 (unsigned long long)idx_ni->mft_no); 1976 /* 1977 * Get the index bitmap inode. 1978 * 1979 * Note we do not lock the bitmap inode as the caller holds an 1980 * exclusive lock on the index inode thus the bitmap cannot change 1981 * under us as the only places the index bitmap is modified is as a 1982 * side effect of a locked index inode. 1983 * 1984 * The inode will be locked later if/when we are going to modify its 1985 * size to ensure any relevant VM activity that may be happening at the 1986 * same time is excluded. 1987 * 1988 * We need to take the ntfs inode lock on the bitmap inode for writing. 1989 * This causes a complication because there is a valid case in which we 1990 * can recurse (once) back into ntfs_index_block_alloc() by calling: 1991 * ntfs_attr_resize(bmp_ni) -> 1992 * ntfs_index_move_root_to_allocation_block() -> 1993 * ntfs_index_block_alloc() 1994 * In this case we have to not take the lock again or we will deadlock 1995 * (or with lock debugging enabled the kernel will panic()). Thus we 1996 * set @ictx->bmp_is_locked so that when we get back here we know not 1997 * to take the lock again. 1998 */ 1999 lock = 0; 2000 if (!ictx->bmp_is_locked) { 2001 lock = LCK_RW_TYPE_EXCLUSIVE; 2002 ictx->bmp_is_locked = 1; 2003 } 2004 err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, 2005 idx_ni->name_len, FALSE, lock, &bmp_ni); 2006 if (err) { 2007 ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode " 2008 "(error %d).", err); 2009 if (lock) 2010 ictx->bmp_is_locked = 0; 2011 goto err; 2012 } 2013 /* Find the first zero bit in the index bitmap. */ 2014 bmp_pos = 0; 2015 lck_spin_lock(&bmp_ni->size_lock); 2016 end_pos = bmp_ni->initialized_size; 2017 lck_spin_unlock(&bmp_ni->size_lock); 2018 for (bmp_pos = 0; bmp_pos < end_pos;) { 2019 err = ntfs_page_map(bmp_ni, bmp_pos, &upl, &pl, &bmp, TRUE); 2020 if (err) { 2021 ntfs_error(idx_ni->vol->mp, "Failed to read index " 2022 "bitmap (error %d).", err); 2023 goto put_err; 2024 } 2025 bmp_end = bmp + PAGE_SIZE; 2026 if (bmp_pos + PAGE_SIZE > end_pos) 2027 bmp_end = bmp + (end_pos - bmp_pos); 2028 /* Check the next bit(s). */ 2029 for (; bmp < bmp_end; bmp++, bmp_pos++) { 2030 if (*bmp == 0xff) 2031 continue; 2032 /* 2033 * TODO: There does not appear to be a ffz() function 2034 * in the kernel. )-: If/when the kernel has an ffz() 2035 * function, switch the below code to use it. 2036 * 2037 * So emulate "ffz(x)" using "ffs(~x) - 1" which gives 2038 * the same result but incurs extra CPU overhead. 2039 */ 2040 bit = ffs(~(unsigned long)*bmp) - 1; 2041 if (bit < 8) 2042 goto allocated_bit; 2043 } 2044 ntfs_page_unmap(bmp_ni, upl, pl, FALSE); 2045 } 2046 /* 2047 * There are no zero bits in the initialized part of the bitmap. Thus 2048 * we extend it by 8 bytes and allocate the first bit of the extension. 2049 */ 2050 bmp_pos = end_pos; 2051 lck_spin_lock(&bmp_ni->size_lock); 2052 end_pos = bmp_ni->data_size; 2053 lck_spin_unlock(&bmp_ni->size_lock); 2054 /* If we are exceeding the bitmap size need to extend it. */ 2055 have_resized = FALSE; 2056 if (bmp_pos + 8 >= end_pos) { 2057 ntfs_debug("Extending index bitmap, old size 0x%llx, " 2058 "requested size 0x%llx.", 2059 (unsigned long long)end_pos, 2060 (unsigned long long)end_pos + 8); 2061 err = ntfs_attr_resize(bmp_ni, end_pos + 8, 0, ictx); 2062 if (err) { 2063 ntfs_error(idx_ni->vol->mp, "Failed to extend index " 2064 "bitmap (error %d).", err); 2065 goto put_err; 2066 } 2067 have_resized = TRUE; 2068 } 2069 /* 2070 * Get the page containing the bit we are allocating. Note this has to 2071 * happen before we call ntfs_attr_set_initialized_size() as the latter 2072 * only sets the initialized size without zeroing the area between the 2073 * old initialized size and the new one thus we need to map the page 2074 * now so that its tail is zeroed due to the old value of the 2075 * initialized size. 2076 */ 2077 err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl, &pl, &bmp, 2078 TRUE); 2079 if (err) { 2080 ntfs_error(idx_ni->vol->mp, "Failed to read index bitmap " 2081 "(error %d).", err); 2082 /* 2083 * There is no need to resize the bitmap back to its old size. 2084 * It does not change the metadata consistency to have a bigger 2085 * bitmap. 2086 */ 2087 goto put_err; 2088 } 2089 bmp += (unsigned)bmp_pos & PAGE_MASK; 2090 /* 2091 * Extend the initialized size if the bitmap is non-resident. If it is 2092 * resident this was already done by the ntfs_attr_resize() call. 2093 */ 2094 if (NInoNonResident(bmp_ni)) { 2095#ifdef DEBUG 2096 lck_spin_lock(&bmp_ni->size_lock); 2097 init_size = bmp_ni->initialized_size; 2098 lck_spin_unlock(&bmp_ni->size_lock); 2099 ntfs_debug("Setting initialized size of index bitmap, old " 2100 "size 0x%llx, requested size 0x%llx.", 2101 (unsigned long long)init_size, 2102 (unsigned long long)end_pos + 8); 2103#endif 2104 err = ntfs_attr_set_initialized_size(bmp_ni, end_pos + 8); 2105 if (err) { 2106 ntfs_error(idx_ni->vol->mp, "Failed to update " 2107 "initialized size of index bitmap " 2108 "(error %d).", err); 2109 ntfs_page_unmap(bmp_ni, upl, pl, FALSE); 2110 goto put_err; 2111 } 2112 } 2113 /* Finally, allocate the bit in the page. */ 2114 bit = 0; 2115 /* 2116 * If we used ntfs_attr_resize() to extend the bitmap attribute it is 2117 * possible that this caused the index root node to be moved out to an 2118 * index allocation block which very likely will have allocated the 2119 * very bit we want to allocate so we test for this case and if it has 2120 * happened we allocate the next bit along which must be free or 2121 * something has gone wrong. 2122 */ 2123 if (have_resized && *bmp & 1) { 2124 if (*bmp & 2) 2125 panic("%s(): *bmp & 2\n", __FUNCTION__); 2126 bit++; 2127 } 2128allocated_bit: 2129 *bmp |= 1 << bit; 2130 ntfs_page_unmap(bmp_ni, upl, pl, TRUE); 2131 /* Set @bmp_pos to the allocated index bitmap bit. */ 2132 bmp_pos = (bmp_pos << 3) + bit; 2133 /* 2134 * If we are caching the last set bit in the bitmap in the index inode 2135 * and we allocated beyond the last set bit, update the cached value. 2136 */ 2137 if (idx_ni->last_set_bit >= 0 && bmp_pos > idx_ni->last_set_bit) 2138 idx_ni->last_set_bit = bmp_pos; 2139 ntfs_debug("Allocated index bitmap bit 0x%llx.", bmp_pos); 2140 /* 2141 * We are done with the bitmap and have the index allocation to go. 2142 * 2143 * If the allocated bit is outside the data size need to extend it. 2144 */ 2145 lck_spin_lock(&idx_ni->size_lock); 2146 end_pos = idx_ni->data_size; 2147 lck_spin_unlock(&idx_ni->size_lock); 2148 if (bmp_pos >= end_pos >> idx_ni->block_size_shift) { 2149 ntfs_debug("Extending index allocation, old size 0x%llx, " 2150 "requested size 0x%llx.", (unsigned long long) 2151 end_pos, (unsigned long long)(bmp_pos + 1) << 2152 idx_ni->block_size_shift); 2153 err = ntfs_attr_resize(idx_ni, (bmp_pos + 1) << 2154 idx_ni->block_size_shift, 0, ictx); 2155 if (err) { 2156 ntfs_error(idx_ni->vol->mp, "Failed to extend index " 2157 "allocation (error %d).", err); 2158 goto alloc_err; 2159 } 2160 } 2161 /* 2162 * Map the page containing the allocated index block. As above for the 2163 * bitmap we need to get the page before we set the initialized size so 2164 * that the tail of the page is zeroed for us. 2165 */ 2166 upl_ofs = (bmp_pos << idx_ni->block_size_shift) & ~PAGE_MASK_64; 2167 err = ntfs_page_map(idx_ni, upl_ofs, &upl, &pl, &bmp, TRUE); 2168 if (err) { 2169 ntfs_error(idx_ni->vol->mp, "Failed to read index allocation " 2170 "block (error %d).", err); 2171 /* 2172 * There is no need to truncate the index allocation back to 2173 * its old size. It does not change the metadata consistency 2174 * to have a bigger index allocation. 2175 */ 2176 goto alloc_err; 2177 } 2178 /* Extend the initialized size if needed. */ 2179 lck_spin_lock(&idx_ni->size_lock); 2180 init_size = idx_ni->initialized_size; 2181 lck_spin_unlock(&idx_ni->size_lock); 2182 if (bmp_pos >= init_size >> idx_ni->block_size_shift) { 2183 ntfs_debug("Setting initialized size of index allocation, old " 2184 "size 0x%llx, requested size 0x%llx.", 2185 (unsigned long long)init_size, 2186 (unsigned long long)(bmp_pos + 1) << 2187 idx_ni->block_size_shift); 2188 err = ntfs_attr_set_initialized_size(idx_ni, 2189 (bmp_pos + 1) << idx_ni->block_size_shift); 2190 if (err) { 2191 ntfs_error(idx_ni->vol->mp, "Failed to update " 2192 "initialized size of index allocation " 2193 "(error %d).", err); 2194 goto unm_err; 2195 } 2196 } 2197 if (lock) { 2198 ictx->bmp_is_locked = 0; 2199 lck_rw_unlock_exclusive(&bmp_ni->lock); 2200 } 2201 (void)vnode_put(bmp_ni->vn); 2202 /* Lay out an empty index block into the allocated space. */ 2203 ia = (INDEX_ALLOCATION*)(bmp + (((unsigned)bmp_pos << 2204 idx_ni->block_size_shift) & PAGE_MASK)); 2205 /* Preserve the update sequence number across the layout. */ 2206 usn = 0; 2207 if (le16_to_cpu(ia->usa_ofs) < NTFS_BLOCK_SIZE - sizeof(u16)) 2208 usn = *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)); 2209 /* Calculate the index block vcn from the index block number. */ 2210 *dst_vcn = bmp_pos = bmp_pos << idx_ni->block_size_shift >> 2211 idx_ni->vcn_size_shift; 2212 ntfs_index_block_lay_out(idx_ni, bmp_pos, ia); 2213 if (usn && usn != const_cpu_to_le16(0xffff)) 2214 *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = usn; 2215 *dst_ia = ia; 2216 *dst_upl_ofs = upl_ofs; 2217 *dst_upl = upl; 2218 *dst_pl = pl; 2219 *dst_addr = bmp; 2220 ntfs_debug("Done (allocated index block with vcn 0x%llx.", 2221 (unsigned long long)bmp_pos); 2222 return 0; 2223unm_err: 2224 ntfs_page_unmap(idx_ni, upl, pl, FALSE); 2225alloc_err: 2226 /* Free the index bitmap bit that we allocated. */ 2227 err2 = ntfs_bitmap_clear_bit(bmp_ni, bmp_pos); 2228 if (err2) { 2229 ntfs_error(idx_ni->vol->mp, "Failed to undo index block " 2230 "allocation in index bitmap (error %d). " 2231 "Leaving inconsistent metadata. Run chkdsk.", 2232 err2); 2233 NVolSetErrors(idx_ni->vol); 2234 } 2235put_err: 2236 if (lock) { 2237 ictx->bmp_is_locked = 0; 2238 lck_rw_unlock_exclusive(&bmp_ni->lock); 2239 } 2240 (void)vnode_put(bmp_ni->vn); 2241err: 2242 ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); 2243 return err; 2244} 2245 2246/** 2247 * ntfs_index_ctx_disconnect_reinit - disconnect and reinit an index context 2248 * @ictx: index context to disconnect 2249 * 2250 * Disconnect the index context @ictx from its tree path and reinitialize its 2251 * @up and @down pointers to point to itself. 2252 * 2253 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 2254 */ 2255static inline void ntfs_index_ctx_disconnect_reinit(ntfs_index_context *ictx) 2256{ 2257 ntfs_index_ctx_disconnect(ictx); 2258 ictx->down = ictx->up = ictx; 2259} 2260 2261/** 2262 * ntfs_index_move_root_to_allocation_block - move index root to allocation 2263 * @ictx: index lookup context describing index root to move 2264 * 2265 * Move the index root to an index allocation block in the process switching 2266 * the index from a small to a large index if necessary. 2267 * 2268 * If no index allocation and/or bitmap attributes exist they are created. 2269 * 2270 * An index block is then allocated and if necessary the index bitmap and/or 2271 * the allocation attributes are extended in the process. 2272 * 2273 * Finally all index entries contained in the index root attribute are moved 2274 * into the allocated index block in the index allocation attribute. 2275 * 2276 * We need to potentially create the index allocation and/or bitmap attributes 2277 * so we can move the entries from the index root attribute to the index 2278 * allocation attribute and then shrink the index root attribute. However, 2279 * there is not enough space in the mft record to do this. Also we already 2280 * have the index root attribute looked up so it makes sense to deal with it 2281 * first. 2282 * 2283 * Thus, if there is no index allocation attribute, we allocate the space to be 2284 * used by the index allocation attribute and setup the directory inode for a 2285 * large index including the allocated space but leaving the initialized size 2286 * to zero. We then map and lock the first page containing the now allocated 2287 * first index block and move the index entries from the index root into it. 2288 * We then shrink the index root and set it up to point to the allocated index 2289 * block. 2290 * 2291 * Having shrunk the index root attribute there is now hopefully enough space 2292 * in the mft record to create the index allocation attribute and the index 2293 * bitmap attribute in the mft record and the conversion is complete. 2294 * 2295 * If there is not enough space we create the index allocation and/or index 2296 * bitmap attributes in an extent mft record. TODO: This is not implemented 2297 * yet. 2298 * 2299 * If there is an index allocation attribute already, we allocate a temporary 2300 * buffer to hold the index block and then copy from there into the allocated 2301 * index block later. 2302 * 2303 * Return 0 on success and errno on error. On error the index context is no 2304 * longer usable and must be released or reinitialized. 2305 * 2306 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 2307 * writing. 2308 * - The index context @ictx must be locked. 2309 */ 2310errno_t ntfs_index_move_root_to_allocation_block(ntfs_index_context *ictx) 2311{ 2312 VCN vcn; 2313 ntfs_inode *base_ni, *idx_ni; 2314 ntfs_volume *vol; 2315 ntfs_index_context *ir_ictx; 2316 upl_t upl; 2317 upl_page_info_array_t pl; 2318 INDEX_ALLOCATION *ia; 2319 INDEX_HEADER *ih; 2320 INDEX_ROOT *ir; 2321 INDEX_ENTRY *ie; 2322 ntfs_attr_search_ctx *actx; 2323 MFT_RECORD *m; 2324 u32 u, ia_ie_ofs, ir_ie_ofs, clusters = 0; 2325 errno_t err, err2; 2326 struct { 2327 unsigned is_large_index:1; 2328 } old; 2329 BOOL need_ubc_setsize = FALSE; 2330 2331 ntfs_debug("Entering."); 2332 if (!ictx->is_locked) 2333 panic("%s(): !ictx->is_locked\n", __FUNCTION__); 2334 base_ni = ictx->base_ni; 2335 idx_ni = ictx->idx_ni; 2336 vol = base_ni->vol; 2337 /* 2338 * The current index context is going to turn from describing the index 2339 * root to describing the newly allocated index block. Thus, allocate 2340 * a new index context for the emptied index root. 2341 */ 2342 ir_ictx = ntfs_index_ctx_get(idx_ni); 2343 if (!ir_ictx) 2344 return ENOMEM; 2345 ir_ictx->entries = OSMalloc(ictx->max_entries * sizeof(INDEX_ENTRY*), 2346 ntfs_malloc_tag); 2347 if (!ir_ictx->entries) { 2348 err = ENOMEM; 2349 goto err; 2350 } 2351 /* 2352 * If there is no index allocation attribute, we need to allocate an 2353 * index block worth of clusters. Thus if the cluster size is less 2354 * than the index block size the number of clusters we need to allocate 2355 * is the index block size divided by the cluster size and if the 2356 * cluster size is greater than or equal to the index block size we 2357 * simply allocate one cluster. 2358 * 2359 * Otherwise we allocate a temporary buffer for the index block. 2360 */ 2361 if (!NInoIndexAllocPresent(idx_ni)) { 2362 clusters = idx_ni->block_size >> vol->cluster_size_shift; 2363 if (!clusters) 2364 clusters = 1; 2365 /* 2366 * We cannot lock the runlist as we are holding the mft record 2367 * lock. That is fortunately ok as we also have @idx_ni->lock 2368 * on the index inode and there is no index allocation 2369 * attribute at present so no-one can be using the runlist yet. 2370 */ 2371 err = ntfs_cluster_alloc(vol, 0, clusters, -1, DATA_ZONE, TRUE, 2372 &idx_ni->rl); 2373 if (err) { 2374 ntfs_error(vol->mp, "Failed to allocate %u clusters " 2375 "for the index allocation block " 2376 "(error %d).", (unsigned)clusters, 2377 err); 2378 goto err; 2379 } 2380 /* Allocate/get the first page and zero it out. */ 2381 ntfs_page_grab(idx_ni, 0, &upl, &pl, (u8**)&ia, TRUE); 2382 if (err) { 2383 ntfs_error(vol->mp, "Failed to grab first page."); 2384 goto page_err; 2385 } 2386 bzero(ia, PAGE_SIZE); 2387 } else { 2388 /* Allocate a temporary buffer and zero it out. */ 2389 ia = OSMalloc(idx_ni->block_size, ntfs_malloc_tag); 2390 if (!ia) { 2391 err = ENOMEM; 2392 goto err; 2393 } 2394 bzero(ia, idx_ni->block_size); 2395 upl = NULL; 2396 pl = NULL; 2397 } 2398 /* 2399 * Set up the index block and copy the index entries from the index 2400 * root. 2401 */ 2402 ia->magic = magic_INDX; 2403 ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK)); 2404 ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE)); 2405 /* Set the update sequence number to 1. */ 2406 *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1); 2407 ih = &ia->index; 2408 ia_ie_ofs = (sizeof(INDEX_HEADER) + 2409 (le16_to_cpu(ia->usa_count) << 1) + 7) & ~7; 2410 ih->entries_offset = cpu_to_le32(ia_ie_ofs); 2411 /* Work out the size of the index entries in the index root. */ 2412 ir = ictx->ir; 2413 ir_ie_ofs = le32_to_cpu(ir->index.entries_offset); 2414 /* 2415 * FIXME: Should we scan through the index root looking for the last 2416 * entry and use that to determine the size instead? Then we could fix 2417 * any space being wasted due to a too long index for its entries. 2418 */ 2419 u = le32_to_cpu(ir->index.index_length) - ir_ie_ofs; 2420 ih->index_length = cpu_to_le32(ia_ie_ofs + u); 2421 ih->allocated_size = cpu_to_le32(idx_ni->block_size - 2422 offsetof(INDEX_BLOCK, index)); 2423 ih->flags = LEAF_NODE; 2424 ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs); 2425 memcpy((u8*)ih + ia_ie_ofs, ie, u); 2426 /* 2427 * If the index root is a large index, then the index block is an index 2428 * node, i.e. not a leaf node. 2429 */ 2430 if (ir->index.flags & LARGE_INDEX) { 2431 old.is_large_index = 1; 2432 ih->flags |= INDEX_NODE; 2433 } 2434 /* 2435 * Set up the index context to point into the index allocation block 2436 * instead of into the index root. 2437 */ 2438 ictx->entry = (INDEX_ENTRY*)((u8*)ih + ia_ie_ofs + 2439 ((u8*)ictx->entry - (u8*)ie)); 2440 ictx->is_root = 0; 2441 actx = ictx->actx; 2442 ictx->ia = ia; 2443 ictx->vcn = 0; 2444 ictx->index = ih; 2445 ictx->upl_ofs = 0; 2446 ictx->upl = upl; 2447 ictx->pl = pl; 2448 ictx->addr = (u8*)ia; 2449 ictx->bytes_free = le32_to_cpu(ia->index.allocated_size) - 2450 le32_to_cpu(ia->index.index_length); 2451 /* 2452 * We have copied the index entries and switched the index context so 2453 * we can go ahead and shrink the index root by moving the end entry on 2454 * top of the first entry. We only need to do the move if the first 2455 * index root entry is not the end entry, i.e. the index is not empty. 2456 */ 2457 ih = &ir->index; 2458 if (!(ie->flags & INDEX_ENTRY_END)) { 2459 u8 *index_end; 2460 INDEX_ENTRY *end_ie; 2461 2462 index_end = (u8*)&ir->index + le32_to_cpu(ih->index_length); 2463 for (end_ie = ie;; end_ie = (INDEX_ENTRY*)((u8*)end_ie + 2464 le16_to_cpu(end_ie->length))) { 2465 /* Bounds checks. */ 2466 if ((u8*)end_ie < (u8*)ie || (u8*)end_ie + 2467 sizeof(INDEX_ENTRY_HEADER) > 2468 index_end || (u8*)end_ie + 2469 le16_to_cpu(end_ie->length) > 2470 index_end || 2471 (u32)sizeof(INDEX_ENTRY_HEADER) + 2472 le16_to_cpu(end_ie->key_length) > 2473 le16_to_cpu(end_ie->length)) { 2474 ntfs_error(vol->mp, "Corrupt index. Run " 2475 "chkdsk."); 2476 NVolSetErrors(vol); 2477 err = EIO; 2478 goto ictx_err; 2479 } 2480 /* Stop when we have found the last entry. */ 2481 if (end_ie->flags & INDEX_ENTRY_END) 2482 break; 2483 } 2484 memmove(ie, end_ie, le16_to_cpu(end_ie->length)); 2485 } 2486 /* 2487 * If the end entry is a leaf we need to extend it by 8 bytes in order 2488 * to turn it into a node entry. To do this we need to extend the 2489 * index root attribute itself in the case that the index root was 2490 * empty to start with or we need to do nothing if the index root was 2491 * not empty as we have not shrunk the index root attribute yet and we 2492 * moved the end index entry forward by at least one index entry which 2493 * is much bigger than 8 bytes. 2494 * 2495 * We do the index root attribute resize unconditionally because it 2496 * takes care of the size increase needed in the empty index root case 2497 * as well as of the size decrease needed in the non-empty index root 2498 * case. 2499 */ 2500retry_resize: 2501 u = le16_to_cpu(ie->length); 2502 if (!(ie->flags & INDEX_ENTRY_NODE)) 2503 u += sizeof(VCN); 2504 err = ntfs_resident_attr_value_resize(actx->m, actx->a, 2505 offsetof(INDEX_ROOT, index) + 2506 le32_to_cpu(ih->entries_offset) + u); 2507 if (err) { 2508 leMFT_REF mref; 2509 ATTR_RECORD *a; 2510 ntfschar *a_name; 2511 ATTR_LIST_ENTRY *al_entry; 2512 u8 *al_end; 2513 ntfs_attr_search_ctx ctx; 2514 2515 /* 2516 * This can only happen when the first entry is the last entry, 2517 * i.e. this is an empty directory and thus the above memmove() 2518 * has not been done, in which case we only need eight bytes 2519 * (sizeof(VCN)) more space which means that if we manage to 2520 * move out a single attribute out of the mft record we would 2521 * gain that much space. In the worst case scenario we can 2522 * always move out the index root attribute itself in which 2523 * case we are guaranteed to have enough space as an empty 2524 * index root is much smaller than an mft record. The only 2525 * complication is when there is no attribute list attribute as 2526 * we have to add it first. On the bright side that in itself 2527 * can cause some space to be freed up an we may have enough to 2528 * extend the index root by eight bytes. 2529 * 2530 * Add an attribute list attribute if it is not present. 2531 */ 2532 if (!NInoAttrList(base_ni)) { 2533 /* 2534 * Take a copy of the current attribute record pointer 2535 * so we can update all our pointers below. 2536 */ 2537 a = actx->a; 2538 err = ntfs_attr_list_add(base_ni, actx->m, actx); 2539 if (err || actx->is_error) { 2540 if (!err) 2541 err = actx->error; 2542 ntfs_error(vol->mp, "Failed to add attribute " 2543 "list attribute to mft_no " 2544 "0x%llx (error %d).", 2545 (unsigned long long) 2546 base_ni->mft_no, err); 2547 goto ictx_err; 2548 } 2549update_and_retry_resize: 2550 /* 2551 * Need to update our cached pointers as the index root 2552 * attribute is likely to have moved. 2553 */ 2554 ir = (INDEX_ROOT*)((u8*)actx->a + 2555 le16_to_cpu(actx->a->value_offset)); 2556 ih = &ir->index; 2557 ir_ie_ofs = le32_to_cpu(ir->index.entries_offset); 2558 ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs); 2559 /* 2560 * Retry the resize in case we freed eight bytes in the 2561 * mft record which is all we need. 2562 */ 2563 goto retry_resize; 2564 } 2565 /* 2566 * If this is the only attribute record in the mft record we 2567 * cannot gain anything by moving it or anything else. This 2568 * really cannot happen as we have emptied the index root thus 2569 * have made the attribute as small as possible thus it must 2570 * fit just fine in an otherwise empty mft record thus we must 2571 * have enough space to do the resize thus we cannot have 2572 * gotten here. 2573 */ 2574 if (ntfs_attr_record_is_only_one(actx->m, actx->a)) 2575 panic("%s(): ntfs_attr_record_is_only_one()\n", 2576 __FUNCTION__); 2577 /* 2578 * We now know we have an attribute list attribute and that we 2579 * still do not have enough space to extend the index root 2580 * attribute by eight bytes. 2581 * 2582 * First, if the index root attribute is already in an extent 2583 * mft record move it into a new, empty extent mft record. 2584 * This is guaranteed to give us the needed eight bytes. 2585 * 2586 * Second, look through the mft record to see if there are any 2587 * attribute records we can move out into an extent mft record 2588 * and if yes move one. That will also definitely give us the 2589 * needed eight bytes of space. 2590 * 2591 * Finally, if no attributes are available for moving then move 2592 * the index root attribute itself. 2593 */ 2594 if (actx->ni != base_ni) { 2595move_idx_root: 2596 lck_rw_lock_shared(&base_ni->attr_list_rl.lock); 2597 err = ntfs_attr_record_move(actx); 2598 lck_rw_unlock_shared(&base_ni->attr_list_rl.lock); 2599 if (err) { 2600 ntfs_error(vol->mp, "Failed to move index " 2601 "root attribute from mft " 2602 "record 0x%llx to an extent " 2603 "mft record (error %d).", 2604 (unsigned long long) 2605 actx->ni->mft_no, err); 2606 goto ictx_err; 2607 } 2608 /* 2609 * Need to update our cached pointers as the index root 2610 * attribute has now moved after which we need to retry 2611 * the resize which will now succeed. 2612 */ 2613 goto update_and_retry_resize; 2614 } 2615 /* 2616 * We know the index root is in the base mft record. 2617 * 2618 * Look through the base mft record to see if there are any 2619 * attribute records we can move out into an extent mft record 2620 * and if yes move one. That will also definitely give us the 2621 * needed eight bytes of space. 2622 */ 2623 ntfs_attr_search_ctx_init(&ctx, base_ni, actx->m); 2624 ctx.is_iteration = 1; 2625 do { 2626 err = ntfs_attr_find_in_mft_record(AT_UNUSED, NULL, 0, 2627 NULL, 0, &ctx); 2628 if (err) { 2629 if (err == ENOENT) { 2630 /* 2631 * No attributes are available for 2632 * moving thus move the index root 2633 * attribute out of the base mft record 2634 * into a new extent. 2635 * 2636 * TODO: Need to get this case 2637 * triggered and then need to run 2638 * chkdsk to check for validity of 2639 * moving the index root attribute out 2640 * of the base mft record. 2641 */ 2642 goto move_idx_root; 2643 } 2644 ntfs_error(vol->mp, "Failed to iterate over " 2645 "attribute records in base " 2646 "mft record 0x%llx (error %d).", 2647 (unsigned long long) 2648 base_ni->mft_no, err); 2649 goto ictx_err; 2650 } 2651 /* 2652 * Skip the standard information attribute, the 2653 * attribute list attribute, and the index root 2654 * attribute as we are looking for lower priority 2655 * attributes to move out and the attribute list 2656 * attribute is of course not movable. 2657 */ 2658 a = ctx.a; 2659 } while (a->type == AT_STANDARD_INFORMATION || 2660 a->type == AT_ATTRIBUTE_LIST || 2661 a->type == AT_INDEX_ROOT); 2662 /* 2663 * Move the found attribute out to an extent mft record and 2664 * update its attribute list entry. 2665 * 2666 * But first find the attribute list entry matching the 2667 * attribute record so it can be updated. 2668 */ 2669 a_name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)); 2670 al_entry = (ATTR_LIST_ENTRY*)base_ni->attr_list; 2671 al_end = (u8*)al_entry + base_ni->attr_list_size; 2672 mref = MK_LE_MREF(base_ni->mft_no, base_ni->seq_no); 2673 do { 2674 if ((u8*)al_entry >= al_end || !al_entry->length) { 2675 ntfs_error(vol->mp, "Attribute list attribute " 2676 "in mft_no 0x%llx is " 2677 "corrupt. Run chkdsk.", 2678 (unsigned long long) 2679 base_ni->mft_no); 2680 err = EIO; 2681 goto ictx_err; 2682 } 2683 if (al_entry->mft_reference == mref && 2684 al_entry->instance == a->instance) { 2685 /* 2686 * We found the entry, stop looking but first 2687 * perform a quick sanity check that we really 2688 * do have the correct attribute record. 2689 */ 2690 if (al_entry->type == a->type && 2691 ntfs_are_names_equal( 2692 (ntfschar*)((u8*)al_entry + 2693 al_entry->name_offset), 2694 al_entry->name_length, a_name, 2695 a->name_length, TRUE, 2696 vol->upcase, vol->upcase_len)) 2697 break; 2698 ntfs_error(vol->mp, "Found corrupt attribute " 2699 "list attribute when looking " 2700 "for attribute type 0x%x in " 2701 "attribute list attribute of " 2702 "base mft record 0x%llx. Run " 2703 "chkdsk.", 2704 (unsigned)le32_to_cpu(a->type), 2705 (unsigned long long) 2706 base_ni->mft_no); 2707 NVolSetErrors(vol); 2708 err = EIO; 2709 goto ictx_err; 2710 } 2711 /* Go to the next attribute list entry. */ 2712 al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + 2713 le16_to_cpu(al_entry->length)); 2714 } while (1); 2715 /* Finally, move the attribute to an extent record. */ 2716 err = ntfs_attr_record_move_for_attr_list_attribute(&ctx, 2717 al_entry, actx, NULL); 2718 if (err) { 2719 ntfs_error(vol->mp, "Failed to move attribute type " 2720 "0x%x out of base mft record 0x%llx " 2721 "and into an extent mft record (error " 2722 "%d). Run chkdsk.", 2723 (unsigned)le32_to_cpu(a->type), 2724 (unsigned long long)base_ni->mft_no, 2725 err); 2726 NVolSetErrors(vol); 2727 goto ictx_err; 2728 } 2729 /* 2730 * Sync the modified attribute list attribute value to 2731 * metadata/disk. 2732 */ 2733 ntfs_attr_search_ctx_reinit(&ctx); 2734 err = ntfs_attr_list_sync(base_ni, (u8*)al_entry - 2735 base_ni->attr_list, &ctx); 2736 if (!err) { 2737 /* 2738 * Need to update our cached pointers as the index root 2739 * attribute has likely moved after which we need to 2740 * retry the resize which will now succeed. 2741 */ 2742 goto update_and_retry_resize; 2743 } 2744 /* 2745 * FIXME: We could try and revert the move of the attribute and 2746 * the attribute list attribute value change but that is a lot 2747 * of work and the only real errors that could happen at this 2748 * stage are either that we have run out of memory or that the 2749 * disk has gone bad or has been disconnected or there must be 2750 * bad memory corruption. In all those cases we are extremely 2751 * unlikely to be able to undo what we have done so far due to 2752 * further errors so at least for now we do not bother trying. 2753 */ 2754 ntfs_error(vol->mp, "Failed to sync attribute list attribute " 2755 "in mft_no 0x%llx (error %d). Leaving " 2756 "corrupt metadata. Run chkdsk.", 2757 (unsigned long long)base_ni->mft_no, err); 2758 /* 2759 * Need to update our cached pointers as the index root 2760 * attribute has likely moved. 2761 */ 2762 ir = (INDEX_ROOT*)((u8*)actx->a + 2763 le16_to_cpu(actx->a->value_offset)); 2764 ih = &ir->index; 2765 ir_ie_ofs = le32_to_cpu(ir->index.entries_offset); 2766 ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs); 2767 goto ictx_err; 2768 } 2769 /* 2770 * Update the end index entry and the index header to reflect the 2771 * changes in size. 2772 */ 2773 ie->length = cpu_to_le16(u); 2774 ih->allocated_size = ih->index_length = cpu_to_le32( 2775 le32_to_cpu(ih->entries_offset) + u); 2776 /* 2777 * Update the index root and end index entry to reflect the fact that 2778 * the directory now is a large index and that the end index entry 2779 * points to a sub-node and set the sub-node pointer to vcn 0. 2780 */ 2781 ih->flags |= LARGE_INDEX; 2782 ie->flags |= INDEX_ENTRY_NODE; 2783 *(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) = 0; 2784 /* 2785 * Now setup the new index context for the emptied index root and 2786 * attach it at the start of the tree path. 2787 */ 2788 ir_ictx->entries[0] = ir_ictx->entry = ie; 2789 ir_ictx->is_root = 1; 2790 ir_ictx->is_locked = 1; 2791 ir_ictx->ir = ir; 2792 ir_ictx->index = ih; 2793 ir_ictx->actx = actx; 2794 ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - 2795 le32_to_cpu(actx->m->bytes_in_use); 2796 ir_ictx->max_entries = ictx->max_entries; 2797 ir_ictx->nr_entries = 1; 2798 /* Ensure the modified index root attribute is written to disk. */ 2799 ntfs_index_entry_mark_dirty(ir_ictx); 2800 /* 2801 * Attach the new index context between the current index context 2802 * (which is the top of the tree) and the index context above it (which 2803 * is the bottom of the tree), i.e. make the new index context the top 2804 * of the tree. 2805 */ 2806 ictx->up->down = ir_ictx; 2807 ir_ictx->down = ictx; 2808 ir_ictx->up = ictx->up; 2809 ictx->up = ir_ictx; 2810 /* 2811 * If the index allocation attribute is not present yet, we need to 2812 * create it and the index bitmap attribute. 2813 */ 2814 if (upl) { 2815 /* 2816 * The page is now done, mark the index context as dirty so it 2817 * gets written out later. 2818 */ 2819 ntfs_index_entry_mark_dirty(ictx); 2820 /* 2821 * Set up the index inode to reflect that it has an index 2822 * allocation attribute. 2823 */ 2824 NInoSetIndexAllocPresent(idx_ni); 2825 lck_spin_lock(&idx_ni->size_lock); 2826 idx_ni->allocated_size = (s64)clusters << 2827 vol->cluster_size_shift; 2828 idx_ni->initialized_size = idx_ni->data_size = 2829 idx_ni->block_size; 2830 lck_spin_unlock(&idx_ni->size_lock); 2831 if (idx_ni->name == I30) { 2832 lck_spin_lock(&base_ni->size_lock); 2833 base_ni->allocated_size = idx_ni->allocated_size; 2834 base_ni->initialized_size = base_ni->data_size = 2835 idx_ni->block_size; 2836 lck_spin_unlock(&base_ni->size_lock); 2837 } 2838 if (!ubc_setsize(idx_ni->vn, idx_ni->data_size)) 2839 panic("%s(): ubc_setsize() failed.\n", __FUNCTION__); 2840 /* 2841 * Find the position at which we need to insert the index 2842 * allocation attribute. 2843 */ 2844 err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name, 2845 idx_ni->name_len, 0, NULL, 0, actx); 2846 if (err != ENOENT) { 2847 if (!err) { 2848 ntfs_error(vol->mp, "Index allocation " 2849 "attribute already present " 2850 "when it should not be. " 2851 "Corrupt index or driver " 2852 "bug. Run chkdsk."); 2853 NVolSetErrors(vol); 2854 err = EIO; 2855 } else 2856 ntfs_error(vol->mp, "Failed to look up " 2857 "position at which to insert " 2858 "the index allocation " 2859 "attribute (error %d).", err); 2860 goto ia_err; 2861 } 2862 /* Insert the index allocation attribute. */ 2863 err = ntfs_insert_index_allocation_attribute(base_ni, actx, 2864 idx_ni); 2865 if (err || actx->is_error) { 2866 ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx " 2867 "(error %d).", actx->is_error ? 2868 "remap extent mft record of" : 2869 "insert index allocation attribute in ", 2870 (unsigned long long)base_ni->mft_no, 2871 err); 2872 goto ia_err; 2873 } 2874 /* 2875 * Ensure the created index allocation attribute is written to 2876 * disk. 2877 */ 2878 NInoSetMrecNeedsDirtying(actx->ni); 2879 /* 2880 * Find the position at which we need to insert the index 2881 * bitmap attribute. 2882 */ 2883 ntfs_attr_search_ctx_reinit(actx); 2884 err = ntfs_attr_lookup(AT_BITMAP, idx_ni->name, 2885 idx_ni->name_len, 0, NULL, 0, actx); 2886 if (err != ENOENT) { 2887 if (!err) { 2888 ntfs_error(vol->mp, "Index bitmap attribute " 2889 "attribute already present " 2890 "when it should not be. " 2891 "Corrupt index or driver " 2892 "bug. Run chkdsk."); 2893 NVolSetErrors(vol); 2894 err = EIO; 2895 } else 2896 ntfs_error(vol->mp, "Failed to look up " 2897 "position at which to insert " 2898 "the index bitmap attribute " 2899 "(error %d).", err); 2900 goto bmp_err; 2901 } 2902 /* 2903 * Insert the index bitmap attribute as a resident attribute 2904 * with a value length sufficient to cover the number of 2905 * allocated index blocks rounded up to a multiple of 8 bytes 2906 * which are initialized to zero. We then set the first bit in 2907 * the bitmap to reflect the fact that the first index 2908 * allocation block is in use. 2909 */ 2910 err = ntfs_resident_attr_record_insert(base_ni, actx, 2911 AT_BITMAP, idx_ni->name, idx_ni->name_len, 2912 NULL, (((idx_ni->allocated_size >> 2913 idx_ni->block_size_shift) + 63) >> 3) & 2914 ~(s64)7); 2915 if (err || actx->is_error) { 2916 if (!err) 2917 err = actx->error; 2918 ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx " 2919 "(error %d).", actx->is_error ? 2920 "remap extent mft record of" : 2921 "insert index bitmap attribute in", 2922 (unsigned long long)base_ni->mft_no, 2923 err); 2924 goto bmp_err; 2925 } 2926 *((u8*)actx->a + le16_to_cpu(actx->a->value_offset)) = 1; 2927 /* 2928 * Ensure the created index bitmap attribute is written to 2929 * disk. 2930 */ 2931 NInoSetMrecNeedsDirtying(actx->ni); 2932 /* 2933 * We successfully created the index allocation and bitmap 2934 * attributes thus we can set up our cache of the last set bit 2935 * in the bitmap in the index inode to 0 to reflect the fact 2936 * that we just allocated the first index block. 2937 */ 2938 idx_ni->last_set_bit = 0; 2939 } 2940 /* 2941 * We are either completely done and can release the attribute search 2942 * context and the mft record or we now need to call functions which 2943 * will deadlock with us if we do not release the attribute search 2944 * context and the mft record so we have to release them now thus we 2945 * now unlock the index root context. 2946 */ 2947 ntfs_index_ctx_unlock(ir_ictx); 2948 if (upl) { 2949 INDEX_ENTRY **entries; 2950 long delta; 2951 unsigned i; 2952 2953update_ie_pointers: 2954 /* 2955 * Note we still hold the locked and referenced index 2956 * allocation page which is now attached to the index context 2957 * and will be released later when the index context is 2958 * released. 2959 * 2960 * We need to update the index entry pointers in the array to 2961 * point into the index block instead of the old index root. 2962 */ 2963 entries = ictx->entries; 2964 delta = (u8*)ictx->entry - (u8*)entries[ictx->entry_nr]; 2965 for (i = 0; i < ictx->nr_entries; i++) 2966 entries[i] = (INDEX_ENTRY*)((u8*)entries[i] + delta); 2967 if (ictx->entry != entries[ictx->entry_nr]) 2968 panic("%s(): ictx->entry != entries[ictx->entry_nr]\n", 2969 __FUNCTION__); 2970 ntfs_debug("Done."); 2971 return 0; 2972 } 2973 /* 2974 * We have an index allocation attribute already thus we need to 2975 * allocate an index block from it for us to transfer the temporary 2976 * index block to. 2977 * 2978 * We need to set @is_locked to zero because we are using the temporary 2979 * buffer for @ia and hence @upl and @pl are zero thus as far as the 2980 * index context code is concerned the context is not actually locked 2981 * at all. Not doing this can lead to a panic() in 2982 * ntfs_index_block_alloc() under some circumstances due to an 2983 * assertion check that verifies that @is_locked is zero in 2984 * ntfs_attr_resize(). 2985 */ 2986 ictx->is_locked = 0; 2987 err = ntfs_index_block_alloc(ictx, &vcn, &ia, &ictx->upl_ofs, &upl, 2988 &ictx->pl, &ictx->addr); 2989 if (err) { 2990 ntfs_error(vol->mp, "Failed to allocate index allocation " 2991 "block (error %d).", err); 2992 goto alloc_err; 2993 } 2994 /* Copy the update sequence number into our temporary buffer. */ 2995 *(le16*)((u8*)ictx->ia + le16_to_cpu(ictx->ia->usa_ofs)) = 2996 *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)); 2997 /* 2998 * Copy our temporary buffer into the allocated index block, free the 2999 * temporary buffer and setup the index context to point to the 3000 * allocated index block instead of the temporary one. 3001 */ 3002 memcpy(ia, ictx->ia, idx_ni->block_size); 3003 OSFree(ictx->ia, idx_ni->block_size, ntfs_malloc_tag); 3004 ictx->entry = ie = (INDEX_ENTRY*)((u8*)ia + 3005 ((u8*)ictx->entry - (u8*)ictx->ia)); 3006 ictx->ia = ia; 3007 ictx->index = &ia->index; 3008 ictx->upl = upl; 3009 ictx->is_locked = 1; 3010 /* 3011 * If the vcn of the allocated index block is not zero, we need to 3012 * update the vcn in the index block itself as well as the sub-node 3013 * vcn pointer in the index root attribute. 3014 */ 3015 if (vcn) { 3016 ictx->vcn = vcn; 3017 ia->index_block_vcn = cpu_to_sle64(vcn); 3018 ictx->upl_ofs = (vcn << idx_ni->vcn_size_shift) & 3019 ~PAGE_MASK_64; 3020 /* Get hold of the mft record for the index inode. */ 3021 err = ntfs_mft_record_map(base_ni, &m); 3022 if (err) { 3023 /* 3024 * The only thing that is now incorrect is the vcn 3025 * sub-node pointer in the empty index root attribute 3026 * but we cannot correct it as we are failing to map 3027 * the mft record which we need to be able to rollback. 3028 * We leave it to chkdsk to sort this out later. 3029 */ 3030 ntfs_error(vol->mp, "Cannot rollback partial index " 3031 "upgrade because " 3032 "ntfs_mft_record_map() failed (error " 3033 "%d). Leaving inconsistent " 3034 "metadata. Run chkdsk.", err); 3035 goto map_err; 3036 } 3037 actx = ntfs_attr_search_ctx_get(base_ni, m); 3038 if (!actx) { 3039 err = ENOMEM; 3040 ntfs_error(vol->mp, "Cannot rollback partial index " 3041 "upgrade because there was not enough " 3042 "memory to obtain an attribute search " 3043 "context. Leaving inconsistent " 3044 "metadata. Run chkdsk."); 3045 goto actx_err; 3046 } 3047 /* Find the index root attribute in the mft record. */ 3048 err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, 3049 idx_ni->name_len, 0, NULL, 0, actx); 3050 if (err) 3051 goto lookup_err; 3052 /* Get to the index root value. */ 3053 ir = (INDEX_ROOT*)((u8*)actx->a + 3054 le16_to_cpu(actx->a->value_offset)); 3055 /* The first index entry which is also the last one. */ 3056 ie = (INDEX_ENTRY*)((u8*)&ir->index + 3057 le32_to_cpu(ir->index.entries_offset)); 3058 if (!(ie->flags & INDEX_ENTRY_END)) 3059 panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n", 3060 __FUNCTION__); 3061 if (!(ie->flags & INDEX_ENTRY_NODE)) 3062 panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n", 3063 __FUNCTION__); 3064 /* Finally, update the vcn pointer of the index entry. */ 3065 *(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) = 3066 cpu_to_sle64(vcn); 3067 /* 3068 * Ensure the updated index entry is written to disk and 3069 * release the attribute search context and the mft record. 3070 */ 3071 NInoSetMrecNeedsDirtying(actx->ni); 3072 ntfs_attr_search_ctx_put(actx); 3073 ntfs_mft_record_unmap(base_ni); 3074 } 3075 /* 3076 * The page is now done, mark the index context as dirty so it gets 3077 * written out later. 3078 */ 3079 ntfs_index_entry_mark_dirty(ictx); 3080 goto update_ie_pointers; 3081lookup_err: 3082 ntfs_error(vol->mp, "Cannot rollback partial index upgrade because we " 3083 "failed to look up the index root attribute (error " 3084 "%d). Leaving inconsistent metadata. Run chkdsk.", 3085 err); 3086 if (err == ENOENT) 3087 err = EIO; 3088 ntfs_attr_search_ctx_put(actx); 3089actx_err: 3090 ntfs_mft_record_unmap(base_ni); 3091map_err: 3092 /* 3093 * The page is now done, mark the index context as dirty so it gets 3094 * written out later. 3095 */ 3096 ntfs_index_entry_mark_dirty(ictx); 3097 goto err; 3098alloc_err: 3099 /* 3100 * We need to get hold of the mft record and get a new search context 3101 * so we can restore the index root attribute. 3102 */ 3103 err2 = ntfs_mft_record_map(base_ni, &m); 3104 if (err2) { 3105 ntfs_error(vol->mp, "Cannot rollback partial index upgrade " 3106 "because ntfs_mft_record_map() failed (error " 3107 "%d). Leaving inconsistent metadata. Run " 3108 "chkdsk.", err2); 3109undo_alloc_err2: 3110 /* 3111 * Mark the index context invalid given it neither has an index 3112 * block and page nor an index root and mft record attached. 3113 */ 3114 ictx->entry = NULL; 3115 goto undo_alloc_err; 3116 } 3117 actx = ntfs_attr_search_ctx_get(base_ni, m); 3118 if (!actx) { 3119 ntfs_error(vol->mp, "Cannot rollback partial index upgrade " 3120 "because there was not enough memory to " 3121 "obtain an attribute search context. Leaving " 3122 "inconsistent metadata. Run chkdsk."); 3123 ntfs_mft_record_unmap(base_ni); 3124 goto undo_alloc_err2; 3125 } 3126 goto restore_ir; 3127bmp_err: 3128 ntfs_attr_search_ctx_reinit(actx); 3129 err2 = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name, 3130 idx_ni->name_len, 0, NULL, 0, actx); 3131 if (err2) { 3132 ntfs_error(vol->mp, "Cannot rollback partial index upgrade " 3133 "because looking up the index allocation " 3134 "attribute failed (error %d). Leaving " 3135 "inconsistent metadata. Run chkdsk.", err2); 3136bmp_err_err: 3137 NVolSetErrors(vol); 3138 /* 3139 * Everything is actually consistent except for the fact that 3140 * the index bitmap attribute is missing so it is better if we 3141 * do not do any kind of rollback. Hopefully, chkdsk will fix 3142 * this by adding the missing index bitmap attribute. 3143 */ 3144 goto err; 3145 } 3146 /* Remove the added index allocation attribute. */ 3147 err2 = ntfs_attr_record_delete(base_ni, actx); 3148 if (err2) { 3149 ntfs_error(vol->mp, "Cannot rollback partial index upgrade " 3150 "because deleting the index allocation " 3151 "attribute failed (error %d). Leaving " 3152 "inconsistent metadata. Run chkdsk.", err2); 3153 goto bmp_err_err; 3154 } 3155ia_err: 3156 /* Reset the inode. */ 3157 lck_spin_lock(&idx_ni->size_lock); 3158 idx_ni->initialized_size = idx_ni->data_size = idx_ni->allocated_size = 3159 0; 3160 lck_spin_unlock(&idx_ni->size_lock); 3161 if (idx_ni->name == I30) { 3162 lck_spin_lock(&base_ni->size_lock); 3163 base_ni->initialized_size = base_ni->data_size = 3164 base_ni->allocated_size = 0; 3165 lck_spin_unlock(&base_ni->size_lock); 3166 } 3167 NInoClearIndexAllocPresent(idx_ni); 3168 /* 3169 * Cannot call ubc_setsize() because we have the page pinned so delay 3170 * until we dump the page later on. 3171 */ 3172 need_ubc_setsize = TRUE; 3173 /* 3174 * Pretend the @ir_ictx is not locked as we are going to transfer the 3175 * index root back to @ictx. 3176 */ 3177 ir_ictx->is_locked = 0; 3178 ir_ictx->actx = NULL; 3179 /* Restore the index root attribute. */ 3180 ntfs_attr_search_ctx_reinit(actx); 3181restore_ir: 3182 err2 = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, 3183 0, NULL, 0, actx); 3184 if (err2) { 3185 ntfs_error(vol->mp, "Cannot rollback partial index upgrade " 3186 "because looking up the index root attribute " 3187 "failed (error %d). Leaving inconsistent " 3188 "metadata. Run chkdsk.", err2); 3189 NVolSetErrors(vol); 3190 goto ictx_err; 3191 } 3192 ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); 3193 ih = &ir->index; 3194 ie = (INDEX_ENTRY*)((u8*)ih + ir_ie_ofs); 3195 u = ir_ie_ofs + (le32_to_cpu(ia->index.index_length) - ia_ie_ofs); 3196 err2 = ntfs_resident_attr_value_resize(actx->m, actx->a, 3197 offsetof(INDEX_ROOT, index) + u); 3198 if (err2) { 3199 ntfs_error(vol->mp, "Cannot rollback partial index upgrade " 3200 "because resizing the index root attribute " 3201 "failed (error %d). Leaving inconsistent " 3202 "metadata. Run chkdsk.", err2); 3203 NVolSetErrors(vol); 3204 goto ictx_err; 3205 } 3206 if (!old.is_large_index) 3207 ih->flags &= ~LARGE_INDEX; 3208 ih->allocated_size = ih->index_length = cpu_to_le32(u); 3209 memcpy(ie, (u8*)&ia->index + ia_ie_ofs, 3210 le32_to_cpu(ia->index.index_length) - ia_ie_ofs); 3211 /* Ensure the restored index root attribute is written to disk. */ 3212 NInoSetMrecNeedsDirtying(actx->ni); 3213ictx_err: 3214 /* 3215 * Reset the index context. We may be setting @ictx->entry to a bogus 3216 * value but it does not matter because we are returning an error code 3217 * thus the caller must not use the index context and while the value 3218 * may be bogus it is correctly non-NULL thus the index context will be 3219 * cleaned up correctly when the caller releases it. 3220 */ 3221 ictx->entry = ictx->entries[ictx->entry_nr]; 3222 ictx->is_root = 1; 3223 ictx->is_locked = 1; 3224 ictx->ir = ir; 3225 ictx->index = ih; 3226 ictx->actx = actx; 3227 if (!actx->is_error) 3228 ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) - 3229 le32_to_cpu(actx->m->bytes_in_use); 3230 if (!upl) { 3231undo_alloc_err: 3232 OSFree(ia, idx_ni->block_size, ntfs_malloc_tag); 3233 } else { 3234 /* Destroy the page. */ 3235 ntfs_page_dump(idx_ni, upl, pl); 3236 if (need_ubc_setsize && !ubc_setsize(idx_ni->vn, 0)) 3237 panic("%s(): ubc_setsize() failed.\n", __FUNCTION__); 3238page_err: 3239 err2 = ntfs_cluster_free_from_rl(vol, idx_ni->rl.rl, 0, -1, 3240 NULL); 3241 if (err2) { 3242 ntfs_error(vol->mp, "Failed to rollback cluster " 3243 "allocation (error %d). Run chkdsk " 3244 "to recover the lost space.", err2); 3245 NVolSetErrors(vol); 3246 } 3247 err2 = ntfs_rl_truncate_nolock(vol, &idx_ni->rl, 0); 3248 if (err2) 3249 panic("%s(): err2\n", __FUNCTION__); 3250 } 3251err: 3252 /* 3253 * Dissociate the allocated index root context from the tree path and 3254 * throw it away. We do this here unconditionally as it works both for 3255 * the case where we have not attached it to the tree path yet and for 3256 * the case where we have already attached it to the tree path. We 3257 * have to deal with the former case here so cannot defer the cleanup 3258 * to the ntfs_index_ctx_put() of the index context @ictx that will be 3259 * done by the caller. 3260 */ 3261 ntfs_index_ctx_disconnect_reinit(ir_ictx); 3262 ntfs_index_ctx_put(ir_ictx); 3263 ntfs_error(vol->mp, "Failed (error %d).", err); 3264 return err; 3265} 3266 3267/** 3268 * ntfs_index_root_make_space - make space for an index entry in the index root 3269 * @ictx: index entry in front of which to make space 3270 * @ie_size: size of the index entry to make space for 3271 * 3272 * Return 0 on success and ENOSPC if there is not enough space in the mft 3273 * record to insert an index entry of size @ie_size in the index root 3274 * attribute. 3275 * 3276 * Note that we do not update the array of index entry pointers nor the number 3277 * of entries in the array because on success it means that the index entry 3278 * being added will be created and the function is then done, i.e. the array of 3279 * index entry pointers will not be used any more and on error we have not done 3280 * anything at all so there is no need to update any of the pointers. 3281 * 3282 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 3283 * writing. 3284 * - The index context @ictx must be locked. 3285 */ 3286static errno_t ntfs_index_root_make_space(ntfs_index_context *ictx, 3287 const u32 ie_size) 3288{ 3289 MFT_RECORD *m = ictx->actx->m; 3290 const u32 muse = le32_to_cpu(m->bytes_in_use); 3291 const u32 new_muse = muse + ie_size; 3292 3293 ntfs_debug("Entering."); 3294 if (new_muse <= le32_to_cpu(m->bytes_allocated)) { 3295 INDEX_ENTRY *ie = ictx->entry; 3296 ATTR_RECORD *a; 3297 INDEX_HEADER *ih; 3298 3299 /* 3300 * Extend the index root attribute so it has enough space for 3301 * the new index entry. As an optimization we combine the 3302 * resizing of the index root attribute and the moving of index 3303 * entries within the attribute into a single operation. 3304 */ 3305 memmove((u8*)ie + ie_size, ie, muse - ((u8*)ie - (u8*)m)); 3306 /* Adjust the mft record to reflect the change in used space. */ 3307 m->bytes_in_use = cpu_to_le32(new_muse); 3308 /* 3309 * Adjust the attribute record to reflect the changes in the 3310 * size of the attribute record and in the size of the 3311 * attribute value. 3312 */ 3313 a = ictx->actx->a; 3314 a->length = cpu_to_le32(le32_to_cpu(a->length) + ie_size); 3315 a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) + 3316 ie_size); 3317 /* Adjust the index header to reflect the change in length. */ 3318 ih = ictx->index; 3319 ih->allocated_size = ih->index_length = cpu_to_le32( 3320 le32_to_cpu(ih->index_length) + ie_size); 3321 ntfs_debug("Done."); 3322 return 0; 3323 } 3324 ntfs_debug("Failed (not enough space in mft record to insert index " 3325 "entry into index root attribute)."); 3326 return ENOSPC; 3327} 3328 3329/** 3330 * ntfs_index_block_make_space - make space for an index entry in an index block 3331 * @ictx: index entry in front of which to make space 3332 * @ie_size: size of the index entry to make space for 3333 * 3334 * Return 0 on success and ENOSPC if there is not enough space in the index 3335 * block to insert an index entry of size @ie_size. 3336 * 3337 * Note that we do not update the array of index entry pointers nor the number 3338 * of entries in the array because on success it means that the index entry 3339 * being added will be created and the function is then done, i.e. the array of 3340 * index entry pointers will not be used any more and on error we have not done 3341 * anything at all so there is no need to update any of the pointers. 3342 * 3343 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 3344 * writing. 3345 * - The index context @ictx must be locked. 3346 */ 3347static errno_t ntfs_index_block_make_space(ntfs_index_context *ictx, 3348 const u32 ie_size) 3349{ 3350 INDEX_HEADER *ih = ictx->index; 3351 const u32 ilen = le32_to_cpu(ih->index_length); 3352 const u32 new_ilen = ilen + ie_size; 3353 3354 ntfs_debug("Entering."); 3355 if (new_ilen <= le32_to_cpu(ih->allocated_size)) { 3356 INDEX_ENTRY *ie = ictx->entry; 3357 3358 /* Move the index entries to make space for the new entry. */ 3359 memmove((u8*)ie + ie_size, ie, ilen - ((u8*)ie - (u8*)ih)); 3360 /* Adjust the index header to reflect the change in length. */ 3361 ih->index_length = cpu_to_le32(new_ilen); 3362 ntfs_debug("Done."); 3363 return 0; 3364 } 3365 ntfs_debug("Failed (not enough space in index block to insert index " 3366 "entry)."); 3367 return ENOSPC; 3368} 3369 3370/** 3371 * ntfs_index_node_make_space - make space for an index entry in an index node 3372 * @ictx: index entry in front of which to make space 3373 * @ie_size: size of the index entry to make space for 3374 * 3375 * Return 0 on success and ENOSPC if there is not enough space in the index 3376 * node to insert an index entry of size @ie_size. 3377 * 3378 * Note that we do not update the array of index entry pointers nor the number 3379 * of entries in the array because on success it means that the index entry 3380 * being added will be created and the function is then done, i.e. the array of 3381 * index entry pointers will not be used any more and on error we have not done 3382 * anything at all so there is no need to update any of the pointers. 3383 * 3384 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 3385 * writing. 3386 * - The index context @ictx must be locked. 3387 */ 3388static errno_t ntfs_index_node_make_space(ntfs_index_context *ictx, 3389 const u32 ie_size) 3390{ 3391 errno_t err; 3392 3393 if (ictx->is_root) 3394 err = ntfs_index_root_make_space(ictx, ie_size); 3395 else 3396 err = ntfs_index_block_make_space(ictx, ie_size); 3397 return err; 3398} 3399 3400/** 3401 * ntfs_index_ctx_revalidate - revalidate the pointers in an index context 3402 * @new: index context with the new mapped virtual address 3403 * @old: index context to revalidate 3404 * 3405 * Revalidate all pointers in the index context @old and adjust them for the 3406 * changed mapped virtual address in the index context @new. 3407 * 3408 * Note both @new and @old must be in the same VM page. 3409 * 3410 * We need to revalidate all the pointers in @old just as if we were locking it 3411 * because @new may have been mapped into a different virtual address than @old 3412 * was last time thus the pointers in @old may be stale. 3413 * 3414 * To check if revalidation is needed is done by comparing @old->addr and 3415 * @new->addr. If they match all is ok and if not then need to revalidate @old 3416 * with the new address of @new->addr. 3417 * 3418 * Locking: - Caller must hold @new->idx_ni->lock on the index inode. 3419 * - The index context @old must be locked. 3420 * - The index context @new must not be locked. 3421 */ 3422static void ntfs_index_ctx_revalidate(ntfs_index_context *new, 3423 ntfs_index_context *old) 3424{ 3425 long delta; 3426 INDEX_ENTRY **entries; 3427 unsigned u; 3428 3429 if (!new->is_locked) 3430 panic("%s(): !new->is_locked\n", __FUNCTION__); 3431 if (new->is_root) 3432 panic("%s(): new->is_root\n", __FUNCTION__); 3433 if (old->is_locked) 3434 panic("%s(): old->is_locked\n", __FUNCTION__); 3435 if (old->is_root) 3436 panic("%s(): old->is_root\n", __FUNCTION__); 3437 delta = new->addr - old->addr; 3438 if (!delta) 3439 return; 3440 /* Revalidation is needed. */ 3441 old->entry = (INDEX_ENTRY*)((u8*)old->entry + delta); 3442 old->ia = (INDEX_ALLOCATION*)((u8*)old->ia + delta); 3443 old->index = &old->ia->index; 3444 old->addr = new->addr; 3445 entries = old->entries; 3446 for (u = 0; u < old->nr_entries; u++) 3447 entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta); 3448} 3449 3450/** 3451 * ntfs_index_ctx_lock_two - lock two index contexts in a deadlock-free fashion 3452 * @a: first index context to lock (this may be locked) 3453 * @b: second index context to lock (this must not be locked) 3454 * 3455 * Lock the two index contexts @a and @b. @a may already be locked and it may 3456 * need to be unlocked if it has to be locked after @b is locked. 3457 * 3458 * The following lock ordering rules are applied: 3459 * 3460 * - An index block node must be locked before an index root node. 3461 * 3462 * - Two index block nodes must be locked in ascending page offset order. 3463 * 3464 * - Two index block nodes that are physically in the same VM page must share 3465 * the lock. The way this is implemented is that @a is really locked and @b 3466 * is not actually locked but all its pointers are revalidated as if it were 3467 * locked. This is ok because @a is locked and therefore the VM page is 3468 * mapped, locked, and pinned and will not go anywhere until we are done. 3469 * The reason we need to do the revalidation is because @b when it was mapped 3470 * previously may have been mapped at a different virtual address than the 3471 * one @a is mapped at now. This means that when you are unlocking @b you 3472 * must check if @b is locked and only unlock it if so. 3473 * 3474 * Return 0 on success or errno on error. On error @a and @b may be both left 3475 * unlocked or one can be left locked and one unlocked. 3476 */ 3477static errno_t ntfs_index_ctx_lock_two(ntfs_index_context *a, 3478 ntfs_index_context *b) 3479{ 3480 errno_t err; 3481 3482 if (b->is_locked) 3483 panic("%s(): b->is_locked\n", __FUNCTION__); 3484 if (a->is_root) { 3485 /* 3486 * @a is the index root so it has to be locked second. 3487 * 3488 * Unlock @a if it is already locked. 3489 */ 3490 if (a->is_locked) 3491 ntfs_index_ctx_unlock(a); 3492 } else if (b->is_root) { 3493 /* @b is the index root. If @a is not locked, lock it. */ 3494 if (!a->is_locked) { 3495 err = ntfs_index_ctx_relock(a); 3496 if (err) 3497 return err; 3498 } 3499 } else { 3500 /* 3501 * Both @a and @b are index blocks. 3502 * 3503 * Do we need to share the lock because both index nodes are in 3504 * the same VM page? 3505 */ 3506 if (a->upl_ofs == b->upl_ofs) { 3507 if (!a->is_locked) { 3508 err = ntfs_index_ctx_relock(a); 3509 if (err) 3510 return err; 3511 } 3512 ntfs_index_ctx_revalidate(a, b); 3513 return 0; 3514 } 3515 if (a->is_locked) { 3516 /* Do we have to unlock @a before locking @b? */ 3517 if (a->upl_ofs > b->upl_ofs) 3518 ntfs_index_ctx_unlock(a); 3519 } else { 3520 /* Do we need to lock @a first? */ 3521 if (a->upl_ofs < b->upl_ofs) { 3522 err = ntfs_index_ctx_relock(a); 3523 if (err) 3524 return err; 3525 } 3526 } 3527 } 3528 /* Lock @b. */ 3529 err = ntfs_index_ctx_relock(b); 3530 /* If @a is currently locked or there was an error we are done. */ 3531 if (a->is_locked || err) 3532 return err; 3533 /* 3534 * We unlocked @a so we could lock @b or @a was not locked. 3535 * 3536 * Lock @a and we are done. 3537 */ 3538 return ntfs_index_ctx_relock(a); 3539} 3540 3541/** 3542 * ntfs_index_entry_add_or_node_split - add a key to an index 3543 * @ictx: index context specifying the node to split/position to add at 3544 * @split_only: if true do not insert, only split the index node 3545 * @entry_size: size of the entry that will be added after the split 3546 * @key: key to add to the directory or view index 3547 * @key_len: size of key @key in bytes 3548 * @data: data to associate with the key @key if a view index 3549 * @data_len: size of data @data in bytes if a view index 3550 * 3551 * If @split_only is true, split the index node pointed to by @ictx. @ictx 3552 * also points to the entry in the index node at which an entry will be 3553 * inserted after the split is completed. @entry_size is the size of the entry 3554 * that will be added at that position. These are used so that the split is 3555 * performed with consideration with the insertion that is likely to come. And 3556 * if the insertion does not happen it does not matter much, it just means the 3557 * split was not quite on the median entry but off by one. 3558 * 3559 * In this case @key, @key_len, @data, and @data_len are ignored. 3560 * 3561 * If @split_only is false @entry_size is ignored and @key, @key_len, @data, 3562 * and @data_len are used. 3563 * 3564 * In this case, if @ictx belongs to a directory index, insert the filename 3565 * attribute @key of length @key_len bytes in the directory index at the 3566 * position specified by the index context @ictx and point the inserted index 3567 * entry at the mft reference *@data which is the mft reference of the inode to 3568 * which the filename @fn belongs. @data_len must be zero in this case. 3569 * 3570 * If @ictx belongs to a view index, insert the key @key of length @key_len 3571 * bytes in the view index at the position specified by the index context @ictx 3572 * and associate the data @data of size @data_len bytes with the key @key. 3573 * 3574 * Return 0 on success and errno on error. On error the index context is no 3575 * longer usable and must be released or reinitialized. 3576 * 3577 * Note that we do not update the array of index entry pointers nor the number 3578 * of entries in the array because on success it means that the index node has 3579 * been split and the function is done, i.e. the array of index entry pointers 3580 * will not be used any more and on error the index context becomes invalid so 3581 * there is no need to update any of the pointers. The caller is expected to 3582 * restart its operations by doing a new index lookup if it wants to continue 3583 * working on the index for that reason. 3584 * 3585 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 3586 * writing. 3587 * - The index context @ictx must be locked. 3588 */ 3589errno_t ntfs_index_entry_add_or_node_split(ntfs_index_context *ictx, 3590 const BOOL split_only, u32 entry_size, const void *key, 3591 const u32 key_len, const void *data, const u32 data_len) 3592{ 3593 VCN vcn = 0; 3594 ntfs_index_context *cur_ictx, *child_ictx; 3595 ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; 3596 u32 data_ofs = 0; 3597 errno_t err, err2; 3598 const BOOL is_view = (idx_ni->name != I30); 3599 3600 ntfs_debug("Entering."); 3601 if (!ictx->is_locked) 3602 panic("%s(): !ictx->is_locked\n", __FUNCTION__); 3603 /* 3604 * If this is only a split @entry_size contains the size of the entry 3605 * and the @key and @data are not set (and not needed/used). 3606 * 3607 * If this is an addition, @entry_size is not set and we need to work 3608 * it out from the supplied @key and @data. 3609 */ 3610 if (!split_only) { 3611 /* 3612 * Calculate the size for the index entry to be added. For a 3613 * directory index, the mft reference @data is embedded inside 3614 * the directory index entry thus we only need space for the 3615 * index entry header and the filename attribute @key. 3616 * 3617 * For a view index, the data follows the key aligned to a 3618 * 4-byte boundary. 3619 * 3620 * As an additional complication, the $SDH index in the $Secure 3621 * system file contains an extra magic which is not counted in 3622 * the data length @data_len so we need to add it by hand here. 3623 */ 3624 entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len; 3625 if (is_view) { 3626 data_ofs = (entry_size + 4) & ~4; 3627 entry_size = data_ofs + data_len; 3628 if (idx_ni == idx_ni->vol->secure_sdh_ni) 3629 entry_size += sizeof(((SDH_INDEX_DATA*)NULL)-> 3630 magic); 3631 } 3632 /* 3633 * Align the index entry size to an 8-byte boundary and add 3634 * another 8 bytes to the entry size if the insertion is to 3635 * happen in an index node. 3636 */ 3637 entry_size = ((entry_size + 7) & ~7) + 3638 ((ictx->index->flags & INDEX_NODE) ? 3639 sizeof(leVCN): 0); 3640 } 3641 /* Set the current entry to be the entry to be added to the index. */ 3642 cur_ictx = ictx; 3643 cur_ictx->promote_inserted = 0; 3644 cur_ictx->insert_to_add = 1; 3645 cur_ictx->insert_entry_size = entry_size; 3646 /* 3647 * We need to loop over each index node on the tree path starting at 3648 * the bottom. For each traversed node, we check if the current entry 3649 * to be inserted fits and if it does not we need to split that node. 3650 * We make the arrangements to be able to do so and then move onto the 3651 * next node up the tree and so on until we find a node which does have 3652 * enough space to fit the index entry to be inserted. We can then 3653 * break out of the loop and begin work on the actual addition of the 3654 * entry and the splitting of the so marked nodes. 3655 */ 3656 while (cur_ictx->insert_entry_size > cur_ictx->bytes_free) { 3657 ntfs_index_context *insert_ictx; 3658 unsigned median_entry_nr, insert_entry_nr, entry_nr; 3659 u32 insert_entry_size; 3660 BOOL insert_to_add; 3661 3662 /* 3663 * The entry to be inserted into this node @cur_ictx is 3664 * specified by its @insert_entry and @insert_entry_size. 3665 */ 3666 if (!cur_ictx->is_locked) 3667 panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__); 3668 /* 3669 * We do not have enough space in the current node to insert 3670 * the entry to be inserted. 3671 * 3672 * If the current node is the index root we need to move the 3673 * index root to an index block and if successful restart the 3674 * loop to determine if we now have enough space to insert the 3675 * entry. 3676 * 3677 * This causes the B+tree to grow in height by one and is in 3678 * fact the only operation that can cause the tree to grow in 3679 * height. All other operations can only cause the tree to 3680 * grow in width. 3681 */ 3682 if (cur_ictx->is_root) { 3683 ntfs_debug("Moving index root into index allocation " 3684 "block."); 3685 /* 3686 * If we speculatively locked the index node containing 3687 * the index entry to insert into the index root, 3688 * unlock it now so we can move the index root to an 3689 * index block. 3690 */ 3691 insert_ictx = cur_ictx->insert_ictx; 3692 if (insert_ictx && insert_ictx->is_locked) { 3693 if (insert_ictx->is_root) 3694 panic("%s(): insert_ictx->is_root\n", 3695 __FUNCTION__); 3696 ntfs_index_ctx_unlock(insert_ictx); 3697 } 3698 err = ntfs_index_move_root_to_allocation_block( 3699 cur_ictx); 3700 if (!err) 3701 continue; 3702 ntfs_error(idx_ni->vol->mp, "Failed to move index " 3703 "root to index allocation block " 3704 "(error %d).", err); 3705 goto err; 3706 } 3707 ntfs_debug("Need to split index block with VCN 0x%llx.", 3708 (unsigned long long)cur_ictx->vcn); 3709 /* 3710 * We do not have enough space in the current node which we now 3711 * know is an index allocation block. We have to split the 3712 * node and promote the median entry into the parent node which 3713 * may in turn involve a split. We do not perform the split 3714 * but instead work out what needs to be done and allocate any 3715 * needed index blocks in advance so we cannot run out of disk 3716 * space and/or memory half-way through and only then do we do 3717 * the actual work on the index nodes. This preparation 3718 * involves the following steps: 3719 * 3720 * 1. Work out the median entry to be promoted into the parent 3721 * node of the current node and unlock the current node. 3722 * 2. If the entry to be promoted is not the entry to be 3723 * inserted, make a note of the fact that the entry to be 3724 * inserted is to be inserted into the current node in the 3725 * index context. 3726 * 3. Allocate a new index block and make a note of it in the 3727 * current index context. 3728 * 4. Set the parent node as the current node. 3729 * 5. Set the median entry up as the current entry to be 3730 * inserted. 3731 * 3732 * Then go round the loop again checking whether there is 3733 * enough space to insert the now current entry into the now 3734 * current node... 3735 * 3736 * First of all, mark the node as needing to be split. 3737 */ 3738 cur_ictx->split_node = 1; 3739 /* 3740 * 1. Determine the median index entry. 3741 * 3742 * Note we exclude the end entry from the median calculation 3743 * because it is not eligible for promotion thus we should use 3744 * @ictx->nr_entries - 1 but we need to take into account the 3745 * not yet inserted entry for which we are doing the split in 3746 * the first place so that is a + 1 and we also start counting 3747 * entries at 0 and not 1 thus we apply a - 1 for that. Thus 3748 * we have - 1 + 1 - 1 = -1. 3749 */ 3750 median_entry_nr = (cur_ictx->nr_entries - 1) / 2; 3751 /* 3752 * @entry_nr is the index into the array of index entry 3753 * pointers of the index entry in front of which the entry to 3754 * be inserted @cur_ictx->insert_entry needs to be inserted. 3755 */ 3756 entry_nr = cur_ictx->entry_nr; 3757 /* 3758 * If the position at which to insert the entry to be inserted 3759 * is the median promote the entry to be inserted. If the 3760 * number of entries is even there are two possible medians. 3761 * We choose the first one for simplicity. 3762 * 3763 * If the entry to be inserted is before the median, subtract 1 3764 * from the median. 3765 * 3766 * Otherwise promote the median entry. 3767 * 3768 * The only exception is when @split_only is true and the 3769 * current node is the node we are meant to split in which 3770 * case there is no entry to be inserted and thus it cannot be 3771 * promoted. In this case we recalculate the median ignoring 3772 * the entry to be inserted and promote that. 3773 */ 3774 if (entry_nr == median_entry_nr && (!split_only || 3775 cur_ictx != ictx)) { 3776 insert_to_add = cur_ictx->insert_to_add; 3777 insert_entry_nr = cur_ictx->insert_entry_nr; 3778 insert_entry_size = cur_ictx->insert_entry_size; 3779 insert_ictx = cur_ictx->insert_ictx; 3780 /* 3781 * 2. The entry to be promoted is the entry to be 3782 * inserted, make a note of that fact. 3783 */ 3784 cur_ictx->promote_inserted = 1; 3785 } else { 3786 if (entry_nr < median_entry_nr) 3787 median_entry_nr--; 3788 else if (entry_nr == median_entry_nr) { 3789 if (!split_only) 3790 panic("%s(): !split_only\n", 3791 __FUNCTION__); 3792 if (cur_ictx != ictx) 3793 panic("%s(): cur_ictx != ictx\n", 3794 __FUNCTION__); 3795 /* 3796 * We must have at least one real entry or 3797 * there is nothing to promote. This cannot 3798 * happen unless something has gone wrong. 3799 */ 3800 if (cur_ictx->nr_entries < 2) 3801 panic("%s(): cur_ictx->nr_entries < " 3802 "2\n", __FUNCTION__); 3803 median_entry_nr = (cur_ictx->nr_entries - 2) / 3804 2; 3805 } 3806 insert_to_add = FALSE; 3807 insert_entry_nr = median_entry_nr; 3808 insert_entry_size = le16_to_cpu(cur_ictx-> 3809 entries[median_entry_nr]->length); 3810 insert_ictx = cur_ictx; 3811 } 3812 /* 3813 * If this is the very first promotion and we are promoting a 3814 * leaf entry we need to add 8 bytes to the size of the entry 3815 * being promoted to allow for the VCN sub-node pointer that 3816 * will be added at the end of the entry when it is inserted 3817 * into the parent index node. 3818 */ 3819 if (cur_ictx == ictx && 3820 (!(cur_ictx->index->flags & INDEX_NODE) ? 3821 sizeof(leVCN): 0)) 3822 insert_entry_size += sizeof(VCN); 3823 /* 3824 * Record which entry is being promoted, i.e. where we need to 3825 * perform the split of the node. 3826 */ 3827 cur_ictx->promote_entry_nr = median_entry_nr; 3828 // TODO: Possible optimization: Allow ntfs_index_block_alloc() 3829 // to do the unlocking and thus to consume the lock if the new 3830 // index block is in the same page as the current index block. 3831 if (!cur_ictx->is_locked) 3832 panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__); 3833 ntfs_index_ctx_unlock(cur_ictx); 3834 /* 3835 * 3. Allocate a new index block and make a note of it in the 3836 * current index context. 3837 * 3838 * The call may cause the index root attribute to have its 3839 * entries moved out to an index block. That is fine as we 3840 * have not looked at any of our parent entries yet and the 3841 * index root must be above us given we are a child node. 3842 */ 3843 err = ntfs_index_block_alloc(cur_ictx, &cur_ictx->right_vcn, 3844 &cur_ictx->right_ia, &cur_ictx->right_upl_ofs, 3845 &cur_ictx->right_upl, &cur_ictx->right_pl, 3846 &cur_ictx->right_addr); 3847 if (err) { 3848 ntfs_error(idx_ni->vol->mp, "Failed to allocate index " 3849 "allocation block (error %d).", err); 3850 goto err; 3851 } 3852 ntfs_debug("Allocated index block for right-hand node with " 3853 "VCN 0x%llx.", (unsigned long long) 3854 cur_ictx->right_vcn); 3855 /* We need to unmap the newly allocated index block. */ 3856 ntfs_page_unmap(idx_ni, cur_ictx->right_upl, 3857 cur_ictx->right_pl, TRUE); 3858 cur_ictx->right_upl = NULL; 3859 /* 4. Set the parent node as the current node and lock it. */ 3860 cur_ictx = cur_ictx->up; 3861 if (cur_ictx->is_locked) 3862 panic("%s(): cur_ictx->is_locked\n", __FUNCTION__); 3863 if (cur_ictx->is_root) { 3864 /* 3865 * We have reached the index root. We speculatively 3866 * lock the index node containing the index entry to 3867 * insert into the index root, as we cannot lock it 3868 * once we have locked the mft record (which happens 3869 * when we lock the current index context below) as 3870 * that would cause lock reversal and thus deadlocks. 3871 */ 3872 if (insert_ictx) { 3873 if (insert_ictx->is_root) 3874 panic("%s(): insert_ictx->is_root\n", 3875 __FUNCTION__); 3876 if (insert_ictx->is_locked) 3877 panic("%s(): insert_ictx->is_locked\n", 3878 __FUNCTION__); 3879 err = ntfs_index_ctx_relock(insert_ictx); 3880 if (err) 3881 goto err; 3882 } 3883 } 3884 /* Lock the current index context. */ 3885 err = ntfs_index_ctx_relock(cur_ictx); 3886 if (err) 3887 goto err; 3888 /* 3889 * 5. Set the median entry up as the current entry to be 3890 * inserted. 3891 */ 3892 cur_ictx->insert_to_add = insert_to_add; 3893 cur_ictx->insert_entry_nr = insert_entry_nr; 3894 cur_ictx->insert_entry_size = insert_entry_size; 3895 cur_ictx->insert_ictx = insert_ictx; 3896 } 3897 /* 3898 * Get the child node index context if we are not at the bottom of the 3899 * tree already or rather at the node for which we were called (which 3900 * may not actually be at the bottom of the tree). 3901 */ 3902 child_ictx = NULL; 3903 if (cur_ictx != ictx) { 3904 child_ictx = cur_ictx->down; 3905 if (child_ictx->is_root) 3906 panic("%s(): child_ictx->is_root\n", __FUNCTION__); 3907 } 3908 /* 3909 * If the node we were called for was the index root then it will have 3910 * been moved to an index allocation block which will likely have 3911 * created enough space to fit the entry to be inserted thus if this is 3912 * a split only call nothing further needs to be done. 3913 */ 3914 if (cur_ictx == ictx && split_only) { 3915 ntfs_debug("Done (index root was upgraded to index " 3916 "allocation, no further split needed)."); 3917 return 0; 3918 } 3919 /* 3920 * We now have prepared everything so we are guaranteed not to run out 3921 * of disk space and can begin doing the work. 3922 * 3923 * TODO: We can still fail if relocking of an index context fails but 3924 * that can really only happen under extreme memory pressure and there 3925 * is not much we can do about that without <rdar://problem/4992358> 3926 * being implemented. Thus for now we simply bomb out with an error 3927 * message and leave the index potentially badly corrupted and also we 3928 * potentially leave some allocated index blocks actually unused. This 3929 * is highly unsatisfactory but rolling back once the modifications 3930 * have begun is pretty much impossible without keeping a lot more 3931 * state so we do not implement rollback. Once the aforementioned 3932 * radar is implemented we will no longer suffer from this problem and 3933 * it will no longer be possible to fail once we get here. 3934 * 3935 * We start with the current node which is the top-most node we need to 3936 * deal with. We know from above it has enough space and does not need 3937 * to be split. 3938 */ 3939 do { 3940 ntfs_index_context *insert_ictx; 3941 INDEX_ENTRY *entry, *right_entry, *end_entry; 3942 INDEX_HEADER *right_index; 3943 unsigned u; 3944 3945 if (!cur_ictx->is_locked) 3946 panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__); 3947 /* 3948 * The current node is either the top-most node we need to deal 3949 * with thus we know it has enough space or we have just split 3950 * it and thus also know that it must have enough space. 3951 * 3952 * Thus the insertion is now a matter of doing a simple insert 3953 * into the node unless the current node has had its entry to 3954 * be inserted promoted into the parent node in which case 3955 * there is nothing to insert into this node. 3956 * 3957 * Note: Use goto to reduce indentation. 3958 */ 3959 insert_ictx = cur_ictx->insert_ictx; 3960 if (cur_ictx->promote_inserted) 3961 goto skip_insert; 3962 /* 3963 * We now need to begin to do the insertion but there is one 3964 * complication we need to deal with first and that is that if 3965 * we are inserting a promoted entry, we have to lock the index 3966 * node it is in also. 3967 * 3968 * If the current node is an index block we may need to unlock 3969 * it so that we can ensure to always take the node lock in 3970 * ascending page offset order. Also we have to consider the 3971 * case where the two index nodes are in the same page in which 3972 * case we need to share the index lock. 3973 * 3974 * If the current node is the index root and we are inserting a 3975 * promoted entry, we speculatively locked the index node 3976 * containing the entry in the above loop thus we do not need 3977 * to do anything here. 3978 */ 3979 if (!cur_ictx->insert_to_add) { 3980 if (!insert_ictx) 3981 panic("%s(): !insert_ictx\n", __FUNCTION__); 3982 if (cur_ictx->is_root) { 3983 if (!insert_ictx->is_locked) 3984 panic("%s(): !insert_ictx->" 3985 "is_locked\n", 3986 __FUNCTION__); 3987 } else { 3988 err = ntfs_index_ctx_lock_two(cur_ictx, 3989 insert_ictx); 3990 if (err) 3991 goto relock_err; 3992 } 3993 } 3994 entry_size = cur_ictx->insert_entry_size; 3995 /* 3996 * Everything we need is locked and we know there is enough 3997 * space in the index node thus make space for the entry to be 3998 * inserted at the appropriate place and then insert the index 3999 * entry by either copying it from the appropriate, locked 4000 * child node or by creating it in place. The latter happens 4001 * when the entry being inserted is the entry being added with 4002 * this call to ntfs_index_entry_add(), i.e. it is the reason 4003 * we have been called in the first place and thus there is no 4004 * index entry to copy from. 4005 */ 4006 err = ntfs_index_node_make_space(cur_ictx, entry_size); 4007 if (err) 4008 panic("%s(): err\n", __FUNCTION__); 4009 /* 4010 * We have modified the index node. Make sure it is written 4011 * out later. 4012 */ 4013 ntfs_index_entry_mark_dirty(cur_ictx); 4014 if (!cur_ictx->insert_to_add) { 4015 entry = insert_ictx->entries[ 4016 cur_ictx->insert_entry_nr]; 4017 /* Copy the index entry into the created space. */ 4018 memcpy(cur_ictx->entry, entry, 4019 le16_to_cpu(entry->length)); 4020 entry = cur_ictx->entry; 4021 } else { 4022 if (split_only) 4023 panic("%s(): split_only\n", __FUNCTION__); 4024 entry = cur_ictx->entry; 4025 /* 4026 * Clear the created space so we start with a clean 4027 * slate and do not need to worry about initializing 4028 * all the zero fields. 4029 */ 4030 bzero(entry, entry_size); 4031 /* Create the index entry in the created space. */ 4032 if (!is_view) 4033 entry->indexed_file = *(leMFT_REF*)data; 4034 else { 4035 u8 *new_data; 4036 4037 new_data = (u8*)entry + data_ofs; 4038 entry->data_offset = cpu_to_le16(data_ofs); 4039 entry->data_length = cpu_to_le16(data_len); 4040 if (data_len) 4041 memcpy(new_data, data, data_len); 4042 /* 4043 * In the case of $Secure/$SDH we leave the 4044 * extra magic to zero rather than setting it 4045 * to "II" in Unicode. This could easily be 4046 * changed if deemed better and/or necessary by 4047 * uncommenting the below code. 4048 */ 4049#if 0 4050 if (idx_ni == idx_ni->vol->secure_sdh_ni) { 4051 static const ntfschar SDH_magic[2] = { 4052 const_cpu_to_le16('I'), 4053 const_cpu_to_le16('I') 4054 }; 4055 4056 memcpy(((SDH_INDEX_DATA*)data)->magic, 4057 SDH_magic, 4058 sizeof(SDH_magic)); 4059 } 4060#endif 4061 } 4062 entry->key_length = cpu_to_le16(key_len); 4063 memcpy(&entry->key, key, key_len); 4064 } 4065 /* 4066 * If the copied entry is a leaf entry and it is being inserted 4067 * into a non-leaf node, we need to rewrite its size with the 4068 * new size which includes the VCN sub-node pointer. 4069 * 4070 * We just overwrite the length unconditionally and use it to 4071 * set the length of the just created index entry, too. 4072 * 4073 * There is no harm in doing this as we are either updating the 4074 * size which we must do or we are overwriting the size with 4075 * the same value it already has which has no effect. 4076 */ 4077 entry->length = cpu_to_le16(entry_size); 4078 /* 4079 * If the current node is not a leaf node, we have to fix up 4080 * the VCN sub-node pointers both in the entry we just inserted 4081 * and in the entry in front of which we inserted. 4082 * 4083 * The inserted index entry needs to point to what will be the 4084 * left-hand index node after our child node is split. 4085 * 4086 * The index entry in front of which we inserted needs to point 4087 * to what will be the right-hand index node after our child 4088 * node is split. 4089 * 4090 * The INDEX_NODE check is fine for the index root, too, 4091 * because as it happens LARGE_INDEX == INDEX_NODE. 4092 */ 4093 if (cur_ictx->index->flags & INDEX_NODE) { 4094 if (!child_ictx) 4095 panic("%s(): !child_ictx\n", __FUNCTION__); 4096 if (entry->flags & INDEX_ENTRY_END) 4097 panic("%s(): entry->flags & INDEX_ENTRY_END\n", 4098 __FUNCTION__); 4099 entry->flags |= INDEX_ENTRY_NODE; 4100 *(leVCN*)((u8*)entry + entry_size - sizeof(VCN)) = 4101 cpu_to_sle64(child_ictx->vcn); 4102 ntfs_debug("Setting VCN sub-node pointer of inserted " 4103 "index entry to 0x%llx.", 4104 (unsigned long long)child_ictx->vcn); 4105 entry = (INDEX_ENTRY*)((u8*)entry + entry_size); 4106 if (!(entry->flags & INDEX_ENTRY_NODE)) 4107 panic("%s(): !(entry->flags & " 4108 "INDEX_ENTRY_NODE)\n", 4109 __FUNCTION__); 4110 *(leVCN*)((u8*)entry + le16_to_cpu(entry->length) - 4111 sizeof(VCN)) = cpu_to_sle64( 4112 child_ictx->right_vcn); 4113 ntfs_debug("Setting VCN sub-node pointer of index " 4114 "entry after inserted entry to " 4115 "0x%llx.", (unsigned long long) 4116 child_ictx->right_vcn); 4117 } 4118skip_insert: 4119 /* 4120 * If there are no child nodes left we are done. We will dirty 4121 * the index node once we have broken out of the loop. When 4122 * the index context is released later all locked nodes will be 4123 * unlocked so no need to do it now. 4124 */ 4125 if (!child_ictx) 4126 break; 4127 if (!child_ictx->split_node) 4128 panic("%s(): !child_ictx->split_node\n", __FUNCTION__); 4129 /* 4130 * TODO: @child_ictx->right_upl and @child_ictx->right_pl are 4131 * currently not valid as @child_ictx is not locked. Once 4132 * <rdar://problem/4992358> is implemented we can re-enable 4133 * this check and change the code to leave the right_upl mapped 4134 * at all times. 4135 */ 4136#if 0 4137 if (!child_ictx->right_upl || !child_ictx->right_pl) 4138 panic("%s(): !child_ictx->right_upl || 4139 !child_ictx->right_pl\n", 4140 __FUNCTION__); 4141#endif 4142 /* 4143 * We are done with the current node and have a child node. 4144 * Switch to the child node, and sort out the needed locks. 4145 * 4146 * First, unlock the @insert_ictx node if it exists and is 4147 * locked. 4148 * 4149 * Note we do not bother with trying to transfer the lock from 4150 * @insert_ictx onto @child_ictx or @child_ictx->right_* 4151 * because index blocks are 4096 bytes in size in majority of 4152 * cases and the PAGE_SIZE is 4096 bytes both on x86 and PPC 4153 * thus in majority of cases each page will contain a separate 4154 * index block thus no sharing will be possible and there is no 4155 * point in adding extra complexity for an extremely unlikely 4156 * event. 4157 */ 4158 if (insert_ictx && insert_ictx->is_locked) 4159 ntfs_index_ctx_unlock(insert_ictx); 4160 /* 4161 * Unlock the current node. Again we do not bother with trying 4162 * to share the lock with @child_ictx or @child_ictx->right_*. 4163 */ 4164 ntfs_index_ctx_unlock(cur_ictx); 4165 /* Set the child node to be the current node. */ 4166 cur_ictx = child_ictx; 4167 /* 4168 * We need to ensure both the current node and its right-hand 4169 * node are locked. Both are currently unlocked. 4170 * 4171 * If both nodes share the same page, lock the current node and 4172 * share the lock with the right one. 4173 */ 4174 if (cur_ictx->is_locked) 4175 panic("%s(): cur_ictx->is_locked\n", __FUNCTION__); 4176 if (cur_ictx->right_is_locked) 4177 panic("%s(): cur_ictx->right_is_locked\n", 4178 __FUNCTION__); 4179 if (cur_ictx->upl_ofs <= cur_ictx->right_upl_ofs) { 4180 err = ntfs_index_ctx_relock(cur_ictx); 4181 if (err) 4182 goto relock_err; 4183 } 4184 if (cur_ictx->upl_ofs == cur_ictx->right_upl_ofs) { 4185 cur_ictx->right_ia = (INDEX_ALLOCATION*)( 4186 (u8*)cur_ictx->right_ia + 4187 (cur_ictx->addr - 4188 cur_ictx->right_addr)); 4189 cur_ictx->right_addr = cur_ictx->addr; 4190 } else { 4191 u8 *addr; 4192 4193 err = ntfs_page_map(idx_ni, cur_ictx->right_upl_ofs, 4194 &cur_ictx->right_upl, 4195 &cur_ictx->right_pl, &addr, TRUE); 4196 if (err) { 4197 ntfs_error(idx_ni->vol->mp, "Failed to map " 4198 "index page (error %d).", err); 4199 cur_ictx->right_upl = NULL; 4200 goto relock_err; 4201 } 4202 cur_ictx->right_is_locked = 1; 4203 cur_ictx->right_ia = (INDEX_ALLOCATION*)( 4204 (u8*)cur_ictx->right_ia + (addr - 4205 cur_ictx->right_addr)); 4206 cur_ictx->right_addr = addr; 4207 } 4208 if (!cur_ictx->is_locked) { 4209 err = ntfs_index_ctx_relock(cur_ictx); 4210 if (err) { 4211 if (cur_ictx->right_is_locked) { 4212 ntfs_page_unmap(idx_ni, 4213 cur_ictx->right_upl, 4214 cur_ictx->right_pl, 4215 FALSE); 4216 cur_ictx->right_upl = NULL; 4217 cur_ictx->right_is_locked = 0; 4218 } 4219 goto relock_err; 4220 } 4221 } 4222 /* 4223 * Having obtained the needed locks, we can now split the 4224 * current node. 4225 * 4226 * We have recorded the split point in @promote_entry_nr and 4227 * @promote_inserted tells us whether to remove the entry 4228 * specified by @promote_entry_nr and move all entries after it 4229 * to the right-hand node (@promote_inserted is 0) or whether 4230 * to move the entry specified by @promote_entry_nr and all 4231 * entries after it to the right-hand node (@promote_inserted 4232 * is 1). 4233 * 4234 * The split results in the creation of a new end entry in the 4235 * left-hand node as its old end entry is moved to the right 4236 * hand node. This means we need to determine what we need to 4237 * set the VCN sub-node pointer to if this is an index node. 4238 * 4239 * If @promote_iserted is 0, i.e. we are promoting an existing 4240 * entry from this node, we need to use the VCN sub-node 4241 * pointer of the entry we are about to promote. Because we 4242 * are promoting the entry it is going to disappear altogether 4243 * from this node thus we need to make a note of its sub-node 4244 * VCN pointer if it has one now before doing the actual split. 4245 * 4246 * In this case we do not need to modify the VCN sub-node 4247 * pointer of the first entry in the (new) right-hand node as 4248 * it does not change. 4249 * 4250 * If @promote_inserted is 1, i.e. we are promoting the entry 4251 * that was going to be inserted into this node, we need to use 4252 * the VCN of our child node which is the VCN of the entry in 4253 * front of which we are inserting and splitting, i.e. the VCN 4254 * of the first entry we are about to move to the right-hand 4255 * node. 4256 * 4257 * In this case we also need to modify the VCN sub-node pointer 4258 * of the first entry in the (new) right-hand node to point to 4259 * the (new) right-hand child node. We do not know what that 4260 * is yet so we determine it later once we have obtained our 4261 * child node index context containing the needed information. 4262 * 4263 * First, copy the appropriate entries to the right-hand node 4264 * and switch the node to be an index, i.e. non-leaf, node if 4265 * the node being split is an index node. 4266 */ 4267 right_index = &cur_ictx->right_ia->index; 4268 right_entry = (INDEX_ENTRY*)((u8*)right_index + 4269 le32_to_cpu(right_index->entries_offset)); 4270 u = cur_ictx->promote_entry_nr; 4271 entry = cur_ictx->entries[u]; 4272 if (!cur_ictx->promote_inserted) { 4273 if (entry->flags & INDEX_ENTRY_NODE) { 4274 if (!(cur_ictx->index->flags & INDEX_NODE)) 4275 panic("%s(): !(cur_ictx->index->flags " 4276 "& INDEX_NODE)\n", 4277 __FUNCTION__); 4278 vcn = sle64_to_cpu(*(leVCN*)((u8*)entry + 4279 le16_to_cpu(entry->length) - 4280 sizeof(VCN))); 4281 } 4282 u++; 4283 entry = cur_ictx->entries[u]; 4284 } 4285 end_entry = cur_ictx->entries[cur_ictx->nr_entries - 1]; 4286 if (!(end_entry->flags & INDEX_ENTRY_END)) 4287 panic("%s(): !(end_entry->flags & INDEX_ENTRY_END)\n", 4288 __FUNCTION__); 4289 u = (u8*)end_entry - (u8*)entry + 4290 le16_to_cpu(end_entry->length); 4291 memcpy(right_entry, entry, u); 4292 right_index->index_length = cpu_to_le32( 4293 le32_to_cpu(right_index->entries_offset) + u); 4294 right_index->flags = cur_ictx->index->flags; 4295 /* 4296 * Move the end entry of the left-hand node forward thus 4297 * truncating the left-hand node. 4298 */ 4299 if (!cur_ictx->promote_inserted) 4300 entry = cur_ictx->entries[cur_ictx->promote_entry_nr]; 4301 if (entry != end_entry) { 4302 u = le16_to_cpu(end_entry->length); 4303 memmove(entry, end_entry, u); 4304 u += (u8*)entry - (u8*)cur_ictx->index; 4305 cur_ictx->index->index_length = cpu_to_le32(u); 4306 } 4307 /* 4308 * If the current, left-hand node is not a leaf node, we have 4309 * to replace the VCN sub-node pointer in its end entry. 4310 * 4311 * If @promote_iserted is 0, we use the VCN we made a note of 4312 * above. 4313 * 4314 * If @promote_inserted is 1, we take the VCN of the left-hand 4315 * node of our child node. 4316 * 4317 * A side effect of getting the child node in loop scope here 4318 * is that we do not need to reget it when we go round the loop 4319 * again which is when we need to know the child node in order 4320 * to be able to insert the appropriate entry into the current 4321 * node. 4322 */ 4323 child_ictx = NULL; 4324 if (entry->flags & INDEX_ENTRY_NODE) { 4325 if (cur_ictx != ictx) { 4326 child_ictx = cur_ictx->down; 4327 if (child_ictx->is_root) 4328 panic("%s(): child_ictx->is_root\n", 4329 __FUNCTION__); 4330 } 4331 if (cur_ictx->promote_inserted) { 4332 if (!child_ictx) 4333 panic("%s(): !child_ictx\n", 4334 __FUNCTION__); 4335 /* 4336 * As described take the VCN of the left-hand 4337 * node of our child node index context for the 4338 * new end entry. 4339 */ 4340 vcn = child_ictx->vcn; 4341 /* 4342 * Again, as described, update the VCN of the 4343 * first entry we just moved to the (new) right 4344 * hand node to the right-hand node of our 4345 * child node index context. 4346 */ 4347 *(leVCN*)((u8*)right_entry + le16_to_cpu( 4348 right_entry->length) - 4349 sizeof(VCN)) = cpu_to_sle64( 4350 child_ictx->right_vcn); 4351 ntfs_debug("Setting VCN sub-node pointer of " 4352 "first index entry of " 4353 "right-hand index block node " 4354 "after splitting it off from " 4355 "the left-hand node to " 4356 "0x%llx.", (unsigned long long) 4357 child_ictx->right_vcn); 4358 } 4359 if (!(cur_ictx->index->flags & INDEX_NODE)) 4360 panic("%s(): !(cur_ictx->index->flags & " 4361 "INDEX_NODE)\n", __FUNCTION__); 4362 *(leVCN*)((u8*)entry + le16_to_cpu(entry->length) - 4363 sizeof(VCN)) = cpu_to_sle64(vcn); 4364 ntfs_debug("Setting VCN sub-node pointer of end index " 4365 "entry of left-hand index block node " 4366 "after splitting off the right-hand " 4367 "node to 0x%llx.", (unsigned long long) 4368 vcn); 4369 } 4370 /* 4371 * The index context still describes a single node thus if we 4372 * are going to insert an entry into either of the two split 4373 * nodes we have to update the index context appropriately. 4374 * 4375 * If @cur_ictx->entry_nr <= @cur_ictx->promote_entry_nr, we 4376 * have to insert the entry into the left-hand node and if 4377 * @cur_ictx->entry_nr > @cur_ictx->promote_entry_nr we have to 4378 * insert the entry into the right-hand node. 4379 * 4380 * If inserting into the left-hand node we do not need to do 4381 * anything as the insertion process does not make use of 4382 * anything in the index context that has changed. 4383 * 4384 * If inserting into the right-hand node we have to switch the 4385 * index context to describe the right-hand node and place the 4386 * left-hand node into the place of the right-hand node, i.e. 4387 * swap the left- and right-hand nodes in the index context. 4388 * 4389 * If we are switching the left- and right-hand nodes, we also 4390 * have to update the index entry pointer to point to the 4391 * correct insertion location in the now current page. 4392 * 4393 * Note we do not bother to update the array of index entry 4394 * pointers as that is no longer used. 4395 */ 4396 if (!cur_ictx->promote_inserted && cur_ictx->entry_nr > 4397 cur_ictx->promote_entry_nr) { 4398 union { 4399 VCN vcn; 4400 INDEX_BLOCK *ia; 4401 s64 upl_ofs; 4402 upl_t upl; 4403 upl_page_info_array_t pl; 4404 u8 *addr; 4405 } tmp; 4406 4407 tmp.vcn = cur_ictx->right_vcn; 4408 cur_ictx->right_vcn = cur_ictx->vcn; 4409 cur_ictx->vcn = tmp.vcn; 4410 tmp.ia = cur_ictx->right_ia; 4411 cur_ictx->ia = tmp.ia; 4412 cur_ictx->right_ia = cur_ictx->ia; 4413 cur_ictx->index = right_index = &tmp.ia->index; 4414 if (cur_ictx->right_is_locked) { 4415 tmp.upl_ofs = cur_ictx->right_upl_ofs; 4416 cur_ictx->right_upl_ofs = cur_ictx->upl_ofs; 4417 cur_ictx->upl_ofs = tmp.upl_ofs; 4418 tmp.upl = cur_ictx->right_upl; 4419 cur_ictx->right_upl = cur_ictx->upl; 4420 cur_ictx->upl = tmp.upl; 4421 tmp.pl = cur_ictx->right_pl; 4422 cur_ictx->right_pl = cur_ictx->pl; 4423 cur_ictx->pl = tmp.pl; 4424 tmp.addr = cur_ictx->right_addr; 4425 cur_ictx->right_addr = cur_ictx->addr; 4426 cur_ictx->addr = tmp.addr; 4427 } 4428 /* 4429 * Get the location in the left-hand page of the first 4430 * entry that was moved to the right-hand page. 4431 */ 4432 entry = cur_ictx->entries[cur_ictx->promote_entry_nr + 4433 1]; 4434 cur_ictx->entry = (INDEX_ENTRY*)((u8*)right_index + 4435 le32_to_cpu(right_index-> 4436 entries_offset) + 4437 ((u8*)cur_ictx->entry - (u8*)entry)); 4438 } 4439 /* 4440 * We are done with the node that is now set up as the 4441 * right-hand node. Unless it is sharing the lock with the 4442 * left-hand node, unmap/release it marking it dirty so it gets 4443 * written out later. 4444 */ 4445 if (cur_ictx->right_is_locked) { 4446 ntfs_page_unmap(idx_ni, cur_ictx->right_upl, 4447 cur_ictx->right_pl, TRUE); 4448 cur_ictx->right_upl = NULL; 4449 cur_ictx->right_is_locked = 0; 4450 } 4451 /* 4452 * We may be done with the current node. Mark it dirty so it 4453 * gets written out later. 4454 */ 4455 ntfs_index_entry_mark_dirty(cur_ictx); 4456 /* 4457 * If we have reached the original node (@child_ictx will be 4458 * NULL) and we are only splitting it there is nothing to 4459 * insert and hence nothing at all left to do so we break out 4460 * of the loop. 4461 */ 4462 } while (child_ictx || !split_only); 4463 ntfs_debug("Done (%s).", cur_ictx->split_node ? 4464 "insert with split" : "simple insert"); 4465 return 0; 4466err: 4467 if (!NInoIndexAllocPresent(idx_ni)) 4468 goto err_out; 4469 /* 4470 * Unlock all index contexts in the B+tree path otherwise the call to 4471 * ntfs_attr_inode_get() can deadlock. 4472 */ 4473 ntfs_index_ctx_path_unlock(ictx); 4474 /* Get the index bitmap inode. */ 4475 err2 = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, 4476 idx_ni->name_len, FALSE, LCK_RW_TYPE_SHARED, &bmp_ni); 4477 if (err2) { 4478 ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode " 4479 "(error %d). Cannot undo index block " 4480 "allocation. Leaving inconsistent metadata. " 4481 "Run chkdsk.", err2); 4482 NVolSetErrors(idx_ni->vol); 4483 goto err_out; 4484 } 4485 /* Free all the index block allocations we have done. */ 4486 do { 4487 if (cur_ictx->right_addr) { 4488 if (!cur_ictx->right_ia) 4489 panic("%s(): !cur_ictx->right_ia\n", 4490 __FUNCTION__); 4491 if (cur_ictx->right_vcn < 0) 4492 panic("%s(): cur_ictx->right_vcn < 0\n", 4493 __FUNCTION__); 4494 err2 = ntfs_bitmap_clear_bit(bmp_ni, 4495 cur_ictx->right_vcn << 4496 idx_ni->vcn_size_shift >> 4497 idx_ni->block_size_shift); 4498 if (err2) { 4499 ntfs_error(idx_ni->vol->mp, "Failed to undo " 4500 "index block allocation in " 4501 "(error %d). Leaving " 4502 "inconsistent metadata. Run " 4503 "chkdsk.", err2); 4504 NVolSetErrors(idx_ni->vol); 4505 } 4506 } 4507 /* When we have dealt with the bottom entry we are done. */ 4508 if (cur_ictx == ictx) 4509 break; 4510 cur_ictx = cur_ictx->down; 4511 } while (1); 4512 lck_rw_unlock_shared(&bmp_ni->lock); 4513 (void)vnode_put(bmp_ni->vn); 4514err_out: 4515 ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); 4516 return err; 4517relock_err: 4518 ntfs_error(idx_ni->vol->mp, "Failed to relock index context (error " 4519 "%d). Leaving corrupt index. Run chkdsk.", err); 4520 NVolSetErrors(idx_ni->vol); 4521 return err; 4522} 4523 4524/** 4525 * ntfs_index_node_split - split an index node 4526 * @ictx: index context specifying the node to split 4527 * @entry_size: size of the entry that will be added after the split 4528 * 4529 * Split the index node pointed to by @ictx. @ictx also points to the entry 4530 * in the index node at which an entry will be inserted after the split is 4531 * completed. @entry_size is the size of the entry that will be added at that 4532 * position. These are used so that the split is performed with consideration 4533 * with the insertion that is likely to come. And if the insertion does not 4534 * happen it does not matter much, it just means the split was not quite on the 4535 * median entry but off by one. 4536 * 4537 * Return 0 on success and errno on error. On error the index context is no 4538 * longer usable and must be released or reinitialized. 4539 * 4540 * Note that we do not update the array of index entry pointers nor the number 4541 * of entries in the array because on success it means that the index node has 4542 * been split and the function is done, i.e. the array of index entry pointers 4543 * will not be used any more and on error the index context becomes invalid so 4544 * there is no need to update any of the pointers. The caller is expected to 4545 * restart its operations by doing a new index lookup for that reason. 4546 * 4547 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 4548 * writing. 4549 * - The index context @ictx must be locked. 4550 */ 4551static inline errno_t ntfs_index_node_split(ntfs_index_context *ictx, 4552 const u32 entry_size) 4553{ 4554 return ntfs_index_entry_add_or_node_split(ictx, TRUE, entry_size, 4555 NULL, 0, NULL, 0); 4556} 4557 4558/** 4559 * ntfs_index_lookup_predecessor - index node whose predecessor node to return 4560 * @ictx: index context whose predecessor node to return 4561 * @pred_ictx: pointer in which to return the found predecessor index context 4562 * 4563 * Descend into the child node pointed to by @ictx->entry and then keep 4564 * descending into the child node of the child node pointed to by the end entry 4565 * of the child node until we reach the bottom of the B+tree. 4566 * 4567 * On success return the predecessor entry, i.e. the last real (non-end) entry 4568 * in the found leaf index node in *@pred_ictx. 4569 * 4570 * Return 0 on success and errno on error. 4571 * 4572 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 4573 * - The index context @ictx must be locked. 4574 */ 4575static errno_t ntfs_index_lookup_predecessor(ntfs_index_context *ictx, 4576 ntfs_index_context **pred_ictx) 4577{ 4578 unsigned entry_nr; 4579 errno_t err; 4580 4581 ntfs_debug("Entering."); 4582 if (!pred_ictx) 4583 panic("%s(): !pred_ictx\n", __FUNCTION__); 4584 if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) 4585 panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n", 4586 __FUNCTION__); 4587 /* 4588 * We must be at the bottom of the current tree path or things will get 4589 * very confused. 4590 */ 4591 if (!ictx->down->is_root) 4592 panic("%s(): !ictx->down->is_root\n", __FUNCTION__); 4593 do { 4594 err = ntfs_index_descend_into_child_node(&ictx); 4595 if (err) 4596 goto err; 4597 /* If this child node is a leaf node we are done. */ 4598 if (!(ictx->index->flags & INDEX_NODE)) 4599 break; 4600 /* 4601 * This child node is an index node, descend into its end 4602 * entry. 4603 */ 4604 ictx->entry_nr = entry_nr = ictx->nr_entries - 1; 4605 ictx->follow_entry = ictx->entries[entry_nr]; 4606 } while (1); 4607 /* 4608 * We found the leaf node thus the predecessor entry we are looking for 4609 * is the last entry before the end entry. 4610 */ 4611 if (ictx->nr_entries < 2) 4612 panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__); 4613 ictx->entry_nr = entry_nr = ictx->nr_entries - 2; 4614 ictx->entry = ictx->entries[entry_nr]; 4615 ictx->is_match = 1; 4616 *pred_ictx = ictx; 4617 ntfs_debug("Done (found)."); 4618 return 0; 4619err: 4620 ntfs_error(ictx->idx_ni->vol->mp, "Failed to descend into child node " 4621 "(error %d).", err); 4622 return err; 4623} 4624 4625/** 4626 * ntfs_index_ctx_move - move an index context from its tree path to another one 4627 * @ictx: index context to move 4628 * @dst: destination index context below which to insert @ictx 4629 * 4630 * Disconnect the index context @ictx from its tree path and insert it into the 4631 * tree path to which @dst belongs positioning it immediately below the index 4632 * context @dst. 4633 * 4634 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode. 4635 */ 4636static inline void ntfs_index_ctx_move(ntfs_index_context *ictx, 4637 ntfs_index_context *dst) 4638{ 4639 ntfs_index_ctx_disconnect(ictx); 4640 dst->down->up = ictx; 4641 ictx->down = dst->down; 4642 ictx->up = dst; 4643 dst->down = ictx; 4644} 4645 4646/** 4647 * ntfs_index_root_prepare_replace - prepare an index entry to be replaced 4648 * @ictx: existing index entry that is going to be replaced 4649 * @new_ie_size: size in bytes of the new index entry 4650 * 4651 * Resize the existing index entry to the size of the new index entry so that 4652 * the index root is all set up and ready to receive the new entry. 4653 * 4654 * Return 0 on success and ENOSPC if there is not enough space in the index mft 4655 * record for the new entry. 4656 * 4657 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 4658 * writing. 4659 * - The index context @ictx must be locked. 4660 */ 4661static errno_t ntfs_index_root_prepare_replace(ntfs_index_context *ictx, 4662 const unsigned new_ie_size) 4663{ 4664 MFT_RECORD *m = ictx->actx->m; 4665 INDEX_ENTRY *ie = ictx->entry; 4666 const u32 muse = le32_to_cpu(m->bytes_in_use); 4667 const unsigned ie_size = le16_to_cpu(ie->length); 4668 const int size_change = (int)new_ie_size - (int)ie_size; 4669 const u32 new_muse = muse + size_change; 4670 4671 ntfs_debug("Entering."); 4672 if (new_muse <= le32_to_cpu(m->bytes_allocated)) { 4673 u8 *vcn_addr; 4674 ATTR_RECORD *a; 4675 INDEX_HEADER *ih; 4676 4677 /* 4678 * Resize the index root attribute so it has the appropriate 4679 * space for the new index entry to replace the existing entry. 4680 * As an optimization we combine the resizing of the index root 4681 * attribute and the moving of index entries within the 4682 * attribute into a single operation. 4683 */ 4684 vcn_addr = (u8*)ie + ie_size - sizeof(VCN); 4685 memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr, 4686 muse - (vcn_addr - (u8*)m)); 4687 /* Adjust the mft record to reflect the change in used space. */ 4688 m->bytes_in_use = cpu_to_le32(new_muse); 4689 /* 4690 * Adjust the attribute record to reflect the changes in the 4691 * size of the attribute record and in the size of the 4692 * attribute value. 4693 */ 4694 a = ictx->actx->a; 4695 a->length = cpu_to_le32(le32_to_cpu(a->length) + size_change); 4696 a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) + 4697 size_change); 4698 /* Adjust the index header to reflect the change in length. */ 4699 ih = ictx->index; 4700 ih->allocated_size = ih->index_length = cpu_to_le32( 4701 le32_to_cpu(ih->index_length) + size_change); 4702 ntfs_debug("Done."); 4703 return 0; 4704 } 4705 ntfs_debug("Failed (not enough space in mft record to replace index " 4706 "entry in index root attribute)."); 4707 return ENOSPC; 4708} 4709 4710/** 4711 * ntfs_index_block_prepare_replace - prepare an index entry to be replaced 4712 * @ictx: existing index entry that is going to be replaced 4713 * @new_ie_size: size in bytes of the new index entry 4714 * 4715 * Resize the existing index entry to the size of the new index entry so that 4716 * the index node is all set up and ready to receive the new entry. 4717 * 4718 * Return 0 on success and ENOSPC if there is not enough space in the index 4719 * node for the new entry. 4720 * 4721 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 4722 * writing. 4723 * - The index context @ictx must be locked. 4724 */ 4725static errno_t ntfs_index_block_prepare_replace(ntfs_index_context *ictx, 4726 const unsigned new_ie_size) 4727{ 4728 INDEX_HEADER *ih = ictx->index; 4729 INDEX_ENTRY *ie = ictx->entry; 4730 const u32 ilen = le32_to_cpu(ih->index_length); 4731 const unsigned ie_size = le16_to_cpu(ie->length); 4732 const u32 new_ilen = ilen + new_ie_size - ie_size; 4733 4734 ntfs_debug("Entering."); 4735 if (new_ilen <= le32_to_cpu(ih->allocated_size)) { 4736 u8 *vcn_addr; 4737 4738 /* 4739 * Move the VCN of the index entry to be replaced and 4740 * everything that follows it to adapt the space for the new 4741 * entry. 4742 */ 4743 vcn_addr = (u8*)ie + ie_size - sizeof(VCN); 4744 memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr, 4745 ilen - (vcn_addr - (u8*)ih)); 4746 /* Adjust the index header to reflect the change in length. */ 4747 ih->index_length = cpu_to_le32(new_ilen); 4748 ntfs_debug("Done."); 4749 return 0; 4750 } 4751 ntfs_debug("Failed (not enough space in index block to replace index " 4752 "entry)."); 4753 return ENOSPC; 4754} 4755 4756/** 4757 * ntfs_index_entry_replace - replace an existing index entry with a new one 4758 * @ictx: existing index entry to replace 4759 * @new_ictx: new index entry to replace the existing entry with 4760 * 4761 * Replace the existing node index entry @ictx->entry with the leaf index entry 4762 * @new_ictx->entry. 4763 * 4764 * Return 0 on success and ENOSPC if there is not enough space for the new 4765 * entry. 4766 * 4767 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 4768 * writing. 4769 * - The index context @ictx must be locked. 4770 * - The index context @new_ictx must be locked. 4771 */ 4772static errno_t ntfs_index_entry_replace(ntfs_index_context *ictx, 4773 ntfs_index_context *new_ictx) 4774{ 4775 INDEX_ENTRY *new_ie = new_ictx->entry; 4776 INDEX_ENTRY *ie = ictx->entry; 4777 const unsigned new_ie_size = le16_to_cpu(new_ie->length) + sizeof(VCN); 4778 errno_t err; 4779 4780 if (!ictx->is_match || !new_ictx->is_match) 4781 panic("%s(): !ictx->is_match || !new_ictx->is_match\n", 4782 __FUNCTION__); 4783 /* The destination entry is meant to be a node entry. */ 4784 if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) 4785 panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n", 4786 __FUNCTION__); 4787 /* The new entry is meant to be a leaf entry. */ 4788 if (new_ictx->entry->flags & INDEX_ENTRY_NODE) 4789 panic("%s(): new_ictx->entry->flags & INDEX_ENTRY_NODE\n", 4790 __FUNCTION__); 4791 if (ictx->is_root) 4792 err = ntfs_index_root_prepare_replace(ictx, new_ie_size); 4793 else 4794 err = ntfs_index_block_prepare_replace(ictx, new_ie_size); 4795 if (!err) { 4796 /* Copy the new index entry into the adapted space. */ 4797 memcpy(ie, new_ie, new_ie_size - sizeof(VCN)); 4798 /* 4799 * Update the copied index entry to reflect the fact that it is 4800 * now an index node entry and has the VCN sub-node pointer at 4801 * its tail. 4802 */ 4803 ie->length = cpu_to_le16(new_ie_size); 4804 ie->flags |= INDEX_ENTRY_NODE; 4805 /* Ensure the updates are written to disk. */ 4806 ntfs_index_entry_mark_dirty(ictx); 4807 } 4808 return err; 4809} 4810 4811/** 4812 * ntfs_index_block_free - free an index allocation block 4813 * @ictx: index context of the index block to deallocate 4814 * 4815 * Deallocate the index allocation block for the index described by the index 4816 * context @ictx and invalidate the context so the caller can safely release 4817 * it. 4818 * 4819 * We also check if the index allocation attribute can be shrunk as a 4820 * consequence of the deallocation of the index allocation block and if so and 4821 * that would actually change the on-disk size of the attribute we shrink it 4822 * now. 4823 * 4824 * If we shrunk the index allocation attribute and the index bitmap attribute 4825 * is non-resident we shrink the index bitmap attribute also but again only if 4826 * it would actually change the on-disk size of the attribute. 4827 * 4828 * Return 0 on success and errno on error. 4829 * 4830 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 4831 * writing. 4832 * - All of the index contexts in the index must be unlocked (this 4833 * includes @ictx, i.e. @ictx must not be locked). 4834 */ 4835static errno_t ntfs_index_block_free(ntfs_index_context *ictx) 4836{ 4837 s64 target_pos, bmp_pos, alloc_size; 4838 ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; 4839 ntfs_volume *vol = idx_ni->vol; 4840 unsigned page_ofs; 4841 errno_t err; 4842 4843 ntfs_debug("Entering."); 4844 if (ictx->is_locked) 4845 panic("%s(): ictx->is_locked\n", __FUNCTION__); 4846 /* Get the index bitmap inode. */ 4847 err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, 4848 idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE, 4849 &bmp_ni); 4850 if (err) { 4851 ntfs_error(vol->mp, "Failed to get index bitmap inode (error " 4852 "%d).", err); 4853 return err; 4854 } 4855 /* 4856 * Zero the bit in the index bitmap corresponding to the index block 4857 * being deallocated. 4858 */ 4859 target_pos = ictx->vcn << idx_ni->vcn_size_shift >> 4860 idx_ni->block_size_shift; 4861 err = ntfs_bitmap_clear_bit(bmp_ni, target_pos); 4862 if (err) { 4863 ntfs_error(vol->mp, "Failed to deallocate index block in " 4864 "index bitmap (error %d).", err); 4865 lck_rw_unlock_exclusive(&bmp_ni->lock); 4866 (void)vnode_put(bmp_ni->vn); 4867 return err; 4868 } 4869 /* If this is not the last set bit, we are done. */ 4870 if (target_pos < idx_ni->last_set_bit) { 4871done: 4872 ntfs_debug("Done (index records are allocated beyond the " 4873 "deallocated one)."); 4874out: 4875 lck_rw_unlock_exclusive(&bmp_ni->lock); 4876 (void)vnode_put(bmp_ni->vn); 4877 return 0; 4878 } 4879 /* 4880 * Scan backwards through the entire bitmap looking for the last set 4881 * bit. If we do not know the old last set bit (@idx_ni->last_set_bit 4882 * is -1), start at the end of the bitmap and if we do know it but we 4883 * cleared it just now, start at the old last set bit. 4884 * 4885 * Note we ignore any errors as the truncation is just a disk saving 4886 * optimization and is not actually required. However if an error 4887 * occurs we invalidate the last set bit stored in the inode. 4888 */ 4889 if (idx_ni->last_set_bit >= 0) 4890 bmp_pos = idx_ni->last_set_bit >> 3; 4891 else { 4892 lck_spin_lock(&bmp_ni->size_lock); 4893 bmp_pos = bmp_ni->initialized_size - 1; 4894 lck_spin_unlock(&bmp_ni->size_lock); 4895 } 4896 do { 4897 upl_t upl; 4898 upl_page_info_array_t pl; 4899 u8 *bmp_start, *bmp; 4900 4901 err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl, 4902 &pl, &bmp_start, FALSE); 4903 if (err) { 4904 ntfs_debug("Failed to read index bitmap (error %d).", 4905 err); 4906 idx_ni->last_set_bit = -1; 4907 goto out; 4908 } 4909 page_ofs = (unsigned)bmp_pos & PAGE_MASK; 4910 bmp = bmp_start + page_ofs; 4911 /* Scan backwards through the page. */ 4912 do { 4913 unsigned bit, byte = *bmp; 4914 /* If this byte is zero skip it. */ 4915 if (!byte) 4916 continue; 4917 /* 4918 * Determine the last set bit in the byte. 4919 * 4920 * TODO: There does not appear to be a fls() function 4921 * in the kernel. )-: If/when the kernel has an fls() 4922 * function, switch the below code to use it. 4923 * 4924 * So we do the "bit = fls(byte) - 1" by hand which is 4925 * not very efficient but works. 4926 */ 4927 bit = 0; 4928 if (byte & 0xf0) { 4929 byte >>= 4; 4930 bit += 4; 4931 } 4932 if (byte & 0x0c) { 4933 byte >>= 2; 4934 bit += 2; 4935 } 4936 if (byte & 0x02) 4937 bit++; 4938 ntfs_page_unmap(bmp_ni, upl, pl, FALSE); 4939 /* 4940 * @bit now contains the last set bit in the byte thus 4941 * we can determine the last set bit in the bitmap. 4942 */ 4943 idx_ni->last_set_bit = (((bmp_pos & ~PAGE_MASK_64) + 4944 (bmp - bmp_start)) << 3) + bit; 4945 if (target_pos < idx_ni->last_set_bit) 4946 goto done; 4947 goto was_last_set_bit; 4948 } while (--bmp >= bmp_start); 4949 ntfs_page_unmap(bmp_ni, upl, pl, FALSE); 4950 } while ((bmp_pos -= page_ofs + 1) >= 0); 4951 /* 4952 * We scanned the entire bitmap and it was all zero. We do not do 4953 * anything because truncation of indexes that become empty is done 4954 * elsewhere. 4955 */ 4956 idx_ni->last_set_bit = -1; 4957 ntfs_debug("Done (index bitmap has no set bits left)."); 4958 goto out; 4959was_last_set_bit: 4960 /* 4961 * This was the last set bit. Check if we would save disk space by 4962 * truncating the index allocation attribute and if so do it. To do 4963 * this determine which the first unused cluster is and compare it 4964 * against the currently allocated last cluster. 4965 * 4966 * Note we ignore any errors because it is not essential to resize the 4967 * index allocation attribute. In fact Windows and chkdsk are 4968 * perfectly happy with it remaining allocated. It just means the 4969 * index is wasting space on disk and that will be reclaimed when the 4970 * index is deleted or when the index is filled again with entries. 4971 */ 4972 target_pos = (((idx_ni->last_set_bit + 1) << 4973 idx_ni->block_size_shift) + vol->cluster_size_mask) & 4974 ~(s64)vol->cluster_size_mask; 4975 lck_spin_lock(&idx_ni->size_lock); 4976 alloc_size = idx_ni->allocated_size; 4977 lck_spin_unlock(&idx_ni->size_lock); 4978 if (target_pos >= alloc_size) { 4979 ntfs_debug("Done (no space would be freed on disk by " 4980 "truncating the index allocation attribute)."); 4981 goto out; 4982 } 4983 err = ntfs_attr_resize(idx_ni, target_pos, 0, NULL); 4984 if (err) { 4985 ntfs_debug("Failed to truncate index allocation attribute " 4986 "(error %d) thus this index will be wasting " 4987 "space on disk until it is deleted or " 4988 "repopulated with entries.", err); 4989 goto out; 4990 } 4991 ntfs_debug("Truncated index allocation attribute to reclaim 0x%llx " 4992 "bytes of disk space.", 4993 (unsigned long long)(alloc_size - target_pos)); 4994 /* 4995 * If the bitmap attribute is non-resident check if we would save disk 4996 * space by truncating it, too, and if so do it. Again we ignore any 4997 * errors as it is ok for the bitmap attribute to be left as it is. 4998 */ 4999 if (NInoNonResident(bmp_ni)) { 5000 target_pos = ((((target_pos >> idx_ni->block_size_shift) + 5001 7) >> 3) + vol->cluster_size_mask) & 5002 ~(s64)vol->cluster_size_mask; 5003 lck_spin_lock(&bmp_ni->size_lock); 5004 alloc_size = bmp_ni->allocated_size; 5005 lck_spin_unlock(&bmp_ni->size_lock); 5006 if (target_pos >= alloc_size) { 5007 ntfs_debug("Done (truncated index allocation to free " 5008 "space on disk but not truncating " 5009 "index bitmap as no space would be " 5010 "freed on disk by doing so)."); 5011 goto out; 5012 } 5013 err = ntfs_attr_resize(bmp_ni, target_pos, 0, NULL); 5014 if (err) { 5015 ntfs_debug("Failed to truncate index bitmap attribute " 5016 "(error %d) thus this index will be " 5017 "wasting space on disk until it is " 5018 "deleted or repopulated with entries.", 5019 err); 5020 goto out; 5021 } 5022 ntfs_debug("Truncated index bitmap attribute to reclaim " 5023 "0x%llx bytes of disk space.", 5024 (unsigned long long)(alloc_size - target_pos)); 5025 } 5026 ntfs_debug("Done."); 5027 goto out; 5028} 5029 5030/** 5031 * ntfs_index_make_empty - make an index empty discarding all entries in it 5032 * @ictx: index context describing the index to empty 5033 * 5034 * Empty the index described by the index context @ictx. On failure, use the 5035 * tree described by @ictx to re-allocate the freed index blocks if any have 5036 * been freed at the point of failure. 5037 * 5038 * This is called when the last index entry in an index is being deleted. 5039 * 5040 * We need to remove the sub-node from the end entry of the index root and 5041 * switch the entry to be a leaf entry. 5042 * 5043 * We also need to deallocate all index blocks and if possible shrink the index 5044 * allocation attribute to zero size as well as the index bitmap attribute if 5045 * it is non-resident. 5046 * 5047 * Return 0 on success and errno on error. 5048 * 5049 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 5050 * writing. 5051 * - All of the index contexts in the index must be unlocked (this 5052 * includes @ictx, i.e. @ictx must not be locked). 5053 */ 5054static errno_t ntfs_index_make_empty(ntfs_index_context *ictx) 5055{ 5056 s64 data_size; 5057 ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni; 5058 MFT_RECORD *m; 5059 ntfs_attr_search_ctx *actx; 5060 INDEX_ROOT *ir; 5061 INDEX_HEADER *ih; 5062 INDEX_ENTRY *ie; 5063 ntfs_index_context *start_ictx; 5064 u32 new_ilen; 5065 errno_t err; 5066 5067 ntfs_debug("Entering."); 5068 if (ictx->is_locked) 5069 panic("%s(): ictx->is_locked\n", __FUNCTION__); 5070 /* 5071 * Start by zeroing the index bitmap bits corresponding to the index 5072 * blocks being deallocated. For simplicity, we just zero the entire 5073 * index bitmap attribute. 5074 */ 5075 err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name, 5076 idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE, 5077 &bmp_ni); 5078 if (err) { 5079 ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode " 5080 "(error %d).", err); 5081 return err; 5082 } 5083 lck_spin_lock(&bmp_ni->size_lock); 5084 data_size = bmp_ni->data_size; 5085 lck_spin_unlock(&bmp_ni->size_lock); 5086 err = ntfs_attr_set(bmp_ni, 0, data_size, 0); 5087 if (err) { 5088 ntfs_error(idx_ni->vol->mp, "Failed to deallocate index " 5089 "block(s) in index bitmap."); 5090 goto err; 5091 } 5092 /* 5093 * We need to get hold of the index root attribute in order to convert 5094 * its end entry to a leaf node. 5095 */ 5096 err = ntfs_mft_record_map(ictx->base_ni, &m); 5097 if (err) { 5098 ntfs_error(idx_ni->vol->mp, "ntfs_mft_record_map() failed " 5099 "(error %d).", err); 5100 goto err; 5101 } 5102 actx = ntfs_attr_search_ctx_get(ictx->base_ni, m); 5103 if (!actx) { 5104 err = ENOMEM; 5105 goto unm_err; 5106 } 5107 /* Find the index root attribute in the mft record. */ 5108 err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, 5109 0, NULL, 0, actx); 5110 if (err) { 5111 if (err == ENOENT) { 5112 ntfs_error(idx_ni->vol->mp, "Index root attribute " 5113 "missing in inode 0x%llx. Run " 5114 "chkdsk.", 5115 (unsigned long long)idx_ni->mft_no); 5116 err = EIO; 5117 NVolSetErrors(idx_ni->vol); 5118 } 5119 goto put_err; 5120 } 5121 /* Get to the index root value. */ 5122 ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset)); 5123 ih = (INDEX_HEADER*)&ir->index; 5124 /* The first and last index entry. */ 5125 ie = (INDEX_ENTRY*)((u8*)ih + le32_to_cpu(ih->entries_offset)); 5126 if (!(ie->flags & INDEX_ENTRY_END)) 5127 panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n", __FUNCTION__); 5128 if (!(ie->flags & INDEX_ENTRY_NODE)) 5129 panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__); 5130 /* 5131 * Remove the sub-node pointer from the index entry and shrink the 5132 * index root attribute appropriately. 5133 */ 5134 ie->length = cpu_to_le16(le16_to_cpu(ie->length) - sizeof(VCN)); 5135 ie->flags &= ~INDEX_ENTRY_NODE; 5136 new_ilen = le32_to_cpu(ih->index_length) - sizeof(VCN); 5137 ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen); 5138 ih->flags &= ~LARGE_INDEX; 5139 err = ntfs_resident_attr_value_resize(actx->m, actx->a, 5140 offsetof(INDEX_ROOT, index) + new_ilen); 5141 /* We are shrinking the index root so the resize cannot fail. */ 5142 if (err) 5143 panic("%s(): err\n", __FUNCTION__); 5144 /* Ensure the changes are written to disk. */ 5145 NInoSetMrecNeedsDirtying(actx->ni); 5146 ntfs_attr_search_ctx_put(actx); 5147 ntfs_mft_record_unmap(ictx->base_ni); 5148 /* 5149 * Now deal with the index allocation attribute by truncating it to 5150 * zero length. 5151 * 5152 * Note we ignore any errors because it is not essential to resize the 5153 * index allocation attribute. In fact Windows and chkdsk are 5154 * perfectly happy with it remaining allocated. It just means the 5155 * index is wasting space on disk and that will be reclaimed when the 5156 * index is deleted or when the index is filled again with entries. 5157 */ 5158 err = ntfs_attr_resize(idx_ni, 0, 0, ictx); 5159 if (err) 5160 ntfs_debug("Failed to truncate index allocation attribute to " 5161 "zero size thus this index will be wasting " 5162 "space on disk until it is deleted or " 5163 "repopulated with entries."); 5164 /* 5165 * Finally, if the index bitmap attribute is non-resident truncate it 5166 * to zero length, too. Again we ignore any errors as it is ok for the 5167 * bitmap attribute to be left as it is. 5168 */ 5169 if (!err && NInoNonResident(bmp_ni)) { 5170 err = ntfs_attr_resize(bmp_ni, 0, 0, ictx); 5171 if (err) 5172 ntfs_debug("Failed to truncate index bitmap attribute " 5173 "to zero size thus this index will be " 5174 "wasting space on disk until it is " 5175 "deleted or repopulated with " 5176 "entries."); 5177 } 5178 lck_rw_unlock_exclusive(&bmp_ni->lock); 5179 (void)vnode_put(bmp_ni->vn); 5180 /* 5181 * We no longer have any index blocks allocated so invalidate our cache 5182 * of the last set bit. 5183 */ 5184 idx_ni->last_set_bit = -1; 5185 ntfs_debug("Done."); 5186 return 0; 5187put_err: 5188 ntfs_attr_search_ctx_put(actx); 5189unm_err: 5190 ntfs_mft_record_unmap(ictx->base_ni); 5191err: 5192 /* 5193 * Re-allocate the deallocated index block(s). This is safe because 5194 * the index inode mutex is held throughout. 5195 */ 5196 start_ictx = ictx; 5197 do { 5198 int err2; 5199 5200 /* 5201 * Skip the index root as it does not have a bit in the 5202 * bitmap. 5203 */ 5204 if (ictx->is_root) 5205 continue; 5206 err2 = ntfs_bitmap_set_bit(bmp_ni, ictx->vcn << 5207 idx_ni->vcn_size_shift >> 5208 idx_ni->block_size_shift); 5209 if (err2) { 5210 ntfs_error(idx_ni->vol->mp, "Failed to undo " 5211 "deallocation of index block in index " 5212 "bitmap (error %d) of inode 0x%llx. " 5213 "Leaving inconsistent metadata. Run " 5214 "chkdsk.", err2, 5215 (unsigned long long)idx_ni->mft_no); 5216 NVolSetErrors(idx_ni->vol); 5217 } 5218 } while ((ictx = ictx->up) != start_ictx); 5219 lck_rw_unlock_exclusive(&bmp_ni->lock); 5220 (void)vnode_put(bmp_ni->vn); 5221 ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err); 5222 return err; 5223} 5224 5225/** 5226 * ntfs_index_entry_delete - delete an index entry 5227 * @ictx: index context describing the index entry to delete 5228 * 5229 * Delete the index entry described by the index context @ictx from the index. 5230 * 5231 * Return 0 on success and errno on error. A special case is a return code of 5232 * -EAGAIN (negative, not EAGAIN) which means that the B+tree was rearranged 5233 * into a different consistent state to make the deletion possible but now the 5234 * lookup and delete has to be repeated as the index entry to be deleted may 5235 * have changed its position in the tree thus a new lookup is required to be 5236 * able to delete it. Doing this is not terribly efficient but we are only 5237 * talking about a handful of cases in a single delete of tens of thousands of 5238 * files so it does not matter if that is inefficient. On the plus side doing 5239 * things this way means we do not need to keep track of the entry to be 5240 * deleted when rearranging the tree which saves time and makes the code much 5241 * simpler. 5242 * 5243 * Note that we do not update the array of index entry pointers nor the number 5244 * of entries in the array because on success it means that the index entry has 5245 * been removed and the function is done, i.e. the array of index entry 5246 * pointers will not be used any more and on error the index contect becomes 5247 * invalid so there is no need to update any of the pointers. 5248 * 5249 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for 5250 * writing. 5251 * - The index context @ictx must be locked. 5252 */ 5253int ntfs_index_entry_delete(ntfs_index_context *ictx) 5254{ 5255 leVCN vcn; 5256 ntfs_inode *idx_ni = ictx->idx_ni; 5257 ntfs_index_context *pred_ictx, *parent_ictx, *suc_ictx, *put_ictx; 5258 ntfs_index_context *deallocate_ictx; 5259 INDEX_HEADER *ih; 5260 INDEX_ENTRY *ie, *next_ie, *pull_down_entry; 5261 MFT_RECORD *m; 5262 unsigned ie_size, pred_ie_size, pull_down_entry_nr, pull_down_ie_size; 5263 unsigned old_parent_entry_nr, old_parent_ie_size, move_ie_size; 5264 errno_t err; 5265 u32 new_ilen; 5266 BOOL is_root = ictx->is_root; 5267 5268 ntfs_debug("Entering."); 5269 if (!ictx->is_locked) 5270 panic("%s(): !ictx->is_locked\n", __FUNCTION__); 5271 deallocate_ictx = parent_ictx = put_ictx = NULL; 5272 ih = ictx->index; 5273 ie = ictx->entry; 5274 /* We cannot delete end entries as they do not contain any key/data. */ 5275 if (ie->flags & INDEX_ENTRY_END) 5276 panic("%s(): ie->flags & INDEX_ENTRY_END\n", __FUNCTION__); 5277 ie_size = le16_to_cpu(ie->length); 5278 next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size); 5279 /* 5280 * There are two types of index entry: Either it is a leaf entry, i.e. 5281 * it is in a leaf node and thus has no children or it is in an index 5282 * node entry, i.e. it is in an index block node and thus has children 5283 * which we need to deal with. If it is a leaf entry, skip onto leaf 5284 * entry deletion. 5285 */ 5286 if (!(ie->flags & INDEX_ENTRY_NODE)) 5287 goto delete_leaf_entry; 5288 /* 5289 * The node the entry to be deleted is in has sub-node(s), i.e. it is 5290 * not a leaf node, and the entry to be deleted is not a leaf entry. 5291 * We have to replace it with its predecessor (which by definition is a 5292 * leaf entry). There are three cases of increasing complexity: 5293 * 5294 * 1) The simplest case is when the predecessor is not the only entry 5295 * in its (leaf) node thus it can be simply removed without any tree 5296 * rebalancing needing to be done on its node and in addition the 5297 * predecessor entry can replace the entry to be deleted without 5298 * overflowing the node, i.e. there is enough space in the node we are 5299 * deleting the entry from to fit its predecessor entry thus it is a 5300 * matter of a simple replace without the need to change anything else 5301 * in the tree. 5302 * 5303 * 2) The slightly more complicated case is the same as above case 1) 5304 * except that there is not enough space in the node the entry to be 5305 * deleted is in to fit its predecessor entry, thus we need to split 5306 * the node the entry is being deleted from and promote the median 5307 * entry to the parent node which may then overflow the parent node and 5308 * so on up to the root node. A particular annoyance here is that 5309 * depending on the implementation details it is possible for the entry 5310 * to be deleted to be the median entry and thus be promoted to its 5311 * parent or alternatively it is possible for its predecessor entry 5312 * that is to replace the entry to be deleted to have to be promoted. 5313 * A further pitfall is that if the entry to be deleted is behind the 5314 * median entry, i.e. it is on the right of the median entry, then it 5315 * will be moved to a different node (the new right-hand node to the 5316 * old node being split) thus the replace needs to happen in a 5317 * different place. If we use ntfs_index_entry_add() as it is to take 5318 * care of the split&promote on insertion this last case would cause us 5319 * the problem that the pointers would be all out of date. We would 5320 * need ntfs_index_entry_add() to update the pointers and to switch the 5321 * right and left nodes in that case. Perhaps we need an 5322 * ntfs_index_entry_replace() that can share a lot of code with 5323 * ntfs_index_entry_add() perhaps even just a flag to 5324 * ntfs_index_entry_add() to indicate replace? Then we would not need 5325 * to worry about the pointers becoming out of date as the replace 5326 * would be done already for us. 5327 * 5328 * 3) The more complicated case is when the predecessor entry is the 5329 * last entry in its (leaf) node thus we cannot use it to replace the 5330 * entry to be deleted without rebalancing the tree. In this case we 5331 * have to rebalance the tree so that the predecessor entry is no 5332 * longer the last entry in its (leaf) node. Once that is successfully 5333 * done we have reduced the problem to either case 1) or case 2) above 5334 * or in fact to a completely different case (see below). The benefit 5335 * of doing the rebalancing first without regard to the delete and 5336 * replace that are to take place is that the tree ends up in a 5337 * consistent state, just a different one to what it was before, thus 5338 * we do not need to rollback to the original state any more thus error 5339 * handling is greatly simplified. The pitfall here is that doing the 5340 * balancing may profoundly change the tree to the extent that the 5341 * entry to be deleted may be moved to somewhere completely different 5342 * which we somehow need to be able to cope with. This can have 5343 * positive side effects, too, as it for example can lead to the entry 5344 * to be deleted being turned into a leaf entry thus its deletion turns 5345 * into a delete of a leaf entry thus the planned replace does not need 5346 * to happen at all. This is the "completely different case" mentioned 5347 * above. Usually this case will have been caused by a merge of two 5348 * neighbouring nodes with pulldown of the parent entry (which happens 5349 * to be the entry to be deleted) thus the entry to be deleted will not 5350 * only be a leaf entry but it will also not be the last entry in the 5351 * leaf node thus it can be deleted with a simple delete. Otherwise it 5352 * is not a disaster and we just need to rebalance the tree again as we 5353 * did just now but this time making sure that the entry to be deleted 5354 * is not the only entry in the node (compare to above where we were 5355 * making sure that the leaf predecessor entry is not the only entry in 5356 * its leaf node) and then we can proceed with a simple delete of the 5357 * leaf node. Because the tree is balanced at all steps we do not 5358 * actually care about handling the delete of the entry in the case 5359 * that it turned into a leaf entry and we just leave it to the leaf 5360 * entry deletion code to deal with. 5361 * 5362 * That is quite enough description now, let the games begin. First, 5363 * investigate the index entry to be deleted and its predecessor to 5364 * determine which of the above categories the deletion falls into. 5365 * 5366 * Locate the predecessor entry. Because the predecessor may be the 5367 * only entry in its (leaf) node in which case it would cause the tree 5368 * to be rebalanced, we record the path taken to the predecessor entry 5369 * by simply extending the existing tree path that points to the entry 5370 * to be deleted. 5371 */ 5372 err = ntfs_index_lookup_predecessor(ictx, &pred_ictx); 5373 if (err) { 5374 ntfs_error(idx_ni->vol->mp, "Failed to look up predecessor of " 5375 "index node entry to be deleted (error %d).", 5376 err); 5377 return err; 5378 } 5379 /* 5380 * If the predecessor is not the only entry other than the end entry in 5381 * its node we can simply remove it from its node and use it to replace 5382 * the entry to be deleted. 5383 */ 5384 if (pred_ictx->nr_entries > 2) 5385 goto simple_replace; 5386 if (pred_ictx->nr_entries < 2) 5387 panic("%s(): pred_ictx->nr_entries < 2\n", __FUNCTION__); 5388 /* 5389 * The predecessor is the only entry (other than the end entry) in its 5390 * node thus we cannot simply remove it from its node as that would 5391 * cause the B+tree to become imbalanced. 5392 * 5393 * There are two different cases we need to consider for which we need 5394 * to look up the predecessor of the current predecessor leaf entry. 5395 * To do this we ascend the tree looking for a node with entries in it. 5396 * The first node containing any entries is the node containing the 5397 * predecessor entry (which is not a leaf entry). 5398 * 5399 * 1) If we reach the original node containing the entry to be deleted 5400 * without finding any non-empty nodes on the way the whole sub-tree 5401 * below the entry to be deleted will become empty after using the 5402 * predecessor leaf entry to replace the entry to be deleted. Thus we 5403 * can deallocate all nodes in this sub-tree and instead of replacing 5404 * the entry to be deleted we remove it altogether and we then move the 5405 * predecessor leaf entry that is now homeless in front of its new 5406 * successor leaf entry, thus effectively merging the node containing 5407 * the predecessor with its right-hand sibling. 5408 * 5409 * Note that the insertion of the predecessor leaf entry into the node 5410 * containing its successor leaf entry may require a split of the node 5411 * we are insert into. 5412 * 5413 * 2) If we find a predecessor for the predecessor entry before we 5414 * reach the original node containing the entry to be deleted we break 5415 * what we need to do into two operations. First, we rebalance the 5416 * tree so that the predecessor entry of the entry to be deleted is no 5417 * longer the only entry in its leaf node. This simplifies the 5418 * deletion of the entry to a case we have already solved as we can now 5419 * simply remove the predecessor entry from its node and use it to 5420 * replace the entry to be deleted. Thus, second, we use the already 5421 * existing code to do a simple replace. Should that fail it does not 5422 * matter as we are leaving a fully consistent B+tree tree behind. 5423 * Breaking this case up into two means we incur a little bit of 5424 * overhead for the extra locking of the predecessor node and for the 5425 * extra memmove()s and memcpy() of the predecessor entry. But this 5426 * simplifies error handling and makes the code so much simpler that it 5427 * is definitely worth paying the price in overhead. 5428 * 5429 * Enough discussion, time to do it. Start by going up the tree 5430 * looking for a non-empty node or the original node containing the 5431 * entry to be deleted so we can determine if we have the above case 1 5432 * or 2. 5433 * 5434 * We must be at the bottom of the current tree path or things will get 5435 * very confused. 5436 */ 5437 if (!pred_ictx->down->is_root) 5438 panic("%s(): !pred_ictx->down->is_root\n", __FUNCTION__); 5439 parent_ictx = pred_ictx->up; 5440 put_ictx = deallocate_ictx = pred_ictx; 5441 ntfs_index_ctx_disconnect_reinit(pred_ictx); 5442 /* 5443 * Make a note of the size of the predecessor entry that we are going 5444 * to move and unlock it for now. 5445 */ 5446 move_ie_size = le16_to_cpu(pred_ictx->entry->length); 5447 ntfs_index_ctx_unlock(pred_ictx); 5448 /* Now ascend the tree until we find a non-empty node. */ 5449 while (parent_ictx->nr_entries <= 1) { 5450 if (parent_ictx->nr_entries != 1) 5451 panic("%s(): parent_ictx->nr_entries != 1\n", 5452 __FUNCTION__); 5453 /* 5454 * Empty index node. Move it to the list of index nodes to be 5455 * deallocated and proceed to its parent. 5456 */ 5457 pred_ictx = parent_ictx; 5458 parent_ictx = parent_ictx->up; 5459 ntfs_index_ctx_move(pred_ictx, deallocate_ictx); 5460 } 5461 /* 5462 * We have a non-empty parent node. Check whether this is the node 5463 * containing the entry to be deleted (case 1) or whether it is an 5464 * intervening node (case 2). In the latter case we need to rearrange 5465 * the tree. 5466 */ 5467 if (parent_ictx != ictx) 5468 goto rearrange_tree; 5469 /* 5470 * Now that we know we have case 1, we need to find the leaf node 5471 * containing the successor entry of the entry to be deleted. To do 5472 * this we repoint the tree path to the entry on the right-hand side of 5473 * the entry to be deleted and descend down into the first entry of 5474 * each node until we reach the leaf-node. 5475 * 5476 * We need to lock the node containing the entry to be deleted and make 5477 * a note of it as we are going to delete it later. 5478 */ 5479 err = ntfs_index_ctx_relock(parent_ictx); 5480 if (err) { 5481 ntfs_error(idx_ni->vol->mp, "Failed to relock the parent " 5482 "index node (error %d).", err); 5483 goto put_err; 5484 } 5485 /* 5486 * NOTE: pull_down_* == to be deleted (we are just reusing existing 5487 * code and variables hence the (mis)naming... 5488 */ 5489 pull_down_entry = parent_ictx->follow_entry; 5490 pull_down_entry_nr = parent_ictx->entry_nr; 5491 /* 5492 * Repoint the path to the entry on the right-hand side of the entry to 5493 * be deleted so we can descend to the leaf node containing the 5494 * successor entry. 5495 */ 5496 parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry + 5497 le16_to_cpu(pull_down_entry->length)); 5498 parent_ictx->entry_nr++; 5499 /* Finally descend to the leaf node containing the successor entry. */ 5500 suc_ictx = parent_ictx; 5501 do { 5502 err = ntfs_index_descend_into_child_node(&suc_ictx); 5503 if (err) { 5504 ntfs_error(idx_ni->vol->mp, "Failed to descend to " 5505 "node containing successor entry " 5506 "(error %d).", err); 5507 goto put_err; 5508 } 5509 if (!suc_ictx->is_locked) 5510 panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); 5511 suc_ictx->follow_entry = suc_ictx->entries[0]; 5512 } while (suc_ictx->index->flags & INDEX_NODE); 5513 if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) 5514 panic("%s(): suc_ictx->follow_entry->flags & " 5515 "INDEX_ENTRY_NODE\n", __FUNCTION__); 5516 /* 5517 * Can the predecessor entry simply be inserted in front of the 5518 * successor leaf entry without overflowing the node? If not we need 5519 * to split the node and then restart the delete. 5520 */ 5521 if (move_ie_size > suc_ictx->bytes_free) { 5522split_and_restart_case_1: 5523 ntfs_debug("Splitting index node before add on delete (case " 5524 "1)."); 5525 ie_size = move_ie_size; 5526 goto split_and_restart; 5527 } 5528 /* 5529 * The predecessor entry can simply be inserted in front of the 5530 * successor leaf entry. 5531 * 5532 * We need to have locked both nodes to be able to do the insert thus 5533 * we may need to unlock the locked successor node to ensure that the 5534 * node lock is always taken in ascending page offset order so we avoid 5535 * deadlocks. Also we have to consider the case where the two index 5536 * nodes are in the same page in which case we need to share the index 5537 * lock. 5538 */ 5539 if (deallocate_ictx->is_locked) 5540 panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__); 5541 err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx); 5542 if (err) 5543 goto put_err; 5544 /* 5545 * We need to re-check the amount of free space as it can have changed 5546 * whilst we dropped the lock on @suc_ictx if that was necessary. When 5547 * this happens simply drop the lock on @deallocate_ictx if we took it 5548 * and go back to the previous code dealing with the not enough space 5549 * case. 5550 */ 5551 if (move_ie_size > suc_ictx->bytes_free) { 5552 if (deallocate_ictx->is_locked) 5553 ntfs_index_ctx_unlock(deallocate_ictx); 5554 goto split_and_restart_case_1; 5555 } 5556 /* 5557 * Both nodes are locked and this is a simple insert, so go ahead and 5558 * do it. We have already checked that there is enough space so it 5559 * cannot fail. 5560 * 5561 * Move all the entries out of the way to make space for the 5562 * predecessor entry to be copied in. 5563 */ 5564 err = ntfs_index_node_make_space(suc_ictx, move_ie_size); 5565 if (err) 5566 panic("%s(): err\n", __FUNCTION__); 5567 /* 5568 * Copy the predecessor index entry into the created space in front of 5569 * the successor entry. 5570 */ 5571 memcpy(suc_ictx->follow_entry, deallocate_ictx->entry, move_ie_size); 5572 /* Ensure the updates are written to disk. */ 5573 ntfs_index_entry_mark_dirty(suc_ictx); 5574 /* 5575 * We are done both with the predecessor and successor nodes so release 5576 * their locks. 5577 */ 5578 if (deallocate_ictx->is_locked) 5579 ntfs_index_ctx_unlock(deallocate_ictx); 5580 if (!suc_ictx->is_locked) 5581 panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); 5582 ntfs_index_ctx_unlock(suc_ictx); 5583 /* 5584 * We now have to lock the original node containing the entry to be 5585 * deleted and we need to switch it to be the current index context and 5586 * to point to the entry to be deleted. Note this is a simple delete 5587 * just like for a leaf entry as we have taken care of its child leaf 5588 * node(s) already and will deallocate it(them) later. 5589 */ 5590 if (parent_ictx->is_locked) 5591 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 5592 err = ntfs_index_ctx_relock(parent_ictx); 5593 if (err) { 5594 ntfs_error(idx_ni->vol->mp, "Failed to relock the parent " 5595 "index node (error %d).", err); 5596 /* 5597 * Undo the insertion of the predecessor entry in front of the 5598 * successor entry by moving the successor and all following 5599 * entries back into their old place and resetting the index 5600 * length to the old size. 5601 */ 5602 err = ntfs_index_ctx_relock(suc_ictx); 5603 if (err) { 5604 ntfs_error(idx_ni->vol->mp, "Failed to relock index " 5605 "context (error %d). Leaving corrupt " 5606 "index. Run chkdsk.", err); 5607 NVolSetErrors(idx_ni->vol); 5608 } else { 5609 /* 5610 * @suc_ictx cannot be the index root or the below 5611 * code would be wrong. 5612 */ 5613 if (suc_ictx->is_root) 5614 panic("%s(): suc_ctx->is_root\n", __FUNCTION__); 5615 ie = suc_ictx->follow_entry; 5616 ie_size = le16_to_cpu(ie->length); 5617 next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size); 5618 ih = suc_ictx->index; 5619 new_ilen = le32_to_cpu(ih->index_length); 5620 memmove(ie, next_ie, new_ilen - ((u8*)next_ie - 5621 (u8*)ih)); 5622 ih->index_length = cpu_to_le32(new_ilen - ie_size); 5623 ntfs_index_entry_mark_dirty(suc_ictx); 5624 } 5625 goto put_err; 5626 } 5627 ictx = parent_ictx; 5628 parent_ictx->entry = parent_ictx->entries[pull_down_entry_nr]; 5629 parent_ictx->entry_nr = pull_down_entry_nr; 5630 goto prep_simple_delete; 5631rearrange_tree: 5632 /* 5633 * We now know we have case 2, i.e. we have a non-empty, intervening 5634 * node containing the predecessor (non-leaf) entry of the predecessor 5635 * entry of the entry to be deleted and we need to use it to rearrange 5636 * the tree such that the (leaf) predecessor entry of the entry to be 5637 * deleted no longer is the only entry in its (leaf) node. 5638 * 5639 * To do this we merge the (leaf) node containing the predecessor entry 5640 * of the entry to be deleted with its left-hand sibling (leaf) node. 5641 * This involves pulling the (non-leaf) predecessor entry of the (leaf) 5642 * predecessor entry of the entry to be deleted down to behind its 5643 * (leaf) predecessor entry. 5644 * 5645 * Thus, we need to now locate the leaf node containing the predecessor 5646 * entry behind which we need to insert the two predecessor entries. 5647 * 5648 * To be able to do this we need to lock the node containing the (non- 5649 * leaf) predecessor of the predecessor of the entry to be deleted and 5650 * make a note of it as we are going to delete it later. We also need 5651 * to repoint the tree path to descend into the (non-leaf) predecessor 5652 * entry. 5653 */ 5654 if (parent_ictx->is_locked) 5655 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 5656 if (parent_ictx->is_root) 5657 panic("%s(): parent_ictx->is_root\n", __FUNCTION__); 5658 err = ntfs_index_ctx_relock(parent_ictx); 5659 if (err) 5660 goto put_err; 5661 parent_ictx->entry_nr--; 5662 parent_ictx->follow_entry = parent_ictx->entries[parent_ictx->entry_nr]; 5663 pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) - 5664 sizeof(VCN); 5665 /* Now descend to the leaf predecessor entry. */ 5666 suc_ictx = parent_ictx; 5667 do { 5668 err = ntfs_index_descend_into_child_node(&suc_ictx); 5669 if (err) { 5670 ntfs_error(idx_ni->vol->mp, "Failed to descend to " 5671 "node containing leaf predecessor " 5672 "entry of non-leaf predecessor entry " 5673 "of leaf predecessor entry of entry " 5674 "to be deleted (error %d).", err); 5675 goto put_err; 5676 } 5677 if (!suc_ictx->is_locked) 5678 panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); 5679 suc_ictx->entry_nr = suc_ictx->nr_entries - 1; 5680 suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr]; 5681 } while (suc_ictx->index->flags & INDEX_NODE); 5682 if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) 5683 panic("%s(): suc_ictx->follow_entry->flags & " 5684 "INDEX_ENTRY_NODE\n", __FUNCTION__); 5685 /* 5686 * Can the entry to be pulled down and the original predecessor entry 5687 * to be merged in simply be inserted after the located predecessor 5688 * entry without overflowing the node? If not we need to split the 5689 * node and then restart the delete. 5690 */ 5691 if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) { 5692split_and_restart_case_2: 5693 ntfs_debug("Splitting index node before add on delete (case " 5694 "2)."); 5695 ie_size = pull_down_ie_size + move_ie_size; 5696 goto split_and_restart; 5697 } 5698 /* 5699 * We know we have enough space so we cannot fail. Need to lock the 5700 * non-leaf predecessor that we want to pull down. 5701 */ 5702 if (parent_ictx->is_locked) 5703 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 5704 err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx); 5705 if (err) 5706 goto put_err; 5707 /* 5708 * We need to re-check the amount of free space as it can have changed 5709 * whilst we dropped the lock on @suc_ictx if that was necessary. When 5710 * this happens simply drop the lock on @parent_ictx if we took it and 5711 * go back to the previous code dealing with the not enough space case. 5712 */ 5713 if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) { 5714 if (parent_ictx->is_locked) 5715 ntfs_index_ctx_unlock(parent_ictx); 5716 goto split_and_restart_case_2; 5717 } 5718 /* 5719 * Make enough space in the destination leaf node to accomodate both 5720 * the entries. 5721 */ 5722 err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size + 5723 move_ie_size); 5724 if (err) 5725 panic("%s(): err\n", __FUNCTION__); 5726 /* 5727 * Copy the non-leaf predecessor entry into place and convert it to a 5728 * leaf entry. 5729 */ 5730 ie = suc_ictx->follow_entry; 5731 pull_down_entry = parent_ictx->follow_entry; 5732 memcpy(ie, pull_down_entry, pull_down_ie_size); 5733 ie->length = cpu_to_le16(pull_down_ie_size); 5734 ie->flags &= ~INDEX_ENTRY_NODE; 5735 /* 5736 * Now delete the entry from its original node and transfer its VCN 5737 * sub-node pointer to the next entry (which is the end entry). 5738 */ 5739 vcn = *(leVCN*)((u8*)pull_down_entry + pull_down_ie_size); 5740 ih = parent_ictx->index; 5741 new_ilen = le32_to_cpu(ih->index_length) - (pull_down_ie_size + 5742 sizeof(VCN)); 5743 memmove(pull_down_entry, (u8*)pull_down_entry + pull_down_ie_size + 5744 sizeof(VCN), new_ilen - ((u8*)pull_down_entry - 5745 (u8*)ih)); 5746 *(leVCN*)((u8*)pull_down_entry + le16_to_cpu(pull_down_entry->length) - 5747 sizeof(VCN)) = vcn; 5748 /* Update the index size, too. */ 5749 ih->index_length = cpu_to_le32(new_ilen); 5750 /* Ensure the updates are written to disk. */ 5751 ntfs_index_entry_mark_dirty(parent_ictx); 5752 /* Update the index context as well. */ 5753 parent_ictx->bytes_free += pull_down_ie_size + sizeof(VCN); 5754 parent_ictx->nr_entries--; 5755 /* We are done with the non-leaf predecessor node so unlock it. */ 5756 if (parent_ictx->is_locked) 5757 ntfs_index_ctx_unlock(parent_ictx); 5758 /* 5759 * Now need to move over the original leaf predecessor entry. Lock its 5760 * node again. 5761 */ 5762 if (deallocate_ictx->is_locked) 5763 panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__); 5764 err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx); 5765 if (err) 5766 goto put_err; 5767 /* Copy the original leaf predecessor entry into place. */ 5768 memcpy((u8*)suc_ictx->follow_entry + pull_down_ie_size, 5769 deallocate_ictx->follow_entry, move_ie_size); 5770 /* Ensure the updates are written to disk. */ 5771 ntfs_index_entry_mark_dirty(suc_ictx); 5772 /* We are finished rearranging the tree so unlock both nodes. */ 5773 ntfs_index_ctx_unlock(suc_ictx); 5774 if (deallocate_ictx->is_locked) 5775 ntfs_index_ctx_unlock(deallocate_ictx); 5776 /* 5777 * The last thing to do is to deallocate the disconnected index node(s) 5778 * and then we can restart the delete operation which now involves a 5779 * simple replace to complete the delete. 5780 */ 5781 parent_ictx = deallocate_ictx; 5782 do { 5783 if (parent_ictx->is_root) 5784 panic("%s(): parent_ictx->is_root\n", __FUNCTION__); 5785 err = ntfs_index_block_free(parent_ictx); 5786 if (err) { 5787 ntfs_error(idx_ni->vol->mp, "Failed to deallocate no " 5788 "longer used index block vcn 0x%llx " 5789 "(error %d). Run chkdsk to recover " 5790 "the lost index block.", 5791 (unsigned long long)parent_ictx->vcn, 5792 err); 5793 NVolSetErrors(idx_ni->vol); 5794 } 5795 } while ((parent_ictx = parent_ictx->up) != deallocate_ictx); 5796 /* 5797 * Release the sub-tree paths that the caller no longer has a reference 5798 * to. 5799 */ 5800 ntfs_index_ctx_put(put_ictx); 5801 /* Reset the state machine to original values. */ 5802 put_ictx = deallocate_ictx = NULL; 5803 /* 5804 * The tree is now fully consistent, no nodes are locked, and @suc_ictx 5805 * is the leaf node containing the predecessor entry of the entry to 5806 * be deleted and the predecessor entry no longer is the only entry 5807 * other than the end entry thus it can simply be used to replace the 5808 * entry to be deleted. 5809 * 5810 * Setup everything so we can jump straight into the simple replace 5811 * code. Note we lock the original entry to be deleted as well so that 5812 * the simple replace code does not have to drop the lock on the 5813 * predecessor node in order to lock the original node which it may 5814 * have to do otherwise. 5815 * 5816 * The simple replace code requires the predecessor to be in 5817 * @pred_ictx. 5818 */ 5819 pred_ictx = suc_ictx; 5820 err = ntfs_index_ctx_lock_two(suc_ictx, ictx); 5821 if (err) 5822 return err; 5823 /* 5824 * Repoint the predecessor context to point to the correct entry and 5825 * update it to reflect the addition of the two index entries. 5826 */ 5827 pred_ictx->entry = (INDEX_ENTRY*)((u8*)pred_ictx->entry + 5828 pull_down_ie_size); 5829 pred_ictx->entry_nr++; 5830 pred_ictx->is_match = 1; 5831 pred_ictx->bytes_free -= pull_down_ie_size + move_ie_size; 5832 pred_ictx->nr_entries += 2; 5833 if (pred_ictx->nr_entries > pred_ictx->max_entries) 5834 panic("%s(): pred_ictx->nr_entries > pred_ictx->max_entries\n", 5835 __FUNCTION__); 5836 pred_ictx->entries[pred_ictx->entry_nr] = pred_ictx->entry; 5837 pred_ictx->entries[pred_ictx->entry_nr + 1] = (INDEX_ENTRY*)( 5838 (u8*)pred_ictx->entry + move_ie_size); 5839 /* Everything is set up. Do the simple replace. */ 5840 /* goto simple_replace; */ 5841simple_replace: 5842 /* 5843 * The predecessor can simply be removed from its node. 5844 * 5845 * We need to have locked both nodes to be able to do the replace thus 5846 * we may need to unlock the locked predecessor node to ensure that 5847 * page locks are only taken in ascending page offset order so we avoid 5848 * deadlocks. 5849 */ 5850 if (!pred_ictx->is_locked) 5851 panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__); 5852 if (!ictx->is_locked) { 5853 err = ntfs_index_ctx_lock_two(pred_ictx, ictx); 5854 if (err) 5855 return err; 5856 } 5857 /* 5858 * Can the predecessor simply replace the entry to be deleted without 5859 * overflowing the node containing the entry to be deleted? If not we 5860 * need to split the node and then restart the delete. 5861 */ 5862 pred_ie_size = le16_to_cpu(pred_ictx->entry->length); 5863 ie_size = le16_to_cpu(ictx->entry->length) - sizeof(VCN); 5864 if ((int)pred_ie_size - (int)ie_size > (int)ictx->bytes_free) { 5865 ntfs_debug("Splitting index node before add on delete (case " 5866 "3)."); 5867 /* 5868 * Drop the lock on the predecessor or transfer it to the node 5869 * containing the entry to be deleted if they share the same 5870 * page (in which case @ictx is not marked locked). 5871 */ 5872 if (!pred_ictx->is_locked) 5873 panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__); 5874 if (ictx->is_locked) { 5875 /* 5876 * @ictx is locked thus it cannot be sharing the lock 5877 * with @pred_ictx. Unlock @pred_ictx. 5878 */ 5879 ntfs_index_ctx_unlock(pred_ictx); 5880 } else { 5881 /* 5882 * @ictx is not locked thus it must be sharing the lock 5883 * with @pred_ictx. Transfer the lock across. 5884 */ 5885 if (ictx->is_root) 5886 panic("%s(): ictx->is_root\n", __FUNCTION__); 5887 if (ictx->upl_ofs != pred_ictx->upl_ofs) 5888 panic("%s(): ictx->upl_ofs != " 5889 "pred_ictx->upl_ofs\n", 5890 __FUNCTION__); 5891 ictx->is_locked = 1; 5892 pred_ictx->is_locked = 0; 5893 } 5894 if (!ictx->is_locked) 5895 panic("%s(): !ictx->is_locked\n", __FUNCTION__); 5896 suc_ictx = ictx; 5897 ie_size = pred_ie_size - ie_size; 5898 goto split_and_restart; 5899 } 5900 /* 5901 * Both nodes are locked and this is a simple replace, so go ahead and 5902 * do it. We have already checked that there is enough space so it 5903 * cannot fail. 5904 */ 5905 err = ntfs_index_entry_replace(ictx, pred_ictx); 5906 if (err) 5907 panic("%s(): err\n", __FUNCTION__); 5908 /* We are done with the current entry so release it. */ 5909 if (ictx->is_locked) 5910 ntfs_index_ctx_unlock(ictx); 5911 /* 5912 * We have now replaced the entry to be deleted but at the moment we 5913 * have two copies of the predecessor entry, so delete the original 5914 * copy from its (leaf) node. We need to switch the predecessor index 5915 * context to be the current context so we can just pretend it is the 5916 * entry to be deleted. We only need to setup the fields relevant to 5917 * index block node deletion as the predecessor cannot be in the index 5918 * root node. 5919 */ 5920 ictx = pred_ictx; 5921 goto prep_simple_delete; 5922delete_leaf_entry: 5923 /* 5924 * If the entry is in the index root or it is not the only entry (other 5925 * than the end entry) in the leaf node it can simply be removed. 5926 */ 5927 if (ictx->nr_entries > 2 || is_root) 5928 goto simple_delete; 5929 /* 5930 * The entry to be deleted is the only entry and it is not in the index 5931 * root. We have to rebalance the tree so that the entry to be deleted 5932 * is no longer the only entry in its node. There are several cases we 5933 * need to consider: 5934 * 5935 * 1) If the parent entry is not the end entry of its index node then 5936 * simply insert our parent entry at the front of the leaf node on 5937 * our right, i.e. the child node of the entry following our parent 5938 * entry. Then remove our parent entry from our parent node and 5939 * free the leaf node the entry to be deleted is in (thus deleting 5940 * the entry). 5941 * 5942 * What the above describes is effectively merging the leaf node 5943 * containing the entry to be deleted with its right-hand sibling 5944 * leaf node and in the process pulling down our parent entry into 5945 * the merged node and placing it in front of its successor entry. 5946 * 5947 * Our parent node is of course turned from an index node entry to a 5948 * leaf entry as it is pulled down. 5949 * 5950 * Note how if our parent entry is the only entry other than the end 5951 * entry in its node this results in our parent node being empty, 5952 * i.e. containing only the end entry. This is fine for NTFS 5953 * B+trees, i.e. index nodes may be empty. It is only leaf nodes 5954 * that may not be empty. 5955 * 5956 * 2) As point 1) above but the parent entry of the entry to be removed 5957 * does not fit in the leaf node containing its successor thus that 5958 * node needs to be split and the split may need to propagate up the 5959 * tree which is of course the job of ntfs_index_entry_add(). 5960 * 5961 * 3) If the parent entry is the end entry of its index node, i.e. 5962 * there is no right-hand sibling leaf node then simply insert the 5963 * entry on the left of our parent entry at the end of its own child 5964 * node, i.e. the leaf node on our left. Then remove the entry on 5965 * the left of our parent entry that we inserted into its child node 5966 * from our parent node and change the VCN sub-node pointer of our 5967 * parent node to point to our left-hand sibling. Finally free the 5968 * leaf node the entry to be deleted is in which is no longer 5969 * referenced from anywhere (thus deleting the entry). 5970 * 5971 * What the above describes is effectively merging the leaf node 5972 * containing the entry to be deleted with its left-hand sibling 5973 * leaf node and in the process pulling down the entry on the 5974 * left-hand side of our parent into the merged node and placing it 5975 * after its predecessor entry. 5976 * 5977 * The left-hand sibling of our parent node is of course turned from 5978 * and index node entry to a leaf entry as it is pulled down. 5979 * 5980 * Note how if the left-hand sibling of our parent entry is the only 5981 * entry other than the end entry, which is also our parent entry, 5982 * in its node this results in our parent node being empty, i.e. 5983 * containing only the end entry. This is fine for NTFS B+trees, 5984 * i.e. index nodes may be empty. It is only leaf nodes that may 5985 * not be empty. 5986 * 5987 * 4) As point 3) above but the left-hand sibling of the parent entry 5988 * of the entry to be removed does not fit in the leaf node 5989 * containing its predecessor thus that node needs to be split and 5990 * the split may need to propagate up the tree which is of course 5991 * the job of ntfs_index_entry_add(). 5992 * 5993 * 5) If the parent entry is the end entry of its index node, i.e. 5994 * there is no right-hand sibling leaf node, and there is no entry 5995 * on the left of our parent entry, i.e. there is no left-hand 5996 * sibling leaf node, i.e. the parent node is completely empty, we 5997 * cannot merge with a sibling as there is none thus we need to 5998 * record the current state and repeat a very similar procedure as 5999 * above points 1-4 except applied to the parent node which is not 6000 * a leaf node so we need to be careful with VCN sub-node pointers. 6001 * The aim of doing this is to merge the parent node with one of its 6002 * siblings so that it is no longer empty so that our leaf node 6003 * containing the entry to be deleted has at least one sibling so 6004 * that we can merge the leaf node with one of its siblings. If the 6005 * parent node of the parent node has no siblings either we need to 6006 * push the recorded state down into the stack of recorded states, 6007 * record the new current state, and then go to the next parent and 6008 * try to merge that with its sibling and so on until we find a 6009 * parent with a sibling that can be merged. There are two possible 6010 * outcomes (not counting errors): 6011 * a) The going up and merging will eventually succeed in which case 6012 * we go backwards through the stack of recorded states merging 6013 * nodes as we go along until we reach the last recorded state at 6014 * which point we have reduced the problem to one of the above 6015 * points 1-4 thus we repeat them to finish. 6016 * b) We reach the index root which by definition cannot have any 6017 * siblings. Also because we reached the index root it means 6018 * that the index root must be empty and the end entry is 6019 * pointing to the VCN sub-node we came from and because the 6020 * entry to be deleted is the last entry in its leaf-node and all 6021 * parent nodes including the index root are empty the entry to 6022 * be deleted is the very last entry of the directory thus it can 6023 * simply be removed by truncating the index allocation attribute 6024 * to zero size and if the index bitmap is non-resident 6025 * truncating it to zero size, too, and if the index bitmap is 6026 * resident zeroing its contents instead of truncating it and 6027 * finally the index root is switched to be a small index, which 6028 * is reflected in its index flags and in the removal of the VCN 6029 * sub-node pointer in the end entry of the index root. 6030 * NOTE: Instead of recursing, we do it in one go by simply freeing 6031 * all the index node blocks in the tree path until we find a 6032 * non-empty node, i.e. a node that can be merged, and we then 6033 * perform the merge by pulling down the appropriate index entry but 6034 * instead of pulling it down one level we pull it down to before its 6035 * successor (if merging to the right) or to after its predecessor (if 6036 * merging on the left) which by definition will be in a leaf node. 6037 * That give the same results as the recursion but it is much less 6038 * work to do and it affects less index nodes. 6039 * NOTE2: After implementing NOTE above, we ditch the code handling 6040 * cases 1-4 above as they are just special cases of case 5 when 6041 * implemented like NOTE. 6042 * 6043 * That is quite enough description now, let the games begin. First, 6044 * investigate the parent index entry of the leaf entry to be deleted 6045 * to see whether it is the end entry or not to determine which of the 6046 * above categories the deletion falls into. 6047 * 6048 * We must be at the bottom of the current tree path or things will get 6049 * very confused. 6050 */ 6051 if (!ictx->down->is_root) 6052 panic("%s(): !ictx->down->is_root\n", __FUNCTION__); 6053 /* 6054 * We are going to disconnect @ictx thus the tree starting at the index 6055 * root is no longer referenced by the caller so we need to free it 6056 * later. 6057 */ 6058 put_ictx = ictx->down; 6059 /* 6060 * Obtain the parent node, unlock the current node, disconnect it from 6061 * the tree, and mark it for deallocation later. 6062 */ 6063 parent_ictx = ictx->up; 6064 ntfs_index_ctx_unlock(ictx); 6065 ntfs_index_ctx_disconnect_reinit(ictx); 6066 deallocate_ictx = ictx; 6067 /* 6068 * Investigate the parent node. If it is not empty we can immediately 6069 * proceed with the merge to the right or left. If it is empty then we 6070 * need to ascend the tree until we find a non-empty node or until we 6071 * reach an empty index root in which case the index is becoming empty 6072 * so we delete its contents by resetting it to zero size. 6073 */ 6074 while (parent_ictx->nr_entries <= 1) { 6075 if (parent_ictx->nr_entries != 1) 6076 panic("%s(): parent_ictx->nr_entries != 1\n", 6077 __FUNCTION__); 6078 if (parent_ictx->is_root) { 6079 /* 6080 * The index is becoming empty. To simplify things 6081 * merge the two sub-trees back together and then empty 6082 * the index. 6083 */ 6084 deallocate_ictx->up->down = parent_ictx->down; 6085 parent_ictx->down->up = deallocate_ictx->up; 6086 deallocate_ictx->up = parent_ictx; 6087 parent_ictx->down = deallocate_ictx; 6088 return ntfs_index_make_empty(deallocate_ictx); 6089 } 6090 /* 6091 * Empty index node. Move it to the list of index nodes to be 6092 * deallocated and proceed to its parent. 6093 */ 6094 ictx = parent_ictx; 6095 parent_ictx = parent_ictx->up; 6096 ntfs_index_ctx_move(ictx, deallocate_ictx); 6097 } 6098 /* 6099 * We have a non-empty parent node which we need to lock as we need to 6100 * access its contents. 6101 */ 6102 if (parent_ictx->is_locked) 6103 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 6104 err = ntfs_index_ctx_relock(parent_ictx); 6105 if (err) { 6106 ntfs_error(idx_ni->vol->mp, "Failed to relock the parent " 6107 "index node (error %d).", err); 6108 goto put_err; 6109 } 6110 /* INDEX_NODE == LARGE_INDEX */ 6111 if (!(parent_ictx->follow_entry->flags & INDEX_ENTRY_NODE) || 6112 !(parent_ictx->index->flags & INDEX_NODE)) 6113 panic("%s(): !(parent_ictx->follow_entry->flags & " 6114 "INDEX_ENTRY_NODE) || " 6115 "!(parent_ictx->index->flags & INDEX_NODE)\n", 6116 __FUNCTION__); 6117 /* 6118 * If the parent entry is the end entry in its node we cannot merge to 6119 * the right so merge to the left instead. 6120 */ 6121 if (parent_ictx->follow_entry->flags & INDEX_ENTRY_END) 6122 goto merge_left; 6123 /* 6124 * The parent entry is not the end entry in its node so merge on the 6125 * right. To do this we need to find the successor (leaf) entry for 6126 * which we need to repoint the path so we descend into the sub-node of 6127 * the entry on the right-hand side of the parent entry. 6128 * 6129 * We also make a note of the parent entry as we are going to pull it 6130 * down in front of its successor entry and we then have to delete it 6131 * from this node later. 6132 */ 6133 pull_down_entry = parent_ictx->follow_entry; 6134 pull_down_entry_nr = parent_ictx->entry_nr; 6135 pull_down_ie_size = le16_to_cpu(pull_down_entry->length); 6136 parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry + 6137 pull_down_ie_size); 6138 pull_down_ie_size -= sizeof(VCN); 6139 parent_ictx->entry_nr++; 6140 suc_ictx = parent_ictx; 6141 do { 6142 err = ntfs_index_descend_into_child_node(&suc_ictx); 6143 if (err) { 6144 ntfs_error(idx_ni->vol->mp, "Failed to descend to " 6145 "node containing successor entry " 6146 "(error %d).", err); 6147 goto put_err; 6148 } 6149 if (!suc_ictx->is_locked) 6150 panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); 6151 suc_ictx->follow_entry = suc_ictx->entries[0]; 6152 } while (suc_ictx->index->flags & INDEX_NODE); 6153 if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) 6154 panic("%s(): suc_ictx->follow_entry->flags & " 6155 "INDEX_ENTRY_NODE\n", __FUNCTION__); 6156 /* 6157 * Can the parent entry simply be inserted in front of its successor 6158 * leaf entry without overflowing the node? If not we need to split 6159 * the node and then restart the delete. 6160 */ 6161 if (pull_down_ie_size > suc_ictx->bytes_free) { 6162split_and_restart_case_4: 6163 ntfs_debug("Splitting index node before add on delete (case " 6164 "4)."); 6165 ie_size = pull_down_ie_size; 6166 goto split_and_restart; 6167 } 6168 /* 6169 * The parent entry can simply be inserted in front of its successor 6170 * leaf entry. 6171 * 6172 * We need to have locked both nodes to be able to do the insert thus 6173 * we may need to unlock the locked successor node. 6174 */ 6175 if (parent_ictx->is_locked) 6176 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 6177 err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx); 6178 if (err) 6179 goto put_err; 6180 /* 6181 * We need to re-check the amount of free space as it can have changed 6182 * whilst we dropped the lock on @suc_ictx if that was necessary. When 6183 * this happens simply drop the lock on @parent_ictx if we took it and 6184 * go back to the previous code dealing with the not enough space case. 6185 */ 6186 if (pull_down_ie_size > suc_ictx->bytes_free) { 6187 if (parent_ictx->is_locked) 6188 ntfs_index_ctx_unlock(parent_ictx); 6189 goto split_and_restart_case_4; 6190 } 6191 /* 6192 * Both nodes are locked and this is a simple insert, so go ahead and 6193 * do it. We have already checked that there is enough space so it 6194 * cannot fail. 6195 * 6196 * Move all the entries out of the way to make space for the parent 6197 * entry to be copied in. 6198 */ 6199 err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size); 6200 if (err) 6201 panic("%s(): err\n", __FUNCTION__); 6202 /* 6203 * Copy the parent index entry into the created space in front of its 6204 * successor and switch the inserted entry to being a leaf entry. 6205 */ 6206 ie = suc_ictx->follow_entry; 6207 pull_down_entry = parent_ictx->entries[pull_down_entry_nr]; 6208 memcpy(ie, pull_down_entry, pull_down_ie_size); 6209 ie->length = cpu_to_le16(pull_down_ie_size); 6210 ie->flags &= ~INDEX_ENTRY_NODE; 6211 /* Ensure the updates are written to disk. */ 6212 ntfs_index_entry_mark_dirty(suc_ictx); 6213 /* 6214 * We are done with the successor node so release it or transfer it to 6215 * the parent node if they share the same page (in which case 6216 * @parent_ictx is not marked locked). 6217 */ 6218 if (!suc_ictx->is_locked) 6219 panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); 6220 if (parent_ictx->is_locked) { 6221 /* 6222 * @parent_ictx is locked thus it cannot be sharing the lock 6223 * with @suc_ictx. Unlock @suc_ictx. 6224 */ 6225 ntfs_index_ctx_unlock(suc_ictx); 6226 } else { 6227 /* 6228 * @parent_ictx is not locked thus it must be sharing the lock 6229 * with @suc_ictx. Transfer the lock across. 6230 */ 6231 if (parent_ictx->is_root) 6232 panic("%s(): parent_ictx->is_root\n", __FUNCTION__); 6233 if (parent_ictx->upl_ofs != suc_ictx->upl_ofs) 6234 panic("%s(): parent_ictx->upl_ofs != " 6235 "suc_ictx->upl_ofs\n", __FUNCTION__); 6236 parent_ictx->is_locked = 1; 6237 suc_ictx->is_locked = 0; 6238 } 6239 if (!parent_ictx->is_locked) 6240 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 6241 /* 6242 * We have now inserted the parent entry in front of its successor but 6243 * at the moment we have two copies of the parent entry, so delete the 6244 * original copy from the parent node. We need to switch the parent 6245 * index context to be the current context and to point to the parent 6246 * entry we pulled down so we can just pretend it is the entry to be 6247 * deleted. Note this is a simple delete just like for a leaf entry as 6248 * we have taken care of its child leaf node(s) already and will 6249 * deallocate it(them) later. 6250 */ 6251 ictx = parent_ictx; 6252 parent_ictx->entry = pull_down_entry; 6253 parent_ictx->entry_nr = pull_down_entry_nr; 6254 goto prep_simple_delete; 6255merge_left: 6256 /* 6257 * The parent entry is the end entry in its node so merge on the left. 6258 * To do this we need to find the predecessor (leaf) entry for which we 6259 * need to repoint the path so we descend into the sub-node of the 6260 * entry on the left-hand side of the parent entry. Once found we need 6261 * to pull down the new parent entry, i.e. the entry on the left of the 6262 * current parent entry, behind its predecessor entry. 6263 * 6264 * We also make a note of the original parent entry as we have to 6265 * change its VCN sub-node pointer later. 6266 */ 6267 old_parent_entry_nr = parent_ictx->entry_nr; 6268 old_parent_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) - 6269 sizeof(VCN); 6270 parent_ictx->entry_nr--; 6271 parent_ictx->follow_entry = 6272 parent_ictx->entries[parent_ictx->entry_nr]; 6273 pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) - 6274 sizeof(VCN); 6275 suc_ictx = parent_ictx; 6276 do { 6277 err = ntfs_index_descend_into_child_node(&suc_ictx); 6278 if (err) { 6279 ntfs_error(idx_ni->vol->mp, "Failed to descend to " 6280 "node containing predecessor entry of " 6281 "left-hand sibling entry (error %d).", 6282 err); 6283 goto put_err; 6284 } 6285 if (!suc_ictx->is_locked) 6286 panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); 6287 suc_ictx->entry_nr = suc_ictx->nr_entries - 1; 6288 suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr]; 6289 } while (suc_ictx->index->flags & INDEX_NODE); 6290 if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE) 6291 panic("%s(): suc_ictx->follow_entry->flags & " 6292 "INDEX_ENTRY_NODE\n", __FUNCTION__); 6293 /* 6294 * Can the entry on the left-hand side of the parent entry simply be 6295 * inserted in after its predecessor leaf entry, i.e. before the end 6296 * entry in the leaf node containing the predecessor leaf entry, 6297 * without overflowing the node? If not we need to split the node and 6298 * then restart the delete. 6299 */ 6300 if (pull_down_ie_size > suc_ictx->bytes_free) { 6301split_and_restart_case_5: 6302 ntfs_debug("Splitting index node before add on delete (case " 6303 "5)."); 6304 ie_size = pull_down_ie_size; 6305 goto split_and_restart; 6306 } 6307 /* 6308 * The entry on the left-hand side of the parent entry can simply be 6309 * inserted in front of the end entry of the leaf node containing its 6310 * predecessor. 6311 * 6312 * We need to have locked both nodes to be able to do the insert thus 6313 * we may need to unlock the locked predecessor node to ensure that 6314 * page locks are only taken in ascending page index order so we avoid 6315 * deadlocks. 6316 */ 6317 if (parent_ictx->is_locked) 6318 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 6319 err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx); 6320 if (err) 6321 goto put_err; 6322 /* 6323 * We need to re-check the amount of free space as it can have changed 6324 * whilst we dropped the lock on @suc_ictx if that was necessary. When 6325 * this happens simply drop the lock on @parent_ictx if we took it and 6326 * go back to the previous code dealing with the not enough space case. 6327 */ 6328 if (pull_down_ie_size > suc_ictx->bytes_free) { 6329 if (parent_ictx->is_locked) 6330 ntfs_index_ctx_unlock(parent_ictx); 6331 goto split_and_restart_case_5; 6332 } 6333 /* 6334 * Both nodes are locked and this is a simple insert, so go ahead and 6335 * do it. We have already checked that there is enough space so it 6336 * cannot fail. 6337 * 6338 * Move the end entry out of the way to make space for the entry to be 6339 * copied in. 6340 */ 6341 err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size); 6342 if (err) 6343 panic("%s(): err\n", __FUNCTION__); 6344 /* 6345 * Copy the parent index entry into the created space in front of its 6346 * predecessor and switch the inserted entry to being a leaf entry. 6347 */ 6348 ie = suc_ictx->follow_entry; 6349 memcpy(ie, parent_ictx->follow_entry, pull_down_ie_size); 6350 ie->length = cpu_to_le16(pull_down_ie_size); 6351 ie->flags &= ~INDEX_ENTRY_NODE; 6352 /* Ensure the updates are written to disk. */ 6353 ntfs_index_entry_mark_dirty(suc_ictx); 6354 /* 6355 * We are done with the predecessor node so release it or transfer it 6356 * to the parent node if they share the same page (in which case 6357 * @parent_ictx is not marked locked). 6358 */ 6359 if (!suc_ictx->is_locked) 6360 panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__); 6361 if (parent_ictx->is_locked) { 6362 /* 6363 * @parent_ictx is locked thus it cannot be sharing the lock 6364 * with @suc_ictx. Unlock @suc_ictx. 6365 */ 6366 ntfs_index_ctx_unlock(suc_ictx); 6367 } else { 6368 /* 6369 * @parent_ictx is not locked thus it must be sharing the lock 6370 * with @suc_ictx. Transfer the lock across. 6371 */ 6372 if (parent_ictx->is_root) 6373 panic("%s(): parent_ictx->is_root\n", __FUNCTION__); 6374 if (parent_ictx->upl_ofs != suc_ictx->upl_ofs) 6375 panic("%s(): parent_ictx->upl_ofs != " 6376 "suc_ictx->upl_ofs\n", __FUNCTION__); 6377 parent_ictx->is_locked = 1; 6378 suc_ictx->is_locked = 0; 6379 } 6380 if (!parent_ictx->is_locked) 6381 panic("%s(): parent_ictx->is_locked\n", __FUNCTION__); 6382 /* 6383 * We have now inserted the left-hand sibling entry of the parent entry 6384 * in front of the end entry of the leaf node containing its 6385 * predecessor entry but at the moment we have two copies of the entry, 6386 * so delete the original copy from its index node. 6387 * 6388 * Before we do that, update the VCN sub-node pointer of the original 6389 * parent entry to point to the index node that is about to become 6390 * parentless. Note we do not mark the index node dirty as that will 6391 * happen when the original copy of the entry we pulled down is 6392 * deleted. 6393 */ 6394 *(leVCN*)((u8*)parent_ictx->entries[old_parent_entry_nr] + 6395 old_parent_ie_size) = *(leVCN*)( 6396 (u8*)parent_ictx->follow_entry + pull_down_ie_size); 6397 /* 6398 * All that is left now is to delete the original copy of the entry we 6399 * pulled down. We need to switch the parent index context to be the 6400 * current context as it already points to the entry we pulled down 6401 * thus we can just pretend it is the entry to be deleted. Note this 6402 * is a simple delete just like for a leaf entry as we have taken care 6403 * of transfering its child leaf node(s) to the original parent entry 6404 * already and will deallocate the origial parent entry's child leaf 6405 * node(s) later. 6406 */ 6407 ictx = parent_ictx; 6408prep_simple_delete: 6409 ie = ictx->entry; 6410 is_root = ictx->is_root; 6411 ictx->is_match = 1; 6412 ih = ictx->index; 6413 ie_size = le16_to_cpu(ie->length); 6414 next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size); 6415 /* goto simple_delete; */ 6416simple_delete: 6417 new_ilen = le32_to_cpu(ih->index_length) - ie_size; 6418 if (is_root) { 6419 ATTR_RECORD *a; 6420 u32 muse; 6421 6422 m = ictx->actx->m; 6423 muse = le32_to_cpu(m->bytes_in_use); 6424 /* 6425 * Move the index entries following @ie into @ie's position. 6426 * As an optimization we combine the moving of the index 6427 * entries and the resizing of the index root attribute into a 6428 * single operation. 6429 */ 6430 memmove(ie, next_ie, muse - ((u8*)next_ie - (u8*)m)); 6431 /* Adjust the mft record to reflect the change in used space. */ 6432 m->bytes_in_use = cpu_to_le32(muse - ie_size); 6433 /* 6434 * Adjust the attribute record to reflect the change in 6435 * attribute value and attribute record size. 6436 */ 6437 a = ictx->actx->a; 6438 a->length = cpu_to_le32(le32_to_cpu(a->length) - ie_size); 6439 a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) - 6440 ie_size); 6441 /* Adjust the index header to reflect the change in length. */ 6442 ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen); 6443 } else { 6444 /* Move index entries following @ie into @ie's position. */ 6445 memmove(ie, next_ie, new_ilen - ((u8*)ie - (u8*)ih)); 6446 /* Adjust the index header to reflect the change in length. */ 6447 ih->index_length = cpu_to_le32(new_ilen); 6448 } 6449 /* Ensure the updates are written to disk. */ 6450 ntfs_index_entry_mark_dirty(ictx); 6451 /* 6452 * If we have scheduled any index block nodes to be deallocated do it 6453 * now but first unlock the node we just deleted an entry from so we do 6454 * not deadlock. 6455 */ 6456 if (deallocate_ictx) { 6457 ntfs_index_ctx_unlock(ictx); 6458 ictx = deallocate_ictx; 6459 do { 6460 if (ictx->is_root) 6461 panic("%s(): ictx->is_root\n", __FUNCTION__); 6462 err = ntfs_index_block_free(ictx); 6463 if (err) { 6464 ntfs_error(idx_ni->vol->mp, "Failed to " 6465 "deallocate no longer used " 6466 "index block VCN 0x%llx " 6467 "(error %d). Run chkdsk to " 6468 "recover the lost index " 6469 "block.", 6470 (unsigned long long)ictx->vcn, 6471 err); 6472 NVolSetErrors(idx_ni->vol); 6473 } 6474 } while ((ictx = ictx->up) != deallocate_ictx); 6475 } 6476 /* 6477 * Release any sub-tree paths that the caller no longer has a reference 6478 * to. 6479 */ 6480 if (put_ictx) 6481 ntfs_index_ctx_put(put_ictx); 6482 ntfs_debug("Done."); 6483 return 0; 6484split_and_restart: 6485 /* 6486 * Split the index node described by @suc_ictx taking into account the 6487 * fact that it is very likely that an entry of @ie_size will be added 6488 * to the index in front of the position pointed to by @suc_ictx. 6489 */ 6490 err = ntfs_index_node_split(suc_ictx, ie_size); 6491 if (!err) { 6492 /* 6493 * Signal to the caller that they need to restart the delete 6494 * from scratch. 6495 */ 6496 err = -EAGAIN; 6497 } 6498put_err: 6499 if (put_ictx) 6500 ntfs_index_ctx_put(put_ictx); 6501 return err; 6502} 6503