stats.c revision 299742
1/* stats.c -- implements the svn_fs_fs__get_stats private API. 2 * 3 * ==================================================================== 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, 15 * software distributed under the License is distributed on an 16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 * KIND, either express or implied. See the License for the 18 * specific language governing permissions and limitations 19 * under the License. 20 * ==================================================================== 21 */ 22 23#include "svn_dirent_uri.h" 24#include "svn_fs.h" 25#include "svn_pools.h" 26#include "svn_sorts.h" 27 28#include "private/svn_cache.h" 29#include "private/svn_sorts_private.h" 30#include "private/svn_string_private.h" 31#include "private/svn_fs_fs_private.h" 32 33#include "index.h" 34#include "pack.h" 35#include "rev_file.h" 36#include "util.h" 37#include "fs_fs.h" 38#include "cached_data.h" 39#include "low_level.h" 40 41#include "../libsvn_fs/fs-loader.h" 42 43#include "svn_private_config.h" 44 45/* We group representations into 2x2 different kinds plus one default: 46 * [dir / file] x [text / prop]. The assignment is done by the first node 47 * that references the respective representation. 48 */ 49typedef enum rep_kind_t 50{ 51 /* The representation is not used _directly_, i.e. not referenced by any 52 * noderev. However, some other representation may use it as delta base. 53 * Null value. Should not occur in real-word repositories. */ 54 unused_rep, 55 56 /* a properties on directory rep */ 57 dir_property_rep, 58 59 /* a properties on file rep */ 60 file_property_rep, 61 62 /* a directory rep */ 63 dir_rep, 64 65 /* a file rep */ 66 file_rep 67} rep_kind_t; 68 69/* A representation fragment. 70 */ 71typedef struct rep_stats_t 72{ 73 /* absolute offset in the file */ 74 apr_off_t offset; 75 76 /* item length in bytes */ 77 apr_uint64_t size; 78 79 /* item length after de-deltification */ 80 apr_uint64_t expanded_size; 81 82 /* revision that contains this representation 83 * (may be referenced by other revisions, though) */ 84 svn_revnum_t revision; 85 86 /* number of nodes that reference this representation */ 87 apr_uint32_t ref_count; 88 89 /* length of the PLAIN / DELTA line in the source file in bytes */ 90 apr_uint16_t header_size; 91 92 /* classification of the representation. values of rep_kind_t */ 93 char kind; 94 95} rep_stats_t; 96 97/* Represents a single revision. 98 * There will be only one instance per revision. */ 99typedef struct revision_info_t 100{ 101 /* number of this revision */ 102 svn_revnum_t revision; 103 104 /* pack file offset (manifest value), 0 for non-packed files */ 105 apr_off_t offset; 106 107 /* length of the changes list on bytes */ 108 apr_uint64_t changes_len; 109 110 /* offset of the changes list relative to OFFSET */ 111 apr_uint64_t change_count; 112 113 /* first offset behind the revision data in the pack file (file length 114 * for non-packed revs) */ 115 apr_off_t end; 116 117 /* number of directory noderevs in this revision */ 118 apr_uint64_t dir_noderev_count; 119 120 /* number of file noderevs in this revision */ 121 apr_uint64_t file_noderev_count; 122 123 /* total size of directory noderevs (i.e. the structs - not the rep) */ 124 apr_uint64_t dir_noderev_size; 125 126 /* total size of file noderevs (i.e. the structs - not the rep) */ 127 apr_uint64_t file_noderev_size; 128 129 /* all rep_stats_t of this revision (in no particular order), 130 * i.e. those that point back to this struct */ 131 apr_array_header_t *representations; 132 133 /* Temporary rev / pack file access object, used in phys. addressing 134 * mode only. NULL when done reading this revision. */ 135 svn_fs_fs__revision_file_t *rev_file; 136} revision_info_t; 137 138/* Root data structure containing all information about a given repository. 139 * We use it as a wrapper around svn_fs_t and pass it around where we would 140 * otherwise just use a svn_fs_t. 141 */ 142typedef struct query_t 143{ 144 /* FS API object*/ 145 svn_fs_t *fs; 146 147 /* The HEAD revision. */ 148 svn_revnum_t head; 149 150 /* Number of revs per shard; 0 for non-sharded repos. */ 151 int shard_size; 152 153 /* First non-packed revision. */ 154 svn_revnum_t min_unpacked_rev; 155 156 /* all revisions */ 157 apr_array_header_t *revisions; 158 159 /* empty representation. 160 * Used as a dummy base for DELTA reps without base. */ 161 rep_stats_t *null_base; 162 163 /* collected statistics */ 164 svn_fs_fs__stats_t *stats; 165 166 /* Progress notification callback to call after each shard. May be NULL. */ 167 svn_fs_progress_notify_func_t progress_func; 168 169 /* Baton for PROGRESS_FUNC. */ 170 void *progress_baton; 171 172 /* Cancellation support callback to call once in a while. May be NULL. */ 173 svn_cancel_func_t cancel_func; 174 175 /* Baton for CANCEL_FUNC. */ 176 void *cancel_baton; 177} query_t; 178 179/* Return the length of REV_FILE in *FILE_SIZE. 180 * Use SCRATCH_POOL for temporary allocations. 181 */ 182static svn_error_t * 183get_file_size(apr_off_t *file_size, 184 svn_fs_fs__revision_file_t *rev_file, 185 apr_pool_t *scratch_pool) 186{ 187 apr_finfo_t finfo; 188 189 SVN_ERR(svn_io_file_info_get(&finfo, APR_FINFO_SIZE, rev_file->file, 190 scratch_pool)); 191 192 *file_size = finfo.size; 193 return SVN_NO_ERROR; 194} 195 196/* Initialize the LARGEST_CHANGES member in STATS with a capacity of COUNT 197 * entries. Allocate the result in RESULT_POOL. 198 */ 199static void 200initialize_largest_changes(svn_fs_fs__stats_t *stats, 201 apr_size_t count, 202 apr_pool_t *result_pool) 203{ 204 apr_size_t i; 205 206 stats->largest_changes = apr_pcalloc(result_pool, 207 sizeof(*stats->largest_changes)); 208 stats->largest_changes->count = count; 209 stats->largest_changes->min_size = 1; 210 stats->largest_changes->changes 211 = apr_palloc(result_pool, count * sizeof(*stats->largest_changes->changes)); 212 213 /* allocate *all* entries before the path stringbufs. This increases 214 * cache locality and enhances performance significantly. */ 215 for (i = 0; i < count; ++i) 216 stats->largest_changes->changes[i] 217 = apr_palloc(result_pool, sizeof(**stats->largest_changes->changes)); 218 219 /* now initialize them and allocate the stringbufs */ 220 for (i = 0; i < count; ++i) 221 { 222 stats->largest_changes->changes[i]->size = 0; 223 stats->largest_changes->changes[i]->revision = SVN_INVALID_REVNUM; 224 stats->largest_changes->changes[i]->path 225 = svn_stringbuf_create_ensure(1024, result_pool); 226 } 227} 228 229/* Add entry for SIZE to HISTOGRAM. 230 */ 231static void 232add_to_histogram(svn_fs_fs__histogram_t *histogram, 233 apr_int64_t size) 234{ 235 apr_int64_t shift = 0; 236 237 while (((apr_int64_t)(1) << shift) <= size) 238 shift++; 239 240 histogram->total.count++; 241 histogram->total.sum += size; 242 histogram->lines[(apr_size_t)shift].count++; 243 histogram->lines[(apr_size_t)shift].sum += size; 244} 245 246/* Update data aggregators in STATS with this representation of type KIND, 247 * on-disk REP_SIZE and expanded node size EXPANDED_SIZE for PATH in REVSION. 248 * PLAIN_ADDED indicates whether the node has a deltification predecessor. 249 */ 250static void 251add_change(svn_fs_fs__stats_t *stats, 252 apr_uint64_t rep_size, 253 apr_uint64_t expanded_size, 254 svn_revnum_t revision, 255 const char *path, 256 rep_kind_t kind, 257 svn_boolean_t plain_added) 258{ 259 /* identify largest reps */ 260 if (rep_size >= stats->largest_changes->min_size) 261 { 262 apr_size_t i; 263 svn_fs_fs__largest_changes_t *largest_changes = stats->largest_changes; 264 svn_fs_fs__large_change_info_t *info 265 = largest_changes->changes[largest_changes->count - 1]; 266 info->size = rep_size; 267 info->revision = revision; 268 svn_stringbuf_set(info->path, path); 269 270 /* linear insertion but not too bad since count is low and insertions 271 * near the end are more likely than close to front */ 272 for (i = largest_changes->count - 1; i > 0; --i) 273 if (largest_changes->changes[i-1]->size >= rep_size) 274 break; 275 else 276 largest_changes->changes[i] = largest_changes->changes[i-1]; 277 278 largest_changes->changes[i] = info; 279 largest_changes->min_size 280 = largest_changes->changes[largest_changes->count-1]->size; 281 } 282 283 /* global histograms */ 284 add_to_histogram(&stats->rep_size_histogram, rep_size); 285 add_to_histogram(&stats->node_size_histogram, expanded_size); 286 287 if (plain_added) 288 { 289 add_to_histogram(&stats->added_rep_size_histogram, rep_size); 290 add_to_histogram(&stats->added_node_size_histogram, expanded_size); 291 } 292 293 /* specific histograms by type */ 294 switch (kind) 295 { 296 case unused_rep: 297 add_to_histogram(&stats->unused_rep_histogram, rep_size); 298 break; 299 case dir_property_rep: 300 add_to_histogram(&stats->dir_prop_rep_histogram, rep_size); 301 add_to_histogram(&stats->dir_prop_histogram, expanded_size); 302 break; 303 case file_property_rep: 304 add_to_histogram(&stats->file_prop_rep_histogram, rep_size); 305 add_to_histogram(&stats->file_prop_histogram, expanded_size); 306 break; 307 case dir_rep: 308 add_to_histogram(&stats->dir_rep_histogram, rep_size); 309 add_to_histogram(&stats->dir_histogram, expanded_size); 310 break; 311 case file_rep: 312 add_to_histogram(&stats->file_rep_histogram, rep_size); 313 add_to_histogram(&stats->file_histogram, expanded_size); 314 break; 315 } 316 317 /* by extension */ 318 if (kind == file_rep) 319 { 320 /* determine extension */ 321 svn_fs_fs__extension_info_t *info; 322 const char * file_name = strrchr(path, '/'); 323 const char * extension = file_name ? strrchr(file_name, '.') : NULL; 324 325 if (extension == NULL || extension == file_name + 1) 326 extension = "(none)"; 327 328 /* get / auto-insert entry for this extension */ 329 info = apr_hash_get(stats->by_extension, extension, APR_HASH_KEY_STRING); 330 if (info == NULL) 331 { 332 apr_pool_t *pool = apr_hash_pool_get(stats->by_extension); 333 info = apr_pcalloc(pool, sizeof(*info)); 334 info->extension = apr_pstrdup(pool, extension); 335 336 apr_hash_set(stats->by_extension, info->extension, 337 APR_HASH_KEY_STRING, info); 338 } 339 340 /* update per-extension histogram */ 341 add_to_histogram(&info->node_histogram, expanded_size); 342 add_to_histogram(&info->rep_histogram, rep_size); 343 } 344} 345 346/* Comparator used for binary search comparing the absolute file offset 347 * of a representation to some other offset. DATA is a *rep_stats_t, 348 * KEY is a pointer to an apr_off_t. 349 */ 350static int 351compare_representation_offsets(const void *data, const void *key) 352{ 353 apr_off_t lhs = (*(const rep_stats_t *const *)data)->offset; 354 apr_off_t rhs = *(const apr_off_t *)key; 355 356 if (lhs < rhs) 357 return -1; 358 return (lhs > rhs ? 1 : 0); 359} 360 361/* Find the revision_info_t object to the given REVISION in QUERY and 362 * return it in *REVISION_INFO. For performance reasons, we skip the 363 * lookup if the info is already provided. 364 * 365 * In that revision, look for the rep_stats_t object for offset OFFSET. 366 * If it already exists, set *IDX to its index in *REVISION_INFO's 367 * representations list and return the representation object. Otherwise, 368 * set the index to where it must be inserted and return NULL. 369 */ 370static rep_stats_t * 371find_representation(int *idx, 372 query_t *query, 373 revision_info_t **revision_info, 374 svn_revnum_t revision, 375 apr_off_t offset) 376{ 377 revision_info_t *info; 378 *idx = -1; 379 380 /* first let's find the revision */ 381 info = revision_info ? *revision_info : NULL; 382 if (info == NULL || info->revision != revision) 383 { 384 info = APR_ARRAY_IDX(query->revisions, revision, revision_info_t*); 385 if (revision_info) 386 *revision_info = info; 387 } 388 389 /* not found -> no result */ 390 if (info == NULL) 391 return NULL; 392 393 /* look for the representation */ 394 *idx = svn_sort__bsearch_lower_bound(info->representations, 395 &offset, 396 compare_representation_offsets); 397 if (*idx < info->representations->nelts) 398 { 399 /* return the representation, if this is the one we were looking for */ 400 rep_stats_t *result 401 = APR_ARRAY_IDX(info->representations, *idx, rep_stats_t *); 402 if (result->offset == offset) 403 return result; 404 } 405 406 /* not parsed, yet */ 407 return NULL; 408} 409 410/* Find / auto-construct the representation stats for REP in QUERY and 411 * return it in *REPRESENTATION. 412 * 413 * If necessary, allocate the result in RESULT_POOL; use SCRATCH_POOL for 414 * temporary allocations. 415 */ 416static svn_error_t * 417parse_representation(rep_stats_t **representation, 418 query_t *query, 419 representation_t *rep, 420 revision_info_t *revision_info, 421 apr_pool_t *result_pool, 422 apr_pool_t *scratch_pool) 423{ 424 rep_stats_t *result; 425 int idx; 426 427 /* read location (revision, offset) and size */ 428 429 /* look it up */ 430 result = find_representation(&idx, query, &revision_info, rep->revision, 431 (apr_off_t)rep->item_index); 432 if (!result) 433 { 434 /* not parsed, yet (probably a rep in the same revision). 435 * Create a new rep object and determine its base rep as well. 436 */ 437 result = apr_pcalloc(result_pool, sizeof(*result)); 438 result->revision = rep->revision; 439 result->expanded_size = (rep->expanded_size ? rep->expanded_size 440 : rep->size); 441 result->offset = (apr_off_t)rep->item_index; 442 result->size = rep->size; 443 444 /* In phys. addressing mode, follow link to the actual representation. 445 * In log. addressing mode, we will find it already as part of our 446 * linear walk through the whole file. */ 447 if (!svn_fs_fs__use_log_addressing(query->fs)) 448 { 449 svn_fs_fs__rep_header_t *header; 450 apr_off_t offset = revision_info->offset + result->offset; 451 452 SVN_ERR_ASSERT(revision_info->rev_file); 453 SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET, 454 &offset, scratch_pool)); 455 SVN_ERR(svn_fs_fs__read_rep_header(&header, 456 revision_info->rev_file->stream, 457 scratch_pool, scratch_pool)); 458 459 result->header_size = header->header_size; 460 } 461 462 svn_sort__array_insert(revision_info->representations, &result, idx); 463 } 464 465 *representation = result; 466 467 return SVN_NO_ERROR; 468} 469 470 471/* forward declaration */ 472static svn_error_t * 473read_noderev(query_t *query, 474 svn_stringbuf_t *noderev_str, 475 revision_info_t *revision_info, 476 apr_pool_t *result_pool, 477 apr_pool_t *scratch_pool); 478 479/* Read the noderev item at OFFSET in REVISION_INFO from the filesystem 480 * provided by QUERY. Return it in *NODEREV, allocated in RESULT_POOL. 481 * Use SCRATCH_POOL for temporary allocations. 482 * 483 * The textual representation of the noderev will be used to determine 484 * the on-disk size of the noderev. Only called in phys. addressing mode. 485 */ 486static svn_error_t * 487read_phsy_noderev(svn_stringbuf_t **noderev, 488 query_t *query, 489 apr_off_t offset, 490 revision_info_t *revision_info, 491 apr_pool_t *result_pool, 492 apr_pool_t *scratch_pool) 493{ 494 svn_stringbuf_t *noderev_str = svn_stringbuf_create_empty(result_pool); 495 svn_stringbuf_t *line; 496 svn_boolean_t eof; 497 498 apr_pool_t *iterpool = svn_pool_create(scratch_pool); 499 500 /* Navigate the file stream to the start of noderev. */ 501 SVN_ERR_ASSERT(revision_info->rev_file); 502 503 offset += revision_info->offset; 504 SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET, 505 &offset, scratch_pool)); 506 507 /* Read it (terminated by an empty line) */ 508 do 509 { 510 svn_pool_clear(iterpool); 511 512 SVN_ERR(svn_stream_readline(revision_info->rev_file->stream, &line, 513 "\n", &eof, iterpool)); 514 svn_stringbuf_appendstr(noderev_str, line); 515 svn_stringbuf_appendbyte(noderev_str, '\n'); 516 } 517 while (line->len > 0 && !eof); 518 519 /* Return the result. */ 520 *noderev = noderev_str; 521 522 svn_pool_destroy(iterpool); 523 524 return SVN_NO_ERROR; 525} 526 527/* Starting at the directory in NODEREV's text, read all DAG nodes, 528 * directories and representations linked in that tree structure. 529 * Store them in QUERY and REVISION_INFO. Also, read them only once. 530 * 531 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 532 * temporaries. 533 */ 534static svn_error_t * 535parse_dir(query_t *query, 536 node_revision_t *noderev, 537 revision_info_t *revision_info, 538 apr_pool_t *result_pool, 539 apr_pool_t *scratch_pool) 540{ 541 apr_pool_t *iterpool = svn_pool_create(scratch_pool); 542 543 int i; 544 apr_array_header_t *entries; 545 SVN_ERR(svn_fs_fs__rep_contents_dir(&entries, query->fs, noderev, 546 scratch_pool, scratch_pool)); 547 548 for (i = 0; i < entries->nelts; ++i) 549 { 550 svn_fs_dirent_t *dirent = APR_ARRAY_IDX(entries, i, svn_fs_dirent_t *); 551 552 if (svn_fs_fs__id_rev(dirent->id) == revision_info->revision) 553 { 554 svn_stringbuf_t *noderev_str; 555 svn_pool_clear(iterpool); 556 557 SVN_ERR(read_phsy_noderev(&noderev_str, query, 558 svn_fs_fs__id_item(dirent->id), 559 revision_info, iterpool, iterpool)); 560 SVN_ERR(read_noderev(query, noderev_str, revision_info, 561 result_pool, iterpool)); 562 } 563 } 564 565 svn_pool_destroy(iterpool); 566 567 return SVN_NO_ERROR; 568} 569 570/* Parse the noderev given as NODEREV_STR and store the info in QUERY and 571 * REVISION_INFO. In phys. addressing mode, continue reading all DAG nodes, 572 * directories and representations linked in that tree structure. 573 * 574 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 575 * temporaries. 576 */ 577static svn_error_t * 578read_noderev(query_t *query, 579 svn_stringbuf_t *noderev_str, 580 revision_info_t *revision_info, 581 apr_pool_t *result_pool, 582 apr_pool_t *scratch_pool) 583{ 584 rep_stats_t *text = NULL; 585 rep_stats_t *props = NULL; 586 node_revision_t *noderev; 587 588 svn_stream_t *stream = svn_stream_from_stringbuf(noderev_str, scratch_pool); 589 SVN_ERR(svn_fs_fs__read_noderev(&noderev, stream, scratch_pool, 590 scratch_pool)); 591 592 if (noderev->data_rep) 593 { 594 SVN_ERR(parse_representation(&text, query, 595 noderev->data_rep, revision_info, 596 result_pool, scratch_pool)); 597 598 /* if we are the first to use this rep, mark it as "text rep" */ 599 if (++text->ref_count == 1) 600 text->kind = noderev->kind == svn_node_dir ? dir_rep : file_rep; 601 } 602 603 if (noderev->prop_rep) 604 { 605 SVN_ERR(parse_representation(&props, query, 606 noderev->prop_rep, revision_info, 607 result_pool, scratch_pool)); 608 609 /* if we are the first to use this rep, mark it as "prop rep" */ 610 if (++props->ref_count == 1) 611 props->kind = noderev->kind == svn_node_dir ? dir_property_rep 612 : file_property_rep; 613 } 614 615 /* record largest changes */ 616 if (text && text->ref_count == 1) 617 add_change(query->stats, text->size, text->expanded_size, text->revision, 618 noderev->created_path, text->kind, !noderev->predecessor_id); 619 if (props && props->ref_count == 1) 620 add_change(query->stats, props->size, props->expanded_size, 621 props->revision, noderev->created_path, props->kind, 622 !noderev->predecessor_id); 623 624 /* if this is a directory and has not been processed, yet, read and 625 * process it recursively */ 626 if ( noderev->kind == svn_node_dir && text && text->ref_count == 1 627 && !svn_fs_fs__use_log_addressing(query->fs)) 628 SVN_ERR(parse_dir(query, noderev, revision_info, result_pool, 629 scratch_pool)); 630 631 /* update stats */ 632 if (noderev->kind == svn_node_dir) 633 { 634 revision_info->dir_noderev_size += noderev_str->len; 635 revision_info->dir_noderev_count++; 636 } 637 else 638 { 639 revision_info->file_noderev_size += noderev_str->len; 640 revision_info->file_noderev_count++; 641 } 642 643 return SVN_NO_ERROR; 644} 645 646/* For the revision given as REVISION_INFO within QUERY, determine the number 647 * of entries in its changed paths list and store that info in REVISION_INFO. 648 * Use SCRATCH_POOL for temporary allocations. 649 */ 650static svn_error_t * 651get_phys_change_count(query_t *query, 652 revision_info_t *revision_info, 653 apr_pool_t *scratch_pool) 654{ 655 /* We are going to use our own sub-pool here because the changes object 656 * may well be >100MB and SCRATCH_POOL may not get cleared until all other 657 * info has been read by read_phys_revision(). Therefore, tidy up early. 658 */ 659 apr_pool_t *subpool = svn_pool_create(scratch_pool); 660 apr_array_header_t *changes; 661 662 SVN_ERR(svn_fs_fs__get_changes(&changes, query->fs, 663 revision_info->revision, subpool)); 664 revision_info->change_count = changes->nelts; 665 666 /* Release potentially tons of memory. */ 667 svn_pool_destroy(subpool); 668 669 return SVN_NO_ERROR; 670} 671 672/* Read header information for the revision stored in FILE_CONTENT (one 673 * whole revision). Return the offsets within FILE_CONTENT for the 674 * *ROOT_NODEREV, the list of *CHANGES and its len in *CHANGES_LEN. 675 * Use POOL for temporary allocations. */ 676static svn_error_t * 677read_phys_revision(query_t *query, 678 revision_info_t *info, 679 apr_pool_t *result_pool, 680 apr_pool_t *scratch_pool) 681{ 682 char buf[64]; 683 apr_off_t root_node_offset; 684 apr_off_t changes_offset; 685 svn_stringbuf_t *trailer; 686 svn_stringbuf_t *noderev_str; 687 688 /* Read the last 64 bytes of the revision (if long enough). */ 689 apr_off_t start = MAX(info->offset, info->end - sizeof(buf)); 690 apr_size_t len = (apr_size_t)(info->end - start); 691 SVN_ERR(svn_io_file_seek(info->rev_file->file, APR_SET, &start, 692 scratch_pool)); 693 SVN_ERR(svn_io_file_read_full2(info->rev_file->file, buf, len, NULL, NULL, 694 scratch_pool)); 695 trailer = svn_stringbuf_ncreate(buf, len, scratch_pool); 696 697 /* Parse that trailer. */ 698 SVN_ERR(svn_fs_fs__parse_revision_trailer(&root_node_offset, 699 &changes_offset, trailer, 700 info->revision)); 701 SVN_ERR(get_phys_change_count(query, info, scratch_pool)); 702 703 /* Calculate the length of the changes list. */ 704 trailer = svn_fs_fs__unparse_revision_trailer(root_node_offset, 705 changes_offset, 706 scratch_pool); 707 info->changes_len = info->end - info->offset - changes_offset 708 - trailer->len; 709 710 /* Recursively read nodes added in this rev. */ 711 SVN_ERR(read_phsy_noderev(&noderev_str, query, root_node_offset, info, 712 scratch_pool, scratch_pool)); 713 SVN_ERR(read_noderev(query, noderev_str, info, result_pool, scratch_pool)); 714 715 return SVN_NO_ERROR; 716} 717 718/* Read the content of the pack file staring at revision BASE physical 719 * addressing mode and store it in QUERY. 720 * 721 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 722 * temporaries. 723 */ 724static svn_error_t * 725read_phys_pack_file(query_t *query, 726 svn_revnum_t base, 727 apr_pool_t *result_pool, 728 apr_pool_t *scratch_pool) 729{ 730 apr_pool_t *iterpool = svn_pool_create(scratch_pool); 731 int i; 732 apr_off_t file_size = 0; 733 svn_fs_fs__revision_file_t *rev_file; 734 735 SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base, 736 scratch_pool, scratch_pool)); 737 SVN_ERR(get_file_size(&file_size, rev_file, scratch_pool)); 738 739 /* process each revision in the pack file */ 740 for (i = 0; i < query->shard_size; ++i) 741 { 742 revision_info_t *info; 743 744 /* cancellation support */ 745 if (query->cancel_func) 746 SVN_ERR(query->cancel_func(query->cancel_baton)); 747 748 /* create the revision info for the current rev */ 749 info = apr_pcalloc(result_pool, sizeof(*info)); 750 info->representations = apr_array_make(result_pool, 4, 751 sizeof(rep_stats_t*)); 752 info->rev_file = rev_file; 753 754 info->revision = base + i; 755 SVN_ERR(svn_fs_fs__get_packed_offset(&info->offset, query->fs, base + i, 756 iterpool)); 757 if (i + 1 == query->shard_size) 758 info->end = file_size; 759 else 760 SVN_ERR(svn_fs_fs__get_packed_offset(&info->end, query->fs, 761 base + i + 1, iterpool)); 762 763 SVN_ERR(read_phys_revision(query, info, result_pool, iterpool)); 764 765 info->representations = apr_array_copy(result_pool, 766 info->representations); 767 768 /* Done with this revision. */ 769 info->rev_file = NULL; 770 771 /* put it into our container */ 772 APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info; 773 774 /* destroy temps */ 775 svn_pool_clear(iterpool); 776 } 777 778 /* Done with this pack file. */ 779 SVN_ERR(svn_fs_fs__close_revision_file(rev_file)); 780 781 /* one more pack file processed */ 782 if (query->progress_func) 783 query->progress_func(base, query->progress_baton, scratch_pool); 784 785 return SVN_NO_ERROR; 786} 787 788/* Read the content of the file for REVISION in physical addressing mode 789 * and store its contents in QUERY. 790 * 791 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 792 * temporaries. 793 */ 794static svn_error_t * 795read_phys_revision_file(query_t *query, 796 svn_revnum_t revision, 797 apr_pool_t *result_pool, 798 apr_pool_t *scratch_pool) 799{ 800 revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info)); 801 apr_off_t file_size = 0; 802 svn_fs_fs__revision_file_t *rev_file; 803 804 /* cancellation support */ 805 if (query->cancel_func) 806 SVN_ERR(query->cancel_func(query->cancel_baton)); 807 808 /* read the whole pack file into memory */ 809 SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, revision, 810 scratch_pool, scratch_pool)); 811 SVN_ERR(get_file_size(&file_size, rev_file, scratch_pool)); 812 813 /* create the revision info for the current rev */ 814 info->representations = apr_array_make(result_pool, 4, sizeof(rep_stats_t*)); 815 816 info->rev_file = rev_file; 817 info->revision = revision; 818 info->offset = 0; 819 info->end = file_size; 820 821 SVN_ERR(read_phys_revision(query, info, result_pool, scratch_pool)); 822 823 /* Done with this revision. */ 824 SVN_ERR(svn_fs_fs__close_revision_file(rev_file)); 825 info->rev_file = NULL; 826 827 /* put it into our container */ 828 APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info; 829 830 /* show progress every 1000 revs or so */ 831 if (query->progress_func) 832 { 833 if (query->shard_size && (revision % query->shard_size == 0)) 834 query->progress_func(revision, query->progress_baton, scratch_pool); 835 if (!query->shard_size && (revision % 1000 == 0)) 836 query->progress_func(revision, query->progress_baton, scratch_pool); 837 } 838 839 return SVN_NO_ERROR; 840} 841 842/* Given the unparsed changes list in CHANGES with LEN chars, return the 843 * number of changed paths encoded in it. Only used in log. addressing 844 * mode. 845 */ 846static apr_uint64_t 847get_log_change_count(const char *changes, 848 apr_size_t len) 849{ 850 apr_size_t lines = 0; 851 const char *end = changes + len; 852 853 /* line count */ 854 for (; changes < end; ++changes) 855 if (*changes == '\n') 856 ++lines; 857 858 /* two lines per change */ 859 return lines / 2; 860} 861 862/* Read the item described by ENTRY from the REV_FILE and return the 863 * respective byte sequence in *CONTENTS, allocated in RESULT_POOL. 864 * Use SCRATCH_POOL for temporary allocations 865 */ 866static svn_error_t * 867read_item(svn_stringbuf_t **contents, 868 svn_fs_fs__revision_file_t *rev_file, 869 svn_fs_fs__p2l_entry_t *entry, 870 apr_pool_t *result_pool, 871 apr_pool_t *scratch_pool) 872{ 873 svn_stringbuf_t *item = svn_stringbuf_create_ensure(entry->size, 874 result_pool); 875 item->len = entry->size; 876 item->data[item->len] = 0; 877 878 SVN_ERR(svn_io_file_aligned_seek(rev_file->file, rev_file->block_size, 879 NULL, entry->offset, scratch_pool)); 880 SVN_ERR(svn_io_file_read_full2(rev_file->file, item->data, item->len, 881 NULL, NULL, scratch_pool)); 882 883 *contents = item; 884 885 return SVN_NO_ERROR; 886} 887 888/* Process the logically addressed revision contents of revisions BASE to 889 * BASE + COUNT - 1 in QUERY. 890 * 891 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 892 * temporaries. 893 */ 894static svn_error_t * 895read_log_rev_or_packfile(query_t *query, 896 svn_revnum_t base, 897 int count, 898 apr_pool_t *result_pool, 899 apr_pool_t *scratch_pool) 900{ 901 fs_fs_data_t *ffd = query->fs->fsap_data; 902 apr_pool_t *iterpool = svn_pool_create(scratch_pool); 903 apr_off_t max_offset; 904 apr_off_t offset = 0; 905 int i; 906 svn_fs_fs__revision_file_t *rev_file; 907 908 /* we will process every revision in the rev / pack file */ 909 for (i = 0; i < count; ++i) 910 { 911 /* create the revision info for the current rev */ 912 revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info)); 913 info->representations = apr_array_make(result_pool, 4, 914 sizeof(rep_stats_t*)); 915 info->revision = base + i; 916 917 APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info; 918 } 919 920 /* open the pack / rev file that is covered by the p2l index */ 921 SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base, 922 scratch_pool, iterpool)); 923 SVN_ERR(svn_fs_fs__p2l_get_max_offset(&max_offset, query->fs, rev_file, 924 base, scratch_pool)); 925 926 /* record the whole pack size in the first rev so the total sum will 927 still be correct */ 928 APR_ARRAY_IDX(query->revisions, base, revision_info_t*)->end = max_offset; 929 930 /* for all offsets in the file, get the P2L index entries and process 931 the interesting items (change lists, noderevs) */ 932 for (offset = 0; offset < max_offset; ) 933 { 934 apr_array_header_t *entries; 935 936 svn_pool_clear(iterpool); 937 938 /* cancellation support */ 939 if (query->cancel_func) 940 SVN_ERR(query->cancel_func(query->cancel_baton)); 941 942 /* get all entries for the current block */ 943 SVN_ERR(svn_fs_fs__p2l_index_lookup(&entries, query->fs, rev_file, base, 944 offset, ffd->p2l_page_size, 945 iterpool, iterpool)); 946 947 /* process all entries (and later continue with the next block) */ 948 for (i = 0; i < entries->nelts; ++i) 949 { 950 svn_fs_fs__p2l_entry_t *entry 951 = &APR_ARRAY_IDX(entries, i, svn_fs_fs__p2l_entry_t); 952 953 /* skip bits we previously processed */ 954 if (i == 0 && entry->offset < offset) 955 continue; 956 957 /* skip zero-sized entries */ 958 if (entry->size == 0) 959 continue; 960 961 /* read and process interesting items */ 962 if (entry->type == SVN_FS_FS__ITEM_TYPE_NODEREV) 963 { 964 svn_stringbuf_t *item; 965 revision_info_t *info = APR_ARRAY_IDX(query->revisions, 966 entry->item.revision, 967 revision_info_t*); 968 SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool)); 969 SVN_ERR(read_noderev(query, item, info, result_pool, iterpool)); 970 } 971 else if (entry->type == SVN_FS_FS__ITEM_TYPE_CHANGES) 972 { 973 svn_stringbuf_t *item; 974 revision_info_t *info = APR_ARRAY_IDX(query->revisions, 975 entry->item.revision, 976 revision_info_t*); 977 SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool)); 978 info->change_count 979 = get_log_change_count(item->data + 0, item->len); 980 info->changes_len += entry->size; 981 } 982 983 /* advance offset */ 984 offset += entry->size; 985 } 986 } 987 988 /* clean up and close file handles */ 989 svn_pool_destroy(iterpool); 990 991 return SVN_NO_ERROR; 992} 993 994/* Read the content of the pack file staring at revision BASE logical 995 * addressing mode and store it in QUERY. 996 * 997 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 998 * temporaries. 999 */ 1000static svn_error_t * 1001read_log_pack_file(query_t *query, 1002 svn_revnum_t base, 1003 apr_pool_t *result_pool, 1004 apr_pool_t *scratch_pool) 1005{ 1006 SVN_ERR(read_log_rev_or_packfile(query, base, query->shard_size, 1007 result_pool, scratch_pool)); 1008 1009 /* one more pack file processed */ 1010 if (query->progress_func) 1011 query->progress_func(base, query->progress_baton, scratch_pool); 1012 1013 return SVN_NO_ERROR; 1014} 1015 1016/* Read the content of the file for REVISION in logical addressing mode 1017 * and store its contents in QUERY. 1018 * 1019 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 1020 * temporaries. 1021 */ 1022static svn_error_t * 1023read_log_revision_file(query_t *query, 1024 svn_revnum_t revision, 1025 apr_pool_t *result_pool, 1026 apr_pool_t *scratch_pool) 1027{ 1028 SVN_ERR(read_log_rev_or_packfile(query, revision, 1, 1029 result_pool, scratch_pool)); 1030 1031 /* show progress every 1000 revs or so */ 1032 if (query->progress_func) 1033 { 1034 if (query->shard_size && (revision % query->shard_size == 0)) 1035 query->progress_func(revision, query->progress_baton, scratch_pool); 1036 if (!query->shard_size && (revision % 1000 == 0)) 1037 query->progress_func(revision, query->progress_baton, scratch_pool); 1038 } 1039 1040 return SVN_NO_ERROR; 1041} 1042 1043/* Read the repository and collect the stats info in QUERY. 1044 * 1045 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for 1046 * temporaries. 1047 */ 1048static svn_error_t * 1049read_revisions(query_t *query, 1050 apr_pool_t *result_pool, 1051 apr_pool_t *scratch_pool) 1052{ 1053 apr_pool_t *iterpool = svn_pool_create(scratch_pool); 1054 svn_revnum_t revision; 1055 1056 /* read all packed revs */ 1057 for ( revision = 0 1058 ; revision < query->min_unpacked_rev 1059 ; revision += query->shard_size) 1060 { 1061 svn_pool_clear(iterpool); 1062 1063 if (svn_fs_fs__use_log_addressing(query->fs)) 1064 SVN_ERR(read_log_pack_file(query, revision, result_pool, iterpool)); 1065 else 1066 SVN_ERR(read_phys_pack_file(query, revision, result_pool, iterpool)); 1067 } 1068 1069 /* read non-packed revs */ 1070 for ( ; revision <= query->head; ++revision) 1071 { 1072 svn_pool_clear(iterpool); 1073 1074 if (svn_fs_fs__use_log_addressing(query->fs)) 1075 SVN_ERR(read_log_revision_file(query, revision, result_pool, 1076 iterpool)); 1077 else 1078 SVN_ERR(read_phys_revision_file(query, revision, result_pool, 1079 iterpool)); 1080 } 1081 1082 svn_pool_destroy(iterpool); 1083 1084 return SVN_NO_ERROR; 1085} 1086 1087/* Accumulate stats of REP in STATS. 1088 */ 1089static void 1090add_rep_pack_stats(svn_fs_fs__rep_pack_stats_t *stats, 1091 rep_stats_t *rep) 1092{ 1093 stats->count++; 1094 1095 stats->packed_size += rep->size; 1096 stats->expanded_size += rep->expanded_size; 1097 stats->overhead_size += rep->header_size + 7 /* ENDREP\n */; 1098} 1099 1100/* Accumulate stats of REP in STATS. 1101 */ 1102static void 1103add_rep_stats(svn_fs_fs__representation_stats_t *stats, 1104 rep_stats_t *rep) 1105{ 1106 add_rep_pack_stats(&stats->total, rep); 1107 if (rep->ref_count == 1) 1108 add_rep_pack_stats(&stats->uniques, rep); 1109 else 1110 add_rep_pack_stats(&stats->shared, rep); 1111 1112 stats->references += rep->ref_count; 1113 stats->expanded_size += rep->ref_count * rep->expanded_size; 1114} 1115 1116/* Aggregate the info the in revision_info_t * array REVISIONS into the 1117 * respectve fields of STATS. 1118 */ 1119static void 1120aggregate_stats(const apr_array_header_t *revisions, 1121 svn_fs_fs__stats_t *stats) 1122{ 1123 int i, k; 1124 1125 /* aggregate info from all revisions */ 1126 stats->revision_count = revisions->nelts; 1127 for (i = 0; i < revisions->nelts; ++i) 1128 { 1129 revision_info_t *revision = APR_ARRAY_IDX(revisions, i, 1130 revision_info_t *); 1131 1132 /* data gathered on a revision level */ 1133 stats->change_count += revision->change_count; 1134 stats->change_len += revision->changes_len; 1135 stats->total_size += revision->end - revision->offset; 1136 1137 stats->dir_node_stats.count += revision->dir_noderev_count; 1138 stats->dir_node_stats.size += revision->dir_noderev_size; 1139 stats->file_node_stats.count += revision->file_noderev_count; 1140 stats->file_node_stats.size += revision->file_noderev_size; 1141 stats->total_node_stats.count += revision->dir_noderev_count 1142 + revision->file_noderev_count; 1143 stats->total_node_stats.size += revision->dir_noderev_size 1144 + revision->file_noderev_size; 1145 1146 /* process representations */ 1147 for (k = 0; k < revision->representations->nelts; ++k) 1148 { 1149 rep_stats_t *rep = APR_ARRAY_IDX(revision->representations, k, 1150 rep_stats_t *); 1151 1152 /* accumulate in the right bucket */ 1153 switch(rep->kind) 1154 { 1155 case file_rep: 1156 add_rep_stats(&stats->file_rep_stats, rep); 1157 break; 1158 case dir_rep: 1159 add_rep_stats(&stats->dir_rep_stats, rep); 1160 break; 1161 case file_property_rep: 1162 add_rep_stats(&stats->file_prop_rep_stats, rep); 1163 break; 1164 case dir_property_rep: 1165 add_rep_stats(&stats->dir_prop_rep_stats, rep); 1166 break; 1167 default: 1168 break; 1169 } 1170 1171 add_rep_stats(&stats->total_rep_stats, rep); 1172 } 1173 } 1174} 1175 1176/* Return a new svn_fs_fs__stats_t instance, allocated in RESULT_POOL. 1177 */ 1178static svn_fs_fs__stats_t * 1179create_stats(apr_pool_t *result_pool) 1180{ 1181 svn_fs_fs__stats_t *stats = apr_pcalloc(result_pool, sizeof(*stats)); 1182 1183 initialize_largest_changes(stats, 64, result_pool); 1184 stats->by_extension = apr_hash_make(result_pool); 1185 1186 return stats; 1187} 1188 1189/* Create a *QUERY, allocated in RESULT_POOL, reading filesystem FS and 1190 * collecting results in STATS. Store the optional PROCESS_FUNC and 1191 * PROGRESS_BATON as well as CANCEL_FUNC and CANCEL_BATON in *QUERY, too. 1192 * Use SCRATCH_POOL for temporary allocations. 1193 */ 1194static svn_error_t * 1195create_query(query_t **query, 1196 svn_fs_t *fs, 1197 svn_fs_fs__stats_t *stats, 1198 svn_fs_progress_notify_func_t progress_func, 1199 void *progress_baton, 1200 svn_cancel_func_t cancel_func, 1201 void *cancel_baton, 1202 apr_pool_t *result_pool, 1203 apr_pool_t *scratch_pool) 1204{ 1205 *query = apr_pcalloc(result_pool, sizeof(**query)); 1206 1207 /* Read repository dimensions. */ 1208 (*query)->shard_size = svn_fs_fs__shard_size(fs); 1209 SVN_ERR(svn_fs_fs__youngest_rev(&(*query)->head, fs, scratch_pool)); 1210 SVN_ERR(svn_fs_fs__min_unpacked_rev(&(*query)->min_unpacked_rev, fs, 1211 scratch_pool)); 1212 1213 /* create data containers and caches 1214 * Note: this assumes that int is at least 32-bits and that we only support 1215 * 32-bit wide revision numbers (actually 31-bits due to the signedness 1216 * of both the nelts field of the array and our revision numbers). This 1217 * means this code will fail on platforms where int is less than 32-bits 1218 * and the repository has more revisions than int can hold. */ 1219 (*query)->revisions = apr_array_make(result_pool, (int) (*query)->head + 1, 1220 sizeof(revision_info_t *)); 1221 (*query)->null_base = apr_pcalloc(result_pool, 1222 sizeof(*(*query)->null_base)); 1223 1224 /* Store other parameters */ 1225 (*query)->fs = fs; 1226 (*query)->stats = stats; 1227 (*query)->progress_func = progress_func; 1228 (*query)->progress_baton = progress_baton; 1229 (*query)->cancel_func = cancel_func; 1230 (*query)->cancel_baton = cancel_baton; 1231 1232 return SVN_NO_ERROR; 1233} 1234 1235svn_error_t * 1236svn_fs_fs__get_stats(svn_fs_fs__stats_t **stats, 1237 svn_fs_t *fs, 1238 svn_fs_progress_notify_func_t progress_func, 1239 void *progress_baton, 1240 svn_cancel_func_t cancel_func, 1241 void *cancel_baton, 1242 apr_pool_t *result_pool, 1243 apr_pool_t *scratch_pool) 1244{ 1245 query_t *query; 1246 1247 *stats = create_stats(result_pool); 1248 SVN_ERR(create_query(&query, fs, *stats, progress_func, progress_baton, 1249 cancel_func, cancel_baton, scratch_pool, 1250 scratch_pool)); 1251 SVN_ERR(read_revisions(query, scratch_pool, scratch_pool)); 1252 aggregate_stats(query->revisions, *stats); 1253 1254 return SVN_NO_ERROR; 1255} 1256