stats.c revision 299742
1/* stats.c -- implements the svn_fs_fs__get_stats private API.
2 *
3 * ====================================================================
4 *    Licensed to the Apache Software Foundation (ASF) under one
5 *    or more contributor license agreements.  See the NOTICE file
6 *    distributed with this work for additional information
7 *    regarding copyright ownership.  The ASF licenses this file
8 *    to you under the Apache License, Version 2.0 (the
9 *    "License"); you may not use this file except in compliance
10 *    with the License.  You may obtain a copy of the License at
11 *
12 *      http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *    Unless required by applicable law or agreed to in writing,
15 *    software distributed under the License is distributed on an
16 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 *    KIND, either express or implied.  See the License for the
18 *    specific language governing permissions and limitations
19 *    under the License.
20 * ====================================================================
21 */
22
23#include "svn_dirent_uri.h"
24#include "svn_fs.h"
25#include "svn_pools.h"
26#include "svn_sorts.h"
27
28#include "private/svn_cache.h"
29#include "private/svn_sorts_private.h"
30#include "private/svn_string_private.h"
31#include "private/svn_fs_fs_private.h"
32
33#include "index.h"
34#include "pack.h"
35#include "rev_file.h"
36#include "util.h"
37#include "fs_fs.h"
38#include "cached_data.h"
39#include "low_level.h"
40
41#include "../libsvn_fs/fs-loader.h"
42
43#include "svn_private_config.h"
44
45/* We group representations into 2x2 different kinds plus one default:
46 * [dir / file] x [text / prop]. The assignment is done by the first node
47 * that references the respective representation.
48 */
49typedef enum rep_kind_t
50{
51  /* The representation is not used _directly_, i.e. not referenced by any
52   * noderev. However, some other representation may use it as delta base.
53   * Null value. Should not occur in real-word repositories. */
54  unused_rep,
55
56  /* a properties on directory rep  */
57  dir_property_rep,
58
59  /* a properties on file rep  */
60  file_property_rep,
61
62  /* a directory rep  */
63  dir_rep,
64
65  /* a file rep  */
66  file_rep
67} rep_kind_t;
68
69/* A representation fragment.
70 */
71typedef struct rep_stats_t
72{
73  /* absolute offset in the file */
74  apr_off_t offset;
75
76  /* item length in bytes */
77  apr_uint64_t size;
78
79  /* item length after de-deltification */
80  apr_uint64_t expanded_size;
81
82  /* revision that contains this representation
83   * (may be referenced by other revisions, though) */
84  svn_revnum_t revision;
85
86  /* number of nodes that reference this representation */
87  apr_uint32_t ref_count;
88
89  /* length of the PLAIN / DELTA line in the source file in bytes */
90  apr_uint16_t header_size;
91
92  /* classification of the representation. values of rep_kind_t */
93  char kind;
94
95} rep_stats_t;
96
97/* Represents a single revision.
98 * There will be only one instance per revision. */
99typedef struct revision_info_t
100{
101  /* number of this revision */
102  svn_revnum_t revision;
103
104  /* pack file offset (manifest value), 0 for non-packed files */
105  apr_off_t offset;
106
107  /* length of the changes list on bytes */
108  apr_uint64_t changes_len;
109
110  /* offset of the changes list relative to OFFSET */
111  apr_uint64_t change_count;
112
113  /* first offset behind the revision data in the pack file (file length
114   * for non-packed revs) */
115  apr_off_t end;
116
117  /* number of directory noderevs in this revision */
118  apr_uint64_t dir_noderev_count;
119
120  /* number of file noderevs in this revision */
121  apr_uint64_t file_noderev_count;
122
123  /* total size of directory noderevs (i.e. the structs - not the rep) */
124  apr_uint64_t dir_noderev_size;
125
126  /* total size of file noderevs (i.e. the structs - not the rep) */
127  apr_uint64_t file_noderev_size;
128
129  /* all rep_stats_t of this revision (in no particular order),
130   * i.e. those that point back to this struct */
131  apr_array_header_t *representations;
132
133  /* Temporary rev / pack file access object, used in phys. addressing
134   * mode only.  NULL when done reading this revision. */
135  svn_fs_fs__revision_file_t *rev_file;
136} revision_info_t;
137
138/* Root data structure containing all information about a given repository.
139 * We use it as a wrapper around svn_fs_t and pass it around where we would
140 * otherwise just use a svn_fs_t.
141 */
142typedef struct query_t
143{
144  /* FS API object*/
145  svn_fs_t *fs;
146
147  /* The HEAD revision. */
148  svn_revnum_t head;
149
150  /* Number of revs per shard; 0 for non-sharded repos. */
151  int shard_size;
152
153  /* First non-packed revision. */
154  svn_revnum_t min_unpacked_rev;
155
156  /* all revisions */
157  apr_array_header_t *revisions;
158
159  /* empty representation.
160   * Used as a dummy base for DELTA reps without base. */
161  rep_stats_t *null_base;
162
163  /* collected statistics */
164  svn_fs_fs__stats_t *stats;
165
166  /* Progress notification callback to call after each shard.  May be NULL. */
167  svn_fs_progress_notify_func_t progress_func;
168
169  /* Baton for PROGRESS_FUNC. */
170  void *progress_baton;
171
172  /* Cancellation support callback to call once in a while.  May be NULL. */
173  svn_cancel_func_t cancel_func;
174
175  /* Baton for CANCEL_FUNC. */
176  void *cancel_baton;
177} query_t;
178
179/* Return the length of REV_FILE in *FILE_SIZE.
180 * Use SCRATCH_POOL for temporary allocations.
181 */
182static svn_error_t *
183get_file_size(apr_off_t *file_size,
184              svn_fs_fs__revision_file_t *rev_file,
185              apr_pool_t *scratch_pool)
186{
187  apr_finfo_t finfo;
188
189  SVN_ERR(svn_io_file_info_get(&finfo, APR_FINFO_SIZE, rev_file->file,
190                               scratch_pool));
191
192  *file_size = finfo.size;
193  return SVN_NO_ERROR;
194}
195
196/* Initialize the LARGEST_CHANGES member in STATS with a capacity of COUNT
197 * entries.  Allocate the result in RESULT_POOL.
198 */
199static void
200initialize_largest_changes(svn_fs_fs__stats_t *stats,
201                           apr_size_t count,
202                           apr_pool_t *result_pool)
203{
204  apr_size_t i;
205
206  stats->largest_changes = apr_pcalloc(result_pool,
207                                       sizeof(*stats->largest_changes));
208  stats->largest_changes->count = count;
209  stats->largest_changes->min_size = 1;
210  stats->largest_changes->changes
211    = apr_palloc(result_pool, count * sizeof(*stats->largest_changes->changes));
212
213  /* allocate *all* entries before the path stringbufs.  This increases
214   * cache locality and enhances performance significantly. */
215  for (i = 0; i < count; ++i)
216    stats->largest_changes->changes[i]
217      = apr_palloc(result_pool, sizeof(**stats->largest_changes->changes));
218
219  /* now initialize them and allocate the stringbufs */
220  for (i = 0; i < count; ++i)
221    {
222      stats->largest_changes->changes[i]->size = 0;
223      stats->largest_changes->changes[i]->revision = SVN_INVALID_REVNUM;
224      stats->largest_changes->changes[i]->path
225        = svn_stringbuf_create_ensure(1024, result_pool);
226    }
227}
228
229/* Add entry for SIZE to HISTOGRAM.
230 */
231static void
232add_to_histogram(svn_fs_fs__histogram_t *histogram,
233                 apr_int64_t size)
234{
235  apr_int64_t shift = 0;
236
237  while (((apr_int64_t)(1) << shift) <= size)
238    shift++;
239
240  histogram->total.count++;
241  histogram->total.sum += size;
242  histogram->lines[(apr_size_t)shift].count++;
243  histogram->lines[(apr_size_t)shift].sum += size;
244}
245
246/* Update data aggregators in STATS with this representation of type KIND,
247 * on-disk REP_SIZE and expanded node size EXPANDED_SIZE for PATH in REVSION.
248 * PLAIN_ADDED indicates whether the node has a deltification predecessor.
249 */
250static void
251add_change(svn_fs_fs__stats_t *stats,
252           apr_uint64_t rep_size,
253           apr_uint64_t expanded_size,
254           svn_revnum_t revision,
255           const char *path,
256           rep_kind_t kind,
257           svn_boolean_t plain_added)
258{
259  /* identify largest reps */
260  if (rep_size >= stats->largest_changes->min_size)
261    {
262      apr_size_t i;
263      svn_fs_fs__largest_changes_t *largest_changes = stats->largest_changes;
264      svn_fs_fs__large_change_info_t *info
265        = largest_changes->changes[largest_changes->count - 1];
266      info->size = rep_size;
267      info->revision = revision;
268      svn_stringbuf_set(info->path, path);
269
270      /* linear insertion but not too bad since count is low and insertions
271       * near the end are more likely than close to front */
272      for (i = largest_changes->count - 1; i > 0; --i)
273        if (largest_changes->changes[i-1]->size >= rep_size)
274          break;
275        else
276          largest_changes->changes[i] = largest_changes->changes[i-1];
277
278      largest_changes->changes[i] = info;
279      largest_changes->min_size
280        = largest_changes->changes[largest_changes->count-1]->size;
281    }
282
283  /* global histograms */
284  add_to_histogram(&stats->rep_size_histogram, rep_size);
285  add_to_histogram(&stats->node_size_histogram, expanded_size);
286
287  if (plain_added)
288    {
289      add_to_histogram(&stats->added_rep_size_histogram, rep_size);
290      add_to_histogram(&stats->added_node_size_histogram, expanded_size);
291    }
292
293  /* specific histograms by type */
294  switch (kind)
295    {
296      case unused_rep:
297        add_to_histogram(&stats->unused_rep_histogram, rep_size);
298        break;
299      case dir_property_rep:
300        add_to_histogram(&stats->dir_prop_rep_histogram, rep_size);
301        add_to_histogram(&stats->dir_prop_histogram, expanded_size);
302        break;
303      case file_property_rep:
304        add_to_histogram(&stats->file_prop_rep_histogram, rep_size);
305        add_to_histogram(&stats->file_prop_histogram, expanded_size);
306        break;
307      case dir_rep:
308        add_to_histogram(&stats->dir_rep_histogram, rep_size);
309        add_to_histogram(&stats->dir_histogram, expanded_size);
310        break;
311      case file_rep:
312        add_to_histogram(&stats->file_rep_histogram, rep_size);
313        add_to_histogram(&stats->file_histogram, expanded_size);
314        break;
315    }
316
317  /* by extension */
318  if (kind == file_rep)
319    {
320      /* determine extension */
321      svn_fs_fs__extension_info_t *info;
322      const char * file_name = strrchr(path, '/');
323      const char * extension = file_name ? strrchr(file_name, '.') : NULL;
324
325      if (extension == NULL || extension == file_name + 1)
326        extension = "(none)";
327
328      /* get / auto-insert entry for this extension */
329      info = apr_hash_get(stats->by_extension, extension, APR_HASH_KEY_STRING);
330      if (info == NULL)
331        {
332          apr_pool_t *pool = apr_hash_pool_get(stats->by_extension);
333          info = apr_pcalloc(pool, sizeof(*info));
334          info->extension = apr_pstrdup(pool, extension);
335
336          apr_hash_set(stats->by_extension, info->extension,
337                       APR_HASH_KEY_STRING, info);
338        }
339
340      /* update per-extension histogram */
341      add_to_histogram(&info->node_histogram, expanded_size);
342      add_to_histogram(&info->rep_histogram, rep_size);
343    }
344}
345
346/* Comparator used for binary search comparing the absolute file offset
347 * of a representation to some other offset. DATA is a *rep_stats_t,
348 * KEY is a pointer to an apr_off_t.
349 */
350static int
351compare_representation_offsets(const void *data, const void *key)
352{
353  apr_off_t lhs = (*(const rep_stats_t *const *)data)->offset;
354  apr_off_t rhs = *(const apr_off_t *)key;
355
356  if (lhs < rhs)
357    return -1;
358  return (lhs > rhs ? 1 : 0);
359}
360
361/* Find the revision_info_t object to the given REVISION in QUERY and
362 * return it in *REVISION_INFO. For performance reasons, we skip the
363 * lookup if the info is already provided.
364 *
365 * In that revision, look for the rep_stats_t object for offset OFFSET.
366 * If it already exists, set *IDX to its index in *REVISION_INFO's
367 * representations list and return the representation object. Otherwise,
368 * set the index to where it must be inserted and return NULL.
369 */
370static rep_stats_t *
371find_representation(int *idx,
372                    query_t *query,
373                    revision_info_t **revision_info,
374                    svn_revnum_t revision,
375                    apr_off_t offset)
376{
377  revision_info_t *info;
378  *idx = -1;
379
380  /* first let's find the revision */
381  info = revision_info ? *revision_info : NULL;
382  if (info == NULL || info->revision != revision)
383    {
384      info = APR_ARRAY_IDX(query->revisions, revision, revision_info_t*);
385      if (revision_info)
386        *revision_info = info;
387    }
388
389  /* not found -> no result */
390  if (info == NULL)
391    return NULL;
392
393  /* look for the representation */
394  *idx = svn_sort__bsearch_lower_bound(info->representations,
395                                       &offset,
396                                       compare_representation_offsets);
397  if (*idx < info->representations->nelts)
398    {
399      /* return the representation, if this is the one we were looking for */
400      rep_stats_t *result
401        = APR_ARRAY_IDX(info->representations, *idx, rep_stats_t *);
402      if (result->offset == offset)
403        return result;
404    }
405
406  /* not parsed, yet */
407  return NULL;
408}
409
410/* Find / auto-construct the representation stats for REP in QUERY and
411 * return it in *REPRESENTATION.
412 *
413 * If necessary, allocate the result in RESULT_POOL; use SCRATCH_POOL for
414 * temporary allocations.
415 */
416static svn_error_t *
417parse_representation(rep_stats_t **representation,
418                     query_t *query,
419                     representation_t *rep,
420                     revision_info_t *revision_info,
421                     apr_pool_t *result_pool,
422                     apr_pool_t *scratch_pool)
423{
424  rep_stats_t *result;
425  int idx;
426
427  /* read location (revision, offset) and size */
428
429  /* look it up */
430  result = find_representation(&idx, query, &revision_info, rep->revision,
431                               (apr_off_t)rep->item_index);
432  if (!result)
433    {
434      /* not parsed, yet (probably a rep in the same revision).
435       * Create a new rep object and determine its base rep as well.
436       */
437      result = apr_pcalloc(result_pool, sizeof(*result));
438      result->revision = rep->revision;
439      result->expanded_size = (rep->expanded_size ? rep->expanded_size
440                                                  : rep->size);
441      result->offset = (apr_off_t)rep->item_index;
442      result->size = rep->size;
443
444      /* In phys. addressing mode, follow link to the actual representation.
445       * In log. addressing mode, we will find it already as part of our
446       * linear walk through the whole file. */
447      if (!svn_fs_fs__use_log_addressing(query->fs))
448        {
449          svn_fs_fs__rep_header_t *header;
450          apr_off_t offset = revision_info->offset + result->offset;
451
452          SVN_ERR_ASSERT(revision_info->rev_file);
453          SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
454                                   &offset, scratch_pool));
455          SVN_ERR(svn_fs_fs__read_rep_header(&header,
456                                             revision_info->rev_file->stream,
457                                             scratch_pool, scratch_pool));
458
459          result->header_size = header->header_size;
460        }
461
462      svn_sort__array_insert(revision_info->representations, &result, idx);
463    }
464
465  *representation = result;
466
467  return SVN_NO_ERROR;
468}
469
470
471/* forward declaration */
472static svn_error_t *
473read_noderev(query_t *query,
474             svn_stringbuf_t *noderev_str,
475             revision_info_t *revision_info,
476             apr_pool_t *result_pool,
477             apr_pool_t *scratch_pool);
478
479/* Read the noderev item at OFFSET in REVISION_INFO from the filesystem
480 * provided by QUERY.  Return it in *NODEREV, allocated in RESULT_POOL.
481 * Use SCRATCH_POOL for temporary allocations.
482 *
483 * The textual representation of the noderev will be used to determine
484 * the on-disk size of the noderev.  Only called in phys. addressing mode.
485 */
486static svn_error_t *
487read_phsy_noderev(svn_stringbuf_t **noderev,
488                  query_t *query,
489                  apr_off_t offset,
490                  revision_info_t *revision_info,
491                  apr_pool_t *result_pool,
492                  apr_pool_t *scratch_pool)
493{
494  svn_stringbuf_t *noderev_str = svn_stringbuf_create_empty(result_pool);
495  svn_stringbuf_t *line;
496  svn_boolean_t eof;
497
498  apr_pool_t *iterpool = svn_pool_create(scratch_pool);
499
500  /* Navigate the file stream to the start of noderev. */
501  SVN_ERR_ASSERT(revision_info->rev_file);
502
503  offset += revision_info->offset;
504  SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
505                           &offset, scratch_pool));
506
507  /* Read it (terminated by an empty line) */
508  do
509    {
510      svn_pool_clear(iterpool);
511
512      SVN_ERR(svn_stream_readline(revision_info->rev_file->stream, &line,
513                                  "\n", &eof, iterpool));
514      svn_stringbuf_appendstr(noderev_str, line);
515      svn_stringbuf_appendbyte(noderev_str, '\n');
516    }
517  while (line->len > 0 && !eof);
518
519  /* Return the result. */
520  *noderev = noderev_str;
521
522  svn_pool_destroy(iterpool);
523
524  return SVN_NO_ERROR;
525}
526
527/* Starting at the directory in NODEREV's text, read all DAG nodes,
528 * directories and representations linked in that tree structure.
529 * Store them in QUERY and REVISION_INFO.  Also, read them only once.
530 *
531 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
532 * temporaries.
533 */
534static svn_error_t *
535parse_dir(query_t *query,
536          node_revision_t *noderev,
537          revision_info_t *revision_info,
538          apr_pool_t *result_pool,
539          apr_pool_t *scratch_pool)
540{
541  apr_pool_t *iterpool = svn_pool_create(scratch_pool);
542
543  int i;
544  apr_array_header_t *entries;
545  SVN_ERR(svn_fs_fs__rep_contents_dir(&entries, query->fs, noderev,
546                                      scratch_pool, scratch_pool));
547
548  for (i = 0; i < entries->nelts; ++i)
549    {
550      svn_fs_dirent_t *dirent = APR_ARRAY_IDX(entries, i, svn_fs_dirent_t *);
551
552      if (svn_fs_fs__id_rev(dirent->id) == revision_info->revision)
553        {
554          svn_stringbuf_t *noderev_str;
555          svn_pool_clear(iterpool);
556
557          SVN_ERR(read_phsy_noderev(&noderev_str, query,
558                                    svn_fs_fs__id_item(dirent->id),
559                                    revision_info, iterpool, iterpool));
560          SVN_ERR(read_noderev(query, noderev_str, revision_info,
561                               result_pool, iterpool));
562        }
563    }
564
565  svn_pool_destroy(iterpool);
566
567  return SVN_NO_ERROR;
568}
569
570/* Parse the noderev given as NODEREV_STR and store the info in QUERY and
571 * REVISION_INFO.  In phys. addressing mode, continue reading all DAG nodes,
572 * directories and representations linked in that tree structure.
573 *
574 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
575 * temporaries.
576 */
577static svn_error_t *
578read_noderev(query_t *query,
579             svn_stringbuf_t *noderev_str,
580             revision_info_t *revision_info,
581             apr_pool_t *result_pool,
582             apr_pool_t *scratch_pool)
583{
584  rep_stats_t *text = NULL;
585  rep_stats_t *props = NULL;
586  node_revision_t *noderev;
587
588  svn_stream_t *stream = svn_stream_from_stringbuf(noderev_str, scratch_pool);
589  SVN_ERR(svn_fs_fs__read_noderev(&noderev, stream, scratch_pool,
590                                  scratch_pool));
591
592  if (noderev->data_rep)
593    {
594      SVN_ERR(parse_representation(&text, query,
595                                   noderev->data_rep, revision_info,
596                                   result_pool, scratch_pool));
597
598      /* if we are the first to use this rep, mark it as "text rep" */
599      if (++text->ref_count == 1)
600        text->kind = noderev->kind == svn_node_dir ? dir_rep : file_rep;
601    }
602
603  if (noderev->prop_rep)
604    {
605      SVN_ERR(parse_representation(&props, query,
606                                   noderev->prop_rep, revision_info,
607                                   result_pool, scratch_pool));
608
609      /* if we are the first to use this rep, mark it as "prop rep" */
610      if (++props->ref_count == 1)
611        props->kind = noderev->kind == svn_node_dir ? dir_property_rep
612                                                    : file_property_rep;
613    }
614
615  /* record largest changes */
616  if (text && text->ref_count == 1)
617    add_change(query->stats, text->size, text->expanded_size, text->revision,
618               noderev->created_path, text->kind, !noderev->predecessor_id);
619  if (props && props->ref_count == 1)
620    add_change(query->stats, props->size, props->expanded_size,
621               props->revision, noderev->created_path, props->kind,
622               !noderev->predecessor_id);
623
624  /* if this is a directory and has not been processed, yet, read and
625   * process it recursively */
626  if (   noderev->kind == svn_node_dir && text && text->ref_count == 1
627      && !svn_fs_fs__use_log_addressing(query->fs))
628    SVN_ERR(parse_dir(query, noderev, revision_info, result_pool,
629                      scratch_pool));
630
631  /* update stats */
632  if (noderev->kind == svn_node_dir)
633    {
634      revision_info->dir_noderev_size += noderev_str->len;
635      revision_info->dir_noderev_count++;
636    }
637  else
638    {
639      revision_info->file_noderev_size += noderev_str->len;
640      revision_info->file_noderev_count++;
641    }
642
643  return SVN_NO_ERROR;
644}
645
646/* For the revision given as REVISION_INFO within QUERY, determine the number
647 * of entries in its changed paths list and store that info in REVISION_INFO.
648 * Use SCRATCH_POOL for temporary allocations.
649 */
650static svn_error_t *
651get_phys_change_count(query_t *query,
652                      revision_info_t *revision_info,
653                      apr_pool_t *scratch_pool)
654{
655  /* We are going to use our own sub-pool here because the changes object
656   * may well be >100MB and SCRATCH_POOL may not get cleared until all other
657   * info has been read by read_phys_revision().  Therefore, tidy up early.
658   */
659  apr_pool_t *subpool = svn_pool_create(scratch_pool);
660  apr_array_header_t *changes;
661
662  SVN_ERR(svn_fs_fs__get_changes(&changes, query->fs,
663                                 revision_info->revision, subpool));
664  revision_info->change_count = changes->nelts;
665
666  /* Release potentially tons of memory. */
667  svn_pool_destroy(subpool);
668
669  return SVN_NO_ERROR;
670}
671
672/* Read header information for the revision stored in FILE_CONTENT (one
673 * whole revision).  Return the offsets within FILE_CONTENT for the
674 * *ROOT_NODEREV, the list of *CHANGES and its len in *CHANGES_LEN.
675 * Use POOL for temporary allocations. */
676static svn_error_t *
677read_phys_revision(query_t *query,
678                   revision_info_t *info,
679                   apr_pool_t *result_pool,
680                   apr_pool_t *scratch_pool)
681{
682  char buf[64];
683  apr_off_t root_node_offset;
684  apr_off_t changes_offset;
685  svn_stringbuf_t *trailer;
686  svn_stringbuf_t *noderev_str;
687
688  /* Read the last 64 bytes of the revision (if long enough). */
689  apr_off_t start = MAX(info->offset, info->end - sizeof(buf));
690  apr_size_t len = (apr_size_t)(info->end - start);
691  SVN_ERR(svn_io_file_seek(info->rev_file->file, APR_SET, &start,
692                           scratch_pool));
693  SVN_ERR(svn_io_file_read_full2(info->rev_file->file, buf, len, NULL, NULL,
694                                 scratch_pool));
695  trailer = svn_stringbuf_ncreate(buf, len, scratch_pool);
696
697  /* Parse that trailer. */
698  SVN_ERR(svn_fs_fs__parse_revision_trailer(&root_node_offset,
699                                            &changes_offset, trailer,
700                                            info->revision));
701  SVN_ERR(get_phys_change_count(query, info, scratch_pool));
702
703  /* Calculate the length of the changes list. */
704  trailer = svn_fs_fs__unparse_revision_trailer(root_node_offset,
705                                                changes_offset,
706                                                scratch_pool);
707  info->changes_len = info->end - info->offset - changes_offset
708                    - trailer->len;
709
710  /* Recursively read nodes added in this rev. */
711  SVN_ERR(read_phsy_noderev(&noderev_str, query, root_node_offset, info,
712                            scratch_pool, scratch_pool));
713  SVN_ERR(read_noderev(query, noderev_str, info, result_pool, scratch_pool));
714
715  return SVN_NO_ERROR;
716}
717
718/* Read the content of the pack file staring at revision BASE physical
719 * addressing mode and store it in QUERY.
720 *
721 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
722 * temporaries.
723 */
724static svn_error_t *
725read_phys_pack_file(query_t *query,
726                    svn_revnum_t base,
727                    apr_pool_t *result_pool,
728                    apr_pool_t *scratch_pool)
729{
730  apr_pool_t *iterpool = svn_pool_create(scratch_pool);
731  int i;
732  apr_off_t file_size = 0;
733  svn_fs_fs__revision_file_t *rev_file;
734
735  SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
736                                           scratch_pool, scratch_pool));
737  SVN_ERR(get_file_size(&file_size, rev_file, scratch_pool));
738
739  /* process each revision in the pack file */
740  for (i = 0; i < query->shard_size; ++i)
741    {
742      revision_info_t *info;
743
744      /* cancellation support */
745      if (query->cancel_func)
746        SVN_ERR(query->cancel_func(query->cancel_baton));
747
748      /* create the revision info for the current rev */
749      info = apr_pcalloc(result_pool, sizeof(*info));
750      info->representations = apr_array_make(result_pool, 4,
751                                             sizeof(rep_stats_t*));
752      info->rev_file = rev_file;
753
754      info->revision = base + i;
755      SVN_ERR(svn_fs_fs__get_packed_offset(&info->offset, query->fs, base + i,
756                                           iterpool));
757      if (i + 1 == query->shard_size)
758        info->end = file_size;
759      else
760        SVN_ERR(svn_fs_fs__get_packed_offset(&info->end, query->fs,
761                                             base + i + 1, iterpool));
762
763      SVN_ERR(read_phys_revision(query, info, result_pool, iterpool));
764
765      info->representations = apr_array_copy(result_pool,
766                                             info->representations);
767
768      /* Done with this revision. */
769      info->rev_file = NULL;
770
771      /* put it into our container */
772      APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
773
774      /* destroy temps */
775      svn_pool_clear(iterpool);
776    }
777
778  /* Done with this pack file. */
779  SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
780
781  /* one more pack file processed */
782  if (query->progress_func)
783    query->progress_func(base, query->progress_baton, scratch_pool);
784
785  return SVN_NO_ERROR;
786}
787
788/* Read the content of the file for REVISION in physical addressing mode
789 * and store its contents in QUERY.
790 *
791 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
792 * temporaries.
793 */
794static svn_error_t *
795read_phys_revision_file(query_t *query,
796                        svn_revnum_t revision,
797                        apr_pool_t *result_pool,
798                        apr_pool_t *scratch_pool)
799{
800  revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
801  apr_off_t file_size = 0;
802  svn_fs_fs__revision_file_t *rev_file;
803
804  /* cancellation support */
805  if (query->cancel_func)
806    SVN_ERR(query->cancel_func(query->cancel_baton));
807
808  /* read the whole pack file into memory */
809  SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, revision,
810                                           scratch_pool, scratch_pool));
811  SVN_ERR(get_file_size(&file_size, rev_file, scratch_pool));
812
813  /* create the revision info for the current rev */
814  info->representations = apr_array_make(result_pool, 4, sizeof(rep_stats_t*));
815
816  info->rev_file = rev_file;
817  info->revision = revision;
818  info->offset = 0;
819  info->end = file_size;
820
821  SVN_ERR(read_phys_revision(query, info, result_pool, scratch_pool));
822
823  /* Done with this revision. */
824  SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
825  info->rev_file = NULL;
826
827  /* put it into our container */
828  APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
829
830  /* show progress every 1000 revs or so */
831  if (query->progress_func)
832    {
833      if (query->shard_size && (revision % query->shard_size == 0))
834        query->progress_func(revision, query->progress_baton, scratch_pool);
835      if (!query->shard_size && (revision % 1000 == 0))
836        query->progress_func(revision, query->progress_baton, scratch_pool);
837    }
838
839  return SVN_NO_ERROR;
840}
841
842/* Given the unparsed changes list in CHANGES with LEN chars, return the
843 * number of changed paths encoded in it.  Only used in log. addressing
844 * mode.
845 */
846static apr_uint64_t
847get_log_change_count(const char *changes,
848                     apr_size_t len)
849{
850  apr_size_t lines = 0;
851  const char *end = changes + len;
852
853  /* line count */
854  for (; changes < end; ++changes)
855    if (*changes == '\n')
856      ++lines;
857
858  /* two lines per change */
859  return lines / 2;
860}
861
862/* Read the item described by ENTRY from the REV_FILE and return the
863 * respective byte sequence in *CONTENTS, allocated in RESULT_POOL.
864 * Use SCRATCH_POOL for temporary allocations
865 */
866static svn_error_t *
867read_item(svn_stringbuf_t **contents,
868          svn_fs_fs__revision_file_t *rev_file,
869          svn_fs_fs__p2l_entry_t *entry,
870          apr_pool_t *result_pool,
871          apr_pool_t *scratch_pool)
872{
873  svn_stringbuf_t *item = svn_stringbuf_create_ensure(entry->size,
874                                                      result_pool);
875  item->len = entry->size;
876  item->data[item->len] = 0;
877
878  SVN_ERR(svn_io_file_aligned_seek(rev_file->file, rev_file->block_size,
879                                   NULL, entry->offset, scratch_pool));
880  SVN_ERR(svn_io_file_read_full2(rev_file->file, item->data, item->len,
881                                 NULL, NULL, scratch_pool));
882
883  *contents = item;
884
885  return SVN_NO_ERROR;
886}
887
888/* Process the logically addressed revision contents of revisions BASE to
889 * BASE + COUNT - 1 in QUERY.
890 *
891 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
892 * temporaries.
893 */
894static svn_error_t *
895read_log_rev_or_packfile(query_t *query,
896                         svn_revnum_t base,
897                         int count,
898                         apr_pool_t *result_pool,
899                         apr_pool_t *scratch_pool)
900{
901  fs_fs_data_t *ffd = query->fs->fsap_data;
902  apr_pool_t *iterpool = svn_pool_create(scratch_pool);
903  apr_off_t max_offset;
904  apr_off_t offset = 0;
905  int i;
906  svn_fs_fs__revision_file_t *rev_file;
907
908  /* we will process every revision in the rev / pack file */
909  for (i = 0; i < count; ++i)
910    {
911      /* create the revision info for the current rev */
912      revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
913      info->representations = apr_array_make(result_pool, 4,
914                                             sizeof(rep_stats_t*));
915      info->revision = base + i;
916
917      APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
918    }
919
920  /* open the pack / rev file that is covered by the p2l index */
921  SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
922                                           scratch_pool, iterpool));
923  SVN_ERR(svn_fs_fs__p2l_get_max_offset(&max_offset, query->fs, rev_file,
924                                        base, scratch_pool));
925
926  /* record the whole pack size in the first rev so the total sum will
927     still be correct */
928  APR_ARRAY_IDX(query->revisions, base, revision_info_t*)->end = max_offset;
929
930  /* for all offsets in the file, get the P2L index entries and process
931     the interesting items (change lists, noderevs) */
932  for (offset = 0; offset < max_offset; )
933    {
934      apr_array_header_t *entries;
935
936      svn_pool_clear(iterpool);
937
938      /* cancellation support */
939      if (query->cancel_func)
940        SVN_ERR(query->cancel_func(query->cancel_baton));
941
942      /* get all entries for the current block */
943      SVN_ERR(svn_fs_fs__p2l_index_lookup(&entries, query->fs, rev_file, base,
944                                          offset, ffd->p2l_page_size,
945                                          iterpool, iterpool));
946
947      /* process all entries (and later continue with the next block) */
948      for (i = 0; i < entries->nelts; ++i)
949        {
950          svn_fs_fs__p2l_entry_t *entry
951            = &APR_ARRAY_IDX(entries, i, svn_fs_fs__p2l_entry_t);
952
953          /* skip bits we previously processed */
954          if (i == 0 && entry->offset < offset)
955            continue;
956
957          /* skip zero-sized entries */
958          if (entry->size == 0)
959            continue;
960
961          /* read and process interesting items */
962          if (entry->type == SVN_FS_FS__ITEM_TYPE_NODEREV)
963            {
964              svn_stringbuf_t *item;
965              revision_info_t *info = APR_ARRAY_IDX(query->revisions,
966                                                    entry->item.revision,
967                                                    revision_info_t*);
968              SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
969              SVN_ERR(read_noderev(query, item, info, result_pool, iterpool));
970            }
971          else if (entry->type == SVN_FS_FS__ITEM_TYPE_CHANGES)
972            {
973              svn_stringbuf_t *item;
974              revision_info_t *info = APR_ARRAY_IDX(query->revisions,
975                                                    entry->item.revision,
976                                                    revision_info_t*);
977              SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
978              info->change_count
979                = get_log_change_count(item->data + 0, item->len);
980              info->changes_len += entry->size;
981            }
982
983          /* advance offset */
984          offset += entry->size;
985        }
986    }
987
988  /* clean up and close file handles */
989  svn_pool_destroy(iterpool);
990
991  return SVN_NO_ERROR;
992}
993
994/* Read the content of the pack file staring at revision BASE logical
995 * addressing mode and store it in QUERY.
996 *
997 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
998 * temporaries.
999 */
1000static svn_error_t *
1001read_log_pack_file(query_t *query,
1002                   svn_revnum_t base,
1003                   apr_pool_t *result_pool,
1004                   apr_pool_t *scratch_pool)
1005{
1006  SVN_ERR(read_log_rev_or_packfile(query, base, query->shard_size,
1007                                   result_pool, scratch_pool));
1008
1009  /* one more pack file processed */
1010  if (query->progress_func)
1011    query->progress_func(base, query->progress_baton, scratch_pool);
1012
1013  return SVN_NO_ERROR;
1014}
1015
1016/* Read the content of the file for REVISION in logical addressing mode
1017 * and store its contents in QUERY.
1018 *
1019 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
1020 * temporaries.
1021 */
1022static svn_error_t *
1023read_log_revision_file(query_t *query,
1024                       svn_revnum_t revision,
1025                       apr_pool_t *result_pool,
1026                       apr_pool_t *scratch_pool)
1027{
1028  SVN_ERR(read_log_rev_or_packfile(query, revision, 1,
1029                                   result_pool, scratch_pool));
1030
1031  /* show progress every 1000 revs or so */
1032  if (query->progress_func)
1033    {
1034      if (query->shard_size && (revision % query->shard_size == 0))
1035        query->progress_func(revision, query->progress_baton, scratch_pool);
1036      if (!query->shard_size && (revision % 1000 == 0))
1037        query->progress_func(revision, query->progress_baton, scratch_pool);
1038    }
1039
1040  return SVN_NO_ERROR;
1041}
1042
1043/* Read the repository and collect the stats info in QUERY.
1044 *
1045 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
1046 * temporaries.
1047 */
1048static svn_error_t *
1049read_revisions(query_t *query,
1050               apr_pool_t *result_pool,
1051               apr_pool_t *scratch_pool)
1052{
1053  apr_pool_t *iterpool = svn_pool_create(scratch_pool);
1054  svn_revnum_t revision;
1055
1056  /* read all packed revs */
1057  for ( revision = 0
1058      ; revision < query->min_unpacked_rev
1059      ; revision += query->shard_size)
1060    {
1061      svn_pool_clear(iterpool);
1062
1063      if (svn_fs_fs__use_log_addressing(query->fs))
1064        SVN_ERR(read_log_pack_file(query, revision, result_pool, iterpool));
1065      else
1066        SVN_ERR(read_phys_pack_file(query, revision, result_pool, iterpool));
1067    }
1068
1069  /* read non-packed revs */
1070  for ( ; revision <= query->head; ++revision)
1071    {
1072      svn_pool_clear(iterpool);
1073
1074      if (svn_fs_fs__use_log_addressing(query->fs))
1075        SVN_ERR(read_log_revision_file(query, revision, result_pool,
1076                                       iterpool));
1077      else
1078        SVN_ERR(read_phys_revision_file(query, revision, result_pool,
1079                                        iterpool));
1080    }
1081
1082  svn_pool_destroy(iterpool);
1083
1084  return SVN_NO_ERROR;
1085}
1086
1087/* Accumulate stats of REP in STATS.
1088 */
1089static void
1090add_rep_pack_stats(svn_fs_fs__rep_pack_stats_t *stats,
1091                   rep_stats_t *rep)
1092{
1093  stats->count++;
1094
1095  stats->packed_size += rep->size;
1096  stats->expanded_size += rep->expanded_size;
1097  stats->overhead_size += rep->header_size + 7 /* ENDREP\n */;
1098}
1099
1100/* Accumulate stats of REP in STATS.
1101 */
1102static void
1103add_rep_stats(svn_fs_fs__representation_stats_t *stats,
1104              rep_stats_t *rep)
1105{
1106  add_rep_pack_stats(&stats->total, rep);
1107  if (rep->ref_count == 1)
1108    add_rep_pack_stats(&stats->uniques, rep);
1109  else
1110    add_rep_pack_stats(&stats->shared, rep);
1111
1112  stats->references += rep->ref_count;
1113  stats->expanded_size += rep->ref_count * rep->expanded_size;
1114}
1115
1116/* Aggregate the info the in revision_info_t * array REVISIONS into the
1117 * respectve fields of STATS.
1118 */
1119static void
1120aggregate_stats(const apr_array_header_t *revisions,
1121                svn_fs_fs__stats_t *stats)
1122{
1123  int i, k;
1124
1125  /* aggregate info from all revisions */
1126  stats->revision_count = revisions->nelts;
1127  for (i = 0; i < revisions->nelts; ++i)
1128    {
1129      revision_info_t *revision = APR_ARRAY_IDX(revisions, i,
1130                                                revision_info_t *);
1131
1132      /* data gathered on a revision level */
1133      stats->change_count += revision->change_count;
1134      stats->change_len += revision->changes_len;
1135      stats->total_size += revision->end - revision->offset;
1136
1137      stats->dir_node_stats.count += revision->dir_noderev_count;
1138      stats->dir_node_stats.size += revision->dir_noderev_size;
1139      stats->file_node_stats.count += revision->file_noderev_count;
1140      stats->file_node_stats.size += revision->file_noderev_size;
1141      stats->total_node_stats.count += revision->dir_noderev_count
1142                                    + revision->file_noderev_count;
1143      stats->total_node_stats.size += revision->dir_noderev_size
1144                                   + revision->file_noderev_size;
1145
1146      /* process representations */
1147      for (k = 0; k < revision->representations->nelts; ++k)
1148        {
1149          rep_stats_t *rep = APR_ARRAY_IDX(revision->representations, k,
1150                                           rep_stats_t *);
1151
1152          /* accumulate in the right bucket */
1153          switch(rep->kind)
1154            {
1155              case file_rep:
1156                add_rep_stats(&stats->file_rep_stats, rep);
1157                break;
1158              case dir_rep:
1159                add_rep_stats(&stats->dir_rep_stats, rep);
1160                break;
1161              case file_property_rep:
1162                add_rep_stats(&stats->file_prop_rep_stats, rep);
1163                break;
1164              case dir_property_rep:
1165                add_rep_stats(&stats->dir_prop_rep_stats, rep);
1166                break;
1167              default:
1168                break;
1169            }
1170
1171          add_rep_stats(&stats->total_rep_stats, rep);
1172        }
1173    }
1174}
1175
1176/* Return a new svn_fs_fs__stats_t instance, allocated in RESULT_POOL.
1177 */
1178static svn_fs_fs__stats_t *
1179create_stats(apr_pool_t *result_pool)
1180{
1181  svn_fs_fs__stats_t *stats = apr_pcalloc(result_pool, sizeof(*stats));
1182
1183  initialize_largest_changes(stats, 64, result_pool);
1184  stats->by_extension = apr_hash_make(result_pool);
1185
1186  return stats;
1187}
1188
1189/* Create a *QUERY, allocated in RESULT_POOL, reading filesystem FS and
1190 * collecting results in STATS.  Store the optional PROCESS_FUNC and
1191 * PROGRESS_BATON as well as CANCEL_FUNC and CANCEL_BATON in *QUERY, too.
1192 * Use SCRATCH_POOL for temporary allocations.
1193 */
1194static svn_error_t *
1195create_query(query_t **query,
1196             svn_fs_t *fs,
1197             svn_fs_fs__stats_t *stats,
1198             svn_fs_progress_notify_func_t progress_func,
1199             void *progress_baton,
1200             svn_cancel_func_t cancel_func,
1201             void *cancel_baton,
1202             apr_pool_t *result_pool,
1203             apr_pool_t *scratch_pool)
1204{
1205  *query = apr_pcalloc(result_pool, sizeof(**query));
1206
1207  /* Read repository dimensions. */
1208  (*query)->shard_size = svn_fs_fs__shard_size(fs);
1209  SVN_ERR(svn_fs_fs__youngest_rev(&(*query)->head, fs, scratch_pool));
1210  SVN_ERR(svn_fs_fs__min_unpacked_rev(&(*query)->min_unpacked_rev, fs,
1211                                      scratch_pool));
1212
1213  /* create data containers and caches
1214   * Note: this assumes that int is at least 32-bits and that we only support
1215   * 32-bit wide revision numbers (actually 31-bits due to the signedness
1216   * of both the nelts field of the array and our revision numbers). This
1217   * means this code will fail on platforms where int is less than 32-bits
1218   * and the repository has more revisions than int can hold. */
1219  (*query)->revisions = apr_array_make(result_pool, (int) (*query)->head + 1,
1220                                       sizeof(revision_info_t *));
1221  (*query)->null_base = apr_pcalloc(result_pool,
1222                                    sizeof(*(*query)->null_base));
1223
1224  /* Store other parameters */
1225  (*query)->fs = fs;
1226  (*query)->stats = stats;
1227  (*query)->progress_func = progress_func;
1228  (*query)->progress_baton = progress_baton;
1229  (*query)->cancel_func = cancel_func;
1230  (*query)->cancel_baton = cancel_baton;
1231
1232  return SVN_NO_ERROR;
1233}
1234
1235svn_error_t *
1236svn_fs_fs__get_stats(svn_fs_fs__stats_t **stats,
1237                     svn_fs_t *fs,
1238                     svn_fs_progress_notify_func_t progress_func,
1239                     void *progress_baton,
1240                     svn_cancel_func_t cancel_func,
1241                     void *cancel_baton,
1242                     apr_pool_t *result_pool,
1243                     apr_pool_t *scratch_pool)
1244{
1245  query_t *query;
1246
1247  *stats = create_stats(result_pool);
1248  SVN_ERR(create_query(&query, fs, *stats, progress_func, progress_baton,
1249                       cancel_func, cancel_baton, scratch_pool,
1250                       scratch_pool));
1251  SVN_ERR(read_revisions(query, scratch_pool, scratch_pool));
1252  aggregate_stats(query->revisions, *stats);
1253
1254  return SVN_NO_ERROR;
1255}
1256