1289177Speter/* reps.c --- FSX representation container
2289177Speter *
3289177Speter * ====================================================================
4289177Speter *    Licensed to the Apache Software Foundation (ASF) under one
5289177Speter *    or more contributor license agreements.  See the NOTICE file
6289177Speter *    distributed with this work for additional information
7289177Speter *    regarding copyright ownership.  The ASF licenses this file
8289177Speter *    to you under the Apache License, Version 2.0 (the
9289177Speter *    "License"); you may not use this file except in compliance
10289177Speter *    with the License.  You may obtain a copy of the License at
11289177Speter *
12289177Speter *      http://www.apache.org/licenses/LICENSE-2.0
13289177Speter *
14289177Speter *    Unless required by applicable law or agreed to in writing,
15289177Speter *    software distributed under the License is distributed on an
16289177Speter *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17289177Speter *    KIND, either express or implied.  See the License for the
18289177Speter *    specific language governing permissions and limitations
19289177Speter *    under the License.
20289177Speter * ====================================================================
21289177Speter */
22289177Speter
23289177Speter#include "reps.h"
24289177Speter
25289177Speter#include "svn_sorts.h"
26289177Speter#include "private/svn_string_private.h"
27289177Speter#include "private/svn_packed_data.h"
28289177Speter#include "private/svn_temp_serializer.h"
29289177Speter
30289177Speter#include "svn_private_config.h"
31289177Speter
32289177Speter#include "cached_data.h"
33289177Speter
34289177Speter/* Length of the text chunks we hash and match.  The algorithm will find
35289177Speter * most matches with a length of 2 * MATCH_BLOCKSIZE and only specific
36289177Speter * ones that are shorter than MATCH_BLOCKSIZE.
37289177Speter *
38289177Speter * This should be a power of two and must be a multiple of 8.
39289177Speter * Good choices are 32, 64 and 128.
40289177Speter */
41289177Speter#define MATCH_BLOCKSIZE 64
42289177Speter
43289177Speter/* Limit the total text body within a container to 16MB.  Larger values
44289177Speter * of up to 2GB are possible but become increasingly impractical as the
45289177Speter * container has to be loaded in its entirety before any of it can be read.
46289177Speter */
47289177Speter#define MAX_TEXT_BODY 0x1000000
48289177Speter
49289177Speter/* Limit the size of the instructions stream.  This should not exceed the
50289177Speter * text body size limit. */
51289177Speter#define MAX_INSTRUCTIONS (MAX_TEXT_BODY / 8)
52289177Speter
53289177Speter/* value of unused hash buckets */
54289177Speter#define NO_OFFSET ((apr_uint32_t)(-1))
55289177Speter
56289177Speter/* Byte strings are described by a series of copy instructions that each
57289177Speter * do one of the following
58289177Speter *
59289177Speter * - copy a given number of bytes from the text corpus starting at a
60289177Speter *   given offset
61289177Speter * - reference other instruction and specify how many of instructions of
62289177Speter *   that sequence shall be executed (i.e. a sub-sequence)
63289177Speter * - copy a number of bytes from the base representation buffer starting
64289177Speter *   at a given offset
65289177Speter */
66289177Speter
67289177Speter/* The contents of a fulltext / representation is defined by its first
68289177Speter * instruction and the number of instructions to execute.
69289177Speter */
70289177Spetertypedef struct rep_t
71289177Speter{
72289177Speter  apr_uint32_t first_instruction;
73289177Speter  apr_uint32_t instruction_count;
74289177Speter} rep_t;
75289177Speter
76289177Speter/* A single instruction.  The instruction type is being encoded in OFFSET.
77289177Speter */
78289177Spetertypedef struct instruction_t
79289177Speter{
80289177Speter  /* Instruction type and offset.
81289177Speter   * - offset < 0
82289177Speter   *   reference to instruction sub-sequence starting with
83289177Speter   *   container->instructions[-offset].
84289177Speter   * - 0 <= offset < container->base_text_len
85289177Speter   *   reference to the base text corpus;
86289177Speter   *   start copy at offset
87289177Speter   * - offset >= container->base_text_len
88289177Speter   *   reference to the text corpus;
89289177Speter   *   start copy at offset-container->base_text_len
90289177Speter   */
91289177Speter  apr_int32_t offset;
92289177Speter
93289177Speter  /* Number of bytes to copy / instructions to execute
94289177Speter   */
95289177Speter  apr_uint32_t count;
96289177Speter} instruction_t;
97289177Speter
98289177Speter/* Describe a base fulltext.
99289177Speter */
100289177Spetertypedef struct base_t
101289177Speter{
102289177Speter  /* Revision */
103289177Speter  svn_revnum_t revision;
104289177Speter
105289177Speter  /* Item within that revision */
106289177Speter  apr_uint64_t item_index;
107289177Speter
108289177Speter  /* Priority with which to use this base over others */
109289177Speter  int priority;
110289177Speter
111289177Speter  /* Index into builder->representations that identifies the copy
112289177Speter   * instructions for this base. */
113289177Speter  apr_uint32_t rep;
114289177Speter} base_t;
115289177Speter
116289177Speter/* Yet another hash data structure.  This one tries to be more cache
117289177Speter * friendly by putting the first byte of each hashed sequence in a
118289177Speter * common array.  This array will often fit into L1 or L2 at least and
119289177Speter * give a 99% accurate test for a match without giving false negatives.
120289177Speter */
121289177Spetertypedef struct hash_t
122289177Speter{
123289177Speter  /* for used entries i, prefixes[i] == text[offsets[i]]; 0 otherwise.
124289177Speter   * This allows for a quick check without resolving the double
125289177Speter   * indirection. */
126289177Speter  char *prefixes;
127289177Speter
128289177Speter  /* for used entries i, offsets[i] is start offset in the text corpus;
129289177Speter   * NO_OFFSET otherwise.
130289177Speter   */
131289177Speter  apr_uint32_t *offsets;
132289177Speter
133289177Speter  /* to be used later for optimizations. */
134289177Speter  apr_uint32_t *last_matches;
135289177Speter
136289177Speter  /* number of buckets in this hash, i.e. elements in each array above.
137289177Speter   * Must be 1 << (8 * sizeof(hash_key_t) - shift) */
138289177Speter  apr_size_t size;
139289177Speter
140289177Speter  /* number of buckets actually in use. Must be <= size. */
141289177Speter  apr_size_t used;
142289177Speter
143289177Speter  /* number of bits to shift right to map a hash_key_t to a bucket index */
144289177Speter  apr_size_t shift;
145289177Speter
146289177Speter  /* pool to use when growing the hash */
147289177Speter  apr_pool_t *pool;
148289177Speter} hash_t;
149289177Speter
150289177Speter/* Hash key type. 32 bits for pseudo-Adler32 hash sums.
151289177Speter */
152289177Spetertypedef apr_uint32_t hash_key_t;
153289177Speter
154289177Speter/* Constructor data structure.
155289177Speter */
156289177Speterstruct svn_fs_x__reps_builder_t
157289177Speter{
158289177Speter  /* file system to read base representations from */
159289177Speter  svn_fs_t *fs;
160289177Speter
161289177Speter  /* text corpus */
162289177Speter  svn_stringbuf_t *text;
163289177Speter
164289177Speter  /* text block hash */
165289177Speter  hash_t hash;
166289177Speter
167289177Speter  /* array of base_t objects describing all bases defined so far */
168289177Speter  apr_array_header_t *bases;
169289177Speter
170289177Speter  /* array of rep_t objects describing all fulltexts (including bases)
171289177Speter   * added so far */
172289177Speter  apr_array_header_t *reps;
173289177Speter
174289177Speter  /* array of instruction_t objects describing all instructions */
175289177Speter  apr_array_header_t *instructions;
176289177Speter
177289177Speter  /* number of bytes in the text corpus that belongs to bases */
178289177Speter  apr_size_t base_text_len;
179289177Speter};
180289177Speter
181289177Speter/* R/o container.
182289177Speter */
183289177Speterstruct svn_fs_x__reps_t
184289177Speter{
185289177Speter  /* text corpus */
186289177Speter  const char *text;
187289177Speter
188289177Speter  /* length of the text corpus in bytes */
189289177Speter  apr_size_t text_len;
190289177Speter
191289177Speter  /* bases used */
192289177Speter  const base_t *bases;
193289177Speter
194289177Speter  /* number of bases used */
195289177Speter  apr_size_t base_count;
196289177Speter
197289177Speter  /* fulltext i can be reconstructed by executing instructions
198289177Speter   * first_instructions[i] .. first_instructions[i+1]-1
199289177Speter   * (this array has one extra element at the end)
200289177Speter   */
201289177Speter  const apr_uint32_t *first_instructions;
202289177Speter
203289177Speter  /* number of fulltexts (no bases) */
204289177Speter  apr_size_t rep_count;
205289177Speter
206289177Speter  /* instructions */
207289177Speter  const instruction_t *instructions;
208289177Speter
209289177Speter  /* total number of instructions */
210289177Speter  apr_size_t instruction_count;
211289177Speter
212289177Speter  /* offsets > 0 but smaller that this are considered base references */
213289177Speter  apr_size_t base_text_len;
214289177Speter};
215289177Speter
216289177Speter/* describe a section in the extractor's result string that is not filled
217289177Speter * yet (but already exists).
218289177Speter */
219289177Spetertypedef struct missing_t
220289177Speter{
221289177Speter  /* start offset within the result string */
222289177Speter  apr_uint32_t start;
223289177Speter
224289177Speter  /* number of bytes to write */
225289177Speter  apr_uint32_t count;
226289177Speter
227289177Speter  /* index into extractor->bases selecting the base representation to
228289177Speter   * copy from */
229289177Speter  apr_uint32_t base;
230289177Speter
231289177Speter  /* copy source offset within that base representation */
232289177Speter  apr_uint32_t offset;
233289177Speter} missing_t;
234289177Speter
235289177Speter/* Fulltext extractor data structure.
236289177Speter */
237289177Speterstruct svn_fs_x__rep_extractor_t
238289177Speter{
239289177Speter  /* filesystem to read the bases from */
240289177Speter  svn_fs_t *fs;
241289177Speter
242289177Speter  /* fulltext being constructed */
243289177Speter  svn_stringbuf_t *result;
244289177Speter
245289177Speter  /* bases (base_t) yet to process (not used ATM) */
246289177Speter  apr_array_header_t *bases;
247289177Speter
248289177Speter  /* missing sections (missing_t) in result->data that need to be filled,
249289177Speter   * yet */
250289177Speter  apr_array_header_t *missing;
251289177Speter
252289177Speter  /* pool to use for allocating the above arrays */
253289177Speter  apr_pool_t *pool;
254289177Speter};
255289177Speter
256289177Speter/* Given the ADLER32 checksum for a certain range of MATCH_BLOCKSIZE
257289177Speter * bytes, return the checksum for the range excluding the first byte
258289177Speter * C_OUT and appending C_IN.
259289177Speter */
260289177Speterstatic hash_key_t
261289177Speterhash_key_replace(hash_key_t adler32, const char c_out, const char c_in)
262289177Speter{
263289177Speter  adler32 -= (MATCH_BLOCKSIZE * 0x10000u * ((unsigned char) c_out));
264289177Speter
265289177Speter  adler32 -= (unsigned char)c_out;
266289177Speter  adler32 += (unsigned char)c_in;
267289177Speter
268289177Speter  return adler32 + adler32 * 0x10000;
269289177Speter}
270289177Speter
271289177Speter/* Calculate an pseudo-adler32 checksum for MATCH_BLOCKSIZE bytes starting
272289177Speter   at DATA.  Return the checksum value.  */
273289177Speterstatic hash_key_t
274289177Speterhash_key(const char *data)
275289177Speter{
276289177Speter  const unsigned char *input = (const unsigned char *)data;
277289177Speter  const unsigned char *last = input + MATCH_BLOCKSIZE;
278289177Speter
279289177Speter  hash_key_t s1 = 0;
280289177Speter  hash_key_t s2 = 0;
281289177Speter
282289177Speter  for (; input < last; input += 8)
283289177Speter    {
284289177Speter      s1 += input[0]; s2 += s1;
285289177Speter      s1 += input[1]; s2 += s1;
286289177Speter      s1 += input[2]; s2 += s1;
287289177Speter      s1 += input[3]; s2 += s1;
288289177Speter      s1 += input[4]; s2 += s1;
289289177Speter      s1 += input[5]; s2 += s1;
290289177Speter      s1 += input[6]; s2 += s1;
291289177Speter      s1 += input[7]; s2 += s1;
292289177Speter    }
293289177Speter
294289177Speter  return s2 * 0x10000 + s1;
295289177Speter}
296289177Speter
297289177Speter/* Map the ADLER32 key to a bucket index in HASH and return that index.
298289177Speter */
299289177Speterstatic apr_size_t
300289177Speterhash_to_index(hash_t *hash, hash_key_t adler32)
301289177Speter{
302289177Speter  return (adler32 * 0xd1f3da69) >> hash->shift;
303289177Speter}
304289177Speter
305289177Speter/* Allocate and initialized SIZE buckets in RESULT_POOL.
306289177Speter * Assign them to HASH.
307289177Speter */
308289177Speterstatic void
309289177Speterallocate_hash_members(hash_t *hash,
310289177Speter                      apr_size_t size,
311289177Speter                      apr_pool_t *result_pool)
312289177Speter{
313289177Speter  apr_size_t i;
314289177Speter
315289177Speter  hash->pool = result_pool;
316289177Speter  hash->size = size;
317289177Speter
318289177Speter  hash->prefixes = apr_pcalloc(result_pool, size);
319289177Speter  hash->last_matches = apr_pcalloc(result_pool,
320289177Speter                                   sizeof(*hash->last_matches) * size);
321289177Speter  hash->offsets = apr_palloc(result_pool, sizeof(*hash->offsets) * size);
322289177Speter
323289177Speter  for (i = 0; i < size; ++i)
324289177Speter    hash->offsets[i] = NO_OFFSET;
325289177Speter}
326289177Speter
327289177Speter/* Initialize the HASH data structure with 2**TWOPOWER buckets allocated
328289177Speter * in RESULT_POOL.
329289177Speter */
330289177Speterstatic void
331289177Speterinit_hash(hash_t *hash,
332289177Speter          apr_size_t twoPower,
333289177Speter          apr_pool_t *result_pool)
334289177Speter{
335289177Speter  hash->used = 0;
336289177Speter  hash->shift = sizeof(hash_key_t) * 8 - twoPower;
337289177Speter
338289177Speter  allocate_hash_members(hash, 1 << twoPower, result_pool);
339289177Speter}
340289177Speter
341289177Speter/* Make HASH have at least MIN_SIZE buckets but at least double the number
342289177Speter * of buckets in HASH by rehashing it based TEXT.
343289177Speter */
344289177Speterstatic void
345289177Spetergrow_hash(hash_t *hash,
346289177Speter          svn_stringbuf_t *text,
347289177Speter          apr_size_t min_size)
348289177Speter{
349289177Speter  hash_t copy;
350289177Speter  apr_size_t i;
351289177Speter
352289177Speter  /* determine the new hash size */
353289177Speter  apr_size_t new_size = hash->size * 2;
354289177Speter  apr_size_t new_shift = hash->shift - 1;
355289177Speter  while (new_size < min_size)
356289177Speter    {
357289177Speter      new_size *= 2;
358289177Speter      --new_shift;
359289177Speter    }
360289177Speter
361289177Speter  /* allocate new hash */
362289177Speter  allocate_hash_members(&copy, new_size, hash->pool);
363289177Speter  copy.used = 0;
364289177Speter  copy.shift = new_shift;
365289177Speter
366289177Speter  /* copy / translate data */
367289177Speter  for (i = 0; i < hash->size; ++i)
368289177Speter    {
369289177Speter      apr_uint32_t offset = hash->offsets[i];
370289177Speter      if (offset != NO_OFFSET)
371289177Speter        {
372289177Speter          hash_key_t key = hash_key(text->data + offset);
373289177Speter          size_t idx = hash_to_index(&copy, key);
374289177Speter
375289177Speter          if (copy.offsets[idx] == NO_OFFSET)
376289177Speter            copy.used++;
377289177Speter
378289177Speter          copy.prefixes[idx] = hash->prefixes[i];
379289177Speter          copy.offsets[idx] = offset;
380289177Speter          copy.last_matches[idx] = hash->last_matches[i];
381289177Speter        }
382289177Speter    }
383289177Speter
384289177Speter  *hash = copy;
385289177Speter}
386289177Speter
387289177Spetersvn_fs_x__reps_builder_t *
388289177Spetersvn_fs_x__reps_builder_create(svn_fs_t *fs,
389289177Speter                              apr_pool_t *result_pool)
390289177Speter{
391289177Speter  svn_fs_x__reps_builder_t *result = apr_pcalloc(result_pool,
392289177Speter                                                 sizeof(*result));
393289177Speter
394289177Speter  result->fs = fs;
395289177Speter  result->text = svn_stringbuf_create_empty(result_pool);
396289177Speter  init_hash(&result->hash, 4, result_pool);
397289177Speter
398289177Speter  result->bases = apr_array_make(result_pool, 0, sizeof(base_t));
399289177Speter  result->reps = apr_array_make(result_pool, 0, sizeof(rep_t));
400289177Speter  result->instructions = apr_array_make(result_pool, 0,
401289177Speter                                        sizeof(instruction_t));
402289177Speter
403289177Speter  return result;
404289177Speter}
405289177Speter
406289177Spetersvn_error_t *
407289177Spetersvn_fs_x__reps_add_base(svn_fs_x__reps_builder_t *builder,
408289177Speter                        svn_fs_x__representation_t *rep,
409289177Speter                        int priority,
410289177Speter                        apr_pool_t *scratch_pool)
411289177Speter{
412289177Speter  base_t base;
413289177Speter  apr_size_t text_start_offset = builder->text->len;
414289177Speter
415289177Speter  svn_stream_t *stream;
416289177Speter  svn_string_t *contents;
417289177Speter  apr_size_t idx;
418289177Speter  SVN_ERR(svn_fs_x__get_contents(&stream, builder->fs, rep, FALSE,
419289177Speter                                 scratch_pool));
420362181Sdim  SVN_ERR(svn_string_from_stream2(&contents, stream, SVN__STREAM_CHUNK_SIZE,
421362181Sdim                                  scratch_pool));
422289177Speter  SVN_ERR(svn_fs_x__reps_add(&idx, builder, contents));
423289177Speter
424289177Speter  base.revision = svn_fs_x__get_revnum(rep->id.change_set);
425289177Speter  base.item_index = rep->id.number;
426289177Speter  base.priority = priority;
427289177Speter  base.rep = (apr_uint32_t)idx;
428289177Speter
429289177Speter  APR_ARRAY_PUSH(builder->bases, base_t) = base;
430289177Speter  builder->base_text_len += builder->text->len - text_start_offset;
431289177Speter
432289177Speter  return SVN_NO_ERROR;
433289177Speter}
434289177Speter
435289177Speter/* Add LEN bytes from DATA to BUILDER's text corpus. Also, add a copy
436289177Speter * operation for that text fragment.
437289177Speter */
438289177Speterstatic void
439289177Speteradd_new_text(svn_fs_x__reps_builder_t *builder,
440289177Speter             const char *data,
441289177Speter             apr_size_t len)
442289177Speter{
443289177Speter  instruction_t instruction;
444289177Speter  apr_size_t offset;
445289177Speter  apr_size_t buckets_required;
446289177Speter
447289177Speter  if (len == 0)
448289177Speter    return;
449289177Speter
450289177Speter  /* new instruction */
451289177Speter  instruction.offset = (apr_int32_t)builder->text->len;
452289177Speter  instruction.count = (apr_uint32_t)len;
453289177Speter  APR_ARRAY_PUSH(builder->instructions, instruction_t) = instruction;
454289177Speter
455289177Speter  /* add to text corpus */
456289177Speter  svn_stringbuf_appendbytes(builder->text, data, len);
457289177Speter
458289177Speter  /* expand the hash upfront to minimize the chances of collisions */
459289177Speter  buckets_required = builder->hash.used + len / MATCH_BLOCKSIZE;
460289177Speter  if (buckets_required * 3 >= builder->hash.size * 2)
461289177Speter    grow_hash(&builder->hash, builder->text, 2 * buckets_required);
462289177Speter
463289177Speter  /* add hash entries for the new sequence */
464289177Speter  for (offset = instruction.offset;
465289177Speter       offset + MATCH_BLOCKSIZE <= builder->text->len;
466289177Speter       offset += MATCH_BLOCKSIZE)
467289177Speter    {
468289177Speter      hash_key_t key = hash_key(builder->text->data + offset);
469289177Speter      size_t idx = hash_to_index(&builder->hash, key);
470289177Speter
471289177Speter      /* Don't replace hash entries that stem from the current text.
472289177Speter       * This makes early matches more likely. */
473289177Speter      if (builder->hash.offsets[idx] == NO_OFFSET)
474289177Speter        ++builder->hash.used;
475289177Speter      else if (builder->hash.offsets[idx] >= instruction.offset)
476289177Speter        continue;
477289177Speter
478289177Speter      builder->hash.offsets[idx] = (apr_uint32_t)offset;
479289177Speter      builder->hash.prefixes[idx] = builder->text->data[offset];
480289177Speter    }
481289177Speter}
482289177Speter
483289177Spetersvn_error_t *
484289177Spetersvn_fs_x__reps_add(apr_size_t *rep_idx,
485289177Speter                   svn_fs_x__reps_builder_t *builder,
486289177Speter                   const svn_string_t *contents)
487289177Speter{
488289177Speter  rep_t rep;
489289177Speter  const char *current = contents->data;
490289177Speter  const char *processed = current;
491289177Speter  const char *end = current + contents->len;
492289177Speter  const char *last_to_test = end - MATCH_BLOCKSIZE - 1;
493289177Speter
494289177Speter  if (builder->text->len + contents->len > MAX_TEXT_BODY)
495289177Speter    return svn_error_create(SVN_ERR_FS_CONTAINER_SIZE, NULL,
496289177Speter                      _("Text body exceeds star delta container capacity"));
497289177Speter
498289177Speter  if (  builder->instructions->nelts + 2 * contents->len / MATCH_BLOCKSIZE
499289177Speter      > MAX_INSTRUCTIONS)
500289177Speter    return svn_error_create(SVN_ERR_FS_CONTAINER_SIZE, NULL,
501289177Speter              _("Instruction count exceeds star delta container capacity"));
502289177Speter
503289177Speter  rep.first_instruction = (apr_uint32_t)builder->instructions->nelts;
504289177Speter  while (current < last_to_test)
505289177Speter    {
506289177Speter      hash_key_t key = hash_key(current);
507289177Speter      size_t offset;
508289177Speter      size_t idx;
509289177Speter
510289177Speter      /* search for the next matching sequence */
511289177Speter
512289177Speter      for (; current < last_to_test; ++current)
513289177Speter        {
514289177Speter          idx = hash_to_index(&builder->hash, key);
515289177Speter          if (builder->hash.prefixes[idx] == current[0])
516289177Speter            {
517289177Speter              offset = builder->hash.offsets[idx];
518289177Speter              if (   (offset != NO_OFFSET)
519289177Speter                  && (memcmp(&builder->text->data[offset], current,
520289177Speter                             MATCH_BLOCKSIZE) == 0))
521289177Speter                break;
522289177Speter            }
523289177Speter          key = hash_key_replace(key, current[0], current[MATCH_BLOCKSIZE]);
524289177Speter        }
525289177Speter
526289177Speter      /* found it? */
527289177Speter
528289177Speter      if (current < last_to_test)
529289177Speter        {
530289177Speter          instruction_t instruction;
531289177Speter
532289177Speter          /* extend the match */
533289177Speter
534289177Speter          size_t prefix_match
535289177Speter            = svn_cstring__reverse_match_length(current,
536289177Speter                                                builder->text->data + offset,
537289177Speter                                                MIN(offset, current - processed));
538289177Speter          size_t postfix_match
539289177Speter            = svn_cstring__match_length(current + MATCH_BLOCKSIZE,
540289177Speter                           builder->text->data + offset + MATCH_BLOCKSIZE,
541289177Speter                           MIN(builder->text->len - offset - MATCH_BLOCKSIZE,
542289177Speter                               end - current - MATCH_BLOCKSIZE));
543289177Speter
544289177Speter          /* non-matched section */
545289177Speter
546289177Speter          size_t new_copy = (current - processed) - prefix_match;
547289177Speter          if (new_copy)
548289177Speter            add_new_text(builder, processed, new_copy);
549289177Speter
550289177Speter          /* add instruction for matching section */
551289177Speter
552289177Speter          instruction.offset = (apr_int32_t)(offset - prefix_match);
553289177Speter          instruction.count = (apr_uint32_t)(prefix_match + postfix_match +
554289177Speter                                             MATCH_BLOCKSIZE);
555289177Speter          APR_ARRAY_PUSH(builder->instructions, instruction_t) = instruction;
556289177Speter
557289177Speter          processed = current + MATCH_BLOCKSIZE + postfix_match;
558289177Speter          current = processed;
559289177Speter        }
560289177Speter    }
561289177Speter
562289177Speter  add_new_text(builder, processed, end - processed);
563289177Speter  rep.instruction_count = (apr_uint32_t)builder->instructions->nelts
564289177Speter                        - rep.first_instruction;
565289177Speter  APR_ARRAY_PUSH(builder->reps, rep_t) = rep;
566289177Speter
567289177Speter  *rep_idx = (apr_size_t)(builder->reps->nelts - 1);
568289177Speter  return SVN_NO_ERROR;
569289177Speter}
570289177Speter
571289177Speterapr_size_t
572289177Spetersvn_fs_x__reps_estimate_size(const svn_fs_x__reps_builder_t *builder)
573289177Speter{
574289177Speter  /* approx: size of the text exclusive to us @ 50% compression rate
575289177Speter   *       + 2 bytes per instruction
576289177Speter   *       + 2 bytes per representation
577289177Speter   *       + 8 bytes per base representation
578289177Speter   *       + 1:8 inefficiency in using the base representations
579289177Speter   *       + 100 bytes static overhead
580289177Speter   */
581289177Speter  return (builder->text->len - builder->base_text_len) / 2
582289177Speter       + builder->instructions->nelts * 2
583289177Speter       + builder->reps->nelts * 2
584289177Speter       + builder->bases->nelts * 8
585289177Speter       + builder->base_text_len / 8
586289177Speter       + 100;
587289177Speter}
588289177Speter
589289177Speter/* Execute COUNT instructions starting at INSTRUCTION_IDX in CONTAINER
590289177Speter * and fill the parts of EXTRACTOR->RESULT that we can from this container.
591289177Speter * Record the remainder in EXTRACTOR->MISSING.
592289177Speter *
593289177Speter * This function will recurse for instructions that reference other
594289177Speter * instruction sequences. COUNT refers to the top-level instructions only.
595289177Speter */
596289177Speterstatic void
597289177Speterget_text(svn_fs_x__rep_extractor_t *extractor,
598289177Speter         const svn_fs_x__reps_t *container,
599289177Speter         apr_size_t instruction_idx,
600289177Speter         apr_size_t count)
601289177Speter{
602289177Speter  const instruction_t *instruction;
603289177Speter  const char *offset_0 = container->text - container->base_text_len;
604289177Speter
605289177Speter  for (instruction = container->instructions + instruction_idx;
606289177Speter       instruction < container->instructions + instruction_idx + count;
607289177Speter       instruction++)
608289177Speter    if (instruction->offset < 0)
609289177Speter      {
610289177Speter        /* instruction sub-sequence */
611289177Speter        get_text(extractor, container, -instruction->offset,
612289177Speter                 instruction->count);
613289177Speter      }
614289177Speter    else if (instruction->offset >= container->base_text_len)
615289177Speter      {
616289177Speter        /* direct copy instruction */
617289177Speter        svn_stringbuf_appendbytes(extractor->result,
618289177Speter                                  offset_0 + instruction->offset,
619289177Speter                                  instruction->count);
620289177Speter      }
621289177Speter    else
622289177Speter      {
623289177Speter        /* a section that we need to fill from some external base rep. */
624289177Speter        missing_t missing;
625289177Speter        missing.base = 0;
626289177Speter        missing.start = (apr_uint32_t)extractor->result->len;
627289177Speter        missing.count = instruction->count;
628289177Speter        missing.offset = instruction->offset;
629289177Speter        svn_stringbuf_appendfill(extractor->result, 0, instruction->count);
630289177Speter
631289177Speter        if (extractor->missing == NULL)
632289177Speter          extractor->missing = apr_array_make(extractor->pool, 1,
633289177Speter                                              sizeof(missing));
634289177Speter
635289177Speter        APR_ARRAY_PUSH(extractor->missing, missing_t) = missing;
636289177Speter      }
637289177Speter}
638289177Speter
639289177Spetersvn_error_t *
640289177Spetersvn_fs_x__reps_get(svn_fs_x__rep_extractor_t **extractor,
641289177Speter                   svn_fs_t *fs,
642289177Speter                   const svn_fs_x__reps_t *container,
643289177Speter                   apr_size_t idx,
644362181Sdim                   apr_pool_t *result_pool)
645289177Speter{
646289177Speter  apr_uint32_t first = container->first_instructions[idx];
647289177Speter  apr_uint32_t last = container->first_instructions[idx + 1];
648289177Speter
649289177Speter  /* create the extractor object */
650362181Sdim  svn_fs_x__rep_extractor_t *result = apr_pcalloc(result_pool,
651362181Sdim                                                  sizeof(*result));
652289177Speter  result->fs = fs;
653362181Sdim  result->result = svn_stringbuf_create_empty(result_pool);
654362181Sdim  result->pool = result_pool;
655289177Speter
656289177Speter  /* fill all the bits of the result that we can, i.e. all but bits coming
657289177Speter   * from base representations */
658289177Speter  get_text(result, container, first, last - first);
659289177Speter  *extractor = result;
660289177Speter  return SVN_NO_ERROR;
661289177Speter}
662289177Speter
663289177Spetersvn_error_t *
664289177Spetersvn_fs_x__extractor_drive(svn_stringbuf_t **contents,
665289177Speter                          svn_fs_x__rep_extractor_t *extractor,
666289177Speter                          apr_size_t start_offset,
667289177Speter                          apr_size_t size,
668289177Speter                          apr_pool_t *result_pool,
669289177Speter                          apr_pool_t *scratch_pool)
670289177Speter{
671289177Speter  /* we don't support base reps right now */
672289177Speter  SVN_ERR_ASSERT(extractor->missing == NULL);
673289177Speter
674289177Speter  if (size == 0)
675289177Speter    {
676289177Speter      *contents = svn_stringbuf_dup(extractor->result, result_pool);
677289177Speter    }
678289177Speter  else
679289177Speter    {
680289177Speter      /* clip the selected range */
681289177Speter      if (start_offset > extractor->result->len)
682289177Speter        start_offset = extractor->result->len;
683289177Speter
684289177Speter      if (size > extractor->result->len - start_offset)
685289177Speter        size = extractor->result->len - start_offset;
686289177Speter
687289177Speter      *contents = svn_stringbuf_ncreate(extractor->result->data + start_offset,
688289177Speter                                        size, result_pool);
689289177Speter    }
690289177Speter
691289177Speter  return SVN_NO_ERROR;
692289177Speter}
693289177Speter
694289177Spetersvn_error_t *
695289177Spetersvn_fs_x__write_reps_container(svn_stream_t *stream,
696289177Speter                               const svn_fs_x__reps_builder_t *builder,
697289177Speter                               apr_pool_t *scratch_pool)
698289177Speter{
699289177Speter  int i;
700289177Speter  svn_packed__data_root_t *root = svn_packed__data_create_root(scratch_pool);
701289177Speter
702289177Speter  /* one top-level stream for each array */
703289177Speter  svn_packed__int_stream_t *bases_stream
704289177Speter    = svn_packed__create_int_stream(root, FALSE, FALSE);
705289177Speter  svn_packed__int_stream_t *reps_stream
706289177Speter    = svn_packed__create_int_stream(root, TRUE, FALSE);
707289177Speter  svn_packed__int_stream_t *instructions_stream
708289177Speter    = svn_packed__create_int_stream(root, FALSE, FALSE);
709289177Speter
710289177Speter  /* for misc stuff */
711289177Speter  svn_packed__int_stream_t *misc_stream
712289177Speter    = svn_packed__create_int_stream(root, FALSE, FALSE);
713289177Speter
714289177Speter  /* TEXT will be just a single string */
715289177Speter  svn_packed__byte_stream_t *text_stream
716289177Speter    = svn_packed__create_bytes_stream(root);
717289177Speter
718289177Speter  /* structure the struct streams such we can extract much of the redundancy
719289177Speter   */
720289177Speter  svn_packed__create_int_substream(bases_stream, TRUE, TRUE);
721289177Speter  svn_packed__create_int_substream(bases_stream, TRUE, FALSE);
722289177Speter  svn_packed__create_int_substream(bases_stream, TRUE, FALSE);
723289177Speter  svn_packed__create_int_substream(bases_stream, TRUE, FALSE);
724289177Speter
725289177Speter  svn_packed__create_int_substream(instructions_stream, TRUE, TRUE);
726289177Speter  svn_packed__create_int_substream(instructions_stream, FALSE, FALSE);
727289177Speter
728289177Speter  /* text */
729289177Speter  svn_packed__add_bytes(text_stream, builder->text->data, builder->text->len);
730289177Speter
731289177Speter  /* serialize bases */
732289177Speter  for (i = 0; i < builder->bases->nelts; ++i)
733289177Speter    {
734289177Speter      const base_t *base = &APR_ARRAY_IDX(builder->bases, i, base_t);
735289177Speter      svn_packed__add_int(bases_stream, base->revision);
736289177Speter      svn_packed__add_uint(bases_stream, base->item_index);
737289177Speter      svn_packed__add_uint(bases_stream, base->priority);
738289177Speter      svn_packed__add_uint(bases_stream, base->rep);
739289177Speter    }
740289177Speter
741289177Speter  /* serialize reps */
742289177Speter  for (i = 0; i < builder->reps->nelts; ++i)
743289177Speter    {
744289177Speter      const rep_t *rep = &APR_ARRAY_IDX(builder->reps, i, rep_t);
745289177Speter      svn_packed__add_uint(reps_stream, rep->first_instruction);
746289177Speter    }
747289177Speter
748289177Speter  svn_packed__add_uint(reps_stream, builder->instructions->nelts);
749289177Speter
750289177Speter  /* serialize instructions */
751289177Speter  for (i = 0; i < builder->instructions->nelts; ++i)
752289177Speter    {
753289177Speter      const instruction_t *instruction
754289177Speter        = &APR_ARRAY_IDX(builder->instructions, i, instruction_t);
755289177Speter      svn_packed__add_int(instructions_stream, instruction->offset);
756289177Speter      svn_packed__add_uint(instructions_stream, instruction->count);
757289177Speter    }
758289177Speter
759289177Speter  /* other elements */
760289177Speter  svn_packed__add_uint(misc_stream, 0);
761289177Speter
762289177Speter  /* write to stream */
763289177Speter  SVN_ERR(svn_packed__data_write(stream, root, scratch_pool));
764289177Speter
765289177Speter  return SVN_NO_ERROR;
766289177Speter}
767289177Speter
768289177Spetersvn_error_t *
769289177Spetersvn_fs_x__read_reps_container(svn_fs_x__reps_t **container,
770289177Speter                              svn_stream_t *stream,
771289177Speter                              apr_pool_t *result_pool,
772289177Speter                              apr_pool_t *scratch_pool)
773289177Speter{
774289177Speter  apr_size_t i;
775289177Speter
776289177Speter  base_t *bases;
777289177Speter  apr_uint32_t *first_instructions;
778289177Speter  instruction_t *instructions;
779289177Speter
780289177Speter  svn_fs_x__reps_t *reps = apr_pcalloc(result_pool, sizeof(*reps));
781289177Speter
782289177Speter  svn_packed__data_root_t *root;
783289177Speter  svn_packed__int_stream_t *bases_stream;
784289177Speter  svn_packed__int_stream_t *reps_stream;
785289177Speter  svn_packed__int_stream_t *instructions_stream;
786289177Speter  svn_packed__int_stream_t *misc_stream;
787289177Speter  svn_packed__byte_stream_t *text_stream;
788289177Speter
789289177Speter  /* read from disk */
790289177Speter  SVN_ERR(svn_packed__data_read(&root, stream, result_pool, scratch_pool));
791289177Speter
792289177Speter  bases_stream = svn_packed__first_int_stream(root);
793289177Speter  reps_stream = svn_packed__next_int_stream(bases_stream);
794289177Speter  instructions_stream = svn_packed__next_int_stream(reps_stream);
795289177Speter  misc_stream = svn_packed__next_int_stream(instructions_stream);
796289177Speter  text_stream = svn_packed__first_byte_stream(root);
797289177Speter
798289177Speter  /* text */
799289177Speter  reps->text = svn_packed__get_bytes(text_stream, &reps->text_len);
800289177Speter  reps->text = apr_pmemdup(result_pool, reps->text, reps->text_len);
801289177Speter
802289177Speter  /* de-serialize  bases */
803289177Speter  reps->base_count
804289177Speter    = svn_packed__int_count(svn_packed__first_int_substream(bases_stream));
805289177Speter  bases = apr_palloc(result_pool, reps->base_count * sizeof(*bases));
806289177Speter  reps->bases = bases;
807289177Speter
808289177Speter  for (i = 0; i < reps->base_count; ++i)
809289177Speter    {
810289177Speter      base_t *base = bases + i;
811289177Speter      base->revision = (svn_revnum_t)svn_packed__get_int(bases_stream);
812289177Speter      base->item_index = svn_packed__get_uint(bases_stream);
813289177Speter      base->priority = (int)svn_packed__get_uint(bases_stream);
814289177Speter      base->rep = (apr_uint32_t)svn_packed__get_uint(bases_stream);
815289177Speter    }
816289177Speter
817289177Speter  /* de-serialize instructions */
818289177Speter  reps->instruction_count
819289177Speter    = svn_packed__int_count
820289177Speter         (svn_packed__first_int_substream(instructions_stream));
821289177Speter  instructions
822289177Speter    = apr_palloc(result_pool,
823289177Speter                 reps->instruction_count * sizeof(*instructions));
824289177Speter  reps->instructions = instructions;
825289177Speter
826289177Speter  for (i = 0; i < reps->instruction_count; ++i)
827289177Speter    {
828289177Speter      instruction_t *instruction = instructions + i;
829289177Speter      instruction->offset
830289177Speter        = (apr_int32_t)svn_packed__get_int(instructions_stream);
831289177Speter      instruction->count
832289177Speter        = (apr_uint32_t)svn_packed__get_uint(instructions_stream);
833289177Speter    }
834289177Speter
835289177Speter  /* de-serialize reps */
836289177Speter  reps->rep_count = svn_packed__int_count(reps_stream);
837289177Speter  first_instructions
838289177Speter    = apr_palloc(result_pool,
839289177Speter                 (reps->rep_count + 1) * sizeof(*first_instructions));
840289177Speter  reps->first_instructions = first_instructions;
841289177Speter
842289177Speter  for (i = 0; i < reps->rep_count; ++i)
843289177Speter    first_instructions[i]
844289177Speter      = (apr_uint32_t)svn_packed__get_uint(reps_stream);
845289177Speter  first_instructions[reps->rep_count] = (apr_uint32_t)reps->instruction_count;
846289177Speter
847289177Speter  /* other elements */
848289177Speter  reps->base_text_len = (apr_size_t)svn_packed__get_uint(misc_stream);
849289177Speter
850289177Speter  /* return result */
851289177Speter  *container = reps;
852289177Speter
853289177Speter  return SVN_NO_ERROR;
854289177Speter}
855289177Speter
856289177Spetersvn_error_t *
857289177Spetersvn_fs_x__serialize_reps_container(void **data,
858289177Speter                                   apr_size_t *data_len,
859289177Speter                                   void *in,
860289177Speter                                   apr_pool_t *pool)
861289177Speter{
862289177Speter  svn_fs_x__reps_t *reps = in;
863289177Speter  svn_stringbuf_t *serialized;
864289177Speter
865289177Speter  /* make a guesstimate on the size of the serialized data.  Erring on the
866289177Speter   * low side will cause the serializer to re-alloc its buffer. */
867289177Speter  apr_size_t size
868289177Speter    = reps->text_len
869289177Speter    + reps->base_count * sizeof(*reps->bases)
870289177Speter    + reps->rep_count * sizeof(*reps->first_instructions)
871289177Speter    + reps->instruction_count * sizeof(*reps->instructions)
872289177Speter    + 100;
873289177Speter
874289177Speter  /* serialize array header and all its elements */
875289177Speter  svn_temp_serializer__context_t *context
876289177Speter    = svn_temp_serializer__init(reps, sizeof(*reps), size, pool);
877289177Speter
878289177Speter  /* serialize sub-structures */
879289177Speter  svn_temp_serializer__add_leaf(context, (const void **)&reps->text,
880289177Speter                                reps->text_len);
881289177Speter  svn_temp_serializer__add_leaf(context, (const void **)&reps->bases,
882289177Speter                                reps->base_count * sizeof(*reps->bases));
883289177Speter  svn_temp_serializer__add_leaf(context,
884289177Speter                                (const void **)&reps->first_instructions,
885289177Speter                                reps->rep_count *
886289177Speter                                    sizeof(*reps->first_instructions));
887289177Speter  svn_temp_serializer__add_leaf(context, (const void **)&reps->instructions,
888289177Speter                                reps->instruction_count *
889289177Speter                                    sizeof(*reps->instructions));
890289177Speter
891289177Speter  /* return the serialized result */
892289177Speter  serialized = svn_temp_serializer__get(context);
893289177Speter
894289177Speter  *data = serialized->data;
895289177Speter  *data_len = serialized->len;
896289177Speter
897289177Speter  return SVN_NO_ERROR;
898289177Speter}
899289177Speter
900289177Spetersvn_error_t *
901289177Spetersvn_fs_x__deserialize_reps_container(void **out,
902289177Speter                                     void *data,
903289177Speter                                     apr_size_t data_len,
904362181Sdim                                     apr_pool_t *result_pool)
905289177Speter{
906289177Speter  svn_fs_x__reps_t *reps = (svn_fs_x__reps_t *)data;
907289177Speter
908289177Speter  /* de-serialize sub-structures */
909289177Speter  svn_temp_deserializer__resolve(reps, (void **)&reps->text);
910289177Speter  svn_temp_deserializer__resolve(reps, (void **)&reps->bases);
911289177Speter  svn_temp_deserializer__resolve(reps, (void **)&reps->first_instructions);
912289177Speter  svn_temp_deserializer__resolve(reps, (void **)&reps->instructions);
913289177Speter
914289177Speter  /* done */
915289177Speter  *out = reps;
916289177Speter
917289177Speter  return SVN_NO_ERROR;
918289177Speter}
919289177Speter
920289177Spetersvn_error_t *
921289177Spetersvn_fs_x__reps_get_func(void **out,
922289177Speter                        const void *data,
923289177Speter                        apr_size_t data_len,
924289177Speter                        void *baton,
925289177Speter                        apr_pool_t *pool)
926289177Speter{
927289177Speter  svn_fs_x__reps_baton_t *reps_baton = baton;
928289177Speter
929289177Speter  /* get a usable reps structure  */
930289177Speter  const svn_fs_x__reps_t *cached = data;
931289177Speter  svn_fs_x__reps_t *reps = apr_pmemdup(pool, cached, sizeof(*reps));
932289177Speter
933289177Speter  reps->text
934289177Speter    = svn_temp_deserializer__ptr(cached, (const void **)&cached->text);
935289177Speter  reps->bases
936289177Speter    = svn_temp_deserializer__ptr(cached, (const void **)&cached->bases);
937289177Speter  reps->first_instructions
938289177Speter    = svn_temp_deserializer__ptr(cached,
939289177Speter                                 (const void **)&cached->first_instructions);
940289177Speter  reps->instructions
941289177Speter    = svn_temp_deserializer__ptr(cached,
942289177Speter                                 (const void **)&cached->instructions);
943289177Speter
944289177Speter  /* return an extractor for the selected item */
945289177Speter  SVN_ERR(svn_fs_x__reps_get((svn_fs_x__rep_extractor_t **)out,
946289177Speter                             reps_baton->fs, reps, reps_baton->idx, pool));
947289177Speter
948289177Speter  return SVN_NO_ERROR;
949289177Speter}
950