1289177Speter/* prefix_string.c --- implement strings based on a prefix tree
2289177Speter *
3289177Speter * ====================================================================
4289177Speter *    Licensed to the Apache Software Foundation (ASF) under one
5289177Speter *    or more contributor license agreements.  See the NOTICE file
6289177Speter *    distributed with this work for additional information
7289177Speter *    regarding copyright ownership.  The ASF licenses this file
8289177Speter *    to you under the Apache License, Version 2.0 (the
9289177Speter *    "License"); you may not use this file except in compliance
10289177Speter *    with the License.  You may obtain a copy of the License at
11289177Speter *
12289177Speter *      http://www.apache.org/licenses/LICENSE-2.0
13289177Speter *
14289177Speter *    Unless required by applicable law or agreed to in writing,
15289177Speter *    software distributed under the License is distributed on an
16289177Speter *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17289177Speter *    KIND, either express or implied.  See the License for the
18289177Speter *    specific language governing permissions and limitations
19289177Speter *    under the License.
20289177Speter * ====================================================================
21289177Speter */
22289177Speter
23289177Speter#include <assert.h>
24289177Speter#include "private/svn_string_private.h"
25289177Speter
26289177Speter/* A node in the tree represents a common prefix.  The root node is the
27289177Speter * empty prefix.  Nodes may have up to 256 sub-nodes, each starting with
28289177Speter * a different character (possibly '\0').
29289177Speter *
30289177Speter * The nodes in the tree store only up to 8 chars of the respective common
31289177Speter * prefix, i.e. longer common prefixes must be drawn out over multiple
32289177Speter * hierarchy levels.  This is a space <-> efficiency trade-off.
33289177Speter *
34289177Speter * Strings are the leaf nodes in the tree and use a specialized, smaller
35289177Speter * data structure.  They may add 0 to 7 extra chars to the prefix.  Both
36289177Speter * data types can be discerned by the last char in the data buffer.  This
37289177Speter * must be 0 for strings (leaves) and non-0 otherwise.  Please note that
38289177Speter * ordinary nodes have a length information so that no terminating 0 is
39289177Speter * required for them.
40289177Speter */
41289177Speter
42289177Speter/* forward declaration */
43289177Spetertypedef struct node_t node_t;
44289177Speter
45289177Speter/* String type and tree leaf.
46289177Speter */
47289177Speterstruct svn_prefix_string__t
48289177Speter{
49289177Speter  /* mandatory prefix */
50289177Speter  node_t *prefix;
51289177Speter
52362181Sdim  /* 0 ..7 chars to add the prefix.
53362181Sdim   *
54362181Sdim   * NUL-terminated, if this is indeed a tree leaf.  We use the same struct
55362181Sdim   * within node_t for inner tree nodes, too.  There, DATA[7] is not NUL,
56362181Sdim   * meaning DATA may or may not be NUL terminated.  The actual length is
57362181Sdim   * provided by the node_t.length field (minus parent node length). */
58289177Speter  char data[8];
59289177Speter};
60289177Speter
61289177Speter/* A node inside the tree, i.e. not a string and not a leaf (unless this is
62289177Speter * the root node).
63289177Speter *
64289177Speter * Note: keep the ordering to minimize size / alignment overhead on 64 bit
65289177Speter * machines.
66289177Speter */
67289177Speterstruct node_t
68289177Speter{
69289177Speter  /* pointer to the parent prefix plus the 1 .. 8 extra chars.
70289177Speter   * Only the root will provide 0 extra chars. */
71289177Speter  svn_prefix_string__t key;
72289177Speter
73289177Speter  /* Length of the prefix from the root down to and including this one.
74289177Speter   * 0 for the root node.  Only then will key.prefix be NULL. */
75289177Speter  apr_uint32_t length;
76289177Speter
77289177Speter  /* Number of entries used in SUB_NODES. */
78289177Speter  apr_uint32_t sub_node_count;
79289177Speter
80289177Speter  /* The sub-nodes, ordered by first char.  node_t and svn_prefix_string__t
81289177Speter   * may be mixed here.  May be NULL.
82289177Speter   * The number of allocated entries is always a power-of-two and only
83289177Speter   * given implicitly by SUB_NODE_COUNT. */
84289177Speter  struct node_t **sub_nodes;
85289177Speter};
86289177Speter
87289177Speter/* The actual tree structure. */
88289177Speterstruct svn_prefix_tree__t
89289177Speter{
90289177Speter  /* the common tree root (represents the empty prefix). */
91289177Speter  node_t *root;
92289177Speter
93289177Speter  /* all sub-nodes & strings will be allocated from this pool */
94289177Speter  apr_pool_t *pool;
95289177Speter};
96289177Speter
97289177Speter/* Return TRUE, iff NODE is a leaf node.
98289177Speter */
99289177Speterstatic svn_boolean_t
100289177Speteris_leaf(node_t *node)
101289177Speter{
102362181Sdim  /* If this NOT a leaf node and this node has ...
103362181Sdim   *    ... 8 chars, data[7] will not be NUL because we only support
104362181Sdim   *        NUL-*terminated* strings.
105362181Sdim   *    ... less than 8 chars, this will be set to 0xff
106362181Sdim   *        (any other non-NUL would do as well but this is not valid UTF8
107362181Sdim   *         making it easy to recognize during debugging etc.) */
108289177Speter  return node->key.data[7] == 0;
109289177Speter}
110289177Speter
111289177Speter/* Ensure that the sub-nodes array of NODE within TREE has at least one
112289177Speter * unused entry.  Re-allocate as necessary.
113289177Speter */
114289177Speterstatic void
115289177Speterauto_realloc_sub_nodes(svn_prefix_tree__t *tree,
116289177Speter                       node_t *node)
117289177Speter{
118289177Speter  if (node->sub_node_count & (node->sub_node_count - 1))
119289177Speter    return;
120289177Speter
121289177Speter  if (node->sub_node_count == 0)
122289177Speter    {
123289177Speter      node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes));
124289177Speter    }
125289177Speter  else
126289177Speter    {
127289177Speter      node_t **sub_nodes
128289177Speter        = apr_pcalloc(tree->pool,
129289177Speter                      2 * node->sub_node_count * sizeof(*sub_nodes));
130289177Speter      memcpy(sub_nodes, node->sub_nodes,
131289177Speter             node->sub_node_count * sizeof(*sub_nodes));
132289177Speter      node->sub_nodes = sub_nodes;
133289177Speter    }
134289177Speter}
135289177Speter
136289177Speter/* Given the COUNT pointers in the SUB_NODES array, return the location at
137289177Speter * which KEY is either located or would be inserted.
138289177Speter */
139289177Speterstatic int
140289177Spetersearch_lower_bound(node_t **sub_nodes,
141289177Speter                   unsigned char key,
142289177Speter                   int count)
143289177Speter{
144289177Speter  int lower = 0;
145289177Speter  int upper = count - 1;
146289177Speter
147289177Speter  /* Binary search for the lowest position at which to insert KEY. */
148289177Speter  while (lower <= upper)
149289177Speter    {
150289177Speter      int current = lower + (upper - lower) / 2;
151289177Speter
152289177Speter      if ((unsigned char)sub_nodes[current]->key.data[0] < key)
153289177Speter        lower = current + 1;
154289177Speter      else
155289177Speter        upper = current - 1;
156289177Speter    }
157289177Speter
158289177Speter  return lower;
159289177Speter}
160289177Speter
161289177Spetersvn_prefix_tree__t *
162289177Spetersvn_prefix_tree__create(apr_pool_t *pool)
163289177Speter{
164289177Speter  svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree));
165289177Speter  tree->pool = pool;
166289177Speter
167289177Speter  tree->root = apr_pcalloc(pool, sizeof(*tree->root));
168362181Sdim  tree->root->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */
169289177Speter
170289177Speter  return tree;
171289177Speter}
172289177Speter
173289177Spetersvn_prefix_string__t *
174289177Spetersvn_prefix_string__create(svn_prefix_tree__t *tree,
175289177Speter                          const char *s)
176289177Speter{
177289177Speter  svn_prefix_string__t *new_string;
178289177Speter  apr_size_t len = strlen(s);
179289177Speter  node_t *node = tree->root;
180289177Speter  node_t *new_node;
181289177Speter  int idx;
182289177Speter
183289177Speter  /* walk the existing tree until we either find S or the node at which S
184289177Speter   * has to be inserted */
185289177Speter  while (TRUE)
186289177Speter    {
187289177Speter      node_t *sub_node;
188289177Speter      int match = 1;
189289177Speter
190289177Speter      /* index of the matching sub-node */
191289177Speter      idx = node->sub_node_count
192289177Speter          ? search_lower_bound(node->sub_nodes,
193289177Speter                               (unsigned char)s[node->length],
194289177Speter                               node->sub_node_count)
195289177Speter          : 0;
196289177Speter
197289177Speter      /* any (partially) matching sub-nodes? */
198289177Speter      if (idx == (int)node->sub_node_count
199289177Speter          || node->sub_nodes[idx]->key.data[0] != s[node->length])
200289177Speter        break;
201289177Speter
202362181Sdim      /* Yes, it matches - at least the first character does. */
203289177Speter      sub_node = node->sub_nodes[idx];
204289177Speter
205289177Speter      /* fully matching sub-node? */
206289177Speter      if (is_leaf(sub_node))
207289177Speter        {
208289177Speter          if (strcmp(sub_node->key.data, s + node->length) == 0)
209289177Speter            return &sub_node->key;
210289177Speter        }
211289177Speter      else
212289177Speter        {
213362181Sdim          /* The string formed by the path from the root down to
214362181Sdim           * SUB_NODE differs from S.
215362181Sdim           *
216362181Sdim           * Is it a prefix?  In that case, the chars added by SUB_NODE
217362181Sdim           * must fully match the respective chars in S. */
218289177Speter          apr_size_t sub_node_len = sub_node->length - node->length;
219289177Speter          if (strncmp(sub_node->key.data, s + node->length,
220289177Speter                      sub_node_len) == 0)
221289177Speter            {
222289177Speter              node = sub_node;
223289177Speter              continue;
224289177Speter            }
225289177Speter        }
226289177Speter
227362181Sdim      /* partial match -> split
228362181Sdim       *
229362181Sdim       * At this point, S may either be a prefix to the string represented
230362181Sdim       * by SUB_NODE, or they may diverge at some offset with
231362181Sdim       * SUB_NODE->KEY.DATA .
232362181Sdim       *
233362181Sdim       * MATCH starts with 1 here b/c we already know that at least one
234362181Sdim       * char matches.  Also, the loop will terminate because the strings
235362181Sdim       * differ before SUB_NODE->KEY.DATA - either at the NUL terminator
236362181Sdim       * of S or some char before that.
237362181Sdim       */
238289177Speter      while (sub_node->key.data[match] == s[node->length + match])
239289177Speter        ++match;
240289177Speter
241289177Speter      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
242289177Speter      new_node->key = sub_node->key;
243289177Speter      new_node->length = node->length + match;
244362181Sdim      new_node->key.data[7] = '\xff';  /* This is not a leaf. See is_leaf(). */
245289177Speter      new_node->sub_node_count = 1;
246289177Speter      new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *));
247289177Speter      new_node->sub_nodes[0] = sub_node;
248289177Speter
249289177Speter      memmove(sub_node->key.data, sub_node->key.data + match, 8 - match);
250289177Speter
251289177Speter      /* replace old sub-node with new one and continue lookup */
252289177Speter      sub_node->key.prefix = new_node;
253289177Speter      node->sub_nodes[idx] = new_node;
254289177Speter      node = new_node;
255289177Speter    }
256289177Speter
257289177Speter  /* add sub-node(s) and final string */
258362181Sdim  while (len - node->length > 7)
259289177Speter    {
260289177Speter      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
261289177Speter      new_node->key.prefix = node;
262289177Speter      new_node->length = node->length + 8;
263289177Speter      memcpy(new_node->key.data, s + node->length, 8);
264289177Speter
265289177Speter      auto_realloc_sub_nodes(tree, node);
266289177Speter      memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
267289177Speter              (node->sub_node_count - idx) * sizeof(node_t *));
268289177Speter
269289177Speter      /* replace old sub-node with new one and continue lookup */
270289177Speter      node->sub_nodes[idx] = new_node;
271289177Speter      node->sub_node_count++;
272289177Speter      node = new_node;
273289177Speter      idx = 0;
274289177Speter    }
275289177Speter
276289177Speter  new_string = apr_pcalloc(tree->pool, sizeof(*new_string));
277289177Speter  new_string->prefix = node;
278289177Speter  memcpy(new_string->data, s + node->length, len - node->length);
279289177Speter
280289177Speter  auto_realloc_sub_nodes(tree, node);
281289177Speter  memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
282289177Speter          (node->sub_node_count - idx) * sizeof(node_t *));
283289177Speter
284289177Speter  node->sub_nodes[idx] = (node_t *)new_string;
285289177Speter  node->sub_node_count++;
286289177Speter  return new_string;
287289177Speter}
288289177Speter
289289177Spetersvn_string_t *
290289177Spetersvn_prefix_string__expand(const svn_prefix_string__t *s,
291289177Speter                          apr_pool_t *pool)
292289177Speter{
293289177Speter  apr_size_t s_len = strlen(s->data);
294289177Speter  apr_size_t len = s->prefix->length + s_len;
295289177Speter  char *buffer = apr_palloc(pool, len + 1);
296289177Speter
297289177Speter  svn_string_t *result = apr_pcalloc(pool, sizeof(*result));
298289177Speter  result->data = buffer;
299289177Speter  result->len = len;
300289177Speter  buffer[len] = '\0';
301289177Speter
302289177Speter  while (s->prefix)
303289177Speter    {
304289177Speter      memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length);
305289177Speter      len = s->prefix->length;
306289177Speter      s = &s->prefix->key;
307289177Speter    }
308289177Speter
309289177Speter  return result;
310289177Speter}
311289177Speter
312289177Speterint
313289177Spetersvn_prefix_string__compare(const svn_prefix_string__t *lhs,
314289177Speter                           const svn_prefix_string__t *rhs)
315289177Speter{
316289177Speter  const node_t *lhs_parent = lhs->prefix;
317289177Speter  const node_t *rhs_parent = rhs->prefix;
318289177Speter
319289177Speter  if (lhs == rhs)
320289177Speter    return 0;
321289177Speter
322289177Speter  /* find the common root */
323289177Speter  while (lhs_parent != rhs_parent)
324289177Speter    {
325289177Speter      if (lhs_parent->length <= rhs_parent->length)
326289177Speter        {
327289177Speter          rhs = &rhs_parent->key;
328289177Speter          rhs_parent = rhs_parent->key.prefix;
329289177Speter        }
330289177Speter      else if (rhs_parent->length <= lhs_parent->length)
331289177Speter        {
332289177Speter          lhs = &lhs_parent->key;
333289177Speter          lhs_parent = lhs_parent->key.prefix;
334289177Speter        }
335289177Speter
336289177Speter      /* same tree? */
337289177Speter      assert(lhs_parent && rhs_parent);
338289177Speter    }
339289177Speter
340289177Speter  /* at the common root, strings will differ in the first follow-up char */
341289177Speter  return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0];
342289177Speter}
343