1/* prefix_string.c --- implement strings based on a prefix tree
2 *
3 * ====================================================================
4 *    Licensed to the Apache Software Foundation (ASF) under one
5 *    or more contributor license agreements.  See the NOTICE file
6 *    distributed with this work for additional information
7 *    regarding copyright ownership.  The ASF licenses this file
8 *    to you under the Apache License, Version 2.0 (the
9 *    "License"); you may not use this file except in compliance
10 *    with the License.  You may obtain a copy of the License at
11 *
12 *      http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *    Unless required by applicable law or agreed to in writing,
15 *    software distributed under the License is distributed on an
16 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 *    KIND, either express or implied.  See the License for the
18 *    specific language governing permissions and limitations
19 *    under the License.
20 * ====================================================================
21 */
22
23#include <assert.h>
24#include "private/svn_string_private.h"
25
26/* A node in the tree represents a common prefix.  The root node is the
27 * empty prefix.  Nodes may have up to 256 sub-nodes, each starting with
28 * a different character (possibly '\0').
29 *
30 * The nodes in the tree store only up to 8 chars of the respective common
31 * prefix, i.e. longer common prefixes must be drawn out over multiple
32 * hierarchy levels.  This is a space <-> efficiency trade-off.
33 *
34 * Strings are the leaf nodes in the tree and use a specialized, smaller
35 * data structure.  They may add 0 to 7 extra chars to the prefix.  Both
36 * data types can be discerned by the last char in the data buffer.  This
37 * must be 0 for strings (leaves) and non-0 otherwise.  Please note that
38 * ordinary nodes have a length information so that no terminating 0 is
39 * required for them.
40 */
41
42/* forward declaration */
43typedef struct node_t node_t;
44
45/* String type and tree leaf.
46 */
47struct svn_prefix_string__t
48{
49  /* mandatory prefix */
50  node_t *prefix;
51
52  /* 0 ..7 chars to add the prefix.
53   *
54   * NUL-terminated, if this is indeed a tree leaf.  We use the same struct
55   * within node_t for inner tree nodes, too.  There, DATA[7] is not NUL,
56   * meaning DATA may or may not be NUL terminated.  The actual length is
57   * provided by the node_t.length field (minus parent node length). */
58  char data[8];
59};
60
61/* A node inside the tree, i.e. not a string and not a leaf (unless this is
62 * the root node).
63 *
64 * Note: keep the ordering to minimize size / alignment overhead on 64 bit
65 * machines.
66 */
67struct node_t
68{
69  /* pointer to the parent prefix plus the 1 .. 8 extra chars.
70   * Only the root will provide 0 extra chars. */
71  svn_prefix_string__t key;
72
73  /* Length of the prefix from the root down to and including this one.
74   * 0 for the root node.  Only then will key.prefix be NULL. */
75  apr_uint32_t length;
76
77  /* Number of entries used in SUB_NODES. */
78  apr_uint32_t sub_node_count;
79
80  /* The sub-nodes, ordered by first char.  node_t and svn_prefix_string__t
81   * may be mixed here.  May be NULL.
82   * The number of allocated entries is always a power-of-two and only
83   * given implicitly by SUB_NODE_COUNT. */
84  struct node_t **sub_nodes;
85};
86
87/* The actual tree structure. */
88struct svn_prefix_tree__t
89{
90  /* the common tree root (represents the empty prefix). */
91  node_t *root;
92
93  /* all sub-nodes & strings will be allocated from this pool */
94  apr_pool_t *pool;
95};
96
97/* Return TRUE, iff NODE is a leaf node.
98 */
99static svn_boolean_t
100is_leaf(node_t *node)
101{
102  /* If this NOT a leaf node and this node has ...
103   *    ... 8 chars, data[7] will not be NUL because we only support
104   *        NUL-*terminated* strings.
105   *    ... less than 8 chars, this will be set to 0xff
106   *        (any other non-NUL would do as well but this is not valid UTF8
107   *         making it easy to recognize during debugging etc.) */
108  return node->key.data[7] == 0;
109}
110
111/* Ensure that the sub-nodes array of NODE within TREE has at least one
112 * unused entry.  Re-allocate as necessary.
113 */
114static void
115auto_realloc_sub_nodes(svn_prefix_tree__t *tree,
116                       node_t *node)
117{
118  if (node->sub_node_count & (node->sub_node_count - 1))
119    return;
120
121  if (node->sub_node_count == 0)
122    {
123      node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes));
124    }
125  else
126    {
127      node_t **sub_nodes
128        = apr_pcalloc(tree->pool,
129                      2 * node->sub_node_count * sizeof(*sub_nodes));
130      memcpy(sub_nodes, node->sub_nodes,
131             node->sub_node_count * sizeof(*sub_nodes));
132      node->sub_nodes = sub_nodes;
133    }
134}
135
136/* Given the COUNT pointers in the SUB_NODES array, return the location at
137 * which KEY is either located or would be inserted.
138 */
139static int
140search_lower_bound(node_t **sub_nodes,
141                   unsigned char key,
142                   int count)
143{
144  int lower = 0;
145  int upper = count - 1;
146
147  /* Binary search for the lowest position at which to insert KEY. */
148  while (lower <= upper)
149    {
150      int current = lower + (upper - lower) / 2;
151
152      if ((unsigned char)sub_nodes[current]->key.data[0] < key)
153        lower = current + 1;
154      else
155        upper = current - 1;
156    }
157
158  return lower;
159}
160
161svn_prefix_tree__t *
162svn_prefix_tree__create(apr_pool_t *pool)
163{
164  svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree));
165  tree->pool = pool;
166
167  tree->root = apr_pcalloc(pool, sizeof(*tree->root));
168  tree->root->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */
169
170  return tree;
171}
172
173svn_prefix_string__t *
174svn_prefix_string__create(svn_prefix_tree__t *tree,
175                          const char *s)
176{
177  svn_prefix_string__t *new_string;
178  apr_size_t len = strlen(s);
179  node_t *node = tree->root;
180  node_t *new_node;
181  int idx;
182
183  /* walk the existing tree until we either find S or the node at which S
184   * has to be inserted */
185  while (TRUE)
186    {
187      node_t *sub_node;
188      int match = 1;
189
190      /* index of the matching sub-node */
191      idx = node->sub_node_count
192          ? search_lower_bound(node->sub_nodes,
193                               (unsigned char)s[node->length],
194                               node->sub_node_count)
195          : 0;
196
197      /* any (partially) matching sub-nodes? */
198      if (idx == (int)node->sub_node_count
199          || node->sub_nodes[idx]->key.data[0] != s[node->length])
200        break;
201
202      /* Yes, it matches - at least the first character does. */
203      sub_node = node->sub_nodes[idx];
204
205      /* fully matching sub-node? */
206      if (is_leaf(sub_node))
207        {
208          if (strcmp(sub_node->key.data, s + node->length) == 0)
209            return &sub_node->key;
210        }
211      else
212        {
213          /* The string formed by the path from the root down to
214           * SUB_NODE differs from S.
215           *
216           * Is it a prefix?  In that case, the chars added by SUB_NODE
217           * must fully match the respective chars in S. */
218          apr_size_t sub_node_len = sub_node->length - node->length;
219          if (strncmp(sub_node->key.data, s + node->length,
220                      sub_node_len) == 0)
221            {
222              node = sub_node;
223              continue;
224            }
225        }
226
227      /* partial match -> split
228       *
229       * At this point, S may either be a prefix to the string represented
230       * by SUB_NODE, or they may diverge at some offset with
231       * SUB_NODE->KEY.DATA .
232       *
233       * MATCH starts with 1 here b/c we already know that at least one
234       * char matches.  Also, the loop will terminate because the strings
235       * differ before SUB_NODE->KEY.DATA - either at the NUL terminator
236       * of S or some char before that.
237       */
238      while (sub_node->key.data[match] == s[node->length + match])
239        ++match;
240
241      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
242      new_node->key = sub_node->key;
243      new_node->length = node->length + match;
244      new_node->key.data[7] = '\xff';  /* This is not a leaf. See is_leaf(). */
245      new_node->sub_node_count = 1;
246      new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *));
247      new_node->sub_nodes[0] = sub_node;
248
249      memmove(sub_node->key.data, sub_node->key.data + match, 8 - match);
250
251      /* replace old sub-node with new one and continue lookup */
252      sub_node->key.prefix = new_node;
253      node->sub_nodes[idx] = new_node;
254      node = new_node;
255    }
256
257  /* add sub-node(s) and final string */
258  while (len - node->length > 7)
259    {
260      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
261      new_node->key.prefix = node;
262      new_node->length = node->length + 8;
263      memcpy(new_node->key.data, s + node->length, 8);
264
265      auto_realloc_sub_nodes(tree, node);
266      memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
267              (node->sub_node_count - idx) * sizeof(node_t *));
268
269      /* replace old sub-node with new one and continue lookup */
270      node->sub_nodes[idx] = new_node;
271      node->sub_node_count++;
272      node = new_node;
273      idx = 0;
274    }
275
276  new_string = apr_pcalloc(tree->pool, sizeof(*new_string));
277  new_string->prefix = node;
278  memcpy(new_string->data, s + node->length, len - node->length);
279
280  auto_realloc_sub_nodes(tree, node);
281  memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
282          (node->sub_node_count - idx) * sizeof(node_t *));
283
284  node->sub_nodes[idx] = (node_t *)new_string;
285  node->sub_node_count++;
286  return new_string;
287}
288
289svn_string_t *
290svn_prefix_string__expand(const svn_prefix_string__t *s,
291                          apr_pool_t *pool)
292{
293  apr_size_t s_len = strlen(s->data);
294  apr_size_t len = s->prefix->length + s_len;
295  char *buffer = apr_palloc(pool, len + 1);
296
297  svn_string_t *result = apr_pcalloc(pool, sizeof(*result));
298  result->data = buffer;
299  result->len = len;
300  buffer[len] = '\0';
301
302  while (s->prefix)
303    {
304      memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length);
305      len = s->prefix->length;
306      s = &s->prefix->key;
307    }
308
309  return result;
310}
311
312int
313svn_prefix_string__compare(const svn_prefix_string__t *lhs,
314                           const svn_prefix_string__t *rhs)
315{
316  const node_t *lhs_parent = lhs->prefix;
317  const node_t *rhs_parent = rhs->prefix;
318
319  if (lhs == rhs)
320    return 0;
321
322  /* find the common root */
323  while (lhs_parent != rhs_parent)
324    {
325      if (lhs_parent->length <= rhs_parent->length)
326        {
327          rhs = &rhs_parent->key;
328          rhs_parent = rhs_parent->key.prefix;
329        }
330      else if (rhs_parent->length <= lhs_parent->length)
331        {
332          lhs = &lhs_parent->key;
333          lhs_parent = lhs_parent->key.prefix;
334        }
335
336      /* same tree? */
337      assert(lhs_parent && rhs_parent);
338    }
339
340  /* at the common root, strings will differ in the first follow-up char */
341  return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0];
342}
343