prefix_string.c revision 299742
1/* prefix_string.c --- implement strings based on a prefix tree
2 *
3 * ====================================================================
4 *    Licensed to the Apache Software Foundation (ASF) under one
5 *    or more contributor license agreements.  See the NOTICE file
6 *    distributed with this work for additional information
7 *    regarding copyright ownership.  The ASF licenses this file
8 *    to you under the Apache License, Version 2.0 (the
9 *    "License"); you may not use this file except in compliance
10 *    with the License.  You may obtain a copy of the License at
11 *
12 *      http://www.apache.org/licenses/LICENSE-2.0
13 *
14 *    Unless required by applicable law or agreed to in writing,
15 *    software distributed under the License is distributed on an
16 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 *    KIND, either express or implied.  See the License for the
18 *    specific language governing permissions and limitations
19 *    under the License.
20 * ====================================================================
21 */
22
23#include <assert.h>
24#include "private/svn_string_private.h"
25
26/* A node in the tree represents a common prefix.  The root node is the
27 * empty prefix.  Nodes may have up to 256 sub-nodes, each starting with
28 * a different character (possibly '\0').
29 *
30 * The nodes in the tree store only up to 8 chars of the respective common
31 * prefix, i.e. longer common prefixes must be drawn out over multiple
32 * hierarchy levels.  This is a space <-> efficiency trade-off.
33 *
34 * Strings are the leaf nodes in the tree and use a specialized, smaller
35 * data structure.  They may add 0 to 7 extra chars to the prefix.  Both
36 * data types can be discerned by the last char in the data buffer.  This
37 * must be 0 for strings (leaves) and non-0 otherwise.  Please note that
38 * ordinary nodes have a length information so that no terminating 0 is
39 * required for them.
40 */
41
42/* forward declaration */
43typedef struct node_t node_t;
44
45/* String type and tree leaf.
46 */
47struct svn_prefix_string__t
48{
49  /* mandatory prefix */
50  node_t *prefix;
51
52  /* 0 ..7 chars to add the the prefix. NUL-terminated. */
53  char data[8];
54};
55
56/* A node inside the tree, i.e. not a string and not a leaf (unless this is
57 * the root node).
58 *
59 * Note: keep the ordering to minimize size / alignment overhead on 64 bit
60 * machines.
61 */
62struct node_t
63{
64  /* pointer to the parent prefix plus the 1 .. 8 extra chars.
65   * Only the root will provide 0 extra chars. */
66  svn_prefix_string__t key;
67
68  /* Length of the prefix from the root down to and including this one.
69   * 0 for the root node.  Only then will key.prefix be NULL. */
70  apr_uint32_t length;
71
72  /* Number of entries used in SUB_NODES. */
73  apr_uint32_t sub_node_count;
74
75  /* The sub-nodes, ordered by first char.  node_t and svn_prefix_string__t
76   * may be mixed here.  May be NULL.
77   * The number of allocated entries is always a power-of-two and only
78   * given implicitly by SUB_NODE_COUNT. */
79  struct node_t **sub_nodes;
80};
81
82/* The actual tree structure. */
83struct svn_prefix_tree__t
84{
85  /* the common tree root (represents the empty prefix). */
86  node_t *root;
87
88  /* all sub-nodes & strings will be allocated from this pool */
89  apr_pool_t *pool;
90};
91
92/* Return TRUE, iff NODE is a leaf node.
93 */
94static svn_boolean_t
95is_leaf(node_t *node)
96{
97  return node->key.data[7] == 0;
98}
99
100/* Ensure that the sub-nodes array of NODE within TREE has at least one
101 * unused entry.  Re-allocate as necessary.
102 */
103static void
104auto_realloc_sub_nodes(svn_prefix_tree__t *tree,
105                       node_t *node)
106{
107  if (node->sub_node_count & (node->sub_node_count - 1))
108    return;
109
110  if (node->sub_node_count == 0)
111    {
112      node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes));
113    }
114  else
115    {
116      node_t **sub_nodes
117        = apr_pcalloc(tree->pool,
118                      2 * node->sub_node_count * sizeof(*sub_nodes));
119      memcpy(sub_nodes, node->sub_nodes,
120             node->sub_node_count * sizeof(*sub_nodes));
121      node->sub_nodes = sub_nodes;
122    }
123}
124
125/* Given the COUNT pointers in the SUB_NODES array, return the location at
126 * which KEY is either located or would be inserted.
127 */
128static int
129search_lower_bound(node_t **sub_nodes,
130                   unsigned char key,
131                   int count)
132{
133  int lower = 0;
134  int upper = count - 1;
135
136  /* Binary search for the lowest position at which to insert KEY. */
137  while (lower <= upper)
138    {
139      int current = lower + (upper - lower) / 2;
140
141      if ((unsigned char)sub_nodes[current]->key.data[0] < key)
142        lower = current + 1;
143      else
144        upper = current - 1;
145    }
146
147  return lower;
148}
149
150svn_prefix_tree__t *
151svn_prefix_tree__create(apr_pool_t *pool)
152{
153  svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree));
154  tree->pool = pool;
155
156  tree->root = apr_pcalloc(pool, sizeof(*tree->root));
157  tree->root->key.data[7] = '\xff';
158
159  return tree;
160}
161
162svn_prefix_string__t *
163svn_prefix_string__create(svn_prefix_tree__t *tree,
164                          const char *s)
165{
166  svn_prefix_string__t *new_string;
167  apr_size_t len = strlen(s);
168  node_t *node = tree->root;
169  node_t *new_node;
170  int idx;
171
172  /* walk the existing tree until we either find S or the node at which S
173   * has to be inserted */
174  while (TRUE)
175    {
176      node_t *sub_node;
177      int match = 1;
178
179      /* index of the matching sub-node */
180      idx = node->sub_node_count
181          ? search_lower_bound(node->sub_nodes,
182                               (unsigned char)s[node->length],
183                               node->sub_node_count)
184          : 0;
185
186      /* any (partially) matching sub-nodes? */
187      if (idx == (int)node->sub_node_count
188          || node->sub_nodes[idx]->key.data[0] != s[node->length])
189        break;
190
191      sub_node = node->sub_nodes[idx];
192
193      /* fully matching sub-node? */
194      if (is_leaf(sub_node))
195        {
196          if (strcmp(sub_node->key.data, s + node->length) == 0)
197            return &sub_node->key;
198        }
199      else
200        {
201          apr_size_t sub_node_len = sub_node->length - node->length;
202          if (strncmp(sub_node->key.data, s + node->length,
203                      sub_node_len) == 0)
204            {
205              node = sub_node;
206              continue;
207            }
208        }
209
210      /* partial match -> split */
211      while (sub_node->key.data[match] == s[node->length + match])
212        ++match;
213
214      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
215      new_node->key = sub_node->key;
216      new_node->length = node->length + match;
217      new_node->key.data[7] = '\xff';
218      new_node->sub_node_count = 1;
219      new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *));
220      new_node->sub_nodes[0] = sub_node;
221
222      memmove(sub_node->key.data, sub_node->key.data + match, 8 - match);
223
224      /* replace old sub-node with new one and continue lookup */
225      sub_node->key.prefix = new_node;
226      node->sub_nodes[idx] = new_node;
227      node = new_node;
228    }
229
230  /* add sub-node(s) and final string */
231  while (node->length + 7 < len)
232    {
233      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
234      new_node->key.prefix = node;
235      new_node->length = node->length + 8;
236      memcpy(new_node->key.data, s + node->length, 8);
237
238      auto_realloc_sub_nodes(tree, node);
239      memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
240              (node->sub_node_count - idx) * sizeof(node_t *));
241
242      /* replace old sub-node with new one and continue lookup */
243      node->sub_nodes[idx] = new_node;
244      node->sub_node_count++;
245      node = new_node;
246      idx = 0;
247    }
248
249  new_string = apr_pcalloc(tree->pool, sizeof(*new_string));
250  new_string->prefix = node;
251  memcpy(new_string->data, s + node->length, len - node->length);
252
253  auto_realloc_sub_nodes(tree, node);
254  memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
255          (node->sub_node_count - idx) * sizeof(node_t *));
256
257  node->sub_nodes[idx] = (node_t *)new_string;
258  node->sub_node_count++;
259  return new_string;
260}
261
262svn_string_t *
263svn_prefix_string__expand(const svn_prefix_string__t *s,
264                          apr_pool_t *pool)
265{
266  apr_size_t s_len = strlen(s->data);
267  apr_size_t len = s->prefix->length + s_len;
268  char *buffer = apr_palloc(pool, len + 1);
269
270  svn_string_t *result = apr_pcalloc(pool, sizeof(*result));
271  result->data = buffer;
272  result->len = len;
273  buffer[len] = '\0';
274
275  while (s->prefix)
276    {
277      memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length);
278      len = s->prefix->length;
279      s = &s->prefix->key;
280    }
281
282  return result;
283}
284
285int
286svn_prefix_string__compare(const svn_prefix_string__t *lhs,
287                           const svn_prefix_string__t *rhs)
288{
289  const node_t *lhs_parent = lhs->prefix;
290  const node_t *rhs_parent = rhs->prefix;
291
292  if (lhs == rhs)
293    return 0;
294
295  /* find the common root */
296  while (lhs_parent != rhs_parent)
297    {
298      if (lhs_parent->length <= rhs_parent->length)
299        {
300          rhs = &rhs_parent->key;
301          rhs_parent = rhs_parent->key.prefix;
302        }
303      else if (rhs_parent->length <= lhs_parent->length)
304        {
305          lhs = &lhs_parent->key;
306          lhs_parent = lhs_parent->key.prefix;
307        }
308
309      /* same tree? */
310      assert(lhs_parent && rhs_parent);
311    }
312
313  /* at the common root, strings will differ in the first follow-up char */
314  return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0];
315}
316