1/* prefix_string.c --- implement strings based on a prefix tree 2 * 3 * ==================================================================== 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, 15 * software distributed under the License is distributed on an 16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 * KIND, either express or implied. See the License for the 18 * specific language governing permissions and limitations 19 * under the License. 20 * ==================================================================== 21 */ 22 23#include <assert.h> 24#include "private/svn_string_private.h" 25 26/* A node in the tree represents a common prefix. The root node is the 27 * empty prefix. Nodes may have up to 256 sub-nodes, each starting with 28 * a different character (possibly '\0'). 29 * 30 * The nodes in the tree store only up to 8 chars of the respective common 31 * prefix, i.e. longer common prefixes must be drawn out over multiple 32 * hierarchy levels. This is a space <-> efficiency trade-off. 33 * 34 * Strings are the leaf nodes in the tree and use a specialized, smaller 35 * data structure. They may add 0 to 7 extra chars to the prefix. Both 36 * data types can be discerned by the last char in the data buffer. This 37 * must be 0 for strings (leaves) and non-0 otherwise. Please note that 38 * ordinary nodes have a length information so that no terminating 0 is 39 * required for them. 40 */ 41 42/* forward declaration */ 43typedef struct node_t node_t; 44 45/* String type and tree leaf. 46 */ 47struct svn_prefix_string__t 48{ 49 /* mandatory prefix */ 50 node_t *prefix; 51 52 /* 0 ..7 chars to add the prefix. 53 * 54 * NUL-terminated, if this is indeed a tree leaf. We use the same struct 55 * within node_t for inner tree nodes, too. There, DATA[7] is not NUL, 56 * meaning DATA may or may not be NUL terminated. The actual length is 57 * provided by the node_t.length field (minus parent node length). */ 58 char data[8]; 59}; 60 61/* A node inside the tree, i.e. not a string and not a leaf (unless this is 62 * the root node). 63 * 64 * Note: keep the ordering to minimize size / alignment overhead on 64 bit 65 * machines. 66 */ 67struct node_t 68{ 69 /* pointer to the parent prefix plus the 1 .. 8 extra chars. 70 * Only the root will provide 0 extra chars. */ 71 svn_prefix_string__t key; 72 73 /* Length of the prefix from the root down to and including this one. 74 * 0 for the root node. Only then will key.prefix be NULL. */ 75 apr_uint32_t length; 76 77 /* Number of entries used in SUB_NODES. */ 78 apr_uint32_t sub_node_count; 79 80 /* The sub-nodes, ordered by first char. node_t and svn_prefix_string__t 81 * may be mixed here. May be NULL. 82 * The number of allocated entries is always a power-of-two and only 83 * given implicitly by SUB_NODE_COUNT. */ 84 struct node_t **sub_nodes; 85}; 86 87/* The actual tree structure. */ 88struct svn_prefix_tree__t 89{ 90 /* the common tree root (represents the empty prefix). */ 91 node_t *root; 92 93 /* all sub-nodes & strings will be allocated from this pool */ 94 apr_pool_t *pool; 95}; 96 97/* Return TRUE, iff NODE is a leaf node. 98 */ 99static svn_boolean_t 100is_leaf(node_t *node) 101{ 102 /* If this NOT a leaf node and this node has ... 103 * ... 8 chars, data[7] will not be NUL because we only support 104 * NUL-*terminated* strings. 105 * ... less than 8 chars, this will be set to 0xff 106 * (any other non-NUL would do as well but this is not valid UTF8 107 * making it easy to recognize during debugging etc.) */ 108 return node->key.data[7] == 0; 109} 110 111/* Ensure that the sub-nodes array of NODE within TREE has at least one 112 * unused entry. Re-allocate as necessary. 113 */ 114static void 115auto_realloc_sub_nodes(svn_prefix_tree__t *tree, 116 node_t *node) 117{ 118 if (node->sub_node_count & (node->sub_node_count - 1)) 119 return; 120 121 if (node->sub_node_count == 0) 122 { 123 node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes)); 124 } 125 else 126 { 127 node_t **sub_nodes 128 = apr_pcalloc(tree->pool, 129 2 * node->sub_node_count * sizeof(*sub_nodes)); 130 memcpy(sub_nodes, node->sub_nodes, 131 node->sub_node_count * sizeof(*sub_nodes)); 132 node->sub_nodes = sub_nodes; 133 } 134} 135 136/* Given the COUNT pointers in the SUB_NODES array, return the location at 137 * which KEY is either located or would be inserted. 138 */ 139static int 140search_lower_bound(node_t **sub_nodes, 141 unsigned char key, 142 int count) 143{ 144 int lower = 0; 145 int upper = count - 1; 146 147 /* Binary search for the lowest position at which to insert KEY. */ 148 while (lower <= upper) 149 { 150 int current = lower + (upper - lower) / 2; 151 152 if ((unsigned char)sub_nodes[current]->key.data[0] < key) 153 lower = current + 1; 154 else 155 upper = current - 1; 156 } 157 158 return lower; 159} 160 161svn_prefix_tree__t * 162svn_prefix_tree__create(apr_pool_t *pool) 163{ 164 svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree)); 165 tree->pool = pool; 166 167 tree->root = apr_pcalloc(pool, sizeof(*tree->root)); 168 tree->root->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */ 169 170 return tree; 171} 172 173svn_prefix_string__t * 174svn_prefix_string__create(svn_prefix_tree__t *tree, 175 const char *s) 176{ 177 svn_prefix_string__t *new_string; 178 apr_size_t len = strlen(s); 179 node_t *node = tree->root; 180 node_t *new_node; 181 int idx; 182 183 /* walk the existing tree until we either find S or the node at which S 184 * has to be inserted */ 185 while (TRUE) 186 { 187 node_t *sub_node; 188 int match = 1; 189 190 /* index of the matching sub-node */ 191 idx = node->sub_node_count 192 ? search_lower_bound(node->sub_nodes, 193 (unsigned char)s[node->length], 194 node->sub_node_count) 195 : 0; 196 197 /* any (partially) matching sub-nodes? */ 198 if (idx == (int)node->sub_node_count 199 || node->sub_nodes[idx]->key.data[0] != s[node->length]) 200 break; 201 202 /* Yes, it matches - at least the first character does. */ 203 sub_node = node->sub_nodes[idx]; 204 205 /* fully matching sub-node? */ 206 if (is_leaf(sub_node)) 207 { 208 if (strcmp(sub_node->key.data, s + node->length) == 0) 209 return &sub_node->key; 210 } 211 else 212 { 213 /* The string formed by the path from the root down to 214 * SUB_NODE differs from S. 215 * 216 * Is it a prefix? In that case, the chars added by SUB_NODE 217 * must fully match the respective chars in S. */ 218 apr_size_t sub_node_len = sub_node->length - node->length; 219 if (strncmp(sub_node->key.data, s + node->length, 220 sub_node_len) == 0) 221 { 222 node = sub_node; 223 continue; 224 } 225 } 226 227 /* partial match -> split 228 * 229 * At this point, S may either be a prefix to the string represented 230 * by SUB_NODE, or they may diverge at some offset with 231 * SUB_NODE->KEY.DATA . 232 * 233 * MATCH starts with 1 here b/c we already know that at least one 234 * char matches. Also, the loop will terminate because the strings 235 * differ before SUB_NODE->KEY.DATA - either at the NUL terminator 236 * of S or some char before that. 237 */ 238 while (sub_node->key.data[match] == s[node->length + match]) 239 ++match; 240 241 new_node = apr_pcalloc(tree->pool, sizeof(*new_node)); 242 new_node->key = sub_node->key; 243 new_node->length = node->length + match; 244 new_node->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */ 245 new_node->sub_node_count = 1; 246 new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *)); 247 new_node->sub_nodes[0] = sub_node; 248 249 memmove(sub_node->key.data, sub_node->key.data + match, 8 - match); 250 251 /* replace old sub-node with new one and continue lookup */ 252 sub_node->key.prefix = new_node; 253 node->sub_nodes[idx] = new_node; 254 node = new_node; 255 } 256 257 /* add sub-node(s) and final string */ 258 while (len - node->length > 7) 259 { 260 new_node = apr_pcalloc(tree->pool, sizeof(*new_node)); 261 new_node->key.prefix = node; 262 new_node->length = node->length + 8; 263 memcpy(new_node->key.data, s + node->length, 8); 264 265 auto_realloc_sub_nodes(tree, node); 266 memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx, 267 (node->sub_node_count - idx) * sizeof(node_t *)); 268 269 /* replace old sub-node with new one and continue lookup */ 270 node->sub_nodes[idx] = new_node; 271 node->sub_node_count++; 272 node = new_node; 273 idx = 0; 274 } 275 276 new_string = apr_pcalloc(tree->pool, sizeof(*new_string)); 277 new_string->prefix = node; 278 memcpy(new_string->data, s + node->length, len - node->length); 279 280 auto_realloc_sub_nodes(tree, node); 281 memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx, 282 (node->sub_node_count - idx) * sizeof(node_t *)); 283 284 node->sub_nodes[idx] = (node_t *)new_string; 285 node->sub_node_count++; 286 return new_string; 287} 288 289svn_string_t * 290svn_prefix_string__expand(const svn_prefix_string__t *s, 291 apr_pool_t *pool) 292{ 293 apr_size_t s_len = strlen(s->data); 294 apr_size_t len = s->prefix->length + s_len; 295 char *buffer = apr_palloc(pool, len + 1); 296 297 svn_string_t *result = apr_pcalloc(pool, sizeof(*result)); 298 result->data = buffer; 299 result->len = len; 300 buffer[len] = '\0'; 301 302 while (s->prefix) 303 { 304 memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length); 305 len = s->prefix->length; 306 s = &s->prefix->key; 307 } 308 309 return result; 310} 311 312int 313svn_prefix_string__compare(const svn_prefix_string__t *lhs, 314 const svn_prefix_string__t *rhs) 315{ 316 const node_t *lhs_parent = lhs->prefix; 317 const node_t *rhs_parent = rhs->prefix; 318 319 if (lhs == rhs) 320 return 0; 321 322 /* find the common root */ 323 while (lhs_parent != rhs_parent) 324 { 325 if (lhs_parent->length <= rhs_parent->length) 326 { 327 rhs = &rhs_parent->key; 328 rhs_parent = rhs_parent->key.prefix; 329 } 330 else if (rhs_parent->length <= lhs_parent->length) 331 { 332 lhs = &lhs_parent->key; 333 lhs_parent = lhs_parent->key.prefix; 334 } 335 336 /* same tree? */ 337 assert(lhs_parent && rhs_parent); 338 } 339 340 /* at the common root, strings will differ in the first follow-up char */ 341 return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0]; 342} 343