1289177Speter/* prefix_string.c --- implement strings based on a prefix tree 2289177Speter * 3289177Speter * ==================================================================== 4289177Speter * Licensed to the Apache Software Foundation (ASF) under one 5289177Speter * or more contributor license agreements. See the NOTICE file 6289177Speter * distributed with this work for additional information 7289177Speter * regarding copyright ownership. The ASF licenses this file 8289177Speter * to you under the Apache License, Version 2.0 (the 9289177Speter * "License"); you may not use this file except in compliance 10289177Speter * with the License. You may obtain a copy of the License at 11289177Speter * 12289177Speter * http://www.apache.org/licenses/LICENSE-2.0 13289177Speter * 14289177Speter * Unless required by applicable law or agreed to in writing, 15289177Speter * software distributed under the License is distributed on an 16289177Speter * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17289177Speter * KIND, either express or implied. See the License for the 18289177Speter * specific language governing permissions and limitations 19289177Speter * under the License. 20289177Speter * ==================================================================== 21289177Speter */ 22289177Speter 23289177Speter#include <assert.h> 24289177Speter#include "private/svn_string_private.h" 25289177Speter 26289177Speter/* A node in the tree represents a common prefix. The root node is the 27289177Speter * empty prefix. Nodes may have up to 256 sub-nodes, each starting with 28289177Speter * a different character (possibly '\0'). 29289177Speter * 30289177Speter * The nodes in the tree store only up to 8 chars of the respective common 31289177Speter * prefix, i.e. longer common prefixes must be drawn out over multiple 32289177Speter * hierarchy levels. This is a space <-> efficiency trade-off. 33289177Speter * 34289177Speter * Strings are the leaf nodes in the tree and use a specialized, smaller 35289177Speter * data structure. They may add 0 to 7 extra chars to the prefix. Both 36289177Speter * data types can be discerned by the last char in the data buffer. This 37289177Speter * must be 0 for strings (leaves) and non-0 otherwise. Please note that 38289177Speter * ordinary nodes have a length information so that no terminating 0 is 39289177Speter * required for them. 40289177Speter */ 41289177Speter 42289177Speter/* forward declaration */ 43289177Spetertypedef struct node_t node_t; 44289177Speter 45289177Speter/* String type and tree leaf. 46289177Speter */ 47289177Speterstruct svn_prefix_string__t 48289177Speter{ 49289177Speter /* mandatory prefix */ 50289177Speter node_t *prefix; 51289177Speter 52362181Sdim /* 0 ..7 chars to add the prefix. 53362181Sdim * 54362181Sdim * NUL-terminated, if this is indeed a tree leaf. We use the same struct 55362181Sdim * within node_t for inner tree nodes, too. There, DATA[7] is not NUL, 56362181Sdim * meaning DATA may or may not be NUL terminated. The actual length is 57362181Sdim * provided by the node_t.length field (minus parent node length). */ 58289177Speter char data[8]; 59289177Speter}; 60289177Speter 61289177Speter/* A node inside the tree, i.e. not a string and not a leaf (unless this is 62289177Speter * the root node). 63289177Speter * 64289177Speter * Note: keep the ordering to minimize size / alignment overhead on 64 bit 65289177Speter * machines. 66289177Speter */ 67289177Speterstruct node_t 68289177Speter{ 69289177Speter /* pointer to the parent prefix plus the 1 .. 8 extra chars. 70289177Speter * Only the root will provide 0 extra chars. */ 71289177Speter svn_prefix_string__t key; 72289177Speter 73289177Speter /* Length of the prefix from the root down to and including this one. 74289177Speter * 0 for the root node. Only then will key.prefix be NULL. */ 75289177Speter apr_uint32_t length; 76289177Speter 77289177Speter /* Number of entries used in SUB_NODES. */ 78289177Speter apr_uint32_t sub_node_count; 79289177Speter 80289177Speter /* The sub-nodes, ordered by first char. node_t and svn_prefix_string__t 81289177Speter * may be mixed here. May be NULL. 82289177Speter * The number of allocated entries is always a power-of-two and only 83289177Speter * given implicitly by SUB_NODE_COUNT. */ 84289177Speter struct node_t **sub_nodes; 85289177Speter}; 86289177Speter 87289177Speter/* The actual tree structure. */ 88289177Speterstruct svn_prefix_tree__t 89289177Speter{ 90289177Speter /* the common tree root (represents the empty prefix). */ 91289177Speter node_t *root; 92289177Speter 93289177Speter /* all sub-nodes & strings will be allocated from this pool */ 94289177Speter apr_pool_t *pool; 95289177Speter}; 96289177Speter 97289177Speter/* Return TRUE, iff NODE is a leaf node. 98289177Speter */ 99289177Speterstatic svn_boolean_t 100289177Speteris_leaf(node_t *node) 101289177Speter{ 102362181Sdim /* If this NOT a leaf node and this node has ... 103362181Sdim * ... 8 chars, data[7] will not be NUL because we only support 104362181Sdim * NUL-*terminated* strings. 105362181Sdim * ... less than 8 chars, this will be set to 0xff 106362181Sdim * (any other non-NUL would do as well but this is not valid UTF8 107362181Sdim * making it easy to recognize during debugging etc.) */ 108289177Speter return node->key.data[7] == 0; 109289177Speter} 110289177Speter 111289177Speter/* Ensure that the sub-nodes array of NODE within TREE has at least one 112289177Speter * unused entry. Re-allocate as necessary. 113289177Speter */ 114289177Speterstatic void 115289177Speterauto_realloc_sub_nodes(svn_prefix_tree__t *tree, 116289177Speter node_t *node) 117289177Speter{ 118289177Speter if (node->sub_node_count & (node->sub_node_count - 1)) 119289177Speter return; 120289177Speter 121289177Speter if (node->sub_node_count == 0) 122289177Speter { 123289177Speter node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes)); 124289177Speter } 125289177Speter else 126289177Speter { 127289177Speter node_t **sub_nodes 128289177Speter = apr_pcalloc(tree->pool, 129289177Speter 2 * node->sub_node_count * sizeof(*sub_nodes)); 130289177Speter memcpy(sub_nodes, node->sub_nodes, 131289177Speter node->sub_node_count * sizeof(*sub_nodes)); 132289177Speter node->sub_nodes = sub_nodes; 133289177Speter } 134289177Speter} 135289177Speter 136289177Speter/* Given the COUNT pointers in the SUB_NODES array, return the location at 137289177Speter * which KEY is either located or would be inserted. 138289177Speter */ 139289177Speterstatic int 140289177Spetersearch_lower_bound(node_t **sub_nodes, 141289177Speter unsigned char key, 142289177Speter int count) 143289177Speter{ 144289177Speter int lower = 0; 145289177Speter int upper = count - 1; 146289177Speter 147289177Speter /* Binary search for the lowest position at which to insert KEY. */ 148289177Speter while (lower <= upper) 149289177Speter { 150289177Speter int current = lower + (upper - lower) / 2; 151289177Speter 152289177Speter if ((unsigned char)sub_nodes[current]->key.data[0] < key) 153289177Speter lower = current + 1; 154289177Speter else 155289177Speter upper = current - 1; 156289177Speter } 157289177Speter 158289177Speter return lower; 159289177Speter} 160289177Speter 161289177Spetersvn_prefix_tree__t * 162289177Spetersvn_prefix_tree__create(apr_pool_t *pool) 163289177Speter{ 164289177Speter svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree)); 165289177Speter tree->pool = pool; 166289177Speter 167289177Speter tree->root = apr_pcalloc(pool, sizeof(*tree->root)); 168362181Sdim tree->root->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */ 169289177Speter 170289177Speter return tree; 171289177Speter} 172289177Speter 173289177Spetersvn_prefix_string__t * 174289177Spetersvn_prefix_string__create(svn_prefix_tree__t *tree, 175289177Speter const char *s) 176289177Speter{ 177289177Speter svn_prefix_string__t *new_string; 178289177Speter apr_size_t len = strlen(s); 179289177Speter node_t *node = tree->root; 180289177Speter node_t *new_node; 181289177Speter int idx; 182289177Speter 183289177Speter /* walk the existing tree until we either find S or the node at which S 184289177Speter * has to be inserted */ 185289177Speter while (TRUE) 186289177Speter { 187289177Speter node_t *sub_node; 188289177Speter int match = 1; 189289177Speter 190289177Speter /* index of the matching sub-node */ 191289177Speter idx = node->sub_node_count 192289177Speter ? search_lower_bound(node->sub_nodes, 193289177Speter (unsigned char)s[node->length], 194289177Speter node->sub_node_count) 195289177Speter : 0; 196289177Speter 197289177Speter /* any (partially) matching sub-nodes? */ 198289177Speter if (idx == (int)node->sub_node_count 199289177Speter || node->sub_nodes[idx]->key.data[0] != s[node->length]) 200289177Speter break; 201289177Speter 202362181Sdim /* Yes, it matches - at least the first character does. */ 203289177Speter sub_node = node->sub_nodes[idx]; 204289177Speter 205289177Speter /* fully matching sub-node? */ 206289177Speter if (is_leaf(sub_node)) 207289177Speter { 208289177Speter if (strcmp(sub_node->key.data, s + node->length) == 0) 209289177Speter return &sub_node->key; 210289177Speter } 211289177Speter else 212289177Speter { 213362181Sdim /* The string formed by the path from the root down to 214362181Sdim * SUB_NODE differs from S. 215362181Sdim * 216362181Sdim * Is it a prefix? In that case, the chars added by SUB_NODE 217362181Sdim * must fully match the respective chars in S. */ 218289177Speter apr_size_t sub_node_len = sub_node->length - node->length; 219289177Speter if (strncmp(sub_node->key.data, s + node->length, 220289177Speter sub_node_len) == 0) 221289177Speter { 222289177Speter node = sub_node; 223289177Speter continue; 224289177Speter } 225289177Speter } 226289177Speter 227362181Sdim /* partial match -> split 228362181Sdim * 229362181Sdim * At this point, S may either be a prefix to the string represented 230362181Sdim * by SUB_NODE, or they may diverge at some offset with 231362181Sdim * SUB_NODE->KEY.DATA . 232362181Sdim * 233362181Sdim * MATCH starts with 1 here b/c we already know that at least one 234362181Sdim * char matches. Also, the loop will terminate because the strings 235362181Sdim * differ before SUB_NODE->KEY.DATA - either at the NUL terminator 236362181Sdim * of S or some char before that. 237362181Sdim */ 238289177Speter while (sub_node->key.data[match] == s[node->length + match]) 239289177Speter ++match; 240289177Speter 241289177Speter new_node = apr_pcalloc(tree->pool, sizeof(*new_node)); 242289177Speter new_node->key = sub_node->key; 243289177Speter new_node->length = node->length + match; 244362181Sdim new_node->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */ 245289177Speter new_node->sub_node_count = 1; 246289177Speter new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *)); 247289177Speter new_node->sub_nodes[0] = sub_node; 248289177Speter 249289177Speter memmove(sub_node->key.data, sub_node->key.data + match, 8 - match); 250289177Speter 251289177Speter /* replace old sub-node with new one and continue lookup */ 252289177Speter sub_node->key.prefix = new_node; 253289177Speter node->sub_nodes[idx] = new_node; 254289177Speter node = new_node; 255289177Speter } 256289177Speter 257289177Speter /* add sub-node(s) and final string */ 258362181Sdim while (len - node->length > 7) 259289177Speter { 260289177Speter new_node = apr_pcalloc(tree->pool, sizeof(*new_node)); 261289177Speter new_node->key.prefix = node; 262289177Speter new_node->length = node->length + 8; 263289177Speter memcpy(new_node->key.data, s + node->length, 8); 264289177Speter 265289177Speter auto_realloc_sub_nodes(tree, node); 266289177Speter memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx, 267289177Speter (node->sub_node_count - idx) * sizeof(node_t *)); 268289177Speter 269289177Speter /* replace old sub-node with new one and continue lookup */ 270289177Speter node->sub_nodes[idx] = new_node; 271289177Speter node->sub_node_count++; 272289177Speter node = new_node; 273289177Speter idx = 0; 274289177Speter } 275289177Speter 276289177Speter new_string = apr_pcalloc(tree->pool, sizeof(*new_string)); 277289177Speter new_string->prefix = node; 278289177Speter memcpy(new_string->data, s + node->length, len - node->length); 279289177Speter 280289177Speter auto_realloc_sub_nodes(tree, node); 281289177Speter memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx, 282289177Speter (node->sub_node_count - idx) * sizeof(node_t *)); 283289177Speter 284289177Speter node->sub_nodes[idx] = (node_t *)new_string; 285289177Speter node->sub_node_count++; 286289177Speter return new_string; 287289177Speter} 288289177Speter 289289177Spetersvn_string_t * 290289177Spetersvn_prefix_string__expand(const svn_prefix_string__t *s, 291289177Speter apr_pool_t *pool) 292289177Speter{ 293289177Speter apr_size_t s_len = strlen(s->data); 294289177Speter apr_size_t len = s->prefix->length + s_len; 295289177Speter char *buffer = apr_palloc(pool, len + 1); 296289177Speter 297289177Speter svn_string_t *result = apr_pcalloc(pool, sizeof(*result)); 298289177Speter result->data = buffer; 299289177Speter result->len = len; 300289177Speter buffer[len] = '\0'; 301289177Speter 302289177Speter while (s->prefix) 303289177Speter { 304289177Speter memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length); 305289177Speter len = s->prefix->length; 306289177Speter s = &s->prefix->key; 307289177Speter } 308289177Speter 309289177Speter return result; 310289177Speter} 311289177Speter 312289177Speterint 313289177Spetersvn_prefix_string__compare(const svn_prefix_string__t *lhs, 314289177Speter const svn_prefix_string__t *rhs) 315289177Speter{ 316289177Speter const node_t *lhs_parent = lhs->prefix; 317289177Speter const node_t *rhs_parent = rhs->prefix; 318289177Speter 319289177Speter if (lhs == rhs) 320289177Speter return 0; 321289177Speter 322289177Speter /* find the common root */ 323289177Speter while (lhs_parent != rhs_parent) 324289177Speter { 325289177Speter if (lhs_parent->length <= rhs_parent->length) 326289177Speter { 327289177Speter rhs = &rhs_parent->key; 328289177Speter rhs_parent = rhs_parent->key.prefix; 329289177Speter } 330289177Speter else if (rhs_parent->length <= lhs_parent->length) 331289177Speter { 332289177Speter lhs = &lhs_parent->key; 333289177Speter lhs_parent = lhs_parent->key.prefix; 334289177Speter } 335289177Speter 336289177Speter /* same tree? */ 337289177Speter assert(lhs_parent && rhs_parent); 338289177Speter } 339289177Speter 340289177Speter /* at the common root, strings will differ in the first follow-up char */ 341289177Speter return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0]; 342289177Speter} 343