1/*
2 * similarity.c: Utility functions for finding similar strings in lists
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24/* ==================================================================== */
25
26
27
28/*** Includes. ***/
29
30#include <stdlib.h>
31
32#include "svn_string.h"
33#include "cl.h"
34
35#include "private/svn_string_private.h"
36
37#include "svn_private_config.h"
38
39
40/* Context for token similarity checking */
41struct svn_cl__simcheck_context_t
42{
43  svn_string_t key;             /* The token we're comparing with */
44  svn_membuf_t buffer;          /* Buffer for similarity testing */
45};
46
47
48/* Similarity test between two property names */
49static APR_INLINE apr_size_t
50simcheck_key_diff(const svn_string_t *key, const svn_string_t *ctx,
51                 svn_membuf_t *buffer, apr_size_t *diff)
52{
53  apr_size_t lcs;
54  const apr_size_t score = svn_string__similarity(key, ctx, buffer, &lcs);
55  if (key->len > ctx->len)
56    *diff = key->len - lcs;
57  else
58    *diff = ctx->len - lcs;
59  return score;
60}
61
62
63/* Key comparator for qsort for svn_cl__simcheck_t */
64static int
65simcheck_compare(const void *pkeya, const void *pkeyb)
66{
67  svn_cl__simcheck_t *const keya = *(svn_cl__simcheck_t *const *)pkeya;
68  svn_cl__simcheck_t *const keyb = *(svn_cl__simcheck_t *const *)pkeyb;
69  svn_cl__simcheck_context_t *const context = keya->context;
70
71  if (keya->score == -1)
72    keya->score = simcheck_key_diff(&keya->token, &context->key,
73                                    &context->buffer, &keya->diff);
74  if (keyb->score == -1)
75    keyb->score = simcheck_key_diff(&keyb->token, &context->key,
76                                    &context->buffer, &keyb->diff);
77
78  return (keya->score < keyb->score ? 1
79          : (keya->score > keyb->score ? -1
80             : (keya->diff > keyb->diff ? 1
81                : (keya->diff < keyb->diff ? -1 : 0))));
82}
83
84apr_size_t
85svn_cl__similarity_check(const char *key,
86                         svn_cl__simcheck_t **tokens,
87                         apr_size_t token_count,
88                         apr_pool_t *scratch_pool)
89{
90  apr_size_t result;
91  apr_size_t i;
92
93  svn_cl__simcheck_context_t context;
94  context.key.data = key;
95  context.key.len = strlen(key);
96  svn_membuf__create(&context.buffer, 0, scratch_pool);
97
98  /* Populate the score, diff and context members. */
99  for (i = 0; i < token_count; ++i)
100    {
101      svn_cl__simcheck_t *const token = tokens[i];
102      token->score = -1;
103      token->diff = 0;
104      token->context = &context;
105    }
106
107  /* Sort the tokens by similarity. */
108  qsort(tokens, token_count, sizeof(*tokens), simcheck_compare);
109
110  /* Remove references to the context, since it points to the stack,
111     and calculate the number of results that are at least two-thirds
112     similar to the key. */
113  for (i = 0, result = 1; i < token_count; ++i)
114    {
115      svn_cl__simcheck_t *const token = tokens[i];
116      token->context = NULL;
117      /* If you update this factor, consider updating
118       * ../libsvn_subr/cmdline.c:most_similar(). */
119      if (token->score >= (2 * SVN_STRING__SIM_RANGE_MAX + 1) / 3)
120        ++result;
121    }
122
123  if (0 == tokens[0]->diff)
124    return 0;                   /* We found an exact match. */
125  return result;
126}
127