1289177Speter/*
2289177Speter * similarity.c: Utility functions for finding similar strings in lists
3289177Speter *
4289177Speter * ====================================================================
5289177Speter *    Licensed to the Apache Software Foundation (ASF) under one
6289177Speter *    or more contributor license agreements.  See the NOTICE file
7289177Speter *    distributed with this work for additional information
8289177Speter *    regarding copyright ownership.  The ASF licenses this file
9289177Speter *    to you under the Apache License, Version 2.0 (the
10289177Speter *    "License"); you may not use this file except in compliance
11289177Speter *    with the License.  You may obtain a copy of the License at
12289177Speter *
13289177Speter *      http://www.apache.org/licenses/LICENSE-2.0
14289177Speter *
15289177Speter *    Unless required by applicable law or agreed to in writing,
16289177Speter *    software distributed under the License is distributed on an
17289177Speter *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18289177Speter *    KIND, either express or implied.  See the License for the
19289177Speter *    specific language governing permissions and limitations
20289177Speter *    under the License.
21289177Speter * ====================================================================
22289177Speter */
23289177Speter
24289177Speter/* ==================================================================== */
25289177Speter
26289177Speter
27289177Speter
28289177Speter/*** Includes. ***/
29289177Speter
30289177Speter#include <stdlib.h>
31289177Speter
32289177Speter#include "svn_string.h"
33289177Speter#include "cl.h"
34289177Speter
35289177Speter#include "private/svn_string_private.h"
36289177Speter
37289177Speter#include "svn_private_config.h"
38289177Speter
39289177Speter
40289177Speter/* Context for token similarity checking */
41289177Speterstruct svn_cl__simcheck_context_t
42289177Speter{
43289177Speter  svn_string_t key;             /* The token we're comparing with */
44289177Speter  svn_membuf_t buffer;          /* Buffer for similarity testing */
45289177Speter};
46289177Speter
47289177Speter
48289177Speter/* Similarity test between two property names */
49289177Speterstatic APR_INLINE apr_size_t
50289177Spetersimcheck_key_diff(const svn_string_t *key, const svn_string_t *ctx,
51289177Speter                 svn_membuf_t *buffer, apr_size_t *diff)
52289177Speter{
53289177Speter  apr_size_t lcs;
54289177Speter  const apr_size_t score = svn_string__similarity(key, ctx, buffer, &lcs);
55289177Speter  if (key->len > ctx->len)
56289177Speter    *diff = key->len - lcs;
57289177Speter  else
58289177Speter    *diff = ctx->len - lcs;
59289177Speter  return score;
60289177Speter}
61289177Speter
62289177Speter
63289177Speter/* Key comparator for qsort for svn_cl__simcheck_t */
64289177Speterstatic int
65289177Spetersimcheck_compare(const void *pkeya, const void *pkeyb)
66289177Speter{
67289177Speter  svn_cl__simcheck_t *const keya = *(svn_cl__simcheck_t *const *)pkeya;
68289177Speter  svn_cl__simcheck_t *const keyb = *(svn_cl__simcheck_t *const *)pkeyb;
69289177Speter  svn_cl__simcheck_context_t *const context = keya->context;
70289177Speter
71289177Speter  if (keya->score == -1)
72289177Speter    keya->score = simcheck_key_diff(&keya->token, &context->key,
73289177Speter                                    &context->buffer, &keya->diff);
74289177Speter  if (keyb->score == -1)
75289177Speter    keyb->score = simcheck_key_diff(&keyb->token, &context->key,
76289177Speter                                    &context->buffer, &keyb->diff);
77289177Speter
78289177Speter  return (keya->score < keyb->score ? 1
79289177Speter          : (keya->score > keyb->score ? -1
80289177Speter             : (keya->diff > keyb->diff ? 1
81289177Speter                : (keya->diff < keyb->diff ? -1 : 0))));
82289177Speter}
83289177Speter
84289177Speterapr_size_t
85289177Spetersvn_cl__similarity_check(const char *key,
86289177Speter                         svn_cl__simcheck_t **tokens,
87289177Speter                         apr_size_t token_count,
88289177Speter                         apr_pool_t *scratch_pool)
89289177Speter{
90289177Speter  apr_size_t result;
91289177Speter  apr_size_t i;
92289177Speter
93289177Speter  svn_cl__simcheck_context_t context;
94289177Speter  context.key.data = key;
95289177Speter  context.key.len = strlen(key);
96289177Speter  svn_membuf__create(&context.buffer, 0, scratch_pool);
97289177Speter
98289177Speter  /* Populate the score, diff and context members. */
99289177Speter  for (i = 0; i < token_count; ++i)
100289177Speter    {
101289177Speter      svn_cl__simcheck_t *const token = tokens[i];
102289177Speter      token->score = -1;
103289177Speter      token->diff = 0;
104289177Speter      token->context = &context;
105289177Speter    }
106289177Speter
107289177Speter  /* Sort the tokens by similarity. */
108289177Speter  qsort(tokens, token_count, sizeof(*tokens), simcheck_compare);
109289177Speter
110289177Speter  /* Remove references to the context, since it points to the stack,
111289177Speter     and calculate the number of results that are at least two-thirds
112289177Speter     similar to the key. */
113289177Speter  for (i = 0, result = 1; i < token_count; ++i)
114289177Speter    {
115289177Speter      svn_cl__simcheck_t *const token = tokens[i];
116289177Speter      token->context = NULL;
117289177Speter      /* If you update this factor, consider updating
118289177Speter       * ../libsvn_subr/cmdline.c:most_similar(). */
119289177Speter      if (token->score >= (2 * SVN_STRING__SIM_RANGE_MAX + 1) / 3)
120289177Speter        ++result;
121289177Speter    }
122289177Speter
123289177Speter  if (0 == tokens[0]->diff)
124289177Speter    return 0;                   /* We found an exact match. */
125289177Speter  return result;
126289177Speter}
127