1251881Speter/*
2251881Speter * lcs.c :  routines for creating an lcs
3251881Speter *
4251881Speter * ====================================================================
5251881Speter *    Licensed to the Apache Software Foundation (ASF) under one
6251881Speter *    or more contributor license agreements.  See the NOTICE file
7251881Speter *    distributed with this work for additional information
8251881Speter *    regarding copyright ownership.  The ASF licenses this file
9251881Speter *    to you under the Apache License, Version 2.0 (the
10251881Speter *    "License"); you may not use this file except in compliance
11251881Speter *    with the License.  You may obtain a copy of the License at
12251881Speter *
13251881Speter *      http://www.apache.org/licenses/LICENSE-2.0
14251881Speter *
15251881Speter *    Unless required by applicable law or agreed to in writing,
16251881Speter *    software distributed under the License is distributed on an
17251881Speter *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18251881Speter *    KIND, either express or implied.  See the License for the
19251881Speter *    specific language governing permissions and limitations
20251881Speter *    under the License.
21251881Speter * ====================================================================
22251881Speter */
23251881Speter
24251881Speter
25251881Speter#include <apr.h>
26251881Speter#include <apr_pools.h>
27251881Speter#include <apr_general.h>
28251881Speter
29251881Speter#include "diff.h"
30251881Speter
31251881Speter
32251881Speter/*
33251881Speter * Calculate the Longest Common Subsequence (LCS) between two datasources.
34251881Speter * This function is what makes the diff code tick.
35251881Speter *
36251881Speter * The LCS algorithm implemented here is based on the approach described
37251881Speter * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison
38251881Speter * Algorithm", but has been modified for better performance.
39251881Speter *
40251881Speter * Let M and N be the lengths (number of tokens) of the two sources
41251881Speter * ('files'). The goal is to reach the end of both sources (files) with the
42251881Speter * minimum number of insertions + deletions. Since there is a known length
43251881Speter * difference N-M between the files, that is equivalent to just the minimum
44251881Speter * number of deletions, or equivalently the minimum number of insertions.
45251881Speter * For symmetry, we use the lesser number - deletions if M<N, insertions if
46251881Speter * M>N.
47251881Speter *
48251881Speter * Let 'k' be the difference in remaining length between the files, i.e.
49251881Speter * if we're at the beginning of both files, k=N-M, whereas k=0 for the
50251881Speter * 'end state', at the end of both files. An insertion will increase k by
51251881Speter * one, while a deletion decreases k by one. If k<0, then insertions are
52251881Speter * 'free' - we need those to reach the end state k=0 anyway - but deletions
53251881Speter * are costly: Adding a deletion means that we will have to add an additional
54251881Speter * insertion later to reach the end state, so it doesn't matter if we count
55251881Speter * deletions or insertions. Similarly, deletions are free for k>0.
56251881Speter *
57251881Speter * Let a 'state' be a given position in each file {pos1, pos2}. An array
58251881Speter * 'fp' keeps track of the best possible state (largest values of
59251881Speter * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away
60251881Speter * from k=0), as well as a linked list of what matches were used to reach
61251881Speter * that state. For each new value of p, we find for each value of k the
62251881Speter * best achievable state for that k - either by doing a costly operation
63251881Speter * (deletion if k<0) from a state achieved at a lower p, or doing a free
64251881Speter * operation (insertion if k<0) from a state achieved at the same p -
65251881Speter * and in both cases advancing past any matching regions found. This is
66251881Speter * handled by running loops over k in order of descending absolute value.
67251881Speter *
68251881Speter * A recent improvement of the algorithm is to ignore tokens that are unique
69251881Speter * to one file or the other, as those are known from the start to be
70251881Speter * impossible to match.
71251881Speter */
72251881Speter
73251881Spetertypedef struct svn_diff__snake_t svn_diff__snake_t;
74251881Speter
75251881Speterstruct svn_diff__snake_t
76251881Speter{
77251881Speter    apr_off_t             y;
78251881Speter    svn_diff__lcs_t      *lcs;
79251881Speter    svn_diff__position_t *position[2];
80251881Speter};
81251881Speter
82251881Speterstatic APR_INLINE void
83251881Spetersvn_diff__snake(svn_diff__snake_t *fp_k,
84251881Speter                svn_diff__token_index_t *token_counts[2],
85251881Speter                svn_diff__lcs_t **freelist,
86251881Speter                apr_pool_t *pool)
87251881Speter{
88251881Speter  svn_diff__position_t *start_position[2];
89251881Speter  svn_diff__position_t *position[2];
90251881Speter  svn_diff__lcs_t *lcs;
91251881Speter  svn_diff__lcs_t *previous_lcs;
92251881Speter
93251881Speter  /* The previous entry at fp[k] is going to be replaced.  See if we
94251881Speter   * can mark that lcs node for reuse, because the sequence up to this
95251881Speter   * point was a dead end.
96251881Speter   */
97251881Speter  lcs = fp_k[0].lcs;
98251881Speter  while (lcs)
99251881Speter    {
100251881Speter      lcs->refcount--;
101251881Speter      if (lcs->refcount)
102251881Speter        break;
103251881Speter
104251881Speter      previous_lcs = lcs->next;
105251881Speter      lcs->next = *freelist;
106251881Speter      *freelist = lcs;
107251881Speter      lcs = previous_lcs;
108251881Speter    }
109251881Speter
110251881Speter  if (fp_k[-1].y >= fp_k[1].y)
111251881Speter    {
112251881Speter      start_position[0] = fp_k[-1].position[0];
113251881Speter      start_position[1] = fp_k[-1].position[1]->next;
114251881Speter
115251881Speter      previous_lcs = fp_k[-1].lcs;
116251881Speter    }
117251881Speter  else
118251881Speter    {
119251881Speter      start_position[0] = fp_k[1].position[0]->next;
120251881Speter      start_position[1] = fp_k[1].position[1];
121251881Speter
122251881Speter      previous_lcs = fp_k[1].lcs;
123251881Speter    }
124251881Speter
125251881Speter
126251881Speter  if (previous_lcs)
127251881Speter    {
128251881Speter      previous_lcs->refcount++;
129251881Speter    }
130251881Speter
131251881Speter  /* ### Optimization, skip all positions that don't have matchpoints
132251881Speter   * ### anyway. Beware of the sentinel, don't skip it!
133251881Speter   */
134251881Speter
135251881Speter  position[0] = start_position[0];
136251881Speter  position[1] = start_position[1];
137251881Speter
138251881Speter  while (1)
139251881Speter    {
140251881Speter      while (position[0]->token_index == position[1]->token_index)
141251881Speter        {
142251881Speter          position[0] = position[0]->next;
143251881Speter          position[1] = position[1]->next;
144251881Speter        }
145251881Speter
146251881Speter      if (position[1] != start_position[1])
147251881Speter        {
148251881Speter          lcs = *freelist;
149251881Speter          if (lcs)
150251881Speter            {
151251881Speter              *freelist = lcs->next;
152251881Speter            }
153251881Speter          else
154251881Speter            {
155251881Speter              lcs = apr_palloc(pool, sizeof(*lcs));
156251881Speter            }
157251881Speter
158251881Speter          lcs->position[0] = start_position[0];
159251881Speter          lcs->position[1] = start_position[1];
160251881Speter          lcs->length = position[1]->offset - start_position[1]->offset;
161251881Speter          lcs->next = previous_lcs;
162251881Speter          lcs->refcount = 1;
163251881Speter          previous_lcs = lcs;
164251881Speter          start_position[0] = position[0];
165251881Speter          start_position[1] = position[1];
166251881Speter        }
167251881Speter
168251881Speter      /* Skip any and all tokens that only occur in one of the files */
169251881Speter      if (position[0]->token_index >= 0
170251881Speter          && token_counts[1][position[0]->token_index] == 0)
171251881Speter        start_position[0] = position[0] = position[0]->next;
172251881Speter      else if (position[1]->token_index >= 0
173251881Speter               && token_counts[0][position[1]->token_index] == 0)
174251881Speter        start_position[1] = position[1] = position[1]->next;
175251881Speter      else
176251881Speter        break;
177251881Speter    }
178251881Speter
179251881Speter  fp_k[0].lcs = previous_lcs;
180251881Speter  fp_k[0].position[0] = position[0];
181251881Speter  fp_k[0].position[1] = position[1];
182251881Speter
183251881Speter  fp_k[0].y = position[1]->offset;
184251881Speter}
185251881Speter
186251881Speter
187251881Speterstatic svn_diff__lcs_t *
188251881Spetersvn_diff__lcs_reverse(svn_diff__lcs_t *lcs)
189251881Speter{
190251881Speter  svn_diff__lcs_t *next;
191251881Speter  svn_diff__lcs_t *prev;
192251881Speter
193251881Speter  next = NULL;
194251881Speter  while (lcs != NULL)
195251881Speter    {
196251881Speter      prev = lcs->next;
197251881Speter      lcs->next = next;
198251881Speter      next = lcs;
199251881Speter      lcs = prev;
200251881Speter    }
201251881Speter
202251881Speter  return next;
203251881Speter}
204251881Speter
205251881Speter
206251881Speter/* Prepends a new lcs chunk for the amount of LINES at the given positions
207251881Speter * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it.
208251881Speter * This function assumes LINES > 0. */
209251881Speterstatic svn_diff__lcs_t *
210251881Speterprepend_lcs(svn_diff__lcs_t *lcs, apr_off_t lines,
211251881Speter            apr_off_t pos0_offset, apr_off_t pos1_offset,
212251881Speter            apr_pool_t *pool)
213251881Speter{
214251881Speter  svn_diff__lcs_t *new_lcs;
215251881Speter
216251881Speter  SVN_ERR_ASSERT_NO_RETURN(lines > 0);
217251881Speter
218251881Speter  new_lcs = apr_palloc(pool, sizeof(*new_lcs));
219251881Speter  new_lcs->position[0] = apr_pcalloc(pool, sizeof(*new_lcs->position[0]));
220251881Speter  new_lcs->position[0]->offset = pos0_offset;
221251881Speter  new_lcs->position[1] = apr_pcalloc(pool, sizeof(*new_lcs->position[1]));
222251881Speter  new_lcs->position[1]->offset = pos1_offset;
223251881Speter  new_lcs->length = lines;
224251881Speter  new_lcs->refcount = 1;
225251881Speter  new_lcs->next = lcs;
226251881Speter
227251881Speter  return new_lcs;
228251881Speter}
229251881Speter
230251881Speter
231251881Spetersvn_diff__lcs_t *
232251881Spetersvn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */
233251881Speter              svn_diff__position_t *position_list2, /* pointer to tail (ring) */
234251881Speter              svn_diff__token_index_t *token_counts_list1, /* array of counts */
235251881Speter              svn_diff__token_index_t *token_counts_list2, /* array of counts */
236251881Speter              svn_diff__token_index_t num_tokens,
237251881Speter              apr_off_t prefix_lines,
238251881Speter              apr_off_t suffix_lines,
239251881Speter              apr_pool_t *pool)
240251881Speter{
241251881Speter  apr_off_t length[2];
242251881Speter  svn_diff__token_index_t *token_counts[2];
243251881Speter  svn_diff__token_index_t unique_count[2];
244251881Speter  svn_diff__token_index_t token_index;
245251881Speter  svn_diff__snake_t *fp;
246251881Speter  apr_off_t d;
247251881Speter  apr_off_t k;
248251881Speter  apr_off_t p = 0;
249251881Speter  svn_diff__lcs_t *lcs, *lcs_freelist = NULL;
250251881Speter
251251881Speter  svn_diff__position_t sentinel_position[2];
252251881Speter
253251881Speter  /* Since EOF is always a sync point we tack on an EOF link
254251881Speter   * with sentinel positions
255251881Speter   */
256251881Speter  lcs = apr_palloc(pool, sizeof(*lcs));
257251881Speter  lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0]));
258251881Speter  lcs->position[0]->offset = position_list1
259251881Speter                             ? position_list1->offset + suffix_lines + 1
260251881Speter                             : prefix_lines + suffix_lines + 1;
261251881Speter  lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1]));
262251881Speter  lcs->position[1]->offset = position_list2
263251881Speter                             ? position_list2->offset + suffix_lines + 1
264251881Speter                             : prefix_lines + suffix_lines + 1;
265251881Speter  lcs->length = 0;
266251881Speter  lcs->refcount = 1;
267251881Speter  lcs->next = NULL;
268251881Speter
269251881Speter  if (position_list1 == NULL || position_list2 == NULL)
270251881Speter    {
271251881Speter      if (suffix_lines)
272251881Speter        lcs = prepend_lcs(lcs, suffix_lines,
273251881Speter                          lcs->position[0]->offset - suffix_lines,
274251881Speter                          lcs->position[1]->offset - suffix_lines,
275251881Speter                          pool);
276251881Speter      if (prefix_lines)
277251881Speter        lcs = prepend_lcs(lcs, prefix_lines, 1, 1, pool);
278251881Speter
279251881Speter      return lcs;
280251881Speter    }
281251881Speter
282251881Speter  unique_count[1] = unique_count[0] = 0;
283251881Speter  for (token_index = 0; token_index < num_tokens; token_index++)
284251881Speter    {
285251881Speter      if (token_counts_list1[token_index] == 0)
286251881Speter        unique_count[1] += token_counts_list2[token_index];
287251881Speter      if (token_counts_list2[token_index] == 0)
288251881Speter        unique_count[0] += token_counts_list1[token_index];
289251881Speter    }
290251881Speter
291251881Speter  /* Calculate lengths M and N of the sequences to be compared. Do not
292251881Speter   * count tokens unique to one file, as those are ignored in __snake.
293251881Speter   */
294251881Speter  length[0] = position_list1->offset - position_list1->next->offset + 1
295251881Speter              - unique_count[0];
296251881Speter  length[1] = position_list2->offset - position_list2->next->offset + 1
297251881Speter              - unique_count[1];
298251881Speter
299251881Speter  /* strikerXXX: here we allocate the furthest point array, which is
300251881Speter   * strikerXXX: sized M + N + 3 (!)
301251881Speter   */
302251881Speter  fp = apr_pcalloc(pool,
303251881Speter                   sizeof(*fp) * (apr_size_t)(length[0] + length[1] + 3));
304251881Speter
305251881Speter  /* The origo of fp corresponds to the end state, where we are
306251881Speter   * at the end of both files. The valid states thus span from
307251881Speter   * -N (at end of first file and at the beginning of the second
308251881Speter   * file) to +M (the opposite :). Finally, svn_diff__snake needs
309251881Speter   * 1 extra slot on each side to work.
310251881Speter   */
311251881Speter  fp += length[1] + 1;
312251881Speter
313251881Speter  sentinel_position[0].next = position_list1->next;
314251881Speter  position_list1->next = &sentinel_position[0];
315251881Speter  sentinel_position[0].offset = position_list1->offset + 1;
316251881Speter  token_counts[0] = token_counts_list1;
317251881Speter
318251881Speter  sentinel_position[1].next = position_list2->next;
319251881Speter  position_list2->next = &sentinel_position[1];
320251881Speter  sentinel_position[1].offset = position_list2->offset + 1;
321251881Speter  token_counts[1] = token_counts_list2;
322251881Speter
323251881Speter  /* Negative indices will not be used elsewhere
324251881Speter   */
325251881Speter  sentinel_position[0].token_index = -1;
326251881Speter  sentinel_position[1].token_index = -2;
327251881Speter
328251881Speter  /* position d = M - N corresponds to the initial state, where
329251881Speter   * we are at the beginning of both files.
330251881Speter   */
331251881Speter  d = length[0] - length[1];
332251881Speter
333251881Speter  /* k = d - 1 will be the first to be used to get previous
334251881Speter   * position information from, make sure it holds sane
335251881Speter   * data
336251881Speter   */
337251881Speter  fp[d - 1].position[0] = sentinel_position[0].next;
338251881Speter  fp[d - 1].position[1] = &sentinel_position[1];
339251881Speter
340251881Speter  p = 0;
341251881Speter  do
342251881Speter    {
343251881Speter      /* For k < 0, insertions are free */
344251881Speter      for (k = (d < 0 ? d : 0) - p; k < 0; k++)
345251881Speter        {
346251881Speter          svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
347251881Speter        }
348251881Speter	  /* for k > 0, deletions are free */
349251881Speter      for (k = (d > 0 ? d : 0) + p; k >= 0; k--)
350251881Speter        {
351251881Speter          svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
352251881Speter        }
353251881Speter
354251881Speter      p++;
355251881Speter    }
356251881Speter  while (fp[0].position[1] != &sentinel_position[1]);
357251881Speter
358251881Speter  if (suffix_lines)
359251881Speter    lcs->next = prepend_lcs(fp[0].lcs, suffix_lines,
360251881Speter                            lcs->position[0]->offset - suffix_lines,
361251881Speter                            lcs->position[1]->offset - suffix_lines,
362251881Speter                            pool);
363251881Speter  else
364251881Speter    lcs->next = fp[0].lcs;
365251881Speter
366251881Speter  lcs = svn_diff__lcs_reverse(lcs);
367251881Speter
368251881Speter  position_list1->next = sentinel_position[0].next;
369251881Speter  position_list2->next = sentinel_position[1].next;
370251881Speter
371251881Speter  if (prefix_lines)
372251881Speter    return prepend_lcs(lcs, prefix_lines, 1, 1, pool);
373251881Speter  else
374251881Speter    return lcs;
375251881Speter}
376