1251881Speter/* 2251881Speter * lcs.c : routines for creating an lcs 3251881Speter * 4251881Speter * ==================================================================== 5251881Speter * Licensed to the Apache Software Foundation (ASF) under one 6251881Speter * or more contributor license agreements. See the NOTICE file 7251881Speter * distributed with this work for additional information 8251881Speter * regarding copyright ownership. The ASF licenses this file 9251881Speter * to you under the Apache License, Version 2.0 (the 10251881Speter * "License"); you may not use this file except in compliance 11251881Speter * with the License. You may obtain a copy of the License at 12251881Speter * 13251881Speter * http://www.apache.org/licenses/LICENSE-2.0 14251881Speter * 15251881Speter * Unless required by applicable law or agreed to in writing, 16251881Speter * software distributed under the License is distributed on an 17251881Speter * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18251881Speter * KIND, either express or implied. See the License for the 19251881Speter * specific language governing permissions and limitations 20251881Speter * under the License. 21251881Speter * ==================================================================== 22251881Speter */ 23251881Speter 24251881Speter 25251881Speter#include <apr.h> 26251881Speter#include <apr_pools.h> 27251881Speter#include <apr_general.h> 28251881Speter 29251881Speter#include "diff.h" 30251881Speter 31251881Speter 32251881Speter/* 33251881Speter * Calculate the Longest Common Subsequence (LCS) between two datasources. 34251881Speter * This function is what makes the diff code tick. 35251881Speter * 36251881Speter * The LCS algorithm implemented here is based on the approach described 37251881Speter * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison 38251881Speter * Algorithm", but has been modified for better performance. 39251881Speter * 40251881Speter * Let M and N be the lengths (number of tokens) of the two sources 41251881Speter * ('files'). The goal is to reach the end of both sources (files) with the 42251881Speter * minimum number of insertions + deletions. Since there is a known length 43251881Speter * difference N-M between the files, that is equivalent to just the minimum 44251881Speter * number of deletions, or equivalently the minimum number of insertions. 45251881Speter * For symmetry, we use the lesser number - deletions if M<N, insertions if 46251881Speter * M>N. 47251881Speter * 48251881Speter * Let 'k' be the difference in remaining length between the files, i.e. 49251881Speter * if we're at the beginning of both files, k=N-M, whereas k=0 for the 50251881Speter * 'end state', at the end of both files. An insertion will increase k by 51251881Speter * one, while a deletion decreases k by one. If k<0, then insertions are 52251881Speter * 'free' - we need those to reach the end state k=0 anyway - but deletions 53251881Speter * are costly: Adding a deletion means that we will have to add an additional 54251881Speter * insertion later to reach the end state, so it doesn't matter if we count 55251881Speter * deletions or insertions. Similarly, deletions are free for k>0. 56251881Speter * 57251881Speter * Let a 'state' be a given position in each file {pos1, pos2}. An array 58251881Speter * 'fp' keeps track of the best possible state (largest values of 59251881Speter * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away 60251881Speter * from k=0), as well as a linked list of what matches were used to reach 61251881Speter * that state. For each new value of p, we find for each value of k the 62251881Speter * best achievable state for that k - either by doing a costly operation 63251881Speter * (deletion if k<0) from a state achieved at a lower p, or doing a free 64251881Speter * operation (insertion if k<0) from a state achieved at the same p - 65251881Speter * and in both cases advancing past any matching regions found. This is 66251881Speter * handled by running loops over k in order of descending absolute value. 67251881Speter * 68251881Speter * A recent improvement of the algorithm is to ignore tokens that are unique 69251881Speter * to one file or the other, as those are known from the start to be 70251881Speter * impossible to match. 71251881Speter */ 72251881Speter 73251881Spetertypedef struct svn_diff__snake_t svn_diff__snake_t; 74251881Speter 75251881Speterstruct svn_diff__snake_t 76251881Speter{ 77251881Speter apr_off_t y; 78251881Speter svn_diff__lcs_t *lcs; 79251881Speter svn_diff__position_t *position[2]; 80251881Speter}; 81251881Speter 82251881Speterstatic APR_INLINE void 83251881Spetersvn_diff__snake(svn_diff__snake_t *fp_k, 84251881Speter svn_diff__token_index_t *token_counts[2], 85251881Speter svn_diff__lcs_t **freelist, 86251881Speter apr_pool_t *pool) 87251881Speter{ 88251881Speter svn_diff__position_t *start_position[2]; 89251881Speter svn_diff__position_t *position[2]; 90251881Speter svn_diff__lcs_t *lcs; 91251881Speter svn_diff__lcs_t *previous_lcs; 92251881Speter 93251881Speter /* The previous entry at fp[k] is going to be replaced. See if we 94251881Speter * can mark that lcs node for reuse, because the sequence up to this 95251881Speter * point was a dead end. 96251881Speter */ 97251881Speter lcs = fp_k[0].lcs; 98251881Speter while (lcs) 99251881Speter { 100251881Speter lcs->refcount--; 101251881Speter if (lcs->refcount) 102251881Speter break; 103251881Speter 104251881Speter previous_lcs = lcs->next; 105251881Speter lcs->next = *freelist; 106251881Speter *freelist = lcs; 107251881Speter lcs = previous_lcs; 108251881Speter } 109251881Speter 110251881Speter if (fp_k[-1].y >= fp_k[1].y) 111251881Speter { 112251881Speter start_position[0] = fp_k[-1].position[0]; 113251881Speter start_position[1] = fp_k[-1].position[1]->next; 114251881Speter 115251881Speter previous_lcs = fp_k[-1].lcs; 116251881Speter } 117251881Speter else 118251881Speter { 119251881Speter start_position[0] = fp_k[1].position[0]->next; 120251881Speter start_position[1] = fp_k[1].position[1]; 121251881Speter 122251881Speter previous_lcs = fp_k[1].lcs; 123251881Speter } 124251881Speter 125251881Speter 126251881Speter if (previous_lcs) 127251881Speter { 128251881Speter previous_lcs->refcount++; 129251881Speter } 130251881Speter 131251881Speter /* ### Optimization, skip all positions that don't have matchpoints 132251881Speter * ### anyway. Beware of the sentinel, don't skip it! 133251881Speter */ 134251881Speter 135251881Speter position[0] = start_position[0]; 136251881Speter position[1] = start_position[1]; 137251881Speter 138251881Speter while (1) 139251881Speter { 140251881Speter while (position[0]->token_index == position[1]->token_index) 141251881Speter { 142251881Speter position[0] = position[0]->next; 143251881Speter position[1] = position[1]->next; 144251881Speter } 145251881Speter 146251881Speter if (position[1] != start_position[1]) 147251881Speter { 148251881Speter lcs = *freelist; 149251881Speter if (lcs) 150251881Speter { 151251881Speter *freelist = lcs->next; 152251881Speter } 153251881Speter else 154251881Speter { 155251881Speter lcs = apr_palloc(pool, sizeof(*lcs)); 156251881Speter } 157251881Speter 158251881Speter lcs->position[0] = start_position[0]; 159251881Speter lcs->position[1] = start_position[1]; 160251881Speter lcs->length = position[1]->offset - start_position[1]->offset; 161251881Speter lcs->next = previous_lcs; 162251881Speter lcs->refcount = 1; 163251881Speter previous_lcs = lcs; 164251881Speter start_position[0] = position[0]; 165251881Speter start_position[1] = position[1]; 166251881Speter } 167251881Speter 168251881Speter /* Skip any and all tokens that only occur in one of the files */ 169251881Speter if (position[0]->token_index >= 0 170251881Speter && token_counts[1][position[0]->token_index] == 0) 171251881Speter start_position[0] = position[0] = position[0]->next; 172251881Speter else if (position[1]->token_index >= 0 173251881Speter && token_counts[0][position[1]->token_index] == 0) 174251881Speter start_position[1] = position[1] = position[1]->next; 175251881Speter else 176251881Speter break; 177251881Speter } 178251881Speter 179251881Speter fp_k[0].lcs = previous_lcs; 180251881Speter fp_k[0].position[0] = position[0]; 181251881Speter fp_k[0].position[1] = position[1]; 182251881Speter 183251881Speter fp_k[0].y = position[1]->offset; 184251881Speter} 185251881Speter 186251881Speter 187251881Speterstatic svn_diff__lcs_t * 188251881Spetersvn_diff__lcs_reverse(svn_diff__lcs_t *lcs) 189251881Speter{ 190251881Speter svn_diff__lcs_t *next; 191251881Speter svn_diff__lcs_t *prev; 192251881Speter 193251881Speter next = NULL; 194251881Speter while (lcs != NULL) 195251881Speter { 196251881Speter prev = lcs->next; 197251881Speter lcs->next = next; 198251881Speter next = lcs; 199251881Speter lcs = prev; 200251881Speter } 201251881Speter 202251881Speter return next; 203251881Speter} 204251881Speter 205251881Speter 206251881Speter/* Prepends a new lcs chunk for the amount of LINES at the given positions 207251881Speter * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it. 208251881Speter * This function assumes LINES > 0. */ 209251881Speterstatic svn_diff__lcs_t * 210251881Speterprepend_lcs(svn_diff__lcs_t *lcs, apr_off_t lines, 211251881Speter apr_off_t pos0_offset, apr_off_t pos1_offset, 212251881Speter apr_pool_t *pool) 213251881Speter{ 214251881Speter svn_diff__lcs_t *new_lcs; 215251881Speter 216251881Speter SVN_ERR_ASSERT_NO_RETURN(lines > 0); 217251881Speter 218251881Speter new_lcs = apr_palloc(pool, sizeof(*new_lcs)); 219251881Speter new_lcs->position[0] = apr_pcalloc(pool, sizeof(*new_lcs->position[0])); 220251881Speter new_lcs->position[0]->offset = pos0_offset; 221251881Speter new_lcs->position[1] = apr_pcalloc(pool, sizeof(*new_lcs->position[1])); 222251881Speter new_lcs->position[1]->offset = pos1_offset; 223251881Speter new_lcs->length = lines; 224251881Speter new_lcs->refcount = 1; 225251881Speter new_lcs->next = lcs; 226251881Speter 227251881Speter return new_lcs; 228251881Speter} 229251881Speter 230251881Speter 231251881Spetersvn_diff__lcs_t * 232251881Spetersvn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */ 233251881Speter svn_diff__position_t *position_list2, /* pointer to tail (ring) */ 234251881Speter svn_diff__token_index_t *token_counts_list1, /* array of counts */ 235251881Speter svn_diff__token_index_t *token_counts_list2, /* array of counts */ 236251881Speter svn_diff__token_index_t num_tokens, 237251881Speter apr_off_t prefix_lines, 238251881Speter apr_off_t suffix_lines, 239251881Speter apr_pool_t *pool) 240251881Speter{ 241251881Speter apr_off_t length[2]; 242251881Speter svn_diff__token_index_t *token_counts[2]; 243251881Speter svn_diff__token_index_t unique_count[2]; 244251881Speter svn_diff__token_index_t token_index; 245251881Speter svn_diff__snake_t *fp; 246251881Speter apr_off_t d; 247251881Speter apr_off_t k; 248251881Speter apr_off_t p = 0; 249251881Speter svn_diff__lcs_t *lcs, *lcs_freelist = NULL; 250251881Speter 251251881Speter svn_diff__position_t sentinel_position[2]; 252251881Speter 253251881Speter /* Since EOF is always a sync point we tack on an EOF link 254251881Speter * with sentinel positions 255251881Speter */ 256251881Speter lcs = apr_palloc(pool, sizeof(*lcs)); 257251881Speter lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0])); 258251881Speter lcs->position[0]->offset = position_list1 259251881Speter ? position_list1->offset + suffix_lines + 1 260251881Speter : prefix_lines + suffix_lines + 1; 261251881Speter lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1])); 262251881Speter lcs->position[1]->offset = position_list2 263251881Speter ? position_list2->offset + suffix_lines + 1 264251881Speter : prefix_lines + suffix_lines + 1; 265251881Speter lcs->length = 0; 266251881Speter lcs->refcount = 1; 267251881Speter lcs->next = NULL; 268251881Speter 269251881Speter if (position_list1 == NULL || position_list2 == NULL) 270251881Speter { 271251881Speter if (suffix_lines) 272251881Speter lcs = prepend_lcs(lcs, suffix_lines, 273251881Speter lcs->position[0]->offset - suffix_lines, 274251881Speter lcs->position[1]->offset - suffix_lines, 275251881Speter pool); 276251881Speter if (prefix_lines) 277251881Speter lcs = prepend_lcs(lcs, prefix_lines, 1, 1, pool); 278251881Speter 279251881Speter return lcs; 280251881Speter } 281251881Speter 282251881Speter unique_count[1] = unique_count[0] = 0; 283251881Speter for (token_index = 0; token_index < num_tokens; token_index++) 284251881Speter { 285251881Speter if (token_counts_list1[token_index] == 0) 286251881Speter unique_count[1] += token_counts_list2[token_index]; 287251881Speter if (token_counts_list2[token_index] == 0) 288251881Speter unique_count[0] += token_counts_list1[token_index]; 289251881Speter } 290251881Speter 291251881Speter /* Calculate lengths M and N of the sequences to be compared. Do not 292251881Speter * count tokens unique to one file, as those are ignored in __snake. 293251881Speter */ 294251881Speter length[0] = position_list1->offset - position_list1->next->offset + 1 295251881Speter - unique_count[0]; 296251881Speter length[1] = position_list2->offset - position_list2->next->offset + 1 297251881Speter - unique_count[1]; 298251881Speter 299251881Speter /* strikerXXX: here we allocate the furthest point array, which is 300251881Speter * strikerXXX: sized M + N + 3 (!) 301251881Speter */ 302251881Speter fp = apr_pcalloc(pool, 303251881Speter sizeof(*fp) * (apr_size_t)(length[0] + length[1] + 3)); 304251881Speter 305251881Speter /* The origo of fp corresponds to the end state, where we are 306251881Speter * at the end of both files. The valid states thus span from 307251881Speter * -N (at end of first file and at the beginning of the second 308251881Speter * file) to +M (the opposite :). Finally, svn_diff__snake needs 309251881Speter * 1 extra slot on each side to work. 310251881Speter */ 311251881Speter fp += length[1] + 1; 312251881Speter 313251881Speter sentinel_position[0].next = position_list1->next; 314251881Speter position_list1->next = &sentinel_position[0]; 315251881Speter sentinel_position[0].offset = position_list1->offset + 1; 316251881Speter token_counts[0] = token_counts_list1; 317251881Speter 318251881Speter sentinel_position[1].next = position_list2->next; 319251881Speter position_list2->next = &sentinel_position[1]; 320251881Speter sentinel_position[1].offset = position_list2->offset + 1; 321251881Speter token_counts[1] = token_counts_list2; 322251881Speter 323251881Speter /* Negative indices will not be used elsewhere 324251881Speter */ 325251881Speter sentinel_position[0].token_index = -1; 326251881Speter sentinel_position[1].token_index = -2; 327251881Speter 328251881Speter /* position d = M - N corresponds to the initial state, where 329251881Speter * we are at the beginning of both files. 330251881Speter */ 331251881Speter d = length[0] - length[1]; 332251881Speter 333251881Speter /* k = d - 1 will be the first to be used to get previous 334251881Speter * position information from, make sure it holds sane 335251881Speter * data 336251881Speter */ 337251881Speter fp[d - 1].position[0] = sentinel_position[0].next; 338251881Speter fp[d - 1].position[1] = &sentinel_position[1]; 339251881Speter 340251881Speter p = 0; 341251881Speter do 342251881Speter { 343251881Speter /* For k < 0, insertions are free */ 344251881Speter for (k = (d < 0 ? d : 0) - p; k < 0; k++) 345251881Speter { 346251881Speter svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool); 347251881Speter } 348251881Speter /* for k > 0, deletions are free */ 349251881Speter for (k = (d > 0 ? d : 0) + p; k >= 0; k--) 350251881Speter { 351251881Speter svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool); 352251881Speter } 353251881Speter 354251881Speter p++; 355251881Speter } 356251881Speter while (fp[0].position[1] != &sentinel_position[1]); 357251881Speter 358251881Speter if (suffix_lines) 359251881Speter lcs->next = prepend_lcs(fp[0].lcs, suffix_lines, 360251881Speter lcs->position[0]->offset - suffix_lines, 361251881Speter lcs->position[1]->offset - suffix_lines, 362251881Speter pool); 363251881Speter else 364251881Speter lcs->next = fp[0].lcs; 365251881Speter 366251881Speter lcs = svn_diff__lcs_reverse(lcs); 367251881Speter 368251881Speter position_list1->next = sentinel_position[0].next; 369251881Speter position_list2->next = sentinel_position[1].next; 370251881Speter 371251881Speter if (prefix_lines) 372251881Speter return prepend_lcs(lcs, prefix_lines, 1, 1, pool); 373251881Speter else 374251881Speter return lcs; 375251881Speter} 376