1234285Sdim//===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// 2234285Sdim// 3234285Sdim// The LLVM Compiler Infrastructure 4234285Sdim// 5234285Sdim// This file is distributed under the University of Illinois Open Source 6234285Sdim// License. See LICENSE.TXT for details. 7234285Sdim// 8234285Sdim//===----------------------------------------------------------------------===// 9234285Sdim// 10234285Sdim// This file defines a Levenshtein distance function that works for any two 11234285Sdim// sequences, with each element of each sequence being analogous to a character 12234285Sdim// in a string. 13234285Sdim// 14234285Sdim//===----------------------------------------------------------------------===// 15234285Sdim 16234285Sdim#ifndef LLVM_ADT_EDIT_DISTANCE_H 17234285Sdim#define LLVM_ADT_EDIT_DISTANCE_H 18234285Sdim 19234285Sdim#include "llvm/ADT/ArrayRef.h" 20234285Sdim#include "llvm/ADT/OwningPtr.h" 21234285Sdim#include <algorithm> 22234285Sdim 23234285Sdimnamespace llvm { 24234285Sdim 25234285Sdim/// \brief Determine the edit distance between two sequences. 26234285Sdim/// 27234285Sdim/// \param FromArray the first sequence to compare. 28234285Sdim/// 29234285Sdim/// \param ToArray the second sequence to compare. 30234285Sdim/// 31234285Sdim/// \param AllowReplacements whether to allow element replacements (change one 32234285Sdim/// element into another) as a single operation, rather than as two operations 33234285Sdim/// (an insertion and a removal). 34234285Sdim/// 35234285Sdim/// \param MaxEditDistance If non-zero, the maximum edit distance that this 36234285Sdim/// routine is allowed to compute. If the edit distance will exceed that 37234285Sdim/// maximum, returns \c MaxEditDistance+1. 38234285Sdim/// 39234285Sdim/// \returns the minimum number of element insertions, removals, or (if 40234285Sdim/// \p AllowReplacements is \c true) replacements needed to transform one of 41234285Sdim/// the given sequences into the other. If zero, the sequences are identical. 42234285Sdimtemplate<typename T> 43234285Sdimunsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, 44234285Sdim bool AllowReplacements = true, 45234285Sdim unsigned MaxEditDistance = 0) { 46234285Sdim // The algorithm implemented below is the "classic" 47234285Sdim // dynamic-programming algorithm for computing the Levenshtein 48234285Sdim // distance, which is described here: 49234285Sdim // 50234285Sdim // http://en.wikipedia.org/wiki/Levenshtein_distance 51234285Sdim // 52234285Sdim // Although the algorithm is typically described using an m x n 53234285Sdim // array, only two rows are used at a time, so this implemenation 54234285Sdim // just keeps two separate vectors for those two rows. 55234285Sdim typename ArrayRef<T>::size_type m = FromArray.size(); 56234285Sdim typename ArrayRef<T>::size_type n = ToArray.size(); 57234285Sdim 58234285Sdim const unsigned SmallBufferSize = 64; 59234285Sdim unsigned SmallBuffer[SmallBufferSize]; 60234285Sdim llvm::OwningArrayPtr<unsigned> Allocated; 61234285Sdim unsigned *Previous = SmallBuffer; 62234285Sdim if (2*(n + 1) > SmallBufferSize) { 63234285Sdim Previous = new unsigned [2*(n+1)]; 64234285Sdim Allocated.reset(Previous); 65234285Sdim } 66234285Sdim unsigned *Current = Previous + (n + 1); 67234285Sdim 68234285Sdim for (unsigned i = 0; i <= n; ++i) 69234285Sdim Previous[i] = i; 70234285Sdim 71234285Sdim for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { 72234285Sdim Current[0] = y; 73234285Sdim unsigned BestThisRow = Current[0]; 74234285Sdim 75234285Sdim for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { 76234285Sdim if (AllowReplacements) { 77234285Sdim Current[x] = std::min( 78234285Sdim Previous[x-1] + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u), 79234285Sdim std::min(Current[x-1], Previous[x])+1); 80234285Sdim } 81234285Sdim else { 82234285Sdim if (FromArray[y-1] == ToArray[x-1]) Current[x] = Previous[x-1]; 83234285Sdim else Current[x] = std::min(Current[x-1], Previous[x]) + 1; 84234285Sdim } 85234285Sdim BestThisRow = std::min(BestThisRow, Current[x]); 86234285Sdim } 87234285Sdim 88234285Sdim if (MaxEditDistance && BestThisRow > MaxEditDistance) 89234285Sdim return MaxEditDistance + 1; 90234285Sdim 91234285Sdim unsigned *tmp = Current; 92234285Sdim Current = Previous; 93234285Sdim Previous = tmp; 94234285Sdim } 95234285Sdim 96234285Sdim unsigned Result = Previous[n]; 97234285Sdim return Result; 98234285Sdim} 99234285Sdim 100234285Sdim} // End llvm namespace 101234285Sdim 102234285Sdim#endif 103