1234285Sdim//===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// 2234285Sdim// 3234285Sdim// The LLVM Compiler Infrastructure 4234285Sdim// 5234285Sdim// This file is distributed under the University of Illinois Open Source 6234285Sdim// License. See LICENSE.TXT for details. 7234285Sdim// 8234285Sdim//===----------------------------------------------------------------------===// 9234285Sdim// 10234285Sdim// This file defines a Levenshtein distance function that works for any two 11234285Sdim// sequences, with each element of each sequence being analogous to a character 12234285Sdim// in a string. 13234285Sdim// 14234285Sdim//===----------------------------------------------------------------------===// 15234285Sdim 16234285Sdim#ifndef LLVM_ADT_EDIT_DISTANCE_H 17234285Sdim#define LLVM_ADT_EDIT_DISTANCE_H 18234285Sdim 19234285Sdim#include "llvm/ADT/ArrayRef.h" 20234285Sdim#include <algorithm> 21276479Sdim#include <memory> 22234285Sdim 23234285Sdimnamespace llvm { 24234285Sdim 25234285Sdim/// \brief Determine the edit distance between two sequences. 26234285Sdim/// 27234285Sdim/// \param FromArray the first sequence to compare. 28234285Sdim/// 29234285Sdim/// \param ToArray the second sequence to compare. 30234285Sdim/// 31234285Sdim/// \param AllowReplacements whether to allow element replacements (change one 32234285Sdim/// element into another) as a single operation, rather than as two operations 33234285Sdim/// (an insertion and a removal). 34234285Sdim/// 35234285Sdim/// \param MaxEditDistance If non-zero, the maximum edit distance that this 36234285Sdim/// routine is allowed to compute. If the edit distance will exceed that 37234285Sdim/// maximum, returns \c MaxEditDistance+1. 38234285Sdim/// 39234285Sdim/// \returns the minimum number of element insertions, removals, or (if 40234285Sdim/// \p AllowReplacements is \c true) replacements needed to transform one of 41234285Sdim/// the given sequences into the other. If zero, the sequences are identical. 42234285Sdimtemplate<typename T> 43234285Sdimunsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, 44234285Sdim bool AllowReplacements = true, 45234285Sdim unsigned MaxEditDistance = 0) { 46234285Sdim // The algorithm implemented below is the "classic" 47234285Sdim // dynamic-programming algorithm for computing the Levenshtein 48234285Sdim // distance, which is described here: 49234285Sdim // 50234285Sdim // http://en.wikipedia.org/wiki/Levenshtein_distance 51234285Sdim // 52234285Sdim // Although the algorithm is typically described using an m x n 53288943Sdim // array, only one row plus one element are used at a time, so this 54288943Sdim // implementation just keeps one vector for the row. To update one entry, 55288943Sdim // only the entries to the left, top, and top-left are needed. The left 56288943Sdim // entry is in Row[x-1], the top entry is what's in Row[x] from the last 57288943Sdim // iteration, and the top-left entry is stored in Previous. 58234285Sdim typename ArrayRef<T>::size_type m = FromArray.size(); 59234285Sdim typename ArrayRef<T>::size_type n = ToArray.size(); 60234285Sdim 61234285Sdim const unsigned SmallBufferSize = 64; 62234285Sdim unsigned SmallBuffer[SmallBufferSize]; 63276479Sdim std::unique_ptr<unsigned[]> Allocated; 64288943Sdim unsigned *Row = SmallBuffer; 65288943Sdim if (n + 1 > SmallBufferSize) { 66288943Sdim Row = new unsigned[n + 1]; 67288943Sdim Allocated.reset(Row); 68234285Sdim } 69234285Sdim 70288943Sdim for (unsigned i = 1; i <= n; ++i) 71288943Sdim Row[i] = i; 72234285Sdim 73234285Sdim for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { 74288943Sdim Row[0] = y; 75288943Sdim unsigned BestThisRow = Row[0]; 76234285Sdim 77288943Sdim unsigned Previous = y - 1; 78234285Sdim for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { 79288943Sdim int OldRow = Row[x]; 80234285Sdim if (AllowReplacements) { 81288943Sdim Row[x] = std::min( 82288943Sdim Previous + (FromArray[y-1] == ToArray[x-1] ? 0u : 1u), 83288943Sdim std::min(Row[x-1], Row[x])+1); 84234285Sdim } 85234285Sdim else { 86288943Sdim if (FromArray[y-1] == ToArray[x-1]) Row[x] = Previous; 87288943Sdim else Row[x] = std::min(Row[x-1], Row[x]) + 1; 88234285Sdim } 89288943Sdim Previous = OldRow; 90288943Sdim BestThisRow = std::min(BestThisRow, Row[x]); 91234285Sdim } 92234285Sdim 93234285Sdim if (MaxEditDistance && BestThisRow > MaxEditDistance) 94234285Sdim return MaxEditDistance + 1; 95234285Sdim } 96234285Sdim 97288943Sdim unsigned Result = Row[n]; 98234285Sdim return Result; 99234285Sdim} 100234285Sdim 101234285Sdim} // End llvm namespace 102234285Sdim 103234285Sdim#endif 104