1//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2//-*- C++ -*-===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements functions to map the name or alias of a unicode
11// character to its codepoint.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/STLExtras.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/Support/Unicode.h"
19
20namespace llvm {
21namespace sys {
22namespace unicode {
23
24extern const char *UnicodeNameToCodepointDict;
25extern const uint8_t *UnicodeNameToCodepointIndex;
26extern const std::size_t UnicodeNameToCodepointIndexSize;
27extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28
29using BufferType = SmallString<64>;
30
31struct Node {
32  bool IsRoot = false;
33  char32_t Value = 0xFFFFFFFF;
34  uint32_t ChildrenOffset = 0;
35  bool HasSibling = false;
36  uint32_t Size = 0;
37  StringRef Name;
38  const Node *Parent = nullptr;
39
40  constexpr bool isValid() const {
41    return !Name.empty() || Value == 0xFFFFFFFF;
42  }
43  constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
44
45  std::string fullName() const {
46    std::string S;
47    // Reserve enough space for most unicode code points.
48    // The chosen value represent the 99th percentile of name size as of
49    // Unicode 15.0.
50    S.reserve(46);
51    const Node *N = this;
52    while (N) {
53      std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
54      N = N->Parent;
55    }
56    std::reverse(S.begin(), S.end());
57    return S;
58  }
59};
60
61static Node createRoot() {
62  Node N;
63  N.IsRoot = true;
64  N.ChildrenOffset = 1;
65  N.Size = 1;
66  return N;
67}
68
69static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
70  if (Offset == 0)
71    return createRoot();
72
73  uint32_t Origin = Offset;
74  Node N;
75  N.Parent = Parent;
76  uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
77  if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
78    return N;
79
80  bool LongName = NameInfo & 0x40;
81  bool HasValue = NameInfo & 0x80;
82  std::size_t Size = NameInfo & ~0xC0;
83  if (LongName) {
84    uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
85    NameOffset |= UnicodeNameToCodepointIndex[Offset++];
86    N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
87  } else {
88    N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
89  }
90  if (HasValue) {
91    uint8_t H = UnicodeNameToCodepointIndex[Offset++];
92    uint8_t M = UnicodeNameToCodepointIndex[Offset++];
93    uint8_t L = UnicodeNameToCodepointIndex[Offset++];
94    N.Value = ((H << 16) | (M << 8) | L) >> 3;
95
96    bool HasChildren = L & 0x02;
97    N.HasSibling = L & 0x01;
98
99    if (HasChildren) {
100      N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
101      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
102      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
103    }
104  } else {
105    uint8_t H = UnicodeNameToCodepointIndex[Offset++];
106    N.HasSibling = H & 0x80;
107    bool HasChildren = H & 0x40;
108    H &= uint8_t(~0xC0);
109    if (HasChildren) {
110      N.ChildrenOffset = (H << 16);
111      N.ChildrenOffset |=
112          (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
113      N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
114    }
115  }
116  N.Size = Offset - Origin;
117  return N;
118}
119
120static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121                       std::size_t &Consummed, char &PreviousCharInName,
122                       char &PreviousCharInNeedle, bool IsPrefix = false) {
123
124  Consummed = 0;
125  if (Strict) {
126    if (!Name.startswith(Needle))
127      return false;
128    Consummed = Needle.size();
129    return true;
130  }
131  if (Needle.empty())
132    return true;
133
134  auto NamePos = Name.begin();
135  auto NeedlePos = Needle.begin();
136
137  char PreviousCharInNameOrigin = PreviousCharInName;
138  char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
139
140  auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
141                         bool IgnoreEnd = false) {
142    while (It != End) {
143      const auto Next = std::next(It);
144      // Ignore spaces, underscore, medial hyphens
145      // https://unicode.org/reports/tr44/#UAX44-LM2.
146      bool Ignore =
147          *It == ' ' || *It == '_' ||
148          (*It == '-' && isAlnum(PreviousChar) &&
149           ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
150      PreviousChar = *It;
151      if (!Ignore)
152        break;
153      ++It;
154    }
155    return It;
156  };
157
158  while (true) {
159    NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160    NeedlePos =
161        IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162    if (NeedlePos == Needle.end())
163      break;
164    if (NamePos == Name.end())
165      break;
166    if (toUpper(*NeedlePos) != toUpper(*NamePos))
167      break;
168    NeedlePos++;
169    NamePos++;
170  }
171  Consummed = std::distance(Name.begin(), NamePos);
172  if (NeedlePos != Needle.end()) {
173    PreviousCharInName = PreviousCharInNameOrigin;
174    PreviousCharInNeedle = PreviousCharInNeedleOrigin;
175  }
176  return NeedlePos == Needle.end();
177}
178
179static std::tuple<Node, bool, uint32_t>
180compareNode(uint32_t Offset, StringRef Name, bool Strict,
181            char PreviousCharInName, char PreviousCharInNeedle,
182            BufferType &Buffer, const Node *Parent = nullptr) {
183  Node N = readNode(Offset, Parent);
184  std::size_t Consummed = 0;
185  bool DoesStartWith =
186      N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
187                             PreviousCharInName, PreviousCharInNeedle);
188  if (!DoesStartWith)
189    return std::make_tuple(N, false, 0);
190
191  if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
192    return std::make_tuple(N, true, N.Value);
193
194  if (N.hasChildren()) {
195    uint32_t ChildOffset = N.ChildrenOffset;
196    for (;;) {
197      Node C;
198      bool Matches;
199      uint32_t Value;
200      std::tie(C, Matches, Value) =
201          compareNode(ChildOffset, Name.substr(Consummed), Strict,
202                      PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
203      if (Matches) {
204        std::reverse_copy(C.Name.begin(), C.Name.end(),
205                          std::back_inserter(Buffer));
206        return std::make_tuple(N, true, Value);
207      }
208      ChildOffset += C.Size;
209      if (!C.HasSibling)
210        break;
211    }
212  }
213  return std::make_tuple(N, false, 0);
214}
215
216static std::tuple<Node, bool, uint32_t>
217compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
218  return compareNode(Offset, Name, Strict, 0, 0, Buffer);
219}
220
221// clang-format off
222constexpr const char *const HangulSyllables[][3] = {
223    { "G",  "A",   ""   },
224    { "GG", "AE",  "G"  },
225    { "N",  "YA",  "GG" },
226    { "D",  "YAE", "GS" },
227    { "DD", "EO",  "N", },
228    { "R",  "E",   "NJ" },
229    { "M",  "YEO", "NH" },
230    { "B",  "YE",  "D"  },
231    { "BB", "O",   "L"  },
232    { "S",  "WA",  "LG" },
233    { "SS", "WAE", "LM" },
234    { "",   "OE",  "LB" },
235    { "J",  "YO",  "LS" },
236    { "JJ", "U",   "LT" },
237    { "C",  "WEO", "LP" },
238    { "K",  "WE",  "LH" },
239    { "T",  "WI",  "M"  },
240    { "P",  "YU",  "B"  },
241    { "H",  "EU",  "BS" },
242    { 0,    "YI",  "S"  },
243    { 0,    "I",   "SS" },
244    { 0,    0,     "NG" },
245    { 0,    0,     "J"  },
246    { 0,    0,     "C"  },
247    { 0,    0,     "K"  },
248    { 0,    0,     "T"  },
249    { 0,    0,     "P"  },
250    { 0,    0,     "H"  }
251    };
252// clang-format on
253
254// Unicode 15.0
255// 3.12 Conjoining Jamo Behavior Common constants
256constexpr const char32_t SBase = 0xAC00;
257constexpr const uint32_t LCount = 19;
258constexpr const uint32_t VCount = 21;
259constexpr const uint32_t TCount = 28;
260
261static std::size_t findSyllable(StringRef Name, bool Strict,
262                                char &PreviousInName, int &Pos, int Column) {
263  assert(Column == 0 || Column == 1 || Column == 2);
264  static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
265  char NeedleStart = 0;
266  int Len = -1;
267  int Prev = PreviousInName;
268  for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
269    StringRef Syllable(HangulSyllables[I][Column]);
270    if (int(Syllable.size()) <= Len)
271      continue;
272    std::size_t Consummed = 0;
273    char PreviousInNameCopy = PreviousInName;
274    bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
275                                    PreviousInNameCopy, NeedleStart);
276    if (!DoesStartWith)
277      continue;
278    Len = Consummed;
279    Pos = I;
280    Prev = PreviousInNameCopy;
281  }
282  if (Len == -1)
283    return 0;
284  PreviousInName = Prev;
285  return size_t(Len);
286}
287
288static std::optional<char32_t>
289nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
290  Buffer.clear();
291  // Hangul Syllable Decomposition
292  std::size_t Consummed = 0;
293  char NameStart = 0, NeedleStart = 0;
294  bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
295                                  NameStart, NeedleStart);
296  if (!DoesStartWith)
297    return std::nullopt;
298  Name = Name.substr(Consummed);
299  int L = -1, V = -1, T = -1;
300  Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
301  Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
302  Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
303  if (L != -1 && V != -1 && T != -1 && Name.empty()) {
304    if (!Strict) {
305      Buffer.append("HANGUL SYLLABLE ");
306      if (L != -1)
307        Buffer.append(HangulSyllables[L][0]);
308      if (V != -1)
309        Buffer.append(HangulSyllables[V][1]);
310      if (T != -1)
311        Buffer.append(HangulSyllables[T][2]);
312    }
313    return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
314           std::uint32_t(T);
315  }
316  // Otherwise, it's an illegal syllable name.
317  return std::nullopt;
318}
319
320struct GeneratedNamesData {
321  StringRef Prefix;
322  uint32_t Start;
323  uint32_t End;
324};
325
326// Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings
327static const GeneratedNamesData GeneratedNamesDataTable[] = {
328    {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
329    {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
330    {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
331    {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
332    {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
333    {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
334    {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
335    {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
336    {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
337    {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
338    {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
339    {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
340    {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
341    {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
342    {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
343    {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
344};
345
346static std::optional<char32_t>
347nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
348  for (auto &&Item : GeneratedNamesDataTable) {
349    Buffer.clear();
350    std::size_t Consummed = 0;
351    char NameStart = 0, NeedleStart = 0;
352    bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
353                                    NameStart, NeedleStart, /*isPrefix*/ true);
354    if (!DoesStartWith)
355      continue;
356    auto Number = Name.substr(Consummed);
357    unsigned long long V = 0;
358    // Be consistent about mandating upper casing.
359    if (Strict &&
360        llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
361      return {};
362    if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
363      continue;
364    if (!Strict) {
365      Buffer.append(Item.Prefix);
366      Buffer.append(utohexstr(V, true));
367    }
368    return V;
369  }
370  return std::nullopt;
371}
372
373static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
374                                               BufferType &Buffer) {
375  if (Name.empty())
376    return std::nullopt;
377
378  std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
379  if (!Res)
380    Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
381  if (Res)
382    return *Res;
383
384  Buffer.clear();
385  Node Node;
386  bool Matches;
387  uint32_t Value;
388  std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
389  if (Matches) {
390    std::reverse(Buffer.begin(), Buffer.end());
391    // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
392    // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
393    if (!Strict && Value == 0x116c &&
394        Name.find_insensitive("O-E") != StringRef::npos) {
395      Buffer = "HANGUL JUNGSEONG O-E";
396      Value = 0x1180;
397    }
398    return Value;
399  }
400  return std::nullopt;
401}
402
403std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
404
405  BufferType Buffer;
406  auto Opt = nameToCodepoint(Name, true, Buffer);
407  return Opt;
408}
409
410std::optional<LooseMatchingResult>
411nameToCodepointLooseMatching(StringRef Name) {
412  BufferType Buffer;
413  auto Opt = nameToCodepoint(Name, false, Buffer);
414  if (!Opt)
415    return std::nullopt;
416  return LooseMatchingResult{*Opt, Buffer};
417}
418
419// Find the unicode character whose editing distance to Pattern
420// is shortest, using the Wagner���Fischer algorithm.
421llvm::SmallVector<MatchForCodepointName>
422nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
423  // We maintain a fixed size vector of matches,
424  // sorted by distance
425  // The worst match (with the biggest distance) are discarded when new elements
426  // are added.
427  std::size_t LargestEditDistance = 0;
428  llvm::SmallVector<MatchForCodepointName> Matches;
429  Matches.reserve(MaxMatchesCount + 1);
430
431  auto Insert = [&](const Node &Node, uint32_t Distance,
432                    char32_t Value) -> bool {
433    if (Distance > LargestEditDistance) {
434      if (Matches.size() == MaxMatchesCount)
435        return false;
436      LargestEditDistance = Distance;
437    }
438    // To avoid allocations, the creation of the name is delayed
439    // as much as possible.
440    std::string Name;
441    auto GetName = [&] {
442      if (Name.empty())
443        Name = Node.fullName();
444      return Name;
445    };
446
447    auto It = llvm::lower_bound(
448        Matches, Distance,
449        [&](const MatchForCodepointName &a, std::size_t Distance) {
450          if (Distance == a.Distance)
451            return a.Name < GetName();
452          return a.Distance < Distance;
453        });
454    if (It == Matches.end() && Matches.size() == MaxMatchesCount)
455      return false;
456
457    MatchForCodepointName M{GetName(), Distance, Value};
458    Matches.insert(It, std::move(M));
459    if (Matches.size() > MaxMatchesCount)
460      Matches.pop_back();
461    return true;
462  };
463
464  // We ignore case, space, hyphens, etc,
465  // in both the search pattern and the prospective names.
466  auto Normalize = [](StringRef Name) {
467    std::string Out;
468    Out.reserve(Name.size());
469    for (char C : Name) {
470      if (isAlnum(C))
471        Out.push_back(toUpper(C));
472    }
473    return Out;
474  };
475  std::string NormalizedName = Normalize(Pattern);
476
477  // Allocate a matrix big enough for longest names.
478  const std::size_t Columns =
479      std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
480      1;
481
482  LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
483      UnicodeNameToCodepointLargestNameSize + 1;
484
485  std::vector<char> Distances(
486      Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
487
488  auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
489    assert(Column < Columns);
490    assert(Row < Rows);
491    return Distances[Row * Columns + Column];
492  };
493
494  for (std::size_t I = 0; I < Columns; I++)
495    Get(I, 0) = I;
496
497  // Visit the childrens,
498  // Filling (and overriding) the matrix for the name fragment of each node
499  // iteratively. CompleteName is used to collect the actual name of potential
500  // match, respecting case and spacing.
501  auto VisitNode = [&](const Node &N, std::size_t Row,
502                       auto &VisitNode) -> void {
503    std::size_t J = 0;
504    for (; J < N.Name.size(); J++) {
505      if (!isAlnum(N.Name[J]))
506        continue;
507
508      Get(0, Row) = Row;
509
510      for (std::size_t I = 1; I < Columns; I++) {
511        const int Delete = Get(I - 1, Row) + 1;
512        const int Insert = Get(I, Row - 1) + 1;
513
514        const int Replace =
515            Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
516
517        Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
518      }
519
520      Row++;
521    }
522
523    unsigned Cost = Get(Columns - 1, Row - 1);
524    if (N.Value != 0xFFFFFFFF) {
525      Insert(N, Cost, N.Value);
526    }
527
528    if (N.hasChildren()) {
529      auto ChildOffset = N.ChildrenOffset;
530      for (;;) {
531        Node C = readNode(ChildOffset, &N);
532        ChildOffset += C.Size;
533        if (!C.isValid())
534          break;
535        VisitNode(C, Row, VisitNode);
536        if (!C.HasSibling)
537          break;
538      }
539    }
540  };
541
542  Node Root = createRoot();
543  VisitNode(Root, 1, VisitNode);
544  return Matches;
545}
546
547} // namespace unicode
548
549} // namespace sys
550} // namespace llvm
551