1239310Sdim//===- StringMatcher.cpp - Generate a matcher for input strings -----------===//
2239310Sdim//
3239310Sdim//                     The LLVM Compiler Infrastructure
4239310Sdim//
5239310Sdim// This file is distributed under the University of Illinois Open Source
6239310Sdim// License. See LICENSE.TXT for details.
7239310Sdim//
8239310Sdim//===----------------------------------------------------------------------===//
9239310Sdim//
10239310Sdim// This file implements the StringMatcher class.
11239310Sdim//
12239310Sdim//===----------------------------------------------------------------------===//
13239310Sdim
14239310Sdim#include "llvm/TableGen/StringMatcher.h"
15239310Sdim#include "llvm/Support/raw_ostream.h"
16239310Sdim#include <map>
17239310Sdimusing namespace llvm;
18239310Sdim
19239310Sdim/// FindFirstNonCommonLetter - Find the first character in the keys of the
20239310Sdim/// string pairs that is not shared across the whole set of strings.  All
21239310Sdim/// strings are assumed to have the same length.
22239310Sdimstatic unsigned
23239310SdimFindFirstNonCommonLetter(const std::vector<const
24239310Sdim                              StringMatcher::StringPair*> &Matches) {
25239310Sdim  assert(!Matches.empty());
26239310Sdim  for (unsigned i = 0, e = Matches[0]->first.size(); i != e; ++i) {
27239310Sdim    // Check to see if letter i is the same across the set.
28239310Sdim    char Letter = Matches[0]->first[i];
29239310Sdim
30239310Sdim    for (unsigned str = 0, e = Matches.size(); str != e; ++str)
31239310Sdim      if (Matches[str]->first[i] != Letter)
32239310Sdim        return i;
33239310Sdim  }
34239310Sdim
35239310Sdim  return Matches[0]->first.size();
36239310Sdim}
37239310Sdim
38239310Sdim/// EmitStringMatcherForChar - Given a set of strings that are known to be the
39239310Sdim/// same length and whose characters leading up to CharNo are the same, emit
40239310Sdim/// code to verify that CharNo and later are the same.
41239310Sdim///
42239310Sdim/// \return - True if control can leave the emitted code fragment.
43239310Sdimbool StringMatcher::
44239310SdimEmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
45239310Sdim                         unsigned CharNo, unsigned IndentCount) const {
46239310Sdim  assert(!Matches.empty() && "Must have at least one string to match!");
47239310Sdim  std::string Indent(IndentCount*2+4, ' ');
48239310Sdim
49239310Sdim  // If we have verified that the entire string matches, we're done: output the
50239310Sdim  // matching code.
51239310Sdim  if (CharNo == Matches[0]->first.size()) {
52239310Sdim    assert(Matches.size() == 1 && "Had duplicate keys to match on");
53239310Sdim
54239310Sdim    // If the to-execute code has \n's in it, indent each subsequent line.
55239310Sdim    StringRef Code = Matches[0]->second;
56239310Sdim
57239310Sdim    std::pair<StringRef, StringRef> Split = Code.split('\n');
58239310Sdim    OS << Indent << Split.first << "\t // \"" << Matches[0]->first << "\"\n";
59239310Sdim
60239310Sdim    Code = Split.second;
61239310Sdim    while (!Code.empty()) {
62239310Sdim      Split = Code.split('\n');
63239310Sdim      OS << Indent << Split.first << "\n";
64239310Sdim      Code = Split.second;
65239310Sdim    }
66239310Sdim    return false;
67239310Sdim  }
68239310Sdim
69239310Sdim  // Bucket the matches by the character we are comparing.
70239310Sdim  std::map<char, std::vector<const StringPair*> > MatchesByLetter;
71239310Sdim
72239310Sdim  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
73239310Sdim    MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
74239310Sdim
75239310Sdim
76239310Sdim  // If we have exactly one bucket to match, see how many characters are common
77239310Sdim  // across the whole set and match all of them at once.
78239310Sdim  if (MatchesByLetter.size() == 1) {
79239310Sdim    unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches);
80239310Sdim    unsigned NumChars = FirstNonCommonLetter-CharNo;
81239310Sdim
82239310Sdim    // Emit code to break out if the prefix doesn't match.
83239310Sdim    if (NumChars == 1) {
84239310Sdim      // Do the comparison with if (Str[1] != 'f')
85239310Sdim      // FIXME: Need to escape general characters.
86239310Sdim      OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] != '"
87239310Sdim      << Matches[0]->first[CharNo] << "')\n";
88239310Sdim      OS << Indent << "  break;\n";
89239310Sdim    } else {
90239310Sdim      // Do the comparison with if memcmp(Str.data()+1, "foo", 3).
91239310Sdim      // FIXME: Need to escape general strings.
92239310Sdim      OS << Indent << "if (memcmp(" << StrVariableName << ".data()+" << CharNo
93239310Sdim         << ", \"" << Matches[0]->first.substr(CharNo, NumChars) << "\", "
94239310Sdim         << NumChars << "))\n";
95239310Sdim      OS << Indent << "  break;\n";
96239310Sdim    }
97239310Sdim
98239310Sdim    return EmitStringMatcherForChar(Matches, FirstNonCommonLetter, IndentCount);
99239310Sdim  }
100239310Sdim
101239310Sdim  // Otherwise, we have multiple possible things, emit a switch on the
102239310Sdim  // character.
103239310Sdim  OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n";
104239310Sdim  OS << Indent << "default: break;\n";
105239310Sdim
106239310Sdim  for (std::map<char, std::vector<const StringPair*> >::iterator LI =
107239310Sdim       MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) {
108239310Sdim    // TODO: escape hard stuff (like \n) if we ever care about it.
109239310Sdim    OS << Indent << "case '" << LI->first << "':\t // "
110239310Sdim       << LI->second.size() << " string";
111239310Sdim    if (LI->second.size() != 1) OS << 's';
112239310Sdim    OS << " to match.\n";
113239310Sdim    if (EmitStringMatcherForChar(LI->second, CharNo+1, IndentCount+1))
114239310Sdim      OS << Indent << "  break;\n";
115239310Sdim  }
116239310Sdim
117239310Sdim  OS << Indent << "}\n";
118239310Sdim  return true;
119239310Sdim}
120239310Sdim
121239310Sdim
122239310Sdim/// Emit - Top level entry point.
123239310Sdim///
124239310Sdimvoid StringMatcher::Emit(unsigned Indent) const {
125239310Sdim  // If nothing to match, just fall through.
126239310Sdim  if (Matches.empty()) return;
127239310Sdim
128239310Sdim  // First level categorization: group strings by length.
129239310Sdim  std::map<unsigned, std::vector<const StringPair*> > MatchesByLength;
130239310Sdim
131239310Sdim  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
132239310Sdim    MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
133239310Sdim
134239310Sdim  // Output a switch statement on length and categorize the elements within each
135239310Sdim  // bin.
136239310Sdim  OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n";
137239310Sdim  OS.indent(Indent*2+2) << "default: break;\n";
138239310Sdim
139239310Sdim  for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI =
140239310Sdim       MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) {
141239310Sdim    OS.indent(Indent*2+2) << "case " << LI->first << ":\t // "
142239310Sdim       << LI->second.size()
143239310Sdim       << " string" << (LI->second.size() == 1 ? "" : "s") << " to match.\n";
144239310Sdim    if (EmitStringMatcherForChar(LI->second, 0, Indent))
145239310Sdim      OS.indent(Indent*2+4) << "break;\n";
146239310Sdim  }
147239310Sdim
148239310Sdim  OS.indent(Indent*2+2) << "}\n";
149239310Sdim}
150