1/*
2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26package jdk.nashorn.internal.runtime.linker;
27
28/**
29 * <p>
30 * Implements the name mangling and demangling as specified by John Rose's
31 * <a href="https://blogs.oracle.com/jrose/entry/symbolic_freedom_in_the_vm"
32 * target="_blank">"Symbolic Freedom in the VM"</a> article. Normally, you would
33 * mangle the names in the call sites as you're generating bytecode, and then
34 * demangle them when you receive them in bootstrap methods.
35 * </p>
36 * <p>
37 * This code is derived from sun.invoke.util.BytecodeName. Apart from subsetting that
38 * class, we don't want to create dependency between non-exported package from java.base
39 * to nashorn module.
40 * </p>
41 *
42 * <h3>Comment from BytecodeName class reproduced here:</h3>
43 *
44 * Includes universal mangling rules for the JVM.
45 *
46 * <h3>Avoiding Dangerous Characters </h3>
47 *
48 * <p>
49 * The JVM defines a very small set of characters which are illegal
50 * in name spellings.  We will slightly extend and regularize this set
51 * into a group of <cite>dangerous characters</cite>.
52 * These characters will then be replaced, in mangled names, by escape sequences.
53 * In addition, accidental escape sequences must be further escaped.
54 * Finally, a special prefix will be applied if and only if
55 * the mangling would otherwise fail to begin with the escape character.
56 * This happens to cover the corner case of the null string,
57 * and also clearly marks symbols which need demangling.
58 * </p>
59 * <p>
60 * Dangerous characters are the union of all characters forbidden
61 * or otherwise restricted by the JVM specification,
62 * plus their mates, if they are brackets
63 * (<code><big><b>[</b></big></code> and <code><big><b>]</b></big></code>,
64 * <code><big><b>&lt;</b></big></code> and <code><big><b>&gt;</b></big></code>),
65 * plus, arbitrarily, the colon character <code><big><b>:</b></big></code>.
66 * There is no distinction between type, method, and field names.
67 * This makes it easier to convert between mangled names of different
68 * types, since they do not need to be decoded (demangled).
69 * </p>
70 * <p>
71 * The escape character is backslash <code><big><b>\</b></big></code>
72 * (also known as reverse solidus).
73 * This character is, until now, unheard of in bytecode names,
74 * but traditional in the proposed role.
75 *
76 * </p>
77 * <h3> Replacement Characters </h3>
78 *
79 *
80 * <p>
81 * Every escape sequence is two characters
82 * (in fact, two UTF8 bytes) beginning with
83 * the escape character and followed by a
84 * <cite>replacement character</cite>.
85 * (Since the replacement character is never a backslash,
86 * iterated manglings do not double in size.)
87 * </p>
88 * <p>
89 * Each dangerous character has some rough visual similarity
90 * to its corresponding replacement character.
91 * This makes mangled symbols easier to recognize by sight.
92 * </p>
93 * <p>
94 * The dangerous characters are
95 * <code><big><b>/</b></big></code> (forward slash, used to delimit package components),
96 * <code><big><b>.</b></big></code> (dot, also a package delimiter),
97 * <code><big><b>;</b></big></code> (semicolon, used in signatures),
98 * <code><big><b>$</b></big></code> (dollar, used in inner classes and synthetic members),
99 * <code><big><b>&lt;</b></big></code> (left angle),
100 * <code><big><b>&gt;</b></big></code> (right angle),
101 * <code><big><b>[</b></big></code> (left square bracket, used in array types),
102 * <code><big><b>]</b></big></code> (right square bracket, reserved in this scheme for language use),
103 * and <code><big><b>:</b></big></code> (colon, reserved in this scheme for language use).
104 * Their replacements are, respectively,
105 * <code><big><b>|</b></big></code> (vertical bar),
106 * <code><big><b>,</b></big></code> (comma),
107 * <code><big><b>?</b></big></code> (question mark),
108 * <code><big><b>%</b></big></code> (percent),
109 * <code><big><b>^</b></big></code> (caret),
110 * <code><big><b>_</b></big></code> (underscore), and
111 * <code><big><b>{</b></big></code> (left curly bracket),
112 * <code><big><b>}</b></big></code> (right curly bracket),
113 * <code><big><b>!</b></big></code> (exclamation mark).
114 * In addition, the replacement character for the escape character itself is
115 * <code><big><b>-</b></big></code> (hyphen),
116 * and the replacement character for the null prefix is
117 * <code><big><b>=</b></big></code> (equal sign).
118 * </p>
119 * <p>
120 * An escape character <code><big><b>\</b></big></code>
121 * followed by any of these replacement characters
122 * is an escape sequence, and there are no other escape sequences.
123 * An equal sign is only part of an escape sequence
124 * if it is the second character in the whole string, following a backslash.
125 * Two consecutive backslashes do <em>not</em> form an escape sequence.
126 * </p>
127 * <p>
128 * Each escape sequence replaces a so-called <cite>original character</cite>
129 * which is either one of the dangerous characters or the escape character.
130 * A null prefix replaces an initial null string, not a character.
131 * </p>
132 * <p>
133 * All this implies that escape sequences cannot overlap and may be
134 * determined all at once for a whole string.  Note that a spelling
135 * string can contain <cite>accidental escapes</cite>, apparent escape
136 * sequences which must not be interpreted as manglings.
137 * These are disabled by replacing their leading backslash with an
138 * escape sequence (<code><big><b>\-</b></big></code>).  To mangle a string, three logical steps
139 * are required, though they may be carried out in one pass:
140 * </p>
141 * <ol>
142 *   <li>In each accidental escape, replace the backslash with an escape sequence
143 * (<code><big><b>\-</b></big></code>).</li>
144 *   <li>Replace each dangerous character with an escape sequence
145 * (<code><big><b>\|</b></big></code> for <code><big><b>/</b></big></code>, etc.).</li>
146 *   <li>If the first two steps introduced any change, <em>and</em>
147 * if the string does not already begin with a backslash, prepend a null prefix (<code><big><b>\=</b></big></code>).</li>
148 * </ol>
149 *
150 * To demangle a mangled string that begins with an escape,
151 * remove any null prefix, and then replace (in parallel)
152 * each escape sequence by its original character.
153 * <p>Spelling strings which contain accidental
154 * escapes <em>must</em> have them replaced, even if those
155 * strings do not contain dangerous characters.
156 * This restriction means that mangling a string always
157 * requires a scan of the string for escapes.
158 * But then, a scan would be required anyway,
159 * to check for dangerous characters.
160 *
161 * </p>
162 * <h3> Nice Properties </h3>
163 *
164 * <p>
165 * If a bytecode name does not contain any escape sequence,
166 * demangling is a no-op:  The string demangles to itself.
167 * Such a string is called <cite>self-mangling</cite>.
168 * Almost all strings are self-mangling.
169 * In practice, to demangle almost any name &ldquo;found in nature&rdquo;,
170 * simply verify that it does not begin with a backslash.
171 * </p>
172 * <p>
173 * Mangling is a one-to-one function, while demangling
174 * is a many-to-one function.
175 * A mangled string is defined as <cite>validly mangled</cite> if
176 * it is in fact the unique mangling of its spelling string.
177 * Three examples of invalidly mangled strings are <code><big><b>\=foo</b></big></code>,
178 * <code><big><b>\-bar</b></big></code>, and <code><big><b>baz\!</b></big></code>, which demangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and
179 * <code><big><b>baz\!</b></big></code>, but then remangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and <code><big><b>\=baz\-!</b></big></code>.
180 * If a language back-end or runtime is using mangled names,
181 * it should never present an invalidly mangled bytecode
182 * name to the JVM.  If the runtime encounters one,
183 * it should also report an error, since such an occurrence
184 * probably indicates a bug in name encoding which
185 * will lead to errors in linkage.
186 * However, this note does not propose that the JVM verifier
187 * detect invalidly mangled names.
188 * </p>
189 * <p>
190 * As a result of these rules, it is a simple matter to
191 * compute validly mangled substrings and concatenations
192 * of validly mangled strings, and (with a little care)
193 * these correspond to corresponding operations on their
194 * spelling strings.
195 * </p>
196 * <ul>
197 *   <li>Any prefix of a validly mangled string is also validly mangled,
198 * although a null prefix may need to be removed.</li>
199 *   <li>Any suffix of a validly mangled string is also validly mangled,
200 * although a null prefix may need to be added.</li>
201 *   <li>Two validly mangled strings, when concatenated,
202 * are also validly mangled, although any null prefix
203 * must be removed from the second string,
204 * and a trailing backslash on the first string may need escaping,
205 * if it would participate in an accidental escape when followed
206 * by the first character of the second string.</li>
207 * </ul>
208 * <p>If languages that include non-Java symbol spellings use this
209 * mangling convention, they will enjoy the following advantages:
210 * </p>
211 * <ul>
212 *   <li>They can interoperate via symbols they share in common.</li>
213 *   <li>Low-level tools, such as backtrace printers, will have readable displays.</li>
214 *   <li>Future JVM and language extensions can safely use the dangerous characters
215 * for structuring symbols, but will never interfere with valid spellings.</li>
216 *   <li>Runtimes and compilers can use standard libraries for mangling and demangling.</li>
217 *   <li>Occasional transliterations and name composition will be simple and regular,
218 * for classes, methods, and fields.</li>
219 *   <li>Bytecode names will continue to be compact.
220 * When mangled, spellings will at most double in length, either in
221 * UTF8 or UTF16 format, and most will not change at all.</li>
222 * </ul>
223 *
224 *
225 * <h3> Suggestions for Human Readable Presentations </h3>
226 *
227 *
228 * <p>
229 * For human readable displays of symbols,
230 * it will be better to present a string-like quoted
231 * representation of the spelling, because JVM users
232 * are generally familiar with such tokens.
233 * We suggest using single or double quotes before and after
234 * mangled symbols which are not valid Java identifiers,
235 * with quotes, backslashes, and non-printing characters
236 * escaped as if for literals in the Java language.
237 * </p>
238 * <p>
239 * For example, an HTML-like spelling
240 * <code><big><b>&lt;pre&gt;</b></big></code> mangles to
241 * <code><big><b>\^pre\_</b></big></code> and could
242 * display more cleanly as
243 * <code><big><b>'&lt;pre&gt;'</b></big></code>,
244 * with the quotes included.
245 * Such string-like conventions are <em>not</em> suitable
246 * for mangled bytecode names, in part because
247 * dangerous characters must be eliminated, rather
248 * than just quoted.  Otherwise internally structured
249 * strings like package prefixes and method signatures
250 * could not be reliably parsed.
251 * </p>
252 * <p>
253 * In such human-readable displays, invalidly mangled
254 * names should <em>not</em> be demangled and quoted,
255 * for this would be misleading.  Likewise, JVM symbols
256 * which contain dangerous characters (like dots in field
257 * names or brackets in method names) should not be
258 * simply quoted.  The bytecode names
259 * <code><big><b>\=phase\,1</b></big></code> and
260 * <code><big><b>phase.1</b></big></code> are distinct,
261 * and in demangled displays they should be presented as
262 * <code><big><b>'phase.1'</b></big></code> and something like
263 * <code><big><b>'phase'.1</b></big></code>, respectively.
264 * </p>
265 */
266public final class NameCodec {
267    private NameCodec() {
268    }
269
270    private static final char ESCAPE_C = '\\';
271    // empty escape sequence to avoid a null name or illegal prefix
272    private static final char NULL_ESCAPE_C = '=';
273    private static final String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C;
274
275    /**
276     * Canonical encoding for the empty name.
277     */
278    public static final String EMPTY_NAME =  new String(new char[] { ESCAPE_C, NULL_ESCAPE_C });
279
280    /**
281     * Encodes ("mangles") an unencoded symbolic name.
282     * @param name the symbolic name to mangle
283     * @return the mangled form of the symbolic name.
284     */
285    public static String encode(final String name) {
286        final String bn = mangle(name);
287        assert((Object)bn == name || looksMangled(bn)) : bn;
288        assert(name.equals(decode(bn))) : name;
289        return bn;
290    }
291
292    /**
293     * Decodes ("demangles") an encoded symbolic name.
294     * @param name the symbolic name to demangle
295     * @return the demangled form of the symbolic name.
296     */
297    public static String decode(final String name) {
298        String sn = name;
299        if (!sn.isEmpty() && looksMangled(name)) {
300            sn = demangle(name);
301            assert(name.equals(mangle(sn))) : name+" => "+sn+" => "+mangle(sn);
302        }
303        return sn;
304    }
305
306    private static boolean looksMangled(final String s) {
307        return s.charAt(0) == ESCAPE_C;
308    }
309
310    private static String mangle(final String s) {
311        if (s.length() == 0)
312            return NULL_ESCAPE;
313
314        // build this lazily, when we first need an escape:
315        StringBuilder sb = null;
316
317        for (int i = 0, slen = s.length(); i < slen; i++) {
318            final char c = s.charAt(i);
319
320            boolean needEscape = false;
321            if (c == ESCAPE_C) {
322                if (i+1 < slen) {
323                    final char c1 = s.charAt(i+1);
324                    if ((i == 0 && c1 == NULL_ESCAPE_C)
325                        || c1 != originalOfReplacement(c1)) {
326                        // an accidental escape
327                        needEscape = true;
328                    }
329                }
330            } else {
331                needEscape = isDangerous(c);
332            }
333
334            if (!needEscape) {
335                if (sb != null)  sb.append(c);
336                continue;
337            }
338
339            // build sb if this is the first escape
340            if (sb == null) {
341                sb = new StringBuilder(s.length()+10);
342                // mangled names must begin with a backslash:
343                if (s.charAt(0) != ESCAPE_C && i > 0)
344                    sb.append(NULL_ESCAPE);
345                // append the string so far, which is unremarkable:
346                sb.append(s, 0, i);
347            }
348
349            // rewrite \ to \-, / to \|, etc.
350            sb.append(ESCAPE_C);
351            sb.append(replacementOf(c));
352        }
353
354        if (sb != null)   return sb.toString();
355
356        return s;
357    }
358
359    private static String demangle(final String s) {
360        // build this lazily, when we first meet an escape:
361        StringBuilder sb = null;
362
363        int stringStart = 0;
364        if (s.startsWith(NULL_ESCAPE))
365            stringStart = 2;
366
367        for (int i = stringStart, slen = s.length(); i < slen; i++) {
368            char c = s.charAt(i);
369
370            if (c == ESCAPE_C && i+1 < slen) {
371                // might be an escape sequence
372                final char rc = s.charAt(i+1);
373                final char oc = originalOfReplacement(rc);
374                if (oc != rc) {
375                    // build sb if this is the first escape
376                    if (sb == null) {
377                        sb = new StringBuilder(s.length());
378                        // append the string so far, which is unremarkable:
379                        sb.append(s, stringStart, i);
380                    }
381                    ++i;  // skip both characters
382                    c = oc;
383                }
384            }
385
386            if (sb != null)
387                sb.append(c);
388        }
389
390        if (sb != null)   return sb.toString();
391
392        return s.substring(stringStart);
393    }
394
395    private static final String DANGEROUS_CHARS   = "\\/.;:$[]<>"; // \\ must be first
396    private static final String REPLACEMENT_CHARS =  "-|,?!%{}^_";
397    private static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\
398
399    private static final long[] SPECIAL_BITMAP = new long[2];  // 128 bits
400    static {
401        final String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS;
402        for (final char c : SPECIAL.toCharArray()) {
403            SPECIAL_BITMAP[c >>> 6] |= 1L << c;
404        }
405    }
406
407    private static boolean isSpecial(final char c) {
408        if ((c >>> 6) < SPECIAL_BITMAP.length)
409            return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0;
410        else
411            return false;
412    }
413
414    private static char replacementOf(final char c) {
415        if (!isSpecial(c))  return c;
416        final int i = DANGEROUS_CHARS.indexOf(c);
417        if (i < 0)  return c;
418        return REPLACEMENT_CHARS.charAt(i);
419    }
420
421    private static char originalOfReplacement(final char c) {
422        if (!isSpecial(c))  return c;
423        final int i = REPLACEMENT_CHARS.indexOf(c);
424        if (i < 0)  return c;
425        return DANGEROUS_CHARS.charAt(i);
426    }
427
428    private static boolean isDangerous(final char c) {
429        if (!isSpecial(c))  return false;
430        return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX);
431    }
432}
433