1/*
2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "utilities/utf8.hpp"
27
28// Assume the utf8 string is in legal form and has been
29// checked in the class file parser/format checker.
30template<typename T> char* UTF8::next(const char* str, T* value) {
31  unsigned const char *ptr = (const unsigned char *)str;
32  unsigned char ch, ch2, ch3;
33  int length = -1;              /* bad length */
34  jchar result;
35  switch ((ch = ptr[0]) >> 4) {
36    default:
37    result = ch;
38    length = 1;
39    break;
40
41  case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
42    /* Shouldn't happen. */
43    break;
44
45  case 0xC: case 0xD:
46    /* 110xxxxx  10xxxxxx */
47    if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
48      unsigned char high_five = ch & 0x1F;
49      unsigned char low_six = ch2 & 0x3F;
50      result = (high_five << 6) + low_six;
51      length = 2;
52      break;
53    }
54    break;
55
56  case 0xE:
57    /* 1110xxxx 10xxxxxx 10xxxxxx */
58    if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
59      if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
60        unsigned char high_four = ch & 0x0f;
61        unsigned char mid_six = ch2 & 0x3f;
62        unsigned char low_six = ch3 & 0x3f;
63        result = (((high_four << 6) + mid_six) << 6) + low_six;
64        length = 3;
65      }
66    }
67    break;
68  } /* end of switch */
69
70  if (length <= 0) {
71    *value = (T)ptr[0];    /* default bad result; */
72    return (char*)(ptr + 1); // make progress somehow
73  }
74
75  *value = (T)result;
76
77  // The assert is correct but the .class file is wrong
78  // assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
79  return (char *)(ptr + length);
80}
81
82char* UTF8::next_character(const char* str, jint* value) {
83  unsigned const char *ptr = (const unsigned char *)str;
84  /* See if it's legal supplementary character:
85     11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */
86  if (is_supplementary_character(ptr)) {
87    *value = get_supplementary_character(ptr);
88    return (char *)(ptr + 6);
89  }
90  jchar result;
91  char* next_ch = next(str, &result);
92  *value = result;
93  return next_ch;
94}
95
96// Count bytes of the form 10xxxxxx and deduct this count
97// from the total byte count.  The utf8 string must be in
98// legal form which has been verified in the format checker.
99int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) {
100  int num_chars = len;
101  has_multibyte = false;
102  is_latin1 = true;
103  unsigned char prev = 0;
104  for (int i = 0; i < len; i++) {
105    unsigned char c = str[i];
106    if ((c & 0xC0) == 0x80) {
107      // Multibyte, check if valid latin1 character.
108      has_multibyte = true;
109      if (prev > 0xC3) {
110        is_latin1 = false;
111      }
112      --num_chars;
113    }
114    prev = c;
115  }
116  return num_chars;
117}
118
119// Count bytes of the utf8 string except those in form
120// 10xxxxxx which only appear in multibyte characters.
121// The utf8 string must be in legal form and has been
122// verified in the format checker.
123int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) {
124  int num_chars = 0;
125  has_multibyte = false;
126  is_latin1 = true;
127  unsigned char prev = 0;
128  for (const char* p = str; *p; p++) {
129    unsigned char c = (*p);
130    if ((c & 0xC0) == 0x80) {
131      // Multibyte, check if valid latin1 character.
132      has_multibyte = true;
133      if (prev > 0xC3) {
134        is_latin1 = false;
135      }
136    } else {
137      num_chars++;
138    }
139    prev = c;
140  }
141  return num_chars;
142}
143
144// Writes a jchar as utf8 and returns the end
145static u_char* utf8_write(u_char* base, jchar ch) {
146  if ((ch != 0) && (ch <=0x7f)) {
147    base[0] = (u_char) ch;
148    return base + 1;
149  }
150
151  if (ch <= 0x7FF) {
152    /* 11 bits or less. */
153    unsigned char high_five = ch >> 6;
154    unsigned char low_six = ch & 0x3F;
155    base[0] = high_five | 0xC0; /* 110xxxxx */
156    base[1] = low_six | 0x80;   /* 10xxxxxx */
157    return base + 2;
158  }
159  /* possibly full 16 bits. */
160  char high_four = ch >> 12;
161  char mid_six = (ch >> 6) & 0x3F;
162  char low_six = ch & 0x3f;
163  base[0] = high_four | 0xE0; /* 1110xxxx */
164  base[1] = mid_six | 0x80;   /* 10xxxxxx */
165  base[2] = low_six | 0x80;   /* 10xxxxxx */
166  return base + 3;
167}
168
169template<typename T> void UTF8::convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length) {
170  unsigned char ch;
171  const char *ptr = utf8_str;
172  int index = 0;
173
174  /* ASCII case loop optimization */
175  for (; index < unicode_length; index++) {
176    if((ch = ptr[0]) > 0x7F) { break; }
177    unicode_str[index] = (T)ch;
178    ptr = (const char *)(ptr + 1);
179  }
180
181  for (; index < unicode_length; index++) {
182    ptr = UTF8::next(ptr, &unicode_str[index]);
183  }
184}
185
186// Explicit instantiation for all supported string types.
187template char* UTF8::next<jchar>(const char* str, jchar* value);
188template char* UTF8::next<jbyte>(const char* str, jbyte* value);
189template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unicode_str, int unicode_length);
190template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length);
191
192// returns the quoted ascii length of a 0-terminated utf8 string
193int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
194  const char *ptr = utf8_str;
195  const char* end = ptr + utf8_length;
196  int result = 0;
197  while (ptr < end) {
198    jchar c;
199    ptr = UTF8::next(ptr, &c);
200    if (c >= 32 && c < 127) {
201      result++;
202    } else {
203      result += 6;
204    }
205  }
206  return result;
207}
208
209// converts a utf8 string to quoted ascii
210void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) {
211  const char *ptr = utf8_str;
212  const char *utf8_end = ptr + utf8_length;
213  char* p = buf;
214  char* end = buf + buflen;
215  while (ptr < utf8_end) {
216    jchar c;
217    ptr = UTF8::next(ptr, &c);
218    if (c >= 32 && c < 127) {
219      if (p + 1 >= end) break;      // string is truncated
220      *p++ = (char)c;
221    } else {
222      if (p + 6 >= end) break;      // string is truncated
223      sprintf(p, "\\u%04x", c);
224      p += 6;
225    }
226  }
227  assert(p < end, "sanity");
228  *p = '\0';
229}
230
231
232const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) {
233  const char *ptr = quoted_ascii_str;
234  char* result = NULL;
235  while (*ptr != '\0') {
236    char c = *ptr;
237    if (c < 32 || c >= 127) break;
238  }
239  if (*ptr == '\0') {
240    // nothing to do so return original string
241    return quoted_ascii_str;
242  }
243  // everything up to this point was ok.
244  int length = ptr - quoted_ascii_str;
245  char* buffer = NULL;
246  for (int round = 0; round < 2; round++) {
247    while (*ptr != '\0') {
248      if (*ptr != '\\') {
249        if (buffer != NULL) {
250          buffer[length] = *ptr;
251        }
252        length++;
253      } else {
254        switch (ptr[1]) {
255          case 'u': {
256            ptr += 2;
257            jchar value=0;
258            for (int i=0; i<4; i++) {
259              char c = *ptr++;
260              switch (c) {
261                case '0': case '1': case '2': case '3': case '4':
262                case '5': case '6': case '7': case '8': case '9':
263                  value = (value << 4) + c - '0';
264                  break;
265                case 'a': case 'b': case 'c':
266                case 'd': case 'e': case 'f':
267                  value = (value << 4) + 10 + c - 'a';
268                  break;
269                case 'A': case 'B': case 'C':
270                case 'D': case 'E': case 'F':
271                  value = (value << 4) + 10 + c - 'A';
272                  break;
273                default:
274                  ShouldNotReachHere();
275              }
276            }
277            if (buffer == NULL) {
278              char utf8_buffer[4];
279              char* next = (char*)utf8_write((u_char*)utf8_buffer, value);
280              length += next - utf8_buffer;
281            } else {
282              char* next = (char*)utf8_write((u_char*)&buffer[length], value);
283              length += next - &buffer[length];
284            }
285            break;
286          }
287          case 't': if (buffer != NULL) buffer[length] = '\t'; ptr += 2; length++; break;
288          case 'n': if (buffer != NULL) buffer[length] = '\n'; ptr += 2; length++; break;
289          case 'r': if (buffer != NULL) buffer[length] = '\r'; ptr += 2; length++; break;
290          case 'f': if (buffer != NULL) buffer[length] = '\f'; ptr += 2; length++; break;
291          default:
292            ShouldNotReachHere();
293        }
294      }
295    }
296    if (round == 0) {
297      buffer = NEW_RESOURCE_ARRAY(char, length + 1);
298      ptr = quoted_ascii_str;
299    } else {
300      buffer[length] = '\0';
301    }
302  }
303  return buffer;
304}
305
306
307// Returns NULL if 'c' it not found. This only works as long
308// as 'c' is an ASCII character
309const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) {
310  assert(length >= 0, "sanity check");
311  assert(c >= 0, "does not work for non-ASCII characters");
312  // Skip backwards in string until 'c' is found or end is reached
313  while(--length >= 0 && base[length] != c);
314  return (length < 0) ? NULL : &base[length];
315}
316
317bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) {
318  // Length must be the same
319  if (length1 != length2) return false;
320  for (int i = 0; i < length1; i++) {
321    if (base1[i] != base2[i]) return false;
322  }
323  return true;
324}
325
326bool UTF8::is_supplementary_character(const unsigned char* str) {
327  return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80)
328      && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80);
329}
330
331jint UTF8::get_supplementary_character(const unsigned char* str) {
332  return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10)
333                 + ((str[4] & 0x0f) << 6)  + (str[5] & 0x3f);
334}
335
336bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
337                         bool version_leq_47) {
338  int i = 0;
339  int count = length >> 2;
340  for (int k=0; k<count; k++) {
341    unsigned char b0 = buffer[i];
342    unsigned char b1 = buffer[i+1];
343    unsigned char b2 = buffer[i+2];
344    unsigned char b3 = buffer[i+3];
345    // For an unsigned char v,
346    // (v | v - 1) is < 128 (highest bit 0) for 0 < v < 128;
347    // (v | v - 1) is >= 128 (highest bit 1) for v == 0 or v >= 128.
348    unsigned char res = b0 | b0 - 1 |
349                        b1 | b1 - 1 |
350                        b2 | b2 - 1 |
351                        b3 | b3 - 1;
352    if (res >= 128) break;
353    i += 4;
354  }
355  for(; i < length; i++) {
356    unsigned short c;
357    // no embedded zeros
358    if (buffer[i] == 0) return false;
359    if(buffer[i] < 128) {
360      continue;
361    }
362    if ((i + 5) < length) { // see if it's legal supplementary character
363      if (UTF8::is_supplementary_character(&buffer[i])) {
364        c = UTF8::get_supplementary_character(&buffer[i]);
365        i += 5;
366        continue;
367      }
368    }
369    switch (buffer[i] >> 4) {
370      default: break;
371      case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
372        return false;
373      case 0xC: case 0xD:  // 110xxxxx  10xxxxxx
374        c = (buffer[i] & 0x1F) << 6;
375        i++;
376        if ((i < length) && ((buffer[i] & 0xC0) == 0x80)) {
377          c += buffer[i] & 0x3F;
378          if (version_leq_47 || c == 0 || c >= 0x80) {
379            break;
380          }
381        }
382        return false;
383      case 0xE:  // 1110xxxx 10xxxxxx 10xxxxxx
384        c = (buffer[i] & 0xF) << 12;
385        i += 2;
386        if ((i < length) && ((buffer[i-1] & 0xC0) == 0x80) && ((buffer[i] & 0xC0) == 0x80)) {
387          c += ((buffer[i-1] & 0x3F) << 6) + (buffer[i] & 0x3F);
388          if (version_leq_47 || c >= 0x800) {
389            break;
390          }
391        }
392        return false;
393    }  // end of switch
394  } // end of for
395  return true;
396}
397
398//-------------------------------------------------------------------------------------
399
400bool UNICODE::is_latin1(jchar c) {
401  return (c <= 0x00FF);
402}
403
404bool UNICODE::is_latin1(jchar* base, int length) {
405  for (int index = 0; index < length; index++) {
406    if (base[index] > 0x00FF) {
407      return false;
408    }
409  }
410  return true;
411}
412
413int UNICODE::utf8_size(jchar c) {
414  if ((0x0001 <= c) && (c <= 0x007F)) {
415    // ASCII character
416    return 1;
417  } else  if (c <= 0x07FF) {
418    return 2;
419  } else {
420    return 3;
421  }
422}
423
424int UNICODE::utf8_size(jbyte c) {
425  if (c >= 0x01) {
426    // ASCII character. Check is equivalent to
427    // (0x01 <= c) && (c <= 0x7F) because c is signed.
428    return 1;
429  } else {
430    // Non-ASCII character or 0x00 which needs to be
431    // two-byte encoded as 0xC080 in modified UTF-8.
432    return 2;
433  }
434}
435
436template<typename T>
437int UNICODE::utf8_length(T* base, int length) {
438  int result = 0;
439  for (int index = 0; index < length; index++) {
440    T c = base[index];
441    result += utf8_size(c);
442  }
443  return result;
444}
445
446template<typename T>
447char* UNICODE::as_utf8(T* base, int& length) {
448  int utf8_len = utf8_length(base, length);
449  u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
450  char* result = as_utf8(base, length, (char*) buf, utf8_len + 1);
451  assert((int) strlen(result) == utf8_len, "length prediction must be correct");
452  // Set string length to uft8 length
453  length = utf8_len;
454  return (char*) result;
455}
456
457char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) {
458  u_char* p = (u_char*)buf;
459  for (int index = 0; index < length; index++) {
460    jchar c = base[index];
461    buflen -= utf8_size(c);
462    if (buflen <= 0) break; // string is truncated
463    p = utf8_write(p, c);
464  }
465  *p = '\0';
466  return buf;
467}
468
469char* UNICODE::as_utf8(jbyte* base, int length, char* buf, int buflen) {
470  u_char* p = (u_char*)buf;
471  u_char* end = (u_char*)buf + buflen;
472  for (int index = 0; index < length; index++) {
473    jbyte c = base[index];
474    int sz = utf8_size(c);
475    buflen -= sz;
476    if (buflen <= 0) break; // string is truncated
477    if (sz == 1) {
478      // Copy ASCII characters (UTF-8 is ASCII compatible)
479      *p++ = c;
480    } else {
481      // Non-ASCII character or 0x00 which should
482      // be encoded as 0xC080 in "modified" UTF8.
483      p = utf8_write(p, ((jchar) c) & 0xff);
484    }
485  }
486  *p = '\0';
487  return buf;
488}
489
490void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
491  for(int index = 0; index < length; index++) {
492    utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]);
493  }
494  *utf8_buffer = '\0';
495}
496
497// returns the quoted ascii length of a unicode string
498template<typename T>
499int UNICODE::quoted_ascii_length(T* base, int length) {
500  int result = 0;
501  for (int i = 0; i < length; i++) {
502    T c = base[i];
503    if (c >= 32 && c < 127) {
504      result++;
505    } else {
506      result += 6;
507    }
508  }
509  return result;
510}
511
512// converts a unicode string to quoted ascii
513template<typename T>
514void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) {
515  char* p = buf;
516  char* end = buf + buflen;
517  for (int index = 0; index < length; index++) {
518    T c = base[index];
519    if (c >= 32 && c < 127) {
520      if (p + 1 >= end) break;      // string is truncated
521      *p++ = (char)c;
522    } else {
523      if (p + 6 >= end) break;      // string is truncated
524      sprintf(p, "\\u%04x", c);
525      p += 6;
526    }
527  }
528  *p = '\0';
529}
530
531// Explicit instantiation for all supported types.
532template int UNICODE::utf8_length(jbyte* base, int length);
533template int UNICODE::utf8_length(jchar* base, int length);
534template char* UNICODE::as_utf8(jbyte* base, int& length);
535template char* UNICODE::as_utf8(jchar* base, int& length);
536template int UNICODE::quoted_ascii_length<jbyte>(jbyte* base, int length);
537template int UNICODE::quoted_ascii_length<jchar>(jchar* base, int length);
538template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen);
539template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen);
540