utf8.cpp revision 0:a61af66fc99e
1/*
2 * Copyright 1997-2004 Sun Microsystems, Inc.  All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
20 * CA 95054 USA or visit www.sun.com if you need additional information or
21 * have any questions.
22 *
23 */
24
25# include "incls/_precompiled.incl"
26# include "incls/_utf8.cpp.incl"
27
28// Assume the utf8 string is in legal form and has been
29// checked in the class file parser/format checker.
30char* UTF8::next(const char* str, jchar* value) {
31  unsigned const char *ptr = (const unsigned char *)str;
32  unsigned char ch, ch2, ch3;
33  int length = -1;              /* bad length */
34  jchar result;
35  switch ((ch = ptr[0]) >> 4) {
36    default:
37    result = ch;
38    length = 1;
39    break;
40
41  case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
42    /* Shouldn't happen. */
43    break;
44
45  case 0xC: case 0xD:
46    /* 110xxxxx  10xxxxxx */
47    if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
48      unsigned char high_five = ch & 0x1F;
49      unsigned char low_six = ch2 & 0x3F;
50      result = (high_five << 6) + low_six;
51      length = 2;
52      break;
53    }
54    break;
55
56  case 0xE:
57    /* 1110xxxx 10xxxxxx 10xxxxxx */
58    if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
59      if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
60        unsigned char high_four = ch & 0x0f;
61        unsigned char mid_six = ch2 & 0x3f;
62        unsigned char low_six = ch3 & 0x3f;
63        result = (((high_four << 6) + mid_six) << 6) + low_six;
64        length = 3;
65      }
66    }
67    break;
68  } /* end of switch */
69
70  if (length <= 0) {
71    *value = ptr[0];    /* default bad result; */
72    return (char*)(ptr + 1); // make progress somehow
73  }
74
75  *value = result;
76
77  // The assert is correct but the .class file is wrong
78  // assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
79  return (char *)(ptr + length);
80}
81
82char* UTF8::next_character(const char* str, jint* value) {
83  unsigned const char *ptr = (const unsigned char *)str;
84  /* See if it's legal supplementary character:
85     11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */
86  if (is_supplementary_character(ptr)) {
87    *value = get_supplementary_character(ptr);
88    return (char *)(ptr + 6);
89  }
90  jchar result;
91  char* next_ch = next(str, &result);
92  *value = result;
93  return next_ch;
94}
95
96// Count bytes of the form 10xxxxxx and deduct this count
97// from the total byte count.  The utf8 string must be in
98// legal form which has been verified in the format checker.
99int UTF8::unicode_length(const char* str, int len) {
100  int num_chars = len;
101  for (int i = 0; i < len; i++) {
102    if ((str[i] & 0xC0) == 0x80) {
103      --num_chars;
104    }
105  }
106  return num_chars;
107}
108
109// Count bytes of the utf8 string except those in form
110// 10xxxxxx which only appear in multibyte characters.
111// The utf8 string must be in legal form and has been
112// verified in the format checker.
113int UTF8::unicode_length(const char* str) {
114  int num_chars = 0;
115  for (const char* p = str; *p; p++) {
116    if (((*p) & 0xC0) != 0x80) {
117      num_chars++;
118    }
119  }
120  return num_chars;
121}
122
123// Writes a jchar a utf8 and returns the end
124static u_char* utf8_write(u_char* base, jchar ch) {
125  if ((ch != 0) && (ch <=0x7f)) {
126    base[0] = (u_char) ch;
127    return base + 1;
128  }
129
130  if (ch <= 0x7FF) {
131    /* 11 bits or less. */
132    unsigned char high_five = ch >> 6;
133    unsigned char low_six = ch & 0x3F;
134    base[0] = high_five | 0xC0; /* 110xxxxx */
135    base[1] = low_six | 0x80;   /* 10xxxxxx */
136    return base + 2;
137  }
138  /* possibly full 16 bits. */
139  char high_four = ch >> 12;
140  char mid_six = (ch >> 6) & 0x3F;
141  char low_six = ch & 0x3f;
142  base[0] = high_four | 0xE0; /* 1110xxxx */
143  base[1] = mid_six | 0x80;   /* 10xxxxxx */
144  base[2] = low_six | 0x80;   /* 10xxxxxx */
145  return base + 3;
146}
147
148void UTF8::convert_to_unicode(const char* utf8_str, jchar* unicode_str, int unicode_length) {
149  unsigned char ch;
150  const char *ptr = (const char *)utf8_str;
151  int index = 0;
152
153  /* ASCII case loop optimization */
154  for (; index < unicode_length; index++) {
155    if((ch = ptr[0]) > 0x7F) { break; }
156    unicode_str[index] = ch;
157    ptr = (const char *)(ptr + 1);
158  }
159
160  for (; index < unicode_length; index++) {
161    ptr = UTF8::next(ptr, &unicode_str[index]);
162  }
163}
164
165// Returns NULL if 'c' it not found. This only works as long
166// as 'c' is an ASCII character
167jbyte* UTF8::strrchr(jbyte* base, int length, jbyte c) {
168  assert(length >= 0, "sanity check");
169  assert(c >= 0, "does not work for non-ASCII characters");
170  // Skip backwards in string until 'c' is found or end is reached
171  while(--length >= 0 && base[length] != c);
172  return (length < 0) ? NULL : &base[length];
173}
174
175bool UTF8::equal(jbyte* base1, int length1, jbyte* base2, int length2) {
176  // Length must be the same
177  if (length1 != length2) return false;
178  for (int i = 0; i < length1; i++) {
179    if (base1[i] != base2[i]) return false;
180  }
181  return true;
182}
183
184bool UTF8::is_supplementary_character(const unsigned char* str) {
185  return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80)
186      && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80);
187}
188
189jint UTF8::get_supplementary_character(const unsigned char* str) {
190  return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10)
191                 + ((str[4] & 0x0f) << 6)  + (str[5] & 0x3f);
192}
193
194
195//-------------------------------------------------------------------------------------
196
197
198int UNICODE::utf8_size(jchar c) {
199  if ((0x0001 <= c) && (c <= 0x007F)) return 1;
200  if (c <= 0x07FF) return 2;
201  return 3;
202}
203
204int UNICODE::utf8_length(jchar* base, int length) {
205  int result = 0;
206  for (int index = 0; index < length; index++) {
207    jchar c = base[index];
208    if ((0x0001 <= c) && (c <= 0x007F)) result += 1;
209    else if (c <= 0x07FF) result += 2;
210    else result += 3;
211  }
212  return result;
213}
214
215char* UNICODE::as_utf8(jchar* base, int length) {
216  int utf8_len = utf8_length(base, length);
217  u_char* result = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
218  u_char* p = result;
219  for (int index = 0; index < length; index++) {
220    p = utf8_write(p, base[index]);
221  }
222  *p = '\0';
223  assert(p == &result[utf8_len], "length prediction must be correct");
224  return (char*) result;
225}
226
227char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) {
228  u_char* p = (u_char*)buf;
229  u_char* end = (u_char*)buf + buflen;
230  for (int index = 0; index < length; index++) {
231    jchar c = base[index];
232    if (p + utf8_size(c) >= end) break;      // string is truncated
233    p = utf8_write(p, base[index]);
234  }
235  *p = '\0';
236  return buf;
237}
238
239void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
240  for(int index = 0; index < length; index++) {
241    utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]);
242  }
243  *utf8_buffer = '\0';
244}
245