1/* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * CONTACTS 15 * E-mail regarding any portion of the Linux UDF file system should be 16 * directed to the development team's mailing list (run by majordomo): 17 * linux_udf@hpesjro.fc.hp.com 18 * 19 * COPYRIGHT 20 * This file is distributed under the terms of the GNU General Public 21 * License (GPL). Copies of the GPL can be obtained from: 22 * ftp://prep.ai.mit.edu/pub/gnu/GPL 23 * Each contributing author retains all rights to their own work. 24 */ 25 26#include "udfdecl.h" 27 28#include <linux/kernel.h> 29#include <linux/string.h> /* for memset */ 30#include <linux/nls.h> 31#include <linux/udf_fs.h> 32 33#include "udf_sb.h" 34 35int udf_ustr_to_dchars(uint8_t *dest, const struct ustr *src, int strlen) 36{ 37 if ( (!dest) || (!src) || (!strlen) || (src->u_len > strlen) ) 38 return 0; 39 memcpy(dest+1, src->u_name, src->u_len); 40 dest[0] = src->u_cmpID; 41 return src->u_len + 1; 42} 43 44int udf_ustr_to_char(uint8_t *dest, const struct ustr *src, int strlen) 45{ 46 if ( (!dest) || (!src) || (!strlen) || (src->u_len >= strlen) ) 47 return 0; 48 memcpy(dest, src->u_name, src->u_len); 49 return src->u_len; 50} 51 52int udf_ustr_to_dstring(dstring *dest, const struct ustr *src, int dlength) 53{ 54 if ( udf_ustr_to_dchars(dest, src, dlength-1) ) 55 { 56 dest[dlength-1] = src->u_len + 1; 57 return dlength; 58 } 59 else 60 return 0; 61} 62 63int udf_dchars_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) 64{ 65 if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN) ) 66 return 0; 67 memset(dest, 0, sizeof(struct ustr)); 68 memcpy(dest->u_name, src+1, strlen-1); 69 dest->u_cmpID = src[0]; 70 dest->u_len = strlen-1; 71 return strlen-1; 72} 73 74int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) 75{ 76 if ( (!dest) || (!src) || (!strlen) || (strlen >= UDF_NAME_LEN) ) 77 return 0; 78 memset(dest, 0, sizeof(struct ustr)); 79 memcpy(dest->u_name, src, strlen); 80 dest->u_cmpID = 0x08; 81 dest->u_len = strlen; 82 return strlen; 83} 84 85 86int udf_dstring_to_ustr(struct ustr *dest, const dstring *src, int dlength) 87{ 88 if ( dlength && udf_dchars_to_ustr(dest, src, src[dlength-1]) ) 89 return dlength; 90 else 91 return 0; 92} 93 94/* 95 * udf_build_ustr 96 */ 97int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) 98{ 99 int usesize; 100 101 if ( (!dest) || (!ptr) || (!size) ) 102 return -1; 103 104 memset(dest, 0, sizeof(struct ustr)); 105 usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; 106 dest->u_cmpID=ptr[0]; 107 dest->u_len=ptr[size-1]; 108 memcpy(dest->u_name, ptr+1, usesize-1); 109 return 0; 110} 111 112/* 113 * udf_build_ustr_exact 114 */ 115int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) 116{ 117 if ( (!dest) || (!ptr) || (!exactsize) ) 118 return -1; 119 120 memset(dest, 0, sizeof(struct ustr)); 121 dest->u_cmpID=ptr[0]; 122 dest->u_len=exactsize-1; 123 memcpy(dest->u_name, ptr+1, exactsize-1); 124 return 0; 125} 126 127/* 128 * udf_ocu_to_utf8 129 * 130 * PURPOSE 131 * Convert OSTA Compressed Unicode to the UTF-8 equivalent. 132 * 133 * DESCRIPTION 134 * This routine is only called by udf_filldir(). 135 * 136 * PRE-CONDITIONS 137 * utf Pointer to UTF-8 output buffer. 138 * ocu Pointer to OSTA Compressed Unicode input buffer 139 * of size UDF_NAME_LEN bytes. 140 * both of type "struct ustr *" 141 * 142 * POST-CONDITIONS 143 * <return> Zero on success. 144 * 145 * HISTORY 146 * November 12, 1997 - Andrew E. Mileski 147 * Written, tested, and released. 148 */ 149int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) 150{ 151 uint8_t *ocu; 152 uint32_t c; 153 uint8_t cmp_id, ocu_len; 154 int i; 155 156 ocu = ocu_i->u_name; 157 158 ocu_len = ocu_i->u_len; 159 cmp_id = ocu_i->u_cmpID; 160 utf_o->u_len = 0; 161 162 if (ocu_len == 0) 163 { 164 memset(utf_o, 0, sizeof(struct ustr)); 165 utf_o->u_cmpID = 0; 166 utf_o->u_len = 0; 167 return 0; 168 } 169 170 if ((cmp_id != 8) && (cmp_id != 16)) 171 { 172 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); 173 return 0; 174 } 175 176 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) 177 { 178 179 /* Expand OSTA compressed Unicode to Unicode */ 180 c = ocu[i++]; 181 if (cmp_id == 16) 182 c = (c << 8) | ocu[i++]; 183 184 /* Compress Unicode to UTF-8 */ 185 if (c < 0x80U) 186 utf_o->u_name[utf_o->u_len++] = (uint8_t)c; 187 else if (c < 0x800U) 188 { 189 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); 190 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); 191 } 192 else 193 { 194 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); 195 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f)); 196 utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); 197 } 198 } 199 utf_o->u_cmpID=8; 200 201 return utf_o->u_len; 202} 203 204/* 205 * 206 * udf_utf8_to_ocu 207 * 208 * PURPOSE 209 * Convert UTF-8 to the OSTA Compressed Unicode equivalent. 210 * 211 * DESCRIPTION 212 * This routine is only called by udf_lookup(). 213 * 214 * PRE-CONDITIONS 215 * ocu Pointer to OSTA Compressed Unicode output 216 * buffer of size UDF_NAME_LEN bytes. 217 * utf Pointer to UTF-8 input buffer. 218 * utf_len Length of UTF-8 input buffer in bytes. 219 * 220 * POST-CONDITIONS 221 * <return> Zero on success. 222 * 223 * HISTORY 224 * November 12, 1997 - Andrew E. Mileski 225 * Written, tested, and released. 226 */ 227int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) 228{ 229 unsigned c, i, max_val, utf_char; 230 int utf_cnt; 231 int u_len = 0; 232 233 memset(ocu, 0, sizeof(dstring) * length); 234 ocu[0] = 8; 235 max_val = 0xffU; 236 237try_again: 238 utf_char = 0U; 239 utf_cnt = 0U; 240 for (i = 0U; i < utf->u_len; i++) 241 { 242 c = (uint8_t)utf->u_name[i]; 243 244 /* Complete a multi-byte UTF-8 character */ 245 if (utf_cnt) 246 { 247 utf_char = (utf_char << 6) | (c & 0x3fU); 248 if (--utf_cnt) 249 continue; 250 } 251 else 252 { 253 /* Check for a multi-byte UTF-8 character */ 254 if (c & 0x80U) 255 { 256 /* Start a multi-byte UTF-8 character */ 257 if ((c & 0xe0U) == 0xc0U) 258 { 259 utf_char = c & 0x1fU; 260 utf_cnt = 1; 261 } 262 else if ((c & 0xf0U) == 0xe0U) 263 { 264 utf_char = c & 0x0fU; 265 utf_cnt = 2; 266 } 267 else if ((c & 0xf8U) == 0xf0U) 268 { 269 utf_char = c & 0x07U; 270 utf_cnt = 3; 271 } 272 else if ((c & 0xfcU) == 0xf8U) 273 { 274 utf_char = c & 0x03U; 275 utf_cnt = 4; 276 } 277 else if ((c & 0xfeU) == 0xfcU) 278 { 279 utf_char = c & 0x01U; 280 utf_cnt = 5; 281 } 282 else 283 goto error_out; 284 continue; 285 } else 286 /* Single byte UTF-8 character (most common) */ 287 utf_char = c; 288 } 289 290 /* Choose no compression if necessary */ 291 if (utf_char > max_val) 292 { 293 if ( 0xffU == max_val ) 294 { 295 max_val = 0xffffU; 296 ocu[0] = (uint8_t)0x10U; 297 goto try_again; 298 } 299 goto error_out; 300 } 301 302 if (max_val == 0xffffU) 303 { 304 ocu[++u_len] = (uint8_t)(utf_char >> 8); 305 } 306 ocu[++u_len] = (uint8_t)(utf_char & 0xffU); 307 } 308 309 310 if (utf_cnt) 311 { 312error_out: 313 printk(KERN_ERR "udf: bad UTF-8 character\n"); 314 return 0; 315 } 316 317 ocu[length - 1] = (uint8_t)u_len + 1; 318 return u_len + 1; 319} 320 321int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i) 322{ 323 uint8_t *ocu; 324 uint32_t c; 325 uint8_t cmp_id, ocu_len; 326 int i; 327 328 ocu = ocu_i->u_name; 329 330 ocu_len = ocu_i->u_len; 331 cmp_id = ocu_i->u_cmpID; 332 utf_o->u_len = 0; 333 334 if (ocu_len == 0) 335 { 336 memset(utf_o, 0, sizeof(struct ustr)); 337 utf_o->u_cmpID = 0; 338 utf_o->u_len = 0; 339 return 0; 340 } 341 342 if ((cmp_id != 8) && (cmp_id != 16)) 343 { 344 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); 345 return 0; 346 } 347 348 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) 349 { 350 /* Expand OSTA compressed Unicode to Unicode */ 351 c = ocu[i++]; 352 if (cmp_id == 16) 353 c = (c << 8) | ocu[i++]; 354 355 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 356 UDF_NAME_LEN - utf_o->u_len); 357 } 358 utf_o->u_cmpID=8; 359 360 return utf_o->u_len; 361} 362 363int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length) 364{ 365 unsigned len, i, max_val; 366 uint16_t uni_char; 367 int uni_cnt; 368 int u_len = 0; 369 370 memset(ocu, 0, sizeof(dstring) * length); 371 ocu[0] = 8; 372 max_val = 0xffU; 373 374try_again: 375 uni_char = 0U; 376 uni_cnt = 0U; 377 for (i = 0U; i < uni->u_len; i++) 378 { 379 len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char); 380 381 if (len == 2 && max_val == 0xff) 382 { 383 max_val = 0xffffU; 384 ocu[0] = (uint8_t)0x10U; 385 goto try_again; 386 } 387 388 if (max_val == 0xffffU) 389 { 390 ocu[++u_len] = (uint8_t)(uni_char >> 8); 391 i++; 392 } 393 ocu[++u_len] = (uint8_t)(uni_char & 0xffU); 394 } 395 396 ocu[length - 1] = (uint8_t)u_len + 1; 397 return u_len + 1; 398} 399 400int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen) 401{ 402 struct ustr filename, unifilename; 403 int len; 404 405 if (udf_build_ustr_exact(&unifilename, sname, flen)) 406 { 407 return 0; 408 } 409 410 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) 411 { 412 if (!udf_CS0toUTF8(&filename, &unifilename) ) 413 { 414 udf_debug("Failed in udf_get_filename: sname = %s\n", sname); 415 return 0; 416 } 417 } 418 else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 419 { 420 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) ) 421 { 422 udf_debug("Failed in udf_get_filename: sname = %s\n", sname); 423 return 0; 424 } 425 } 426 else 427 return 0; 428 429 if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, 430 unifilename.u_name, unifilename.u_len))) 431 { 432 return len; 433 } 434 return 0; 435} 436 437#define ILLEGAL_CHAR_MARK '_' 438#define EXT_MARK '.' 439#define CRC_MARK '#' 440#define EXT_SIZE 5 441 442int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen) 443{ 444 int index, newIndex = 0, needsCRC = 0; 445 int extIndex = 0, newExtIndex = 0, hasExt = 0; 446 unsigned short valueCRC; 447 uint8_t curr; 448 const uint8_t hexChar[] = "0123456789ABCDEF"; 449 450 if (udfName[0] == '.' && (udfLen == 1 || 451 (udfLen == 2 && udfName[1] == '.'))) 452 { 453 needsCRC = 1; 454 newIndex = udfLen; 455 memcpy(newName, udfName, udfLen); 456 } 457 else 458 { 459 for (index = 0; index < udfLen; index++) 460 { 461 curr = udfName[index]; 462 if (curr == '/' || curr == 0) 463 { 464 needsCRC = 1; 465 curr = ILLEGAL_CHAR_MARK; 466 while (index+1 < udfLen && (udfName[index+1] == '/' || 467 udfName[index+1] == 0)) 468 index++; 469 } 470 if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE) 471 { 472 if (udfLen == index + 1) 473 hasExt = 0; 474 else 475 { 476 hasExt = 1; 477 extIndex = index; 478 newExtIndex = newIndex; 479 } 480 } 481 if (newIndex < 256) 482 newName[newIndex++] = curr; 483 else 484 needsCRC = 1; 485 } 486 } 487 if (needsCRC) 488 { 489 uint8_t ext[EXT_SIZE]; 490 int localExtIndex = 0; 491 492 if (hasExt) 493 { 494 int maxFilenameLen; 495 for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen; 496 index++ ) 497 { 498 curr = udfName[extIndex + index + 1]; 499 500 if (curr == '/' || curr == 0) 501 { 502 needsCRC = 1; 503 curr = ILLEGAL_CHAR_MARK; 504 while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE 505 && (udfName[extIndex + index + 2] == '/' || 506 udfName[extIndex + index + 2] == 0))) 507 index++; 508 } 509 ext[localExtIndex++] = curr; 510 } 511 maxFilenameLen = 250 - localExtIndex; 512 if (newIndex > maxFilenameLen) 513 newIndex = maxFilenameLen; 514 else 515 newIndex = newExtIndex; 516 } 517 else if (newIndex > 250) 518 newIndex = 250; 519 newName[newIndex++] = CRC_MARK; 520 valueCRC = udf_crc(fidName, fidNameLen, 0); 521 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; 522 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; 523 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; 524 newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; 525 526 if (hasExt) 527 { 528 newName[newIndex++] = EXT_MARK; 529 for (index = 0;index < localExtIndex ;index++ ) 530 newName[newIndex++] = ext[index]; 531 } 532 } 533 return newIndex; 534} 535