1/** 2 ******************************************************************************* 3 * Copyright (C) 2007,2012 International Business Machines Corporation, Apple Inc.,* 4 * and others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8#define __STDC_LIMIT_MACROS 1 9#include "unicode/utypes.h" 10 11#if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED 12 13#include "brkeng.h" 14#include "dictbe.h" 15#include "aaplbfct.h" 16#include "unicode/uscript.h" 17#include "unicode/uniset.h" 18#include "unicode/ucnv.h" 19#include "unicode/uchar.h" 20#include <limits.h> 21#include <unistd.h> 22#include <glob.h> 23#include <strings.h> 24#include <NSSystemDirectories.h> 25#include <sys/types.h> 26#include <sys/stat.h> 27#include <sys/mman.h> 28#include <fcntl.h> 29#include <time.h> 30#include <stdio.h> 31#include <stdint.h> 32// The following is now already included by platform.h (included indirectly by 33// utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here 34#include <TargetConditionals.h> 35 36U_NAMESPACE_BEGIN 37 38/* 39 ****************************************************************** 40 */ 41 42AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status) 43: ICULanguageBreakFactory(status) 44{ 45} 46 47AppleLanguageBreakFactory::~AppleLanguageBreakFactory() { 48} 49 50#if !TARGET_OS_EMBEDDED 51#if 0 52// need to update loadDictionaryMatcherFor implementation below 53 54// Helper function that makes a length-delimited buffer look NUL-terminated 55static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) { 56 if (l > 0) { 57 l -= 1; 58 return *p++; 59 } 60 else { 61 return 0; 62 } 63} 64 65// Add a file's worth of words to the supplied mutable dictionary 66static void addDictFile(MutableTrieDictionary *to, const char *path) { 67 UErrorCode status = U_ZERO_ERROR; 68 off_t fileLength; 69 const char *dictRawData = (const char *) -1; 70 const UChar *dictData = NULL; 71 ptrdiff_t dictDataLength = 0; 72 UChar *dictBuffer = NULL; 73 const char *encoding = NULL; 74 int32_t signatureLength = 0; 75 76 // Open the dictionary file 77 int dictFile = open(path, O_RDONLY, 0); 78 if (dictFile == -1) { 79 status = U_FILE_ACCESS_ERROR; 80 } 81 82 // Determine its length 83 if (U_SUCCESS(status)) { 84 fileLength = lseek(dictFile, 0, SEEK_END); 85 (void) lseek(dictFile, 0, SEEK_SET); 86 if (fileLength < 0 || fileLength > PTRDIFF_MAX) { 87 status = U_FILE_ACCESS_ERROR; 88 } 89 } 90 91 // Map it 92 if (U_SUCCESS(status)) { 93 dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0); 94 if ((intptr_t)dictRawData == -1) { 95 status = U_FILE_ACCESS_ERROR; 96 } 97 } 98 99 // No longer need the file descriptor open 100 if (dictFile != -1) { 101 (void) close(dictFile); 102 } 103 104 // Look for a Unicode signature 105 if (U_SUCCESS(status)) { 106 encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status); 107 } 108 109 // If necessary, convert the data to UChars 110 if (U_SUCCESS(status) && encoding != NULL) { 111 UConverter *conv = ucnv_open(encoding, &status); 112 // Preflight to get buffer size 113 uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status); 114 if (status == U_BUFFER_OVERFLOW_ERROR) { 115 status = U_ZERO_ERROR; 116 } 117 if (U_SUCCESS(status)) { 118 dictBuffer = new UChar[destCap+1]; 119 } 120 (void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status); 121 dictData = dictBuffer; 122 dictDataLength = destCap; 123 if (U_SUCCESS(status) && dictData[0] == 0xFEFF) { // BOM? Skip it 124 dictData += 1; 125 dictDataLength -= 1; 126 } 127 128 ucnv_close(conv); 129 } 130 131 // If it didn't need converting, just assume it's native-endian UTF-16, no BOM 132 if (U_SUCCESS(status) && dictData == NULL) { 133 dictData = (const UChar *) dictRawData; 134 dictDataLength = fileLength/sizeof(UChar); 135 } 136 137 // OK, we now have a pointer to native-endian UTF-16. Process it as one word per line, 138 // stopping at the first space. 139 if (U_SUCCESS(status)) { 140 UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status); 141 const UChar *candidate = dictData; 142 int32_t length = 0; 143 UChar uc = nextUChar(dictData, dictDataLength); 144 while (U_SUCCESS(status) && uc) { 145 while (uc && !u_isspace(uc)) { 146 length += 1; 147 uc = nextUChar(dictData, dictDataLength); 148 } 149 150 if (length > 0) { 151 to->addWord(candidate, length, status); 152 } 153 154 // Find beginning of next line 155 // 1. Skip non-line-break characters 156 while (uc && !breaks.contains(uc)) { 157 uc = nextUChar(dictData, dictDataLength); 158 } 159 // 2. Skip line break characters 160 while (uc && breaks.contains(uc)) { 161 uc = nextUChar(dictData, dictDataLength); 162 } 163 164 // Prepare for next line 165 candidate = dictData-1; 166 length = 0; 167 } 168 } 169 170 // Unmap the file if we mapped it 171 if ((intptr_t) dictRawData != -1) { 172 (void) munmap((void *)dictRawData, (size_t) fileLength); 173 } 174 175 // Delete any temporary buffer 176 delete [] dictBuffer; 177} 178 179#if U_IS_BIG_ENDIAN 180 static const char sArchType[] = ""; 181#else 182 static const char sArchType[] = ".le"; // little endian 183#endif 184 185#endif 186#endif 187 188/* 189In ICU50, 190ICULanguageBreakFactory changes from 191 virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType); 192to 193 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); 194and CompactTrieDictionary no longer exists. Need to work out new implementation below. 195*/ 196 197DictionaryMatcher * 198AppleLanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t breakType) { 199 DictionaryMatcher *icuDictMatcher = ICULanguageBreakFactory::loadDictionaryMatcherFor(script, breakType); 200#if !TARGET_OS_EMBEDDED 201#if 0 202// need to update loadDictionaryMatcherFor implementation below 203 // We only look for a user dictionary if there is actually an ICU dictionary 204 if (icuDictMatcher != NULL) { 205 UErrorCode status = U_ZERO_ERROR; 206 const char *scriptName = uscript_getName(script); 207 char path[256]; // PATH_MAX is overkill in this case 208 char cachePath[128]; 209 char cacheTargetPath[256]; 210 glob_t dirGlob; 211 glob_t fileGlob; 212 struct stat cacheStat; 213 struct stat dictStat; 214 bool cacheGood = true; 215 int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE); 216 const CompactTrieDictionary *cacheDict = NULL; 217 218 // Iterate the dictionary directories and accumulate in dirGlob 219 NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask)); 220 while ((state = NSGetNextSearchPathEnumeration(state, path)) != 0) { 221 // First get the directory itself. We should never overflow, but use strlcat anyway 222 // to avoid a crash if we do. 223 strlcat(path, "/Dictionaries", sizeof(path)); 224 if (!glob(path, globFlags, NULL, &dirGlob)) { 225 globFlags |= GLOB_APPEND; 226 } 227 } 228 229 // If there are no Dictionaries directories, ignore any cache file and return the ICU 230 // standard dictionary 231 // TODO: Delete the cache? 232 if (dirGlob.gl_pathc == 0) { 233 globfree(&dirGlob); 234 return icuDictMatcher; 235 } 236 237 // See if there is a cache file already; get its mod time 238 // TODO: should we be using geteuid() here instead of getuid()? 239 state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask); 240 state = NSGetNextSearchPathEnumeration(state, cachePath); // Just use first one 241 // Create the cache file name. We should never overflow, but use snprintf to avoid a crash 242 // if we do. 243 snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid()); 244 if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) { 245 cacheGood = false; // No file or bad permissions or type 246 } 247 248 // Stat the dictionary folders, and glob the dictionary files 249 globFlags &= ~GLOB_APPEND; 250 char **pathsp = dirGlob.gl_pathv; 251 const char *dictpath; 252 while ((dictpath = *pathsp++) != NULL) { 253 // Stat the directory -- ignore if stat failure 254 if (!stat(dictpath, &dictStat)) { 255 // Glob the dictionaries in the directory 256 snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName); 257 if (!glob(path, globFlags, NULL, &fileGlob)) { 258 globFlags |= GLOB_APPEND; 259 } 260 // If the directory has been modified after the cache file, we need to rebuild; 261 // a dictionary might have been deleted. 262 if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) { 263 cacheGood = false; 264 } 265 } 266 } 267 268 // No longer need the directory glob 269 globfree(&dirGlob); 270 271 // If there are no dictionaries, ignore the cache file and return the ICU dictionary 272 // TODO: Delete the cache? 273 if (fileGlob.gl_pathc == 0) { 274 globfree(&fileGlob); 275 return icuDictMatcher; 276 } 277 278 // Now compare the last modified stamp for the cache against all the dictionaries 279 pathsp = fileGlob.gl_pathv; 280 while (cacheGood && (dictpath = *pathsp++)) { 281 // Stat the dictionary -- ignore if stat failure 282 if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) { 283 cacheGood = false; 284 } 285 } 286 287 // Do we need to build the dictionary cache? 288 if (!cacheGood) { 289 // Create a mutable dictionary from the ICU dictionary 290 MutableTrieDictionary *sum = icuDictMatcher->cloneMutable(status); 291 pathsp = fileGlob.gl_pathv; 292 while (U_SUCCESS(status) && (dictpath = *pathsp++)) { 293 // Add the contents of a file to the sum 294 addDictFile(sum, dictpath); 295 } 296 297 // Create a compact (read-only) dictionary 298 CompactTrieDictionary compact(*sum, status); 299 delete sum; 300 301 if (U_SUCCESS(status)) { 302 // Open a temp file to write out the cache 303 strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath)); 304 int temp = mkstemp(cachePath); 305 if (temp == -1) { 306 status = U_FILE_ACCESS_ERROR; 307 } 308 size_t dictSize = compact.dataSize(); 309 if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) { 310 status = U_FILE_ACCESS_ERROR; 311 } 312 // Rename the temp file to the cache. Note that race conditions here are 313 // fine, as the file system operations are atomic. If an outdated version wins 314 // over a newer version, it will get rebuilt at the next app launch due to the 315 // modification time checks above. We don't care that any given app launch gets 316 // the most up-to-date cache (impossible since we can't lock all the Dictionaries 317 // directories), only that the cache (eventually) reflects the current state of 318 // any user dictionaries. That will happen on the next app launch after changes 319 // to the user dictionaries quiesce. 320 if (U_SUCCESS(status)) { 321 if (rename(cachePath, cacheTargetPath)) { 322 status = U_FILE_ACCESS_ERROR; 323 (void) unlink(cachePath); // Clean up the temp file 324 } 325 } 326 if (temp != -1) { 327 close(temp); 328 } 329 } 330 } 331 332 // Done with dictionary paths; release memory allocated by glob() 333 globfree(&fileGlob); 334 335 // Map the cache and build the dictionary 336 if (U_SUCCESS(status)) { 337 int cache = open(cacheTargetPath, O_RDONLY, 0); 338 off_t length; 339 const void *cacheData = (const void *) -1; 340 if (cache == -1) { 341 status = U_FILE_ACCESS_ERROR; 342 } 343 if (U_SUCCESS(status)) { 344 length = lseek(cache, 0, SEEK_END); 345 (void) lseek(cache, 0, SEEK_SET); 346 if (length < 0 || length > PTRDIFF_MAX) { 347 status = U_FILE_ACCESS_ERROR; 348 } 349 } 350 351 // Map the cache. Note: it is left mapped until process exit. This is the normal 352 // behavior anyway, so it shouldn't be an issue. 353 if (U_SUCCESS(status)) { 354 cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0); 355 if ((intptr_t)cacheData == -1) { 356 status = U_FILE_ACCESS_ERROR; 357 } 358 } 359 // We can close the cache file now that it's mapped (or not) 360 if (cache != -1) { 361 (void) close(cache); 362 } 363 // If all was successful, try to create the dictionary. The constructor will 364 // check the magic number for us. 365 if (U_SUCCESS(status)) { 366 cacheDict = new CompactTrieDictionary(cacheData, status); 367 } 368 if (U_FAILURE(status) && (intptr_t)cacheData != -1) { 369 // Clean up the mmap 370 (void) munmap((void *)cacheData, (size_t) length); 371 } 372 } 373 374 // If we were successful, free the ICU dictionary and return ours 375 if (U_SUCCESS(status)) { 376 delete icuDictMatcher; 377 return cacheDict; 378 } 379 else { 380 delete cacheDict; 381 } 382 } 383#endif 384#endif 385 return icuDictMatcher; 386} 387 388U_NAMESPACE_END 389 390#endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */ 391