1/**
2 *******************************************************************************
3 * Copyright (C) 2007,2012 International Business Machines Corporation, Apple Inc.,*
4 * and others.  All Rights Reserved.                                           *
5 *******************************************************************************
6 */
7
8#define __STDC_LIMIT_MACROS 1
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED
12
13#include "brkeng.h"
14#include "dictbe.h"
15#include "aaplbfct.h"
16#include "unicode/uscript.h"
17#include "unicode/uniset.h"
18#include "unicode/ucnv.h"
19#include "unicode/uchar.h"
20#include <limits.h>
21#include <unistd.h>
22#include <glob.h>
23#include <strings.h>
24#include <NSSystemDirectories.h>
25#include <sys/types.h>
26#include <sys/stat.h>
27#include <sys/mman.h>
28#include <fcntl.h>
29#include <time.h>
30#include <stdio.h>
31#include <stdint.h>
32// The following is now already included by platform.h (included indirectly by
33// utypes.h) if U_PLATFORM_IS_DARWIN_BASED but it doesn't hurt to re-include here
34#include <TargetConditionals.h>
35
36U_NAMESPACE_BEGIN
37
38/*
39 ******************************************************************
40 */
41
42AppleLanguageBreakFactory::AppleLanguageBreakFactory(UErrorCode &status)
43: ICULanguageBreakFactory(status)
44{
45}
46
47AppleLanguageBreakFactory::~AppleLanguageBreakFactory() {
48}
49
50#if !TARGET_OS_EMBEDDED
51#if 0
52// need to update loadDictionaryMatcherFor implementation below
53
54// Helper function that makes a length-delimited buffer look NUL-terminated
55static __attribute__((always_inline)) inline UChar nextUChar(const UChar *&p, ptrdiff_t &l) {
56	if (l > 0) {
57		l -= 1;
58		return *p++;
59	}
60	else {
61		return 0;
62	}
63}
64
65// Add a file's worth of words to the supplied mutable dictionary
66static void addDictFile(MutableTrieDictionary *to, const char *path) {
67	UErrorCode status = U_ZERO_ERROR;
68	off_t fileLength;
69	const char *dictRawData = (const char *) -1;
70	const UChar *dictData = NULL;
71	ptrdiff_t dictDataLength = 0;
72	UChar *dictBuffer = NULL;
73	const char *encoding = NULL;
74	int32_t		signatureLength = 0;
75
76	// Open the dictionary file
77	int dictFile = open(path, O_RDONLY, 0);
78	if (dictFile == -1) {
79		status = U_FILE_ACCESS_ERROR;
80	}
81
82	// Determine its length
83	if (U_SUCCESS(status)) {
84		fileLength = lseek(dictFile, 0, SEEK_END);
85		(void) lseek(dictFile, 0, SEEK_SET);
86		if (fileLength < 0 || fileLength > PTRDIFF_MAX) {
87			status = U_FILE_ACCESS_ERROR;
88		}
89	}
90
91	// Map it
92	if (U_SUCCESS(status)) {
93		dictRawData = (const char *) mmap(0, (size_t) fileLength, PROT_READ, MAP_SHARED, dictFile, 0);
94		if ((intptr_t)dictRawData == -1) {
95			status = U_FILE_ACCESS_ERROR;
96		}
97	}
98
99	// No longer need the file descriptor open
100	if (dictFile != -1) {
101		(void) close(dictFile);
102	}
103
104	// Look for a Unicode signature
105	if (U_SUCCESS(status)) {
106		encoding = ucnv_detectUnicodeSignature(dictRawData, fileLength, &signatureLength, &status);
107	}
108
109	// If necessary, convert the data to UChars
110	if (U_SUCCESS(status) && encoding != NULL) {
111		UConverter *conv = ucnv_open(encoding, &status);
112		// Preflight to get buffer size
113		uint32_t destCap = ucnv_toUChars(conv, NULL, 0, dictRawData, fileLength, &status);
114		if (status == U_BUFFER_OVERFLOW_ERROR) {
115			status = U_ZERO_ERROR;
116		}
117		if (U_SUCCESS(status)) {
118			dictBuffer = new UChar[destCap+1];
119		}
120		(void) ucnv_toUChars(conv, dictBuffer, destCap+1, dictRawData, fileLength, &status);
121		dictData = dictBuffer;
122		dictDataLength = destCap;
123		if (U_SUCCESS(status) && dictData[0] == 0xFEFF) {	// BOM? Skip it
124			dictData += 1;
125			dictDataLength -= 1;
126		}
127
128		ucnv_close(conv);
129	}
130
131	// If it didn't need converting, just assume it's native-endian UTF-16, no BOM
132	if (U_SUCCESS(status) && dictData == NULL) {
133		dictData = (const UChar *) dictRawData;
134		dictDataLength = fileLength/sizeof(UChar);
135	}
136
137	// OK, we now have a pointer to native-endian UTF-16. Process it as one word per line,
138	// stopping at the first space.
139	if (U_SUCCESS(status)) {
140		UnicodeSet breaks(UNICODE_STRING_SIMPLE("[[:lb=BK:][:lb=CR:][:lb=LF:][:lb=NL:]]"), status);
141		const UChar *candidate = dictData;
142		int32_t length = 0;
143		UChar uc = nextUChar(dictData, dictDataLength);
144		while (U_SUCCESS(status) && uc) {
145			while (uc && !u_isspace(uc)) {
146				length += 1;
147				uc = nextUChar(dictData, dictDataLength);
148			}
149
150			if (length > 0) {
151				to->addWord(candidate, length, status);
152			}
153
154			// Find beginning of next line
155			// 1. Skip non-line-break characters
156			while (uc && !breaks.contains(uc)) {
157				uc = nextUChar(dictData, dictDataLength);
158			}
159			// 2. Skip line break characters
160			while (uc && breaks.contains(uc)) {
161				uc = nextUChar(dictData, dictDataLength);
162			}
163
164			// Prepare for next line
165			candidate = dictData-1;
166			length = 0;
167		}
168	}
169
170	// Unmap the file if we mapped it
171	if ((intptr_t) dictRawData != -1) {
172		(void) munmap((void *)dictRawData, (size_t) fileLength);
173	}
174
175	// Delete any temporary buffer
176	delete [] dictBuffer;
177}
178
179#if U_IS_BIG_ENDIAN
180	static const char	sArchType[] = "";
181#else
182	static const char	sArchType[] = ".le";	// little endian
183#endif
184
185#endif
186#endif
187
188/*
189In ICU50,
190ICULanguageBreakFactory changes from
191  virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
192to
193  virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
194and CompactTrieDictionary no longer exists. Need to work out  new implementation below.
195*/
196
197DictionaryMatcher *
198AppleLanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t breakType) {
199	DictionaryMatcher *icuDictMatcher = ICULanguageBreakFactory::loadDictionaryMatcherFor(script, breakType);
200#if !TARGET_OS_EMBEDDED
201#if 0
202// need to update loadDictionaryMatcherFor implementation below
203	// We only look for a user dictionary if there is actually an ICU dictionary
204	if (icuDictMatcher != NULL) {
205		UErrorCode status = U_ZERO_ERROR;
206		const char *scriptName = uscript_getName(script);
207		char path[256];			// PATH_MAX is overkill in this case
208		char cachePath[128];
209		char cacheTargetPath[256];
210		glob_t dirGlob;
211		glob_t fileGlob;
212		struct stat cacheStat;
213		struct stat dictStat;
214		bool cacheGood = true;
215		int globFlags = (GLOB_NOESCAPE|GLOB_NOSORT|GLOB_TILDE);
216		const CompactTrieDictionary *cacheDict = NULL;
217
218		// Iterate the dictionary directories and accumulate in dirGlob
219		NSSearchPathEnumerationState state = NSStartSearchPathEnumeration(NSLibraryDirectory, (NSSearchPathDomainMask) (NSUserDomainMask|NSLocalDomainMask|NSNetworkDomainMask));
220		while ((state = NSGetNextSearchPathEnumeration(state, path)) != 0) {
221			// First get the directory itself. We should never overflow, but use strlcat anyway
222			// to avoid a crash if we do.
223			strlcat(path, "/Dictionaries", sizeof(path));
224			if (!glob(path, globFlags, NULL, &dirGlob)) {
225				globFlags |= GLOB_APPEND;
226			}
227		}
228
229		// If there are no Dictionaries directories, ignore any cache file and return the ICU
230		// standard dictionary
231		// TODO: Delete the cache?
232		if (dirGlob.gl_pathc == 0) {
233			globfree(&dirGlob);
234			return icuDictMatcher;
235		}
236
237		// See if there is a cache file already; get its mod time
238		// TODO: should we be using geteuid() here instead of getuid()?
239		state = NSStartSearchPathEnumeration(NSCachesDirectory, NSLocalDomainMask);
240		state = NSGetNextSearchPathEnumeration(state, cachePath);	// Just use first one
241		// Create the cache file name. We should never overflow, but use snprintf to avoid a crash
242		// if we do.
243		snprintf(cacheTargetPath, sizeof(cacheTargetPath), "%s/com.apple.ICUUserDictionaryCache%s.%s.%d", cachePath, sArchType, scriptName, getuid());
244		if (stat(cacheTargetPath, &cacheStat) || cacheStat.st_mode != (S_IFREG|S_IRUSR|S_IWUSR)) {
245			cacheGood = false;		// No file or bad permissions or type
246		}
247
248		// Stat the dictionary folders, and glob the dictionary files
249		globFlags &= ~GLOB_APPEND;
250		char **pathsp = dirGlob.gl_pathv;
251		const char *dictpath;
252		while ((dictpath = *pathsp++) != NULL) {
253			// Stat the directory -- ignore if stat failure
254			if (!stat(dictpath, &dictStat)) {
255				// Glob the dictionaries in the directory
256				snprintf(path, sizeof(path), "%s/*-%s.txt", dictpath, scriptName);
257				if (!glob(path, globFlags, NULL, &fileGlob)) {
258					globFlags |= GLOB_APPEND;
259				}
260				// If the directory has been modified after the cache file, we need to rebuild;
261				// a dictionary might have been deleted.
262				if (cacheGood && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
263					cacheGood = false;
264				}
265			}
266		}
267
268		// No longer need the directory glob
269		globfree(&dirGlob);
270
271		// If there are no dictionaries, ignore the cache file and return the ICU dictionary
272		// TODO: Delete the cache?
273		if (fileGlob.gl_pathc == 0) {
274			globfree(&fileGlob);
275			return icuDictMatcher;
276		}
277
278		// Now compare the last modified stamp for the cache against all the dictionaries
279		pathsp = fileGlob.gl_pathv;
280		while (cacheGood && (dictpath = *pathsp++)) {
281			// Stat the dictionary -- ignore if stat failure
282			if (!stat(dictpath, &dictStat) && (dictStat.st_mtimespec.tv_sec > cacheStat.st_mtimespec.tv_sec || (dictStat.st_mtimespec.tv_sec == cacheStat.st_mtimespec.tv_sec && dictStat.st_mtimespec.tv_nsec > cacheStat.st_mtimespec.tv_nsec))) {
283				cacheGood = false;
284			}
285		}
286
287		// Do we need to build the dictionary cache?
288		if (!cacheGood) {
289			// Create a mutable dictionary from the ICU dictionary
290			MutableTrieDictionary *sum = icuDictMatcher->cloneMutable(status);
291			pathsp = fileGlob.gl_pathv;
292			while (U_SUCCESS(status) && (dictpath = *pathsp++)) {
293				// Add the contents of a file to the sum
294				addDictFile(sum, dictpath);
295			}
296
297			// Create a compact (read-only) dictionary
298			CompactTrieDictionary compact(*sum, status);
299			delete sum;
300
301			if (U_SUCCESS(status)) {
302				// Open a temp file to write out the cache
303				strlcat(cachePath, "/temp.XXXXXXXXXX", sizeof(cachePath));
304				int temp = mkstemp(cachePath);
305				if (temp == -1) {
306					status = U_FILE_ACCESS_ERROR;
307				}
308				size_t dictSize = compact.dataSize();
309				if (U_SUCCESS(status) && write(temp, compact.data(), dictSize) != dictSize) {
310					status = U_FILE_ACCESS_ERROR;
311				}
312				// Rename the temp file to the cache. Note that race conditions here are
313				// fine, as the file system operations are atomic. If an outdated version wins
314				// over a newer version, it will get rebuilt at the next app launch due to the
315				// modification time checks above. We don't care that any given app launch gets
316				// the most up-to-date cache (impossible since we can't lock all the Dictionaries
317				// directories), only that the cache (eventually) reflects the current state of
318				// any user dictionaries. That will happen on the next app launch after changes
319				// to the user dictionaries quiesce.
320				if (U_SUCCESS(status)) {
321					if (rename(cachePath, cacheTargetPath)) {
322						status = U_FILE_ACCESS_ERROR;
323						(void) unlink(cachePath);	// Clean up the temp file
324					}
325				}
326				if (temp != -1) {
327					close(temp);
328				}
329			}
330		}
331
332		// Done with dictionary paths; release memory allocated by glob()
333		globfree(&fileGlob);
334
335		// Map the cache and build the dictionary
336		if (U_SUCCESS(status)) {
337			int cache = open(cacheTargetPath, O_RDONLY, 0);
338			off_t length;
339			const void *cacheData = (const void *) -1;
340			if (cache == -1) {
341				status = U_FILE_ACCESS_ERROR;
342			}
343			if (U_SUCCESS(status)) {
344				length = lseek(cache, 0, SEEK_END);
345				(void) lseek(cache, 0, SEEK_SET);
346				if (length < 0 || length > PTRDIFF_MAX) {
347					status = U_FILE_ACCESS_ERROR;
348				}
349			}
350
351			// Map the cache. Note: it is left mapped until process exit. This is the normal
352			// behavior anyway, so it shouldn't be an issue.
353			if (U_SUCCESS(status)) {
354				cacheData = mmap(0, (size_t) length, PROT_READ, MAP_SHARED, cache, 0);
355				if ((intptr_t)cacheData == -1) {
356					status = U_FILE_ACCESS_ERROR;
357				}
358			}
359			// We can close the cache file now that it's mapped (or not)
360			if (cache != -1) {
361				(void) close(cache);
362			}
363			// If all was successful, try to create the dictionary. The constructor will
364			// check the magic number for us.
365			if (U_SUCCESS(status)) {
366				cacheDict = new CompactTrieDictionary(cacheData, status);
367			}
368			if (U_FAILURE(status) && (intptr_t)cacheData != -1) {
369				// Clean up the mmap
370				(void) munmap((void *)cacheData, (size_t) length);
371			}
372		}
373
374		// If we were successful, free the ICU dictionary and return ours
375		if (U_SUCCESS(status)) {
376			delete icuDictMatcher;
377			return cacheDict;
378		}
379		else {
380			delete cacheDict;
381		}
382	}
383#endif
384#endif
385	return icuDictMatcher;
386}
387
388U_NAMESPACE_END
389
390#endif /* #if !UCONFIG_NO_BREAK_ITERATION && U_PLATFORM_IS_DARWIN_BASED */
391