1/* 2****************************************************************************** 3* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. 4****************************************************************************** 5*/ 6 7#ifndef URBTOK_H 8#define URBTOK_H 9 10#include "unicode/utypes.h" 11 12#if !UCONFIG_NO_BREAK_ITERATION 13 14#include "unicode/utext.h" 15#include "unicode/ubrk.h" 16#include "unicode/parseerr.h" 17 18 19typedef struct RuleBasedTokenRange { 20 signed long location; 21 signed long length; 22} RuleBasedTokenRange; 23 24/** 25 * Open a new UBreakIterator for tokenizing text using specified breaking rules. 26 * The rule syntax is ... (TBD) 27 * @param rules A set of rules specifying the text breaking conventions. 28 * @param rulesLength The number of characters in rules, or -1 if null-terminated. 29 * @param parseErr Receives position and context information for any syntax errors 30 * detected while parsing the rules. 31 * @param status A UErrorCode to receive any errors. 32 * @return A UBreakIterator for the specified rules. 33 * @see ubrk_open 34 * @internal 35 */ 36U_INTERNAL UBreakIterator* U_EXPORT2 37urbtok_openRules(const UChar *rules, 38 int32_t rulesLength, 39 UParseError *parseErr, 40 UErrorCode *status); 41 42/** 43 * Open a new UBreakIterator for tokenizing text using specified breaking rules. 44 * @param rules A set of rules specifying the text breaking conventions. The binary rules 45 * must be at least 32-bit aligned. Note: This version makes a copy of the 46 * rules, so after calling this function the caller can close or release 47 * the rules that were passed to this function. The copy created by this 48 * call will be freed when ubrk_close() is called on the UBreakIterator*. 49 * @param status A UErrorCode to receive any errors. 50 * @return A UBreakIterator for the specified rules. 51 * @see ubrk_open 52 * @internal 53 */ 54U_INTERNAL UBreakIterator* U_EXPORT2 55urbtok_openBinaryRules(const uint8_t *rules, 56 UErrorCode *status); 57 58/** 59 * Open a new UBreakIterator for tokenizing text using specified breaking rules. 60 * @param rules A set of rules specifying the text breaking conventions. The binary rules 61 * must be at least 32-bit aligned. Note: This version does NOT make a copy 62 * of the rules, so after calling this function the caller must not close or 63 * release the rules passed to this function until after they are finished 64 * with this UBreakIterator* (and any others created using the same rules) 65 * and have called ubrk_close() to close the UBreakIterator* (and any others 66 * using the same rules). 67 * @param status A UErrorCode to receive any errors. 68 * @return A UBreakIterator for the specified rules. 69 * @see ubrk_open 70 * @internal 71 */ 72U_INTERNAL UBreakIterator* U_EXPORT2 73urbtok_openBinaryRulesNoCopy(const uint8_t *rules, 74 UErrorCode *status); 75 76/** 77 * Get the (native-endian) binary break rules for this tokenizer. 78 * @param bi The tokenizer to use. 79 * @param buffer The output buffer for the rules. You can pass 0 to get the required size. 80 * @param buffSize The size of the output buffer. 81 * @param status A UErrorCode to receive any errors. 82 * @return The actual size of the binary rules, whether they fit the buffer or not. 83 * @internal 84 */ 85U_INTERNAL uint32_t U_EXPORT2 86urbtok_getBinaryRules(UBreakIterator *bi, 87 uint8_t *buffer, 88 uint32_t buffSize, 89 UErrorCode *status); 90 91/** 92 * Tokenize text using a rule-based tokenizer. 93 * @param bi The tokenizer to use. 94 * @param maxTokens The maximum number of tokens to return. 95 * @param outTokens An array of RuleBasedTokenRange to fill in with the tokens. 96 * @param outTokenFlags An (optional) array of uint32_t to fill in with token flags. 97 * @return The number of tokens returned, 0 if done. 98 * @internal 99 */ 100U_INTERNAL int32_t U_EXPORT2 101urbtok_tokenize(UBreakIterator *bi, 102 int32_t maxTokens, 103 RuleBasedTokenRange *outTokens, 104 unsigned long *outTokenFlags); 105 106/** 107 * Swap the endianness of a set of binary break rules. 108 * @param rules A set of rules which need swapping. 109 * @param buffer The output buffer for the swapped rules, which must be the same 110 * size as the input rules buffer. 111 * @param inIsBigEndian UBool indicating whether the input is big-endian 112 * @param outIsBigEndian UBool indicating whether the output should be big-endian 113 * @param status A UErrorCode to receive any errors. 114 * @internal 115 */ 116U_INTERNAL void U_EXPORT2 117urbtok_swapBinaryRules(const uint8_t *rules, 118 uint8_t *buffer, 119 UBool inIsBigEndian, 120 UBool outIsBigEndian, 121 UErrorCode *status); 122 123 124#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 125 126#endif 127