contrib/sqlite3/sqlite3.c

8516 **   Tokenize text using the tokenizer belonging to the FTS5 table.
8691 ** Applications may also register custom tokenizer types. A tokenizer
8698 **   This function is used to allocate and inititalize a tokenizer instance.
8699 **   A tokenizer instance is required to actually tokenize text.
8705 **   containing the tokenizer arguments, if any, specified following the
8706 **   tokenizer name as part of the CREATE VIRTUAL TABLE statement used
8710 **   should be set to point to the new tokenizer handle and SQLITE_OK
8716 **   This function is invoked to delete a tokenizer handle previously
8731 **            or removed from the FTS table. The tokenizer is being invoked to
8736 **            against the FTS index. The tokenizer is being called to tokenize
8742 **            returned by the tokenizer will be treated as a token prefix.
8744 **       <li> <b>FTS5_TOKENIZE_AUX</b> - The tokenizer is being invoked to
8759 **   normally be set to 0. The exception is if the tokenizer supports
8786 **            In the above example, this means that the tokenizer returns the
8791 **            the tokenizer substitutes "first" for "1st" and the query works
8795 **            In this case, when tokenizing query text, the tokenizer may
8803 **            the tokenizer offers both "1st" and "first" as synonyms for the
8815 **            Using this method, when tokenizing document text, the tokenizer
8821 **            This way, even if the tokenizer does not provide synonyms
8831 **   when parsing the document "I won first place", a tokenizer that supports
8852 **   token "first" is subsituted for "1st" by the tokenizer, then the query:
8857 **   will not match documents that contain the token "1st" (as the tokenizer
8868 **   token "1st", but not "first" (assuming the tokenizer is not able to
8875 **   When using methods (2) or (3), it is important that the tokenizer only
8906 /* Flags that may be passed by the tokenizer implementation back to FTS5
8921   /* Create a new tokenizer */
8930   /* Find an existing tokenizer */
131733   assert( TOKEN.z[0] );  /* The tokenizer always gives us a token */
131962 ** An tokenizer for SQL
132856 ** An tokenizer for SQL
132859 ** This code used to be part of the tokenizer.c source file.  But by
137621 ** generated by the tokenizer. Note that POS_END and POS_COLUMN occur
137880 ** sqlite3_tokenizer_module is a singleton defining the tokenizer
137884 ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
137887 ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
137900 ** Structures used by the tokenizer interface. When a new tokenizer
137906 ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
137907 ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
137909 ** sqlite3_tokenizer structure representing the specific tokenizer to
137910 ** be used for the fts3 table (customized by the tokenizer clause arguments).
137930   ** Create a new tokenizer. The values in the argv[] array are the
137931   ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
137935   **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
137942   ** to point at the newly created tokenizer structure. The generic
137949     sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
137953   ** Destroy an existing tokenizer. The fts3 module calls this method
137959   ** Create a tokenizer cursor to tokenize an input buffer. The caller
137966     sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
137970   ** Destroy an existing tokenizer cursor. The fts3 module calls this
137976   ** Retrieve the next token from the tokenizer cursor pCursor. This
137991   ** The buffer *ppToken is set to point at is managed by the tokenizer
138012   ** Configure the language id of a tokenizer cursor.
138018   const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
138328   sqlite3_tokenizer *pTokenizer;  /* tokenizer for inserts and queries */
138938   /* Invoke the tokenizer destructor to free the tokenizer. */
139611   **   + If there is a tokenizer specification included in the arguments,
139612   **     initializes the tokenizer pTokenizer.
139619     /* Check if this is a tokenizer specification */
139922   void *pAux,                     /* Pointer to tokenizer hash table */
139932   void *pAux,                     /* Pointer to tokenizer hash table */
142273 ** allocated for the tokenizer hash table.
142288 ** to by the argument to point to the "simple" tokenizer implementation.
144929 ** syntax is relatively simple, the whole tokenizer/parser system is
145009   int iLangid;                        /* Language id used with tokenizer */
145079 ** Extract the next token from buffer z (length n) using the tokenizer
145210   ** The first pass, in the block below, uses a tokenizer cursor to iterate
145876   int iLangid,                        /* Language id for tokenizer */
145925 ** The first parameter, pTokenizer, is passed the fts3 tokenizer module to
145939   int iLangid,                        /* Language id for tokenizer */
146110 **   fts3_exprtest(<tokenizer>, <expr>, <column 1>, ...);
146112 ** The first argument, <tokenizer>, is the name of the fts3 tokenizer used
146139         "Usage: fts3_exprtest(tokenizer, expr, col1, ...", -1
146150     sqlite3_result_error(context, "No such tokenizer module", -1);
146624 ** Implementation of the full-text-search tokenizer that implements
146669 ** Create a new tokenizer instance.
146688 ** Destroy a tokenizer
146702   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
147252 ** The set of routines that implement the porter-stemmer tokenizer
147265 ** Allocate a new porter tokenizer.  Return a pointer to the new
147266 ** tokenizer in *ppModule
147291 ** This particular file implements the generic tokenizer interface.
147380       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
147467     sqlite3Fts3ErrMsg(pzErr, "unknown tokenizer: %s", z);
147490       sqlite3Fts3ErrMsg(pzErr, "unknown tokenizer");
147523 ** using the built-in "simple" tokenizer:
147574     char *zErr2 = sqlite3_mprintf("unknown tokenizer: %s", zName);
147682 ** in the README.tokenizer file as an example, so it is important to
147713   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
147805 ** Implementation of the "simple" full-text-search tokenizer.
147851 ** Create a new tokenizer instance.
147892 ** Destroy a tokenizer
147906   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
148004 ** The set of routines that implement the simple tokenizer
148017 ** Allocate a new simple tokenizer.  Return a pointer to the new
148018 ** tokenizer in *ppModule
148046 **       <tokenizer-name>, <arg-1>, ...
148058 ** tokenizer specified by the arguments to the CREATE VIRTUAL TABLE
148104 ** Query FTS for the tokenizer implementation named zName.
148117     sqlite3Fts3ErrMsg(pzErr, "unknown tokenizer: %s", zName);
148173 ** Schema of the tokenizer table.
148185 **   argv[3]: first argument (tokenizer name)
148306 ** Reset the tokenizer cursor passed as the only argument. As if it had
154917     const char *ZDUMMY;           /* Dummy argument used with tokenizer */
154918     int DUMMY1 = -1;              /* Dummy argument used with tokenizer */
154927     ** that needs to know whether or not the tokenizer is being used for
154931     ** initialization. It is not a documented part of the tokenizer interface.
154932     ** If a tokenizer is used directly by any code outside of FTS, this
155781     /* Initialize a tokenizer iterator to iterate through column iCol. */
155889 ** Implementation of the "unicode" full-text-search tokenizer.
155978 ** Destroy a tokenizer allocated by unicodeCreate().
155991 ** statement has specified that the tokenizer for this table shall consider
156004 ** It is not possible to change the behavior of the tokenizer with respect
156092 ** Create a new tokenizer instance.
156097   sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
156099   unicode_tokenizer *pNew;        /* New tokenizer object */
156145   sqlite3_tokenizer *p,           /* The tokenizer */
156253 ** structure for the unicode tokenizer.
160712 ** This file implements a tokenizer for fts3 based on the ICU library.
160750 ** Create a new tokenizer instance.
160755   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
160780 ** Destroy a tokenizer
160795   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
160940 ** The set of routines that implement the simple tokenizer
160953 ** Set *ppModule to point at the implementation of the ICU tokenizer.
168805 **   Tokenize text using the tokenizer belonging to the FTS5 table.
168980 ** Applications may also register custom tokenizer types. A tokenizer
168987 **   This function is used to allocate and inititalize a tokenizer instance.
168988 **   A tokenizer instance is required to actually tokenize text.
168994 **   containing the tokenizer arguments, if any, specified following the
168995 **   tokenizer name as part of the CREATE VIRTUAL TABLE statement used
168999 **   should be set to point to the new tokenizer handle and SQLITE_OK
169005 **   This function is invoked to delete a tokenizer handle previously
169020 **            or removed from the FTS table. The tokenizer is being invoked to
169025 **            against the FTS index. The tokenizer is being called to tokenize
169031 **            returned by the tokenizer will be treated as a token prefix.
169033 **       <li> <b>FTS5_TOKENIZE_AUX</b> - The tokenizer is being invoked to
169048 **   normally be set to 0. The exception is if the tokenizer supports
169075 **            In the above example, this means that the tokenizer returns the
169080 **            the tokenizer substitutes "first" for "1st" and the query works
169084 **            In this case, when tokenizing query text, the tokenizer may
169092 **            the tokenizer offers both "1st" and "first" as synonyms for the
169104 **            Using this method, when tokenizing document text, the tokenizer
169110 **            This way, even if the tokenizer does not provide synonyms
169120 **   when parsing the document "I won first place", a tokenizer that supports
169141 **   token "first" is subsituted for "1st" by the tokenizer, then the query:
169146 **   will not match documents that contain the token "1st" (as the tokenizer
169157 **   token "1st", but not "first" (assuming the tokenizer is not able to
169164 **   When using methods (2) or (3), it is important that the tokenizer only
169195 /* Flags that may be passed by the tokenizer implementation back to FTS5
169210   /* Create a new tokenizer */
169219   /* Find an existing tokenizer */
172627 ** Allocate an instance of the default tokenizer ("simple") at
172828   /* If a tokenizer= option was successfully parsed, the tokenizer has
172830   ** tokenizer (unicode61) now.  */
182924   Fts5TokenizerModule *pTok;      /* First in list of all tokenizer modules */
182925   Fts5TokenizerModule *pDfltTok;  /* Default tokenizer module */
182944 ** Each tokenizer module registered with the FTS5 module is represented
182949   char *zName;                    /* Name of tokenizer */
182953   Fts5TokenizerModule *pNext;     /* Next registered tokenizer module */
183267   void *pAux,                     /* Pointer to tokenizer hash table */
183277   void *pAux,                     /* Pointer to tokenizer hash table */
185305 ** Register a new tokenizer. This is the implementation of the
185361 ** Find a tokenizer. This is the implementation of the
185400     *pzErr = sqlite3_mprintf("no such tokenizer: %s", azArg[0]);
185405       *pzErr = sqlite3_mprintf("error in tokenizer constructor");
186725 ** Start of ascii tokenizer implementation.
186762 ** Delete a "ascii" tokenizer.
186769 ** Create an "ascii" tokenizer.
186822 ** Tokenize some text using the ascii tokenizer.
186882 ** Start of unicode61 tokenizer implementation.
187020 ** Delete a "unicode61" tokenizer.
187033 ** Create a "unicode61" tokenizer.
187041   Unicode61Tokenizer *p = 0;      /* New tokenizer object */
187089 ** Return true if, for the purposes of tokenizing with the tokenizer
187212   fts5_tokenizer tokenizer;       /* Parent tokenizer module */
187213   Fts5Tokenizer *pTokenizer;      /* Parent tokenizer instance */
187218 ** Delete a "porter" tokenizer.
187224       p->tokenizer.xDelete(p->pTokenizer);
187231 ** Create a "porter" tokenizer.
187251     rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
187258     rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
187902 ** Tokenize using the porter tokenizer.
187916   return p->tokenizer.xTokenize(
188874   void *pAux,                     /* Pointer to tokenizer hash table */
188884   void *pAux,                     /* Pointer to tokenizer hash table */