db-4.8.30/db_sql/tokenize.c

/*
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
 *
 */

/*
 * Most of this lexical analyzer code is taken directly from sqlite source.
 */
#include <ctype.h>
#include <stdlib.h>
#include "db_sql.h"

/*
** The charMap() macro maps alphabetic characters into their
** lower-case ASCII equivalent.  On ASCII machines, this is just
** an upper-to-lower case map.  On EBCDIC machines we also need
** to adjust the encoding.  Only alphabetic characters and underscores
** need to be translated.
*/
#ifdef SQLITE_ASCII
# define charMap(X) sqlite3UpperToLower[(unsigned char)X]
#endif
#ifdef SQLITE_EBCDIC
# define charMap(X) ebcdicToAscii[(unsigned char)X]
const unsigned char ebcdicToAscii[] = {
/* 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* 0x */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* 1x */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* 2x */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* 3x */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* 4x */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* 5x */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 95,  0,  0,  /* 6x */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* 7x */
   0, 97, 98, 99,100,101,102,103,104,105,  0,  0,  0,  0,  0,  0,  /* 8x */
   0,106,107,108,109,110,111,112,113,114,  0,  0,  0,  0,  0,  0,  /* 9x */
   0,  0,115,116,117,118,119,120,121,122,  0,  0,  0,  0,  0,  0,  /* Ax */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* Bx */
   0, 97, 98, 99,100,101,102,103,104,105,  0,  0,  0,  0,  0,  0,  /* Cx */
   0,106,107,108,109,110,111,112,113,114,  0,  0,  0,  0,  0,  0,  /* Dx */
   0,  0,115,116,117,118,119,120,121,122,  0,  0,  0,  0,  0,  0,  /* Ex */
   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  /* Fx */
};
#endif

/*
** The sqlite3KeywordCode function looks up an identifier to determine if
** it is a keyword.  If it is a keyword, the token code of that keyword is
** returned.  If the input is not a keyword, TK_ID is returned.
**
** The implementation of this routine was generated by a program,
** mkkeywordhash.h, located in the tool subdirectory of the distribution.
** The output of the mkkeywordhash.c program is written into a file
** named keywordhash.h and then included into this source file by
** the #include below.
*/
#include "sqlite/keywordhash.h"


/*
** If X is a character that can be used in an identifier then
** IdChar(X) will be true.  Otherwise it is false.
**
** For ASCII, any character with the high-order bit set is
** allowed in an identifier.  For 7-bit characters,
** sqlite3IsIdChar[X] must be 1.
**
** For EBCDIC, the rules are more complex but have the same
** end result.
**
** Ticket #1066.  the SQL standard does not allow '$' in the
** middle of identfiers.  But many SQL implementations do.
** SQLite will allow '$' in identifiers for compatibility.
** But the feature is undocumented.
*/
#ifdef SQLITE_ASCII
const char sqlite3IsAsciiIdChar[] = {
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
};
#define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && sqlite3IsAsciiIdChar[c-0x20]))
#endif
#ifdef SQLITE_EBCDIC
const char sqlite3IsEbcdicIdChar[] = {
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 4x */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,  /* 5x */
    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,  /* 6x */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,  /* 7x */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,  /* 8x */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,  /* 9x */
    1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,  /* Ax */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* Bx */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,  /* Cx */
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,  /* Dx */
    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,  /* Ex */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,  /* Fx */
};
#define IdChar(C)  (((c=C)>=0x42 && sqlite3IsEbcdicIdChar[c-0x40]))
#endif


/*
** Return the length of the token that begins at z[0].
** Store the token type in *tokenType before returning.
*/
static int getToken(const unsigned char *z, int *tokenType){
	int i, c;
	switch( *z ){
	case ' ': case '\t': case '\n': case '\f': case '\r': {
		for(i=1; isspace(z[i]); i++){}
		*tokenType = TK_SPACE;
		return i;
	}
	case '-': {
		if( z[1]=='-' ){
			for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
			*tokenType = TK_COMMENT;
			return i;
		}
		*tokenType = TK_MINUS;
		return 1;
	}
	case '(': {
		*tokenType = TK_LP;
		return 1;
	}
	case ')': {
		*tokenType = TK_RP;
		return 1;
	}
	case ';': {
		*tokenType = TK_SEMI;
		return 1;
	}
	case '+': {
		*tokenType = TK_PLUS;
		return 1;
	}
	case '*': {
		*tokenType = TK_STAR;
		return 1;
	}
	case '/': {
		if( z[1]!='*' || z[2]==0 ){
			*tokenType = TK_SLASH;
			return 1;
		}
		for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
		if( c ) i++;
		*tokenType = TK_COMMENT;
		return i;
	}
	case '%': {
		*tokenType = TK_REM;
		return 1;
	}
	case '=': {
		*tokenType = TK_EQ;
		return 1 + (z[1]=='=');
	}
	case '<': {
		if( (c=z[1])=='=' ){
			*tokenType = TK_LE;
			return 2;
		}else if( c=='>' ){
			*tokenType = TK_NE;
			return 2;
		}else if( c=='<' ){
			*tokenType = TK_LSHIFT;
			return 2;
		}else{
			*tokenType = TK_LT;
			return 1;
		}
	}
	case '>': {
		if( (c=z[1])=='=' ){
			*tokenType = TK_GE;
			return 2;
		}else if( c=='>' ){
			*tokenType = TK_RSHIFT;
			return 2;
		}else{
			*tokenType = TK_GT;
			return 1;
		}
	}
	case '!': {
		if( z[1]!='=' ){
			*tokenType = TK_ILLEGAL;
			return 2;
		}else{
			*tokenType = TK_NE;
			return 2;
		}
	}
	case '|': {
		if( z[1]!='|' ){
			*tokenType = TK_BITOR;
			return 1;
		}else{
			*tokenType = TK_CONCAT;
			return 2;
		}
	}
	case ',': {
		*tokenType = TK_COMMA;
		return 1;
	}
	case '&': {
		*tokenType = TK_BITAND;
		return 1;
	}
	case '~': {
		*tokenType = TK_BITNOT;
		return 1;
	}
	case '`':
	case '\'':
	case '"': {
		int delim = z[0];
		for(i=1; (c=z[i])!=0; i++){
			if( c==delim ){
				if( z[i+1]==delim ){
					i++;
				}else{
					break;
				}
			}
		}
		if( c ){
			*tokenType = TK_STRING;
			return i+1;
		}else{
			*tokenType = TK_ILLEGAL;
			return i;
		}
	}
	case '.': {
#ifndef SQLITE_OMIT_FLOATING_POINT
		if( !isdigit(z[1]) )
#endif
		{
			*tokenType = TK_DOT;
			return 1;
		}
		/* If the next character is a digit, this is a floating point
		** number that begins with ".".  Fall thru into the next case */
	}
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9': {
		*tokenType = TK_INTEGER;
		for(i=0; isdigit(z[i]); i++){}
#ifndef SQLITE_OMIT_FLOATING_POINT
		if( z[i]=='.' ){
			i++;
			while( isdigit(z[i]) ){ i++; }
			*tokenType = TK_FLOAT;
		}
		if( (z[i]=='e' || z[i]=='E') &&
		    ( isdigit(z[i+1])
		      || ((z[i+1]=='+' || z[i+1]=='-') && isdigit(z[i+2]))
			    )
			){
			i += 2;
			while( isdigit(z[i]) ){ i++; }
			*tokenType = TK_FLOAT;
		}
#endif
		while( IdChar(z[i]) ){
			*tokenType = TK_ILLEGAL;
			i++;
		}
		return i;
	}
	case '[': {
		for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
		*tokenType = c==']' ? TK_ID : TK_ILLEGAL;
		return i;
	}
	case '?': {
		*tokenType = TK_VARIABLE;
		for(i=1; isdigit(z[i]); i++){}
		return i;
	}
	case '#': {
		for(i=1; isdigit(z[i]); i++){}
		if( i>1 ){
			/* Parameters of the form #NNN (where NNN is a number) are used
			** internally by sqlite3NestedParse.  */
			*tokenType = TK_REGISTER;
			return i;
		}
		/* Fall through into the next case if the '#' is not followed by
		** a digit. Try to match #AAAA where AAAA is a parameter name. */
	}
#ifndef SQLITE_OMIT_TCL_VARIABLE
	case '$':
#endif
	case '@':  /* For compatibility with MS SQL Server */
	case ':': {
		int n = 0;
		*tokenType = TK_VARIABLE;
		for(i=1; (c=z[i])!=0; i++){
			if( IdChar(c) ){
				n++;
#ifndef SQLITE_OMIT_TCL_VARIABLE
			}else if( c=='(' && n>0 ){
				do{
					i++;
				}while( (c=z[i])!=0 && !isspace(c) && c!=')' );
				if( c==')' ){
					i++;
				}else{
					*tokenType = TK_ILLEGAL;
				}
				break;
			}else if( c==':' && z[i+1]==':' ){
				i++;
#endif
			}else{
				break;
			}
		}
		if( n==0 ) *tokenType = TK_ILLEGAL;
		return i;
	}
#ifndef SQLITE_OMIT_BLOB_LITERAL
	case 'x': case 'X': {
		if( z[1]=='\'' ){
			*tokenType = TK_BLOB;
			for(i=2; (c=z[i])!=0 && c!='\''; i++){
				if( !isxdigit(c) ){
					*tokenType = TK_ILLEGAL;
				}
			}
			if( i%2 || !c ) *tokenType = TK_ILLEGAL;
			if( c ) i++;
			return i;
		}
		/* Otherwise fall through to the next case */
	}
#endif
	default: {
		if( !IdChar(*z) ){
			break;
		}
		for(i=1; IdChar(z[i]); i++){}
		*tokenType = keywordCode((char*)z, i);
		return i;
	}
	}
	*tokenType = TK_ILLEGAL;
	return 1;
}

static int
bdb_run_parser(Parse *pParse, const char *zSql, char **pzErrMsg){
	int nErr = 0;
	int i;
	void *pEngine;
	int tokenType;
	int lastTokenParsed = -1;
	pParse->rc = SQLITE_OK;
	pParse->zTail = pParse->zSql = zSql;
	i = 0;
	pEngine = sqlite3ParserAlloc((void*(*)(size_t))malloc);
	if( pEngine==0 ){
		return SQLITE_NOMEM;
	}

	while(zSql[i]!=0 ){
		assert( i>=0 );
		pParse->sLastToken.z = (u8*)&zSql[i];
		assert( pParse->sLastToken.dyn==0 );
		pParse->sLastToken.n = getToken((unsigned char*)&zSql[i],&tokenType);
		i += pParse->sLastToken.n;
		if( i>SQLITE_MAX_SQL_LENGTH ){
			pParse->rc = SQLITE_TOOBIG;
			break;
		}
		switch( tokenType ) {
		case TK_SPACE: {
			break;
		}
		case TK_COMMENT: {
			parse_hint_comment(&pParse->sLastToken);
			break;
		}
		case TK_ILLEGAL: {
			if( pzErrMsg ){
				free(*pzErrMsg);
				*pzErrMsg = sqlite3MPrintf(0, "unrecognized token: \"%T\"",
							   &pParse->sLastToken);
			}
			nErr++;
			goto abort_parse;
		}
		case TK_SEMI: {
			pParse->zTail = &zSql[i];
			/* Fall thru into the default case */
		}
		default: {
			preparser(pEngine, tokenType, pParse->sLastToken, pParse);
			lastTokenParsed = tokenType;
			if( pParse->rc!=SQLITE_OK ){
				goto abort_parse;
			}
			break;
		}
		}
	}
abort_parse:
	if( zSql[i]==0 && nErr==0 && pParse->rc==SQLITE_OK ){
		sqlite3Parser(pEngine, TK_SEMI, pParse->sLastToken, pParse);
		pParse->zTail = &zSql[i];
		sqlite3Parser(pEngine, 0, pParse->sLastToken, pParse);
	}
	sqlite3ParserFree(pEngine,free);
	if( 0 ){
		pParse->rc = SQLITE_NOMEM;
	}
	if( pParse->rc!=SQLITE_OK && pParse->rc!=SQLITE_DONE && pParse->zErrMsg==0 ){
		setString(&pParse->zErrMsg, sqlite3ErrStr(pParse->rc), (char*)0);
	}
	if( pParse->zErrMsg ){
		if( pzErrMsg && *pzErrMsg==0 ){
			*pzErrMsg = pParse->zErrMsg;
		}else{
			free(pParse->zErrMsg);
		}
		pParse->zErrMsg = 0;
		nErr++;
	}
	if( nErr>0 && (pParse->rc==SQLITE_OK || pParse->rc==SQLITE_DONE) ){
		pParse->rc = SQLITE_ERROR;
	}
	return nErr;
}

int do_parse(const char *zSql, char **pzErrMsg) {
	Parse sParse;
	memset(&sParse, 0, sizeof(sParse));
	return bdb_run_parser(&sParse, zSql, pzErrMsg);
}