1/* 2 * Copyright 2005 Maksim Orlovich <maksim@kde.org> 3 * Copyright (C) 2006 Apple Computer, Inc. 4 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "config.h" 29#include "XPathParser.h" 30 31#include "ExceptionCode.h" 32#include "XPathEvaluator.h" 33#include "XPathException.h" 34#include "XPathNSResolver.h" 35#include "XPathPath.h" 36#include "XPathStep.h" 37#include <wtf/StdLibExtras.h> 38#include <wtf/text/StringHash.h> 39 40using namespace WebCore; 41using namespace WTF; 42using namespace Unicode; 43using namespace XPath; 44 45extern int xpathyyparse(WebCore::XPath::Parser*); 46#include "XPathGrammar.h" 47 48Parser* Parser::currentParser = 0; 49 50enum XMLCat { NameStart, NameCont, NotPartOfName }; 51 52typedef HashMap<String, Step::Axis> AxisNamesMap; 53 54static XMLCat charCat(UChar aChar) 55{ 56 //### might need to add some special cases from the XML spec. 57 58 if (aChar == '_') 59 return NameStart; 60 61 if (aChar == '.' || aChar == '-') 62 return NameCont; 63 CharCategory category = Unicode::category(aChar); 64 if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter)) 65 return NameStart; 66 if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit)) 67 return NameCont; 68 return NotPartOfName; 69} 70 71static void setUpAxisNamesMap(AxisNamesMap& axisNames) 72{ 73 struct AxisName { 74 const char* name; 75 Step::Axis axis; 76 }; 77 const AxisName axisNameList[] = { 78 { "ancestor", Step::AncestorAxis }, 79 { "ancestor-or-self", Step::AncestorOrSelfAxis }, 80 { "attribute", Step::AttributeAxis }, 81 { "child", Step::ChildAxis }, 82 { "descendant", Step::DescendantAxis }, 83 { "descendant-or-self", Step::DescendantOrSelfAxis }, 84 { "following", Step::FollowingAxis }, 85 { "following-sibling", Step::FollowingSiblingAxis }, 86 { "namespace", Step::NamespaceAxis }, 87 { "parent", Step::ParentAxis }, 88 { "preceding", Step::PrecedingAxis }, 89 { "preceding-sibling", Step::PrecedingSiblingAxis }, 90 { "self", Step::SelfAxis } 91 }; 92 for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i) 93 axisNames.set(axisNameList[i].name, axisNameList[i].axis); 94} 95 96static bool isAxisName(const String& name, Step::Axis& type) 97{ 98 DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ()); 99 100 if (axisNames.isEmpty()) 101 setUpAxisNamesMap(axisNames); 102 103 AxisNamesMap::iterator it = axisNames.find(name); 104 if (it == axisNames.end()) 105 return false; 106 type = it->value; 107 return true; 108} 109 110static bool isNodeTypeName(const String& name) 111{ 112 DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ()); 113 if (nodeTypeNames.isEmpty()) { 114 nodeTypeNames.add("comment"); 115 nodeTypeNames.add("text"); 116 nodeTypeNames.add("processing-instruction"); 117 nodeTypeNames.add("node"); 118 } 119 return nodeTypeNames.contains(name); 120} 121 122// Returns whether the current token can possibly be a binary operator, given 123// the previous token. Necessary to disambiguate some of the operators 124// (* (multiply), div, and, or, mod) in the [32] Operator rule 125// (check http://www.w3.org/TR/xpath#exprlex). 126bool Parser::isBinaryOperatorContext() const 127{ 128 switch (m_lastTokenType) { 129 case 0: 130 case '@': case AXISNAME: case '(': case '[': case ',': 131 case AND: case OR: case MULOP: 132 case '/': case SLASHSLASH: case '|': case PLUS: case MINUS: 133 case EQOP: case RELOP: 134 return false; 135 default: 136 return true; 137 } 138} 139 140void Parser::skipWS() 141{ 142 while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos])) 143 ++m_nextPos; 144} 145 146Token Parser::makeTokenAndAdvance(int code, int advance) 147{ 148 m_nextPos += advance; 149 return Token(code); 150} 151 152Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance) 153{ 154 m_nextPos += advance; 155 return Token(code, val); 156} 157 158Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance) 159{ 160 m_nextPos += advance; 161 return Token(code, val); 162} 163 164// Returns next char if it's there and interesting, 0 otherwise 165char Parser::peekAheadHelper() 166{ 167 if (m_nextPos + 1 >= m_data.length()) 168 return 0; 169 UChar next = m_data[m_nextPos + 1]; 170 if (next >= 0xff) 171 return 0; 172 return next; 173} 174 175char Parser::peekCurHelper() 176{ 177 if (m_nextPos >= m_data.length()) 178 return 0; 179 UChar next = m_data[m_nextPos]; 180 if (next >= 0xff) 181 return 0; 182 return next; 183} 184 185Token Parser::lexString() 186{ 187 UChar delimiter = m_data[m_nextPos]; 188 int startPos = m_nextPos + 1; 189 190 for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) { 191 if (m_data[m_nextPos] == delimiter) { 192 String value = m_data.substring(startPos, m_nextPos - startPos); 193 if (value.isNull()) 194 value = ""; 195 ++m_nextPos; // Consume the char. 196 return Token(LITERAL, value); 197 } 198 } 199 200 // Ouch, went off the end -- report error. 201 return Token(XPATH_ERROR); 202} 203 204Token Parser::lexNumber() 205{ 206 int startPos = m_nextPos; 207 bool seenDot = false; 208 209 // Go until end or a non-digits character. 210 for (; m_nextPos < m_data.length(); ++m_nextPos) { 211 UChar aChar = m_data[m_nextPos]; 212 if (aChar >= 0xff) break; 213 214 if (aChar < '0' || aChar > '9') { 215 if (aChar == '.' && !seenDot) 216 seenDot = true; 217 else 218 break; 219 } 220 } 221 222 return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos)); 223} 224 225bool Parser::lexNCName(String& name) 226{ 227 int startPos = m_nextPos; 228 if (m_nextPos >= m_data.length()) 229 return false; 230 231 if (charCat(m_data[m_nextPos]) != NameStart) 232 return false; 233 234 // Keep going until we get a character that's not good for names. 235 for (; m_nextPos < m_data.length(); ++m_nextPos) 236 if (charCat(m_data[m_nextPos]) == NotPartOfName) 237 break; 238 239 name = m_data.substring(startPos, m_nextPos - startPos); 240 return true; 241} 242 243bool Parser::lexQName(String& name) 244{ 245 String n1; 246 if (!lexNCName(n1)) 247 return false; 248 249 skipWS(); 250 251 // If the next character is :, what we just got it the prefix, if not, 252 // it's the whole thing. 253 if (peekAheadHelper() != ':') { 254 name = n1; 255 return true; 256 } 257 258 String n2; 259 if (!lexNCName(n2)) 260 return false; 261 262 name = n1 + ":" + n2; 263 return true; 264} 265 266Token Parser::nextTokenInternal() 267{ 268 skipWS(); 269 270 if (m_nextPos >= m_data.length()) 271 return Token(0); 272 273 char code = peekCurHelper(); 274 switch (code) { 275 case '(': case ')': case '[': case ']': 276 case '@': case ',': case '|': 277 return makeTokenAndAdvance(code); 278 case '\'': 279 case '\"': 280 return lexString(); 281 case '0': case '1': case '2': case '3': case '4': 282 case '5': case '6': case '7': case '8': case '9': 283 return lexNumber(); 284 case '.': { 285 char next = peekAheadHelper(); 286 if (next == '.') 287 return makeTokenAndAdvance(DOTDOT, 2); 288 if (next >= '0' && next <= '9') 289 return lexNumber(); 290 return makeTokenAndAdvance('.'); 291 } 292 case '/': 293 if (peekAheadHelper() == '/') 294 return makeTokenAndAdvance(SLASHSLASH, 2); 295 return makeTokenAndAdvance('/'); 296 case '+': 297 return makeTokenAndAdvance(PLUS); 298 case '-': 299 return makeTokenAndAdvance(MINUS); 300 case '=': 301 return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ); 302 case '!': 303 if (peekAheadHelper() == '=') 304 return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2); 305 return Token(XPATH_ERROR); 306 case '<': 307 if (peekAheadHelper() == '=') 308 return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2); 309 return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT); 310 case '>': 311 if (peekAheadHelper() == '=') 312 return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2); 313 return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT); 314 case '*': 315 if (isBinaryOperatorContext()) 316 return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul); 317 ++m_nextPos; 318 return Token(NAMETEST, "*"); 319 case '$': { // $ QName 320 m_nextPos++; 321 String name; 322 if (!lexQName(name)) 323 return Token(XPATH_ERROR); 324 return Token(VARIABLEREFERENCE, name); 325 } 326 } 327 328 String name; 329 if (!lexNCName(name)) 330 return Token(XPATH_ERROR); 331 332 skipWS(); 333 // If we're in an operator context, check for any operator names 334 if (isBinaryOperatorContext()) { 335 if (name == "and") //### hash? 336 return Token(AND); 337 if (name == "or") 338 return Token(OR); 339 if (name == "mod") 340 return Token(MULOP, NumericOp::OP_Mod); 341 if (name == "div") 342 return Token(MULOP, NumericOp::OP_Div); 343 } 344 345 // See whether we are at a : 346 if (peekCurHelper() == ':') { 347 m_nextPos++; 348 // Any chance it's an axis name? 349 if (peekCurHelper() == ':') { 350 m_nextPos++; 351 352 //It might be an axis name. 353 Step::Axis axis; 354 if (isAxisName(name, axis)) 355 return Token(AXISNAME, axis); 356 // Ugh, :: is only valid in axis names -> error 357 return Token(XPATH_ERROR); 358 } 359 360 // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest 361 skipWS(); 362 if (peekCurHelper() == '*') { 363 m_nextPos++; 364 return Token(NAMETEST, name + ":*"); 365 } 366 367 // Make a full qname. 368 String n2; 369 if (!lexNCName(n2)) 370 return Token(XPATH_ERROR); 371 372 name = name + ":" + n2; 373 } 374 375 skipWS(); 376 if (peekCurHelper() == '(') { 377 //note: we don't swallow the (here! 378 379 //either node type of function name 380 if (isNodeTypeName(name)) { 381 if (name == "processing-instruction") 382 return Token(PI, name); 383 384 return Token(NODETYPE, name); 385 } 386 //must be a function name. 387 return Token(FUNCTIONNAME, name); 388 } 389 390 // At this point, it must be NAMETEST. 391 return Token(NAMETEST, name); 392} 393 394Token Parser::nextToken() 395{ 396 Token toRet = nextTokenInternal(); 397 m_lastTokenType = toRet.type; 398 return toRet; 399} 400 401Parser::Parser() 402{ 403 reset(String()); 404} 405 406Parser::~Parser() 407{ 408} 409 410void Parser::reset(const String& data) 411{ 412 m_nextPos = 0; 413 m_data = data; 414 m_lastTokenType = 0; 415 416 m_topExpr = 0; 417 m_gotNamespaceError = false; 418} 419 420int Parser::lex(void* data) 421{ 422 YYSTYPE* yylval = static_cast<YYSTYPE*>(data); 423 Token tok = nextToken(); 424 425 switch (tok.type) { 426 case AXISNAME: 427 yylval->axis = tok.axis; 428 break; 429 case MULOP: 430 yylval->numop = tok.numop; 431 break; 432 case RELOP: 433 case EQOP: 434 yylval->eqop = tok.eqop; 435 break; 436 case NODETYPE: 437 case PI: 438 case FUNCTIONNAME: 439 case LITERAL: 440 case VARIABLEREFERENCE: 441 case NUMBER: 442 case NAMETEST: 443 yylval->str = new String(tok.str); 444 registerString(yylval->str); 445 break; 446 } 447 448 return tok.type; 449} 450 451bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI) 452{ 453 size_t colon = qName.find(':'); 454 if (colon != notFound) { 455 if (!m_resolver) 456 return false; 457 namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon)); 458 if (namespaceURI.isNull()) 459 return false; 460 localName = qName.substring(colon + 1); 461 } else 462 localName = qName; 463 464 return true; 465} 466 467Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec) 468{ 469 reset(statement); 470 471 m_resolver = resolver; 472 473 Parser* oldParser = currentParser; 474 currentParser = this; 475 int parseError = xpathyyparse(this); 476 currentParser = oldParser; 477 478 if (parseError) { 479 deleteAllValues(m_parseNodes); 480 m_parseNodes.clear(); 481 482 HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end(); 483 for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) { 484 deleteAllValues(**it); 485 delete *it; 486 } 487 m_predicateVectors.clear(); 488 489 HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end(); 490 for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) { 491 deleteAllValues(**it); 492 delete *it; 493 } 494 m_expressionVectors.clear(); 495 496 deleteAllValues(m_strings); 497 m_strings.clear(); 498 499 deleteAllValues(m_nodeTests); 500 m_nodeTests.clear(); 501 502 m_topExpr = 0; 503 504 if (m_gotNamespaceError) 505 ec = NAMESPACE_ERR; 506 else 507 ec = XPathException::INVALID_EXPRESSION_ERR; 508 return 0; 509 } 510 511 ASSERT(m_parseNodes.size() == 1); 512 ASSERT(*m_parseNodes.begin() == m_topExpr); 513 ASSERT(m_expressionVectors.size() == 0); 514 ASSERT(m_predicateVectors.size() == 0); 515 ASSERT(m_strings.size() == 0); 516 ASSERT(m_nodeTests.size() == 0); 517 518 m_parseNodes.clear(); 519 Expression* result = m_topExpr; 520 m_topExpr = 0; 521 522 return result; 523} 524 525void Parser::registerParseNode(ParseNode* node) 526{ 527 if (node == 0) 528 return; 529 530 ASSERT(!m_parseNodes.contains(node)); 531 532 m_parseNodes.add(node); 533} 534 535void Parser::unregisterParseNode(ParseNode* node) 536{ 537 if (node == 0) 538 return; 539 540 ASSERT(m_parseNodes.contains(node)); 541 542 m_parseNodes.remove(node); 543} 544 545void Parser::registerPredicateVector(Vector<Predicate*>* vector) 546{ 547 if (vector == 0) 548 return; 549 550 ASSERT(!m_predicateVectors.contains(vector)); 551 552 m_predicateVectors.add(vector); 553} 554 555void Parser::deletePredicateVector(Vector<Predicate*>* vector) 556{ 557 if (vector == 0) 558 return; 559 560 ASSERT(m_predicateVectors.contains(vector)); 561 562 m_predicateVectors.remove(vector); 563 delete vector; 564} 565 566 567void Parser::registerExpressionVector(Vector<Expression*>* vector) 568{ 569 if (vector == 0) 570 return; 571 572 ASSERT(!m_expressionVectors.contains(vector)); 573 574 m_expressionVectors.add(vector); 575} 576 577void Parser::deleteExpressionVector(Vector<Expression*>* vector) 578{ 579 if (vector == 0) 580 return; 581 582 ASSERT(m_expressionVectors.contains(vector)); 583 584 m_expressionVectors.remove(vector); 585 delete vector; 586} 587 588void Parser::registerString(String* s) 589{ 590 if (s == 0) 591 return; 592 593 ASSERT(!m_strings.contains(s)); 594 595 m_strings.add(s); 596} 597 598void Parser::deleteString(String* s) 599{ 600 if (s == 0) 601 return; 602 603 ASSERT(m_strings.contains(s)); 604 605 m_strings.remove(s); 606 delete s; 607} 608 609void Parser::registerNodeTest(Step::NodeTest* t) 610{ 611 if (t == 0) 612 return; 613 614 ASSERT(!m_nodeTests.contains(t)); 615 616 m_nodeTests.add(t); 617} 618 619void Parser::deleteNodeTest(Step::NodeTest* t) 620{ 621 if (t == 0) 622 return; 623 624 ASSERT(m_nodeTests.contains(t)); 625 626 m_nodeTests.remove(t); 627 delete t; 628} 629 630