1/*
2 * Copyright 2005 Maksim Orlovich <maksim@kde.org>
3 * Copyright (C) 2006 Apple Computer, Inc.
4 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "XPathParser.h"
30
31#include "ExceptionCode.h"
32#include "XPathEvaluator.h"
33#include "XPathException.h"
34#include "XPathNSResolver.h"
35#include "XPathPath.h"
36#include "XPathStep.h"
37#include <wtf/StdLibExtras.h>
38#include <wtf/text/StringHash.h>
39
40using namespace WebCore;
41using namespace WTF;
42using namespace Unicode;
43using namespace XPath;
44
45extern int xpathyyparse(WebCore::XPath::Parser*);
46#include "XPathGrammar.h"
47
48Parser* Parser::currentParser = 0;
49
50enum XMLCat { NameStart, NameCont, NotPartOfName };
51
52typedef HashMap<String, Step::Axis> AxisNamesMap;
53
54static XMLCat charCat(UChar aChar)
55{
56    //### might need to add some special cases from the XML spec.
57
58    if (aChar == '_')
59        return NameStart;
60
61    if (aChar == '.' || aChar == '-')
62        return NameCont;
63    CharCategory category = Unicode::category(aChar);
64    if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
65        return NameStart;
66    if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
67        return NameCont;
68    return NotPartOfName;
69}
70
71static void setUpAxisNamesMap(AxisNamesMap& axisNames)
72{
73    struct AxisName {
74        const char* name;
75        Step::Axis axis;
76    };
77    const AxisName axisNameList[] = {
78        { "ancestor", Step::AncestorAxis },
79        { "ancestor-or-self", Step::AncestorOrSelfAxis },
80        { "attribute", Step::AttributeAxis },
81        { "child", Step::ChildAxis },
82        { "descendant", Step::DescendantAxis },
83        { "descendant-or-self", Step::DescendantOrSelfAxis },
84        { "following", Step::FollowingAxis },
85        { "following-sibling", Step::FollowingSiblingAxis },
86        { "namespace", Step::NamespaceAxis },
87        { "parent", Step::ParentAxis },
88        { "preceding", Step::PrecedingAxis },
89        { "preceding-sibling", Step::PrecedingSiblingAxis },
90        { "self", Step::SelfAxis }
91    };
92    for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
93        axisNames.set(axisNameList[i].name, axisNameList[i].axis);
94}
95
96static bool isAxisName(const String& name, Step::Axis& type)
97{
98    DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());
99
100    if (axisNames.isEmpty())
101        setUpAxisNamesMap(axisNames);
102
103    AxisNamesMap::iterator it = axisNames.find(name);
104    if (it == axisNames.end())
105        return false;
106    type = it->value;
107    return true;
108}
109
110static bool isNodeTypeName(const String& name)
111{
112    DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
113    if (nodeTypeNames.isEmpty()) {
114        nodeTypeNames.add("comment");
115        nodeTypeNames.add("text");
116        nodeTypeNames.add("processing-instruction");
117        nodeTypeNames.add("node");
118    }
119    return nodeTypeNames.contains(name);
120}
121
122// Returns whether the current token can possibly be a binary operator, given
123// the previous token. Necessary to disambiguate some of the operators
124// (* (multiply), div, and, or, mod) in the [32] Operator rule
125// (check http://www.w3.org/TR/xpath#exprlex).
126bool Parser::isBinaryOperatorContext() const
127{
128    switch (m_lastTokenType) {
129    case 0:
130    case '@': case AXISNAME: case '(': case '[': case ',':
131    case AND: case OR: case MULOP:
132    case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
133    case EQOP: case RELOP:
134        return false;
135    default:
136        return true;
137    }
138}
139
140void Parser::skipWS()
141{
142    while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
143        ++m_nextPos;
144}
145
146Token Parser::makeTokenAndAdvance(int code, int advance)
147{
148    m_nextPos += advance;
149    return Token(code);
150}
151
152Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
153{
154    m_nextPos += advance;
155    return Token(code, val);
156}
157
158Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
159{
160    m_nextPos += advance;
161    return Token(code, val);
162}
163
164// Returns next char if it's there and interesting, 0 otherwise
165char Parser::peekAheadHelper()
166{
167    if (m_nextPos + 1 >= m_data.length())
168        return 0;
169    UChar next = m_data[m_nextPos + 1];
170    if (next >= 0xff)
171        return 0;
172    return next;
173}
174
175char Parser::peekCurHelper()
176{
177    if (m_nextPos >= m_data.length())
178        return 0;
179    UChar next = m_data[m_nextPos];
180    if (next >= 0xff)
181        return 0;
182    return next;
183}
184
185Token Parser::lexString()
186{
187    UChar delimiter = m_data[m_nextPos];
188    int startPos = m_nextPos + 1;
189
190    for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
191        if (m_data[m_nextPos] == delimiter) {
192            String value = m_data.substring(startPos, m_nextPos - startPos);
193            if (value.isNull())
194                value = "";
195            ++m_nextPos; // Consume the char.
196            return Token(LITERAL, value);
197        }
198    }
199
200    // Ouch, went off the end -- report error.
201    return Token(XPATH_ERROR);
202}
203
204Token Parser::lexNumber()
205{
206    int startPos = m_nextPos;
207    bool seenDot = false;
208
209    // Go until end or a non-digits character.
210    for (; m_nextPos < m_data.length(); ++m_nextPos) {
211        UChar aChar = m_data[m_nextPos];
212        if (aChar >= 0xff) break;
213
214        if (aChar < '0' || aChar > '9') {
215            if (aChar == '.' && !seenDot)
216                seenDot = true;
217            else
218                break;
219        }
220    }
221
222    return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
223}
224
225bool Parser::lexNCName(String& name)
226{
227    int startPos = m_nextPos;
228    if (m_nextPos >= m_data.length())
229        return false;
230
231    if (charCat(m_data[m_nextPos]) != NameStart)
232        return false;
233
234    // Keep going until we get a character that's not good for names.
235    for (; m_nextPos < m_data.length(); ++m_nextPos)
236        if (charCat(m_data[m_nextPos]) == NotPartOfName)
237            break;
238
239    name = m_data.substring(startPos, m_nextPos - startPos);
240    return true;
241}
242
243bool Parser::lexQName(String& name)
244{
245    String n1;
246    if (!lexNCName(n1))
247        return false;
248
249    skipWS();
250
251    // If the next character is :, what we just got it the prefix, if not,
252    // it's the whole thing.
253    if (peekAheadHelper() != ':') {
254        name = n1;
255        return true;
256    }
257
258    String n2;
259    if (!lexNCName(n2))
260        return false;
261
262    name = n1 + ":" + n2;
263    return true;
264}
265
266Token Parser::nextTokenInternal()
267{
268    skipWS();
269
270    if (m_nextPos >= m_data.length())
271        return Token(0);
272
273    char code = peekCurHelper();
274    switch (code) {
275    case '(': case ')': case '[': case ']':
276    case '@': case ',': case '|':
277        return makeTokenAndAdvance(code);
278    case '\'':
279    case '\"':
280        return lexString();
281    case '0': case '1': case '2': case '3': case '4':
282    case '5': case '6': case '7': case '8': case '9':
283        return lexNumber();
284    case '.': {
285        char next = peekAheadHelper();
286        if (next == '.')
287            return makeTokenAndAdvance(DOTDOT, 2);
288        if (next >= '0' && next <= '9')
289            return lexNumber();
290        return makeTokenAndAdvance('.');
291    }
292    case '/':
293        if (peekAheadHelper() == '/')
294            return makeTokenAndAdvance(SLASHSLASH, 2);
295        return makeTokenAndAdvance('/');
296    case '+':
297        return makeTokenAndAdvance(PLUS);
298    case '-':
299        return makeTokenAndAdvance(MINUS);
300    case '=':
301        return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
302    case '!':
303        if (peekAheadHelper() == '=')
304            return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
305        return Token(XPATH_ERROR);
306    case '<':
307        if (peekAheadHelper() == '=')
308            return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
309        return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
310    case '>':
311        if (peekAheadHelper() == '=')
312            return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
313        return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
314    case '*':
315        if (isBinaryOperatorContext())
316            return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
317        ++m_nextPos;
318        return Token(NAMETEST, "*");
319    case '$': { // $ QName
320        m_nextPos++;
321        String name;
322        if (!lexQName(name))
323            return Token(XPATH_ERROR);
324        return Token(VARIABLEREFERENCE, name);
325    }
326    }
327
328    String name;
329    if (!lexNCName(name))
330        return Token(XPATH_ERROR);
331
332    skipWS();
333    // If we're in an operator context, check for any operator names
334    if (isBinaryOperatorContext()) {
335        if (name == "and") //### hash?
336            return Token(AND);
337        if (name == "or")
338            return Token(OR);
339        if (name == "mod")
340            return Token(MULOP, NumericOp::OP_Mod);
341        if (name == "div")
342            return Token(MULOP, NumericOp::OP_Div);
343    }
344
345    // See whether we are at a :
346    if (peekCurHelper() == ':') {
347        m_nextPos++;
348        // Any chance it's an axis name?
349        if (peekCurHelper() == ':') {
350            m_nextPos++;
351
352            //It might be an axis name.
353            Step::Axis axis;
354            if (isAxisName(name, axis))
355                return Token(AXISNAME, axis);
356            // Ugh, :: is only valid in axis names -> error
357            return Token(XPATH_ERROR);
358        }
359
360        // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
361        skipWS();
362        if (peekCurHelper() == '*') {
363            m_nextPos++;
364            return Token(NAMETEST, name + ":*");
365        }
366
367        // Make a full qname.
368        String n2;
369        if (!lexNCName(n2))
370            return Token(XPATH_ERROR);
371
372        name = name + ":" + n2;
373    }
374
375    skipWS();
376    if (peekCurHelper() == '(') {
377        //note: we don't swallow the (here!
378
379        //either node type of function name
380        if (isNodeTypeName(name)) {
381            if (name == "processing-instruction")
382                return Token(PI, name);
383
384            return Token(NODETYPE, name);
385        }
386        //must be a function name.
387        return Token(FUNCTIONNAME, name);
388    }
389
390    // At this point, it must be NAMETEST.
391    return Token(NAMETEST, name);
392}
393
394Token Parser::nextToken()
395{
396    Token toRet = nextTokenInternal();
397    m_lastTokenType = toRet.type;
398    return toRet;
399}
400
401Parser::Parser()
402{
403    reset(String());
404}
405
406Parser::~Parser()
407{
408}
409
410void Parser::reset(const String& data)
411{
412    m_nextPos = 0;
413    m_data = data;
414    m_lastTokenType = 0;
415
416    m_topExpr = 0;
417    m_gotNamespaceError = false;
418}
419
420int Parser::lex(void* data)
421{
422    YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
423    Token tok = nextToken();
424
425    switch (tok.type) {
426    case AXISNAME:
427        yylval->axis = tok.axis;
428        break;
429    case MULOP:
430        yylval->numop = tok.numop;
431        break;
432    case RELOP:
433    case EQOP:
434        yylval->eqop = tok.eqop;
435        break;
436    case NODETYPE:
437    case PI:
438    case FUNCTIONNAME:
439    case LITERAL:
440    case VARIABLEREFERENCE:
441    case NUMBER:
442    case NAMETEST:
443        yylval->str = new String(tok.str);
444        registerString(yylval->str);
445        break;
446    }
447
448    return tok.type;
449}
450
451bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI)
452{
453    size_t colon = qName.find(':');
454    if (colon != notFound) {
455        if (!m_resolver)
456            return false;
457        namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
458        if (namespaceURI.isNull())
459            return false;
460        localName = qName.substring(colon + 1);
461    } else
462        localName = qName;
463
464    return true;
465}
466
467Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec)
468{
469    reset(statement);
470
471    m_resolver = resolver;
472
473    Parser* oldParser = currentParser;
474    currentParser = this;
475    int parseError = xpathyyparse(this);
476    currentParser = oldParser;
477
478    if (parseError) {
479        deleteAllValues(m_parseNodes);
480        m_parseNodes.clear();
481
482        HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end();
483        for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) {
484            deleteAllValues(**it);
485            delete *it;
486        }
487        m_predicateVectors.clear();
488
489        HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end();
490        for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) {
491            deleteAllValues(**it);
492            delete *it;
493        }
494        m_expressionVectors.clear();
495
496        deleteAllValues(m_strings);
497        m_strings.clear();
498
499        deleteAllValues(m_nodeTests);
500        m_nodeTests.clear();
501
502        m_topExpr = 0;
503
504        if (m_gotNamespaceError)
505            ec = NAMESPACE_ERR;
506        else
507            ec = XPathException::INVALID_EXPRESSION_ERR;
508        return 0;
509    }
510
511    ASSERT(m_parseNodes.size() == 1);
512    ASSERT(*m_parseNodes.begin() == m_topExpr);
513    ASSERT(m_expressionVectors.size() == 0);
514    ASSERT(m_predicateVectors.size() == 0);
515    ASSERT(m_strings.size() == 0);
516    ASSERT(m_nodeTests.size() == 0);
517
518    m_parseNodes.clear();
519    Expression* result = m_topExpr;
520    m_topExpr = 0;
521
522    return result;
523}
524
525void Parser::registerParseNode(ParseNode* node)
526{
527    if (node == 0)
528        return;
529
530    ASSERT(!m_parseNodes.contains(node));
531
532    m_parseNodes.add(node);
533}
534
535void Parser::unregisterParseNode(ParseNode* node)
536{
537    if (node == 0)
538        return;
539
540    ASSERT(m_parseNodes.contains(node));
541
542    m_parseNodes.remove(node);
543}
544
545void Parser::registerPredicateVector(Vector<Predicate*>* vector)
546{
547    if (vector == 0)
548        return;
549
550    ASSERT(!m_predicateVectors.contains(vector));
551
552    m_predicateVectors.add(vector);
553}
554
555void Parser::deletePredicateVector(Vector<Predicate*>* vector)
556{
557    if (vector == 0)
558        return;
559
560    ASSERT(m_predicateVectors.contains(vector));
561
562    m_predicateVectors.remove(vector);
563    delete vector;
564}
565
566
567void Parser::registerExpressionVector(Vector<Expression*>* vector)
568{
569    if (vector == 0)
570        return;
571
572    ASSERT(!m_expressionVectors.contains(vector));
573
574    m_expressionVectors.add(vector);
575}
576
577void Parser::deleteExpressionVector(Vector<Expression*>* vector)
578{
579    if (vector == 0)
580        return;
581
582    ASSERT(m_expressionVectors.contains(vector));
583
584    m_expressionVectors.remove(vector);
585    delete vector;
586}
587
588void Parser::registerString(String* s)
589{
590    if (s == 0)
591        return;
592
593    ASSERT(!m_strings.contains(s));
594
595    m_strings.add(s);
596}
597
598void Parser::deleteString(String* s)
599{
600    if (s == 0)
601        return;
602
603    ASSERT(m_strings.contains(s));
604
605    m_strings.remove(s);
606    delete s;
607}
608
609void Parser::registerNodeTest(Step::NodeTest* t)
610{
611    if (t == 0)
612        return;
613
614    ASSERT(!m_nodeTests.contains(t));
615
616    m_nodeTests.add(t);
617}
618
619void Parser::deleteNodeTest(Step::NodeTest* t)
620{
621    if (t == 0)
622        return;
623
624    ASSERT(m_nodeTests.contains(t));
625
626    m_nodeTests.remove(t);
627    delete t;
628}
629
630