oga/lib/oga/xpath/parser.rll

%header
{
##
# AST parser for XPath expressions. The AST is built using `AST::Node`
# instances.
#
# Unlike {Oga::XML::Parser} this parser only takes String instances as input.
#
}

%name Oga::XPath::Parser;

%terminals T_AXIS T_COLON T_COMMA T_FLOAT T_INT T_IDENT T_TYPE_TEST;
%terminals T_LBRACK T_RBRACK T_LPAREN T_RPAREN T_SLASH T_STRING;
%terminals T_PIPE T_AND T_OR T_ADD T_DIV T_MOD T_EQ T_NEQ T_LT T_GT T_LTE T_GTE;
%terminals T_SUB T_MUL T_VAR;

xpath
  = expression?
  ;

# Expressions And Operators
#
# Operators are handled by using a mixture of iteration (in the form of the *
# operator), recursion and priorities. Priorities are handled by recursing into
# certain rules before processing others.
#
# These rules are largely based on the following resources:
#
# * http://www.w3.org/TR/xquery-xpath-parsing/#XPath-EBNF
# * http://blog.jwbroek.com/2010/07/antlr-grammar-for-parsing-xpath-10.html
#

expression
  = and_expr expression_follow* { combine_operators(val) }
  ;

expression_follow
  = T_OR and_expr { [:or, val[1]] }
  ;

and_expr
  = equality_expr and_expr_follow* { combine_operators(val) }
  ;

and_expr_follow
  = T_AND equality_expr { [:and, val[1]] }
  ;

equality_expr
  = relational_expr equality_expr_follow* { combine_operators(val) }
  ;

equality_expr_follow
  = T_EQ  relational_expr { [:eq, val[1]] }
  | T_NEQ relational_expr { [:neq, val[1]] }
  ;

relational_expr
  = additive_expr relational_expr_follow* { combine_operators(val) }
  ;

relational_expr_follow
  = T_LT  additive_expr { [:lt, val[1]] }
  | T_GT  additive_expr { [:gt, val[1]] }
  | T_LTE additive_expr { [:lte, val[1]] }
  | T_GTE additive_expr { [:gte, val[1]] }
  ;

additive_expr
  = mult_expr additive_expr_follow* { combine_operators(val) }
  ;

additive_expr_follow
  = T_ADD mult_expr { [:add, val[1]] }
  | T_SUB mult_expr { [:sub, val[1]] }
  ;

mult_expr
  = union_expr mult_expr_follow? { combine_optional_operator(val) }
  ;

mult_expr_follow
  = T_DIV mult_expr { [:div, val[1]] }
  | T_MOD mult_expr { [:mod, val[1]] }
  | T_MUL mult_expr { [:mul, val[1]] }
  ;

union_expr
  = expression_member union_expr_follow* { combine_operators(val) }
  ;

union_expr_follow
  = T_PIPE expression_member { [:pipe, val[1]] }
  ;

expression_member
  = relative_path
  | absolute_path
  | string
  | number
  | variable
  | T_LPAREN expression T_RPAREN { val[1] }
  ;

# A, A/B, etc
relative_path
  = path_steps { val[0].length > 1 ? s(:path, *val[0]) : val[0][0] }
  ;

path_steps
  = path_step_or_axis path_steps_follow* { [val[0], *val[1]] }
  ;

path_steps_follow
  = T_SLASH path_step_or_axis { val[1] }
  ;

# /A, /A/B, etc
absolute_path
  = T_SLASH path_steps? { s(:absolute_path, *val[1]) }
  ;

path_step_or_axis
  = path_step
  | axis
  ;

# A, A(), A(X), etc
path_step
  = T_IDENT path_step_follow
    {
      type = val[1][0]
      args = val[1][1]
      pred = val[1][2]

      if type == :test
        # Whenever a bare test is used (e.g. just "A") this actually means
        # "child::A". Handling this on parser level is the easiest.
        if args
          node = s(:axis, 'child', s(:test, val[0], args))
        else
          node = s(:axis, 'child', s(:test, nil, val[0]))
        end
      else
        node = s(type, val[0], *args)
      end

      if pred
        node = s(:predicate, node, pred)
      end

      node
    }
  | type_test { s(:axis, 'child', val[0]) }
  ;

path_step_follow
  = T_LPAREN call_args T_RPAREN { [:call, val[1]] }
  | T_COLON T_IDENT predicate?  { [:test, val[1], val[2]] }
  | predicate?                  { [:test, nil, val[0]] }
  ;

predicate
  = T_LBRACK expression T_RBRACK { val[1] }
  ;

type_test
  = T_TYPE_TEST { s(:type_test, val[0]) }
  ;

# Regular test (e.g. tests used as axis values)
test
  = T_IDENT test_follow?
    {
      val[1] ? s(:test, val[0], val[1]) : s(:test, nil, val[0])
    }
  ;

test_follow
  = T_COLON T_IDENT { val[1] }
  ;

call_args
  = expression call_args_follow* { [val[0], *val[1]] }
  | _
  ;

call_args_follow
  = T_COMMA expression { val[1] }
  ;

# child::foo, descendant-or-self::foo, etc
axis
  = T_AXIS axis_value predicate?
    {
      ret = s(:axis, val[0], val[1])

      if val[2]
        ret = s(:predicate, ret, val[2])
      end

      ret
    }
  ;

axis_value
  = test
  | type_test
  ;

string
  = T_STRING { s(:string, val[0]) };

number
  = T_INT   { s(:int, val[0]) }
  | T_FLOAT { s(:float, val[0]) }
  ;

variable
  = T_VAR { s(:var, val[0]) }
  ;

%inner
{
  ##
  # @return [Oga::LRU]
  #
  CACHE = LRU.new

  ##
  # @param [String] data
  # @return [AST::Node]
  #
  def self.parse_with_cache(data)
    return CACHE.get_or_set(data) { new(data).parse }
  end

  ##
  # @param [String] data The input to parse.
  #
  def initialize(data)
    @lexer = Lexer.new(data)
  end

  ##
  # Creates a new XPath node.
  #
  # @param [Symbol] type
  # @param [Array] children
  # @return [AST::Node]
  #
  def s(type, *children)
    return AST::Node.new(type, children)
  end

  ##
  # Yields the next token from the lexer.
  #
  # @yieldparam [Array]
  #
  def each_token
    @lexer.advance do |type, value, line|
      @line = line if line

      yield [type, value]
    end

    yield [-1, -1]
  end

  ##
  # @param [Array] val
  #
  def combine_operators(val)
    ret = val[0]

    val[1].each do |expr|
      ret = s(expr[0], ret, expr[1])
    end

    return ret
  end

  ##
  # @param [Array] val
  #
  def combine_optional_operator(val)
    ret = val[0]

    if val[1]
      ret = s(val[1][0], ret, val[1][1])
    end

    ret
  end
}