From 70f3b7fa922e4498838934f30f4a279e3e06f463 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Mon, 9 Jun 2014 23:35:54 +0200 Subject: [PATCH] Lex XPath operators using individual tokens. Instead of lexing every operator as T_OP they now use individual tokens such as T_EQ and T_LT. --- lib/oga/xpath/lexer.rl | 53 +++--- spec/oga/xpath/lexer/axes_spec.rb | 19 ++ spec/oga/xpath/lexer/calls_spec.rb | 33 ++++ spec/oga/xpath/lexer/general_spec.rb | 71 ++++++++ spec/oga/xpath/lexer/operators_spec.rb | 61 +++++++ spec/oga/xpath/lexer/predicates_spec.rb | 116 +++++++++++++ spec/oga/xpath/lexer_spec.rb | 222 ------------------------ 7 files changed, 331 insertions(+), 244 deletions(-) create mode 100644 spec/oga/xpath/lexer/axes_spec.rb create mode 100644 spec/oga/xpath/lexer/calls_spec.rb create mode 100644 spec/oga/xpath/lexer/general_spec.rb create mode 100644 spec/oga/xpath/lexer/operators_spec.rb create mode 100644 spec/oga/xpath/lexer/predicates_spec.rb delete mode 100644 spec/oga/xpath/lexer_spec.rb diff --git a/lib/oga/xpath/lexer.rl b/lib/oga/xpath/lexer.rl index b8e7f56..f3df977 100644 --- a/lib/oga/xpath/lexer.rl +++ b/lib/oga/xpath/lexer.rl @@ -239,18 +239,14 @@ module Oga # conflicting with the patterns used for matching identifiers (= # element names and the likes). - operator = '|' - | 'and' - | 'or' - | '+' - | 'div' - | 'mod' - | '=' - | '!=' - | '<' - | '>' - | '<=' - | '>='; + op_pipe = '|' %{ add_token(:T_PIPE) }; + op_plus = '+' %{ add_token(:T_ADD) }; + op_eq = '=' %{ add_token(:T_EQ) }; + op_neq = '!=' %{ add_token(:T_NEQ) }; + op_lt = '<' %{ add_token(:T_LT) }; + op_gt = '>' %{ add_token(:T_GT) }; + op_lte = '<=' %{ add_token(:T_LTE) }; + op_gte = '>=' %{ add_token(:T_GTE) }; # These operators require whitespace around them in order to be lexed # as operators. This is due to "-" being allowed in node names and "*" @@ -259,24 +255,36 @@ module Oga # THINK: relying on whitespace is a rather fragile solution, even # though the W3 actually recommends this for the "-" operator. Perhaps # there's a better way of doing this. - space_operator = space ('*' | '-') space; - action emit_operator { - emit(:T_OP, ts, te) - } + op_and = ' and ' %{ add_token(:T_AND) }; + op_or = ' or ' %{ add_token(:T_OR) }; + op_div = ' div ' %{ add_token(:T_DIV) }; + op_mod = ' mod ' %{ add_token(:T_MOD) }; + op_mul = ' * ' %{ add_token(:T_MUL) }; + op_sub = ' - ' %{ add_token(:T_SUB) }; - action emit_space_operator { - emit(:T_OP, ts + 1, te - 1) - } + operator = op_pipe + | op_and + | op_or + | op_plus + | op_div + | op_mod + | op_eq + | op_neq + | op_lt + | op_gt + | op_lte + | op_gte + | op_mul + | op_sub + ; # Machine that handles the lexing of data inside an XPath predicate. # When bumping into a "]" the lexer jumps back to the `main` machine. predicate := |* + operator; whitespace | slash | lparen | rparen | comma | colon | star; - operator => emit_operator; - space_operator => emit_space_operator; - string => emit_string; integer => emit_integer; float => emit_float; @@ -291,6 +299,7 @@ module Oga *|; main := |* + operator; whitespace | slash | lparen | rparen | comma | colon | star; '[' => { diff --git a/spec/oga/xpath/lexer/axes_spec.rb b/spec/oga/xpath/lexer/axes_spec.rb new file mode 100644 index 0000000..daf3ed5 --- /dev/null +++ b/spec/oga/xpath/lexer/axes_spec.rb @@ -0,0 +1,19 @@ +require 'spec_helper' + +describe Oga::XPath::Lexer do + context 'axes' do + example 'lex an axis using the full syntax form' do + lex_xpath('/parent::node()').should == [ + [:T_SLASH, nil], + [:T_AXIS, 'parent'], + [:T_IDENT, 'node'], + [:T_LPAREN, nil], + [:T_RPAREN, nil] + ] + end + + example 'lex an axis using the short syntax form' do + lex_xpath('/..').should == [[:T_SLASH, nil], [:T_AXIS, 'parent']] + end + end +end diff --git a/spec/oga/xpath/lexer/calls_spec.rb b/spec/oga/xpath/lexer/calls_spec.rb new file mode 100644 index 0000000..de9c780 --- /dev/null +++ b/spec/oga/xpath/lexer/calls_spec.rb @@ -0,0 +1,33 @@ +require 'spec_helper' + +describe Oga::XPath::Lexer do + context 'function calls' do + example 'lex a function call without arguments' do + lex_xpath('count()').should == [ + [:T_IDENT, 'count'], + [:T_LPAREN, nil], + [:T_RPAREN, nil] + ] + end + + example 'lex a function call with a single argument' do + lex_xpath('count(foo)').should == [ + [:T_IDENT, 'count'], + [:T_LPAREN, nil], + [:T_IDENT, 'foo'], + [:T_RPAREN, nil] + ] + end + + example 'lex a function call with two arguments' do + lex_xpath('count(foo, bar)').should == [ + [:T_IDENT, 'count'], + [:T_LPAREN, nil], + [:T_IDENT, 'foo'], + [:T_COMMA, nil], + [:T_IDENT, 'bar'], + [:T_RPAREN, nil] + ] + end + end +end diff --git a/spec/oga/xpath/lexer/general_spec.rb b/spec/oga/xpath/lexer/general_spec.rb new file mode 100644 index 0000000..d3e00af --- /dev/null +++ b/spec/oga/xpath/lexer/general_spec.rb @@ -0,0 +1,71 @@ +require 'spec_helper' + +describe Oga::XPath::Lexer do + context 'general' do + example 'lex a simple expression' do + lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']] + end + + example 'lex a node test using a namespace' do + lex_xpath('/foo:bar').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_COLON, nil], + [:T_IDENT, 'bar'] + ] + end + + example 'lex a whildcard node test' do + lex_xpath('/*').should == [[:T_SLASH, nil], [:T_STAR, nil]] + end + + example 'lex a wildcard node test for a namespace' do + lex_xpath('/*:foo').should == [ + [:T_SLASH, nil], + [:T_STAR, nil], + [:T_COLON, nil], + [:T_IDENT, 'foo'] + ] + end + + # The following are a bunch of examples taken from Wikipedia and the W3 + # spec to see how the lexer handles them. + + example 'lex an descendant-or-self expression' do + lex_xpath('/wikimedia//editions').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'wikimedia'], + [:T_SLASH, nil], + [:T_AXIS, 'descendant-or-self'], + [:T_IDENT, 'editions'] + ] + end + + example 'lex a complex expression using predicates and function calls' do + path = '/wikimedia/projects/project[@name="Wikipedia"]/editions/edition/text()' + + lex_xpath(path).should == [ + [:T_SLASH, nil], + [:T_IDENT, 'wikimedia'], + [:T_SLASH, nil], + [:T_IDENT, 'projects'], + [:T_SLASH, nil], + [:T_IDENT, 'project'], + [:T_LBRACK, nil], + [:T_AXIS, 'attribute'], + [:T_IDENT, 'name'], + [:T_EQ, nil], + [:T_STRING, 'Wikipedia'], + [:T_RBRACK, nil], + [:T_SLASH, nil], + [:T_IDENT, 'editions'], + [:T_SLASH, nil], + [:T_IDENT, 'edition'], + [:T_SLASH, nil], + [:T_IDENT, 'text'], + [:T_LPAREN, nil], + [:T_RPAREN, nil] + ] + end + end +end diff --git a/spec/oga/xpath/lexer/operators_spec.rb b/spec/oga/xpath/lexer/operators_spec.rb new file mode 100644 index 0000000..3d58d18 --- /dev/null +++ b/spec/oga/xpath/lexer/operators_spec.rb @@ -0,0 +1,61 @@ +require 'spec_helper' + +describe Oga::XPath::Lexer do + context 'operators' do + example 'lex the pipe operator' do + lex_xpath('|').should == [[:T_PIPE, nil]] + end + + example 'lex the and operator' do + lex_xpath(' and ').should == [[:T_AND, nil]] + end + + example 'lex the or operator' do + lex_xpath(' or ').should == [[:T_OR, nil]] + end + + example 'lex the plus operator' do + lex_xpath('+').should == [[:T_ADD, nil]] + end + + example 'lex the div operator' do + lex_xpath(' div ').should == [[:T_DIV, nil]] + end + + example 'lex the mod operator' do + lex_xpath(' mod ').should == [[:T_MOD, nil]] + end + + example 'lex the equals operator' do + lex_xpath('=').should == [[:T_EQ, nil]] + end + + example 'lex the not-equals operator' do + lex_xpath('!=').should == [[:T_NEQ, nil]] + end + + example 'lex the lower-than operator' do + lex_xpath('<').should == [[:T_LT, nil]] + end + + example 'lex the greater-than operator' do + lex_xpath('>').should == [[:T_GT, nil]] + end + + example 'lex the lower-or-equal operator' do + lex_xpath('<=').should == [[:T_LTE, nil]] + end + + example 'lex the greater-or-equal operator' do + lex_xpath('>=').should == [[:T_GTE, nil]] + end + + example 'lex the mul operator' do + lex_xpath(' * ').should == [[:T_MUL, nil]] + end + + example 'lex the subtraction operator' do + lex_xpath(' - ').should == [[:T_SUB, nil]] + end + end +end diff --git a/spec/oga/xpath/lexer/predicates_spec.rb b/spec/oga/xpath/lexer/predicates_spec.rb new file mode 100644 index 0000000..916b12c --- /dev/null +++ b/spec/oga/xpath/lexer/predicates_spec.rb @@ -0,0 +1,116 @@ +require 'spec_helper' + +describe Oga::XPath::Lexer do + context 'predicates' do + example 'lex a simple predicate expression' do + lex_xpath('/foo[bar]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_IDENT, 'bar'], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate that checks for equality' do + lex_xpath('/foo[@bar="baz"]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_AXIS, 'attribute'], + [:T_IDENT, 'bar'], + [:T_EQ, nil], + [:T_STRING, 'baz'], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate that user an integer' do + lex_xpath('/foo[1]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_INT, 1], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate that uses a float' do + lex_xpath('/foo[1.5]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_FLOAT, 1.5], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate using a function' do + lex_xpath('/foo[bar()]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_LBRACK, nil], + [:T_IDENT, 'bar'], + [:T_LPAREN, nil], + [:T_RPAREN, nil], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate expression using the div operator' do + lex_xpath('/div[@number=4 div 2]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'div'], + [:T_LBRACK, nil], + [:T_AXIS, 'attribute'], + [:T_IDENT, 'number'], + [:T_EQ, nil], + [:T_INT, 4], + [:T_DIV, nil], + [:T_INT, 2], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate expression using the * operator' do + lex_xpath('/div[@number=4 * 2]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'div'], + [:T_LBRACK, nil], + [:T_AXIS, 'attribute'], + [:T_IDENT, 'number'], + [:T_EQ, nil], + [:T_INT, 4], + [:T_MUL, nil], + [:T_INT, 2], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate expression using axes' do + lex_xpath('/div[/foo/bar]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'div'], + [:T_LBRACK, nil], + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_SLASH, nil], + [:T_IDENT, 'bar'], + [:T_RBRACK, nil] + ] + end + + example 'lex a predicate expression using a wildcard' do + lex_xpath('/div[/foo/*]').should == [ + [:T_SLASH, nil], + [:T_IDENT, 'div'], + [:T_LBRACK, nil], + [:T_SLASH, nil], + [:T_IDENT, 'foo'], + [:T_SLASH, nil], + [:T_STAR, nil], + [:T_RBRACK, nil] + ] + end + end +end diff --git a/spec/oga/xpath/lexer_spec.rb b/spec/oga/xpath/lexer_spec.rb deleted file mode 100644 index 8c7861a..0000000 --- a/spec/oga/xpath/lexer_spec.rb +++ /dev/null @@ -1,222 +0,0 @@ -require 'spec_helper' - -describe Oga::XPath::Lexer do - example 'lex a simple expression' do - lex_xpath('/foo').should == [[:T_SLASH, nil], [:T_IDENT, 'foo']] - end - - example 'lex a function call without arguments' do - lex_xpath('count()').should == [ - [:T_IDENT, 'count'], - [:T_LPAREN, nil], - [:T_RPAREN, nil] - ] - end - - example 'lex a function call with a single argument' do - lex_xpath('count(foo)').should == [ - [:T_IDENT, 'count'], - [:T_LPAREN, nil], - [:T_IDENT, 'foo'], - [:T_RPAREN, nil] - ] - end - - example 'lex a function call with two arguments' do - lex_xpath('count(foo, bar)').should == [ - [:T_IDENT, 'count'], - [:T_LPAREN, nil], - [:T_IDENT, 'foo'], - [:T_COMMA, nil], - [:T_IDENT, 'bar'], - [:T_RPAREN, nil] - ] - end - - example 'lex a simple predicate expression' do - lex_xpath('/foo[bar]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_LBRACK, nil], - [:T_IDENT, 'bar'], - [:T_RBRACK, nil] - ] - end - - example 'lex a predicate that checks for equality' do - lex_xpath('/foo[@bar="baz"]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_LBRACK, nil], - [:T_AXIS, 'attribute'], - [:T_IDENT, 'bar'], - [:T_OP, '='], - [:T_STRING, 'baz'], - [:T_RBRACK, nil] - ] - end - - example 'lex a predicate that user an integer' do - lex_xpath('/foo[1]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_LBRACK, nil], - [:T_INT, 1], - [:T_RBRACK, nil] - ] - end - - example 'lex a predicate that uses a float' do - lex_xpath('/foo[1.5]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_LBRACK, nil], - [:T_FLOAT, 1.5], - [:T_RBRACK, nil] - ] - end - - example 'lex a predicate using a function' do - lex_xpath('/foo[bar()]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_LBRACK, nil], - [:T_IDENT, 'bar'], - [:T_LPAREN, nil], - [:T_RPAREN, nil], - [:T_RBRACK, nil] - ] - end - - example 'lex an axis using the full syntax form' do - lex_xpath('/parent::node()').should == [ - [:T_SLASH, nil], - [:T_AXIS, 'parent'], - [:T_IDENT, 'node'], - [:T_LPAREN, nil], - [:T_RPAREN, nil] - ] - end - - example 'lex an axis using the short syntax form' do - lex_xpath('/..').should == [[:T_SLASH, nil], [:T_AXIS, 'parent']] - end - - example 'lex a node test using a namespace' do - lex_xpath('/foo:bar').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_COLON, nil], - [:T_IDENT, 'bar'] - ] - end - - example 'lex a whildcard node test' do - lex_xpath('/*').should == [[:T_SLASH, nil], [:T_STAR, nil]] - end - - example 'lex a wildcard node test for a namespace' do - lex_xpath('/*:foo').should == [ - [:T_SLASH, nil], - [:T_STAR, nil], - [:T_COLON, nil], - [:T_IDENT, 'foo'] - ] - end - - example 'lex a predicate expression using the div operator' do - lex_xpath('/div[@number=4 div 2]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'div'], - [:T_LBRACK, nil], - [:T_AXIS, 'attribute'], - [:T_IDENT, 'number'], - [:T_OP, '='], - [:T_INT, 4], - [:T_OP, 'div'], - [:T_INT, 2], - [:T_RBRACK, nil] - ] - end - - example 'lex a predicate expression using the * operator' do - lex_xpath('/div[@number=4 * 2]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'div'], - [:T_LBRACK, nil], - [:T_AXIS, 'attribute'], - [:T_IDENT, 'number'], - [:T_OP, '='], - [:T_INT, 4], - [:T_OP, '*'], - [:T_INT, 2], - [:T_RBRACK, nil] - ] - end - - example 'lex a predicate expression using axes' do - lex_xpath('/div[/foo/bar]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'div'], - [:T_LBRACK, nil], - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_SLASH, nil], - [:T_IDENT, 'bar'], - [:T_RBRACK, nil] - ] - end - - example 'lex a predicate expression using a wildcard' do - lex_xpath('/div[/foo/*]').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'div'], - [:T_LBRACK, nil], - [:T_SLASH, nil], - [:T_IDENT, 'foo'], - [:T_SLASH, nil], - [:T_STAR, nil], - [:T_RBRACK, nil] - ] - end - - # The following are a bunch of examples taken from Wikipedia and the W3 spec - # to see how the lexer handles them. - - example 'lex an descendant-or-self expression' do - lex_xpath('/wikimedia//editions').should == [ - [:T_SLASH, nil], - [:T_IDENT, 'wikimedia'], - [:T_SLASH, nil], - [:T_AXIS, 'descendant-or-self'], - [:T_IDENT, 'editions'] - ] - end - - example 'lex a complex expression using predicates and function calls' do - path = '/wikimedia/projects/project[@name="Wikipedia"]/editions/edition/text()' - - lex_xpath(path).should == [ - [:T_SLASH, nil], - [:T_IDENT, 'wikimedia'], - [:T_SLASH, nil], - [:T_IDENT, 'projects'], - [:T_SLASH, nil], - [:T_IDENT, 'project'], - [:T_LBRACK, nil], - [:T_AXIS, 'attribute'], - [:T_IDENT, 'name'], - [:T_OP, '='], - [:T_STRING, 'Wikipedia'], - [:T_RBRACK, nil], - [:T_SLASH, nil], - [:T_IDENT, 'editions'], - [:T_SLASH, nil], - [:T_IDENT, 'edition'], - [:T_SLASH, nil], - [:T_IDENT, 'text'], - [:T_LPAREN, nil], - [:T_RPAREN, nil] - ] - end -end