diff options
| author | Jari Vetoniemi <jari.vetoniemi@indooratlas.com> | 2020-03-16 18:49:26 +0900 | 
|---|---|---|
| committer | Jari Vetoniemi <jari.vetoniemi@indooratlas.com> | 2020-03-30 00:39:06 +0900 | 
| commit | fcbf63e62c627deae76c1b8cb8c0876c536ed811 (patch) | |
| tree | 64cb17de3f41a2b6fef2368028fbd00349946994 /jni/ruby/lib/rexml/parsers | |
Fresh start
Diffstat (limited to 'jni/ruby/lib/rexml/parsers')
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/baseparser.rb | 532 | ||||
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/lightparser.rb | 58 | ||||
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/pullparser.rb | 196 | ||||
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/sax2parser.rb | 272 | ||||
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/streamparser.rb | 52 | ||||
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/treeparser.rb | 100 | ||||
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/ultralightparser.rb | 56 | ||||
| -rw-r--r-- | jni/ruby/lib/rexml/parsers/xpathparser.rb | 656 | 
8 files changed, 1922 insertions, 0 deletions
| diff --git a/jni/ruby/lib/rexml/parsers/baseparser.rb b/jni/ruby/lib/rexml/parsers/baseparser.rb new file mode 100644 index 0000000..6a08b86 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/baseparser.rb @@ -0,0 +1,532 @@ +require 'rexml/parseexception' +require 'rexml/undefinednamespaceexception' +require 'rexml/source' +require 'set' + +module REXML +  module Parsers +    # = Using the Pull Parser +    # <em>This API is experimental, and subject to change.</em> +    #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) +    #  while parser.has_next? +    #    res = parser.next +    #    puts res[1]['att'] if res.start_tag? and res[0] == 'b' +    #  end +    # See the PullEvent class for information on the content of the results. +    # The data is identical to the arguments passed for the various events to +    # the StreamListener API. +    # +    # Notice that: +    #  parser = PullParser.new( "<a>BAD DOCUMENT" ) +    #  while parser.has_next? +    #    res = parser.next +    #    raise res[1] if res.error? +    #  end +    # +    # Nat Price gave me some good ideas for the API. +    class BaseParser +      LETTER = '[:alpha:]' +      DIGIT = '[:digit:]' + +      COMBININGCHAR = '' # TODO +      EXTENDER = ''      # TODO + +      NCNAME_STR= "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*" +      NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" +      UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + +      NAMECHAR = '[\-\w\.:]' +      NAME = "([\\w:]#{NAMECHAR}*)" +      NMTOKEN = "(?:#{NAMECHAR})+" +      NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" +      REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" +      REFERENCE_RE = /#{REFERENCE}/ + +      DOCTYPE_START = /\A\s*<!DOCTYPE\s/um +      DOCTYPE_END = /\A\s*\]\s*>/um +      DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um +      ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um +      COMMENT_START = /\A<!--/u +      COMMENT_PATTERN = /<!--(.*?)-->/um +      CDATA_START = /\A<!\[CDATA\[/u +      CDATA_END = /\A\s*\]\s*>/um +      CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um +      XMLDECL_START = /\A<\?xml\s/u; +      XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um +      INSTRUCTION_START = /\A<\?/u +      INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um +      TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um +      CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um + +      VERSION = /\bversion\s*=\s*["'](.*?)['"]/um +      ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um +      STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um + +      ENTITY_START = /\A\s*<!ENTITY/ +      IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u +      ELEMENTDECL_START = /\A\s*<!ELEMENT/um +      ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um +      SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um +      ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" +      NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" +      ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" +      ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" +      ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" +      DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" +      ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" +      ATTDEF_RE = /#{ATTDEF}/ +      ATTLISTDECL_START = /\A\s*<!ATTLIST/um +      ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um +      NOTATIONDECL_START = /\A\s*<!NOTATION/um +      PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um +      SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um + +      TEXT_PATTERN = /\A([^<]*)/um + +      # Entity constants +      PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" +      SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} +      PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} +      EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" +      NDATADECL = "\\s+NDATA\\s+#{NAME}" +      PEREFERENCE = "%#{NAME};" +      ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} +      PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" +      ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" +      PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" +      GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" +      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + +      EREFERENCE = /&(?!#{NAME};)/ + +      DEFAULT_ENTITIES = { +        'gt' => [/>/, '>', '>', />/], +        'lt' => [/</, '<', '<', /</], +        'quot' => [/"/, '"', '"', /"/], +        "apos" => [/'/, "'", "'", /'/] +      } + + +      ###################################################################### +      # These are patterns to identify common markup errors, to make the +      # error messages more informative. +      ###################################################################### +      MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um + +      def initialize( source ) +        self.stream = source +        @listeners = [] +      end + +      def add_listener( listener ) +        @listeners << listener +      end + +      attr_reader :source + +      def stream=( source ) +        @source = SourceFactory.create_from( source ) +        @closed = nil +        @document_status = nil +        @tags = [] +        @stack = [] +        @entities = [] +        @nsstack = [] +      end + +      def position +        if @source.respond_to? :position +          @source.position +        else +          # FIXME +          0 +        end +      end + +      # Returns true if there are no more events +      def empty? +        return (@source.empty? and @stack.empty?) +      end + +      # Returns true if there are more events.  Synonymous with !empty? +      def has_next? +        return !(@source.empty? and @stack.empty?) +      end + +      # Push an event back on the head of the stream.  This method +      # has (theoretically) infinite depth. +      def unshift token +        @stack.unshift(token) +      end + +      # Peek at the +depth+ event in the stack.  The first element on the stack +      # is at depth 0.  If +depth+ is -1, will parse to the end of the input +      # stream and return the last event, which is always :end_document. +      # Be aware that this causes the stream to be parsed up to the +depth+ +      # event, so you can effectively pre-parse the entire document (pull the +      # entire thing into memory) using this method. +      def peek depth=0 +        raise %Q[Illegal argument "#{depth}"] if depth < -1 +        temp = [] +        if depth == -1 +          temp.push(pull()) until empty? +        else +          while @stack.size+temp.size < depth+1 +            temp.push(pull()) +          end +        end +        @stack += temp if temp.size > 0 +        @stack[depth] +      end + +      # Returns the next event.  This is a +PullEvent+ object. +      def pull +        pull_event.tap do |event| +          @listeners.each do |listener| +            listener.receive event +          end +        end +      end + +      def pull_event +        if @closed +          x, @closed = @closed, nil +          return [ :end_element, x ] +        end +        return [ :end_document ] if empty? +        return @stack.shift if @stack.size > 0 +        #STDERR.puts @source.encoding +        @source.read if @source.buffer.size<2 +        #STDERR.puts "BUFFER = #{@source.buffer.inspect}" +        if @document_status == nil +          #@source.consume( /^\s*/um ) +          word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) +          word = word[1] unless word.nil? +          #STDERR.puts "WORD = #{word.inspect}" +          case word +          when COMMENT_START +            return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] +          when XMLDECL_START +            #STDERR.puts "XMLDECL" +            results = @source.match( XMLDECL_PATTERN, true )[1] +            version = VERSION.match( results ) +            version = version[1] unless version.nil? +            encoding = ENCODING.match(results) +            encoding = encoding[1] unless encoding.nil? +            if need_source_encoding_update?(encoding) +              @source.encoding = encoding +            end +            if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding +              encoding = "UTF-16" +            end +            standalone = STANDALONE.match(results) +            standalone = standalone[1] unless standalone.nil? +            return [ :xmldecl, version, encoding, standalone ] +          when INSTRUCTION_START +            return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] +          when DOCTYPE_START +            md = @source.match( DOCTYPE_PATTERN, true ) +            @nsstack.unshift(curr_ns=Set.new) +            identity = md[1] +            close = md[2] +            identity =~ IDENTITY +            name = $1 +            raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil? +            pub_sys = $2.nil? ? nil : $2.strip +            long_name = $4.nil? ? nil : $4.strip +            uri = $6.nil? ? nil : $6.strip +            args = [ :start_doctype, name, pub_sys, long_name, uri ] +            if close == ">" +              @document_status = :after_doctype +              @source.read if @source.buffer.size<2 +              md = @source.match(/^\s*/um, true) +              @stack << [ :end_doctype ] +            else +              @document_status = :in_doctype +            end +            return args +          when /^\s+/ +          else +            @document_status = :after_doctype +            @source.read if @source.buffer.size<2 +            md = @source.match(/\s*/um, true) +            if @source.encoding == "UTF-8" +              @source.buffer.force_encoding(::Encoding::UTF_8) +            end +          end +        end +        if @document_status == :in_doctype +          md = @source.match(/\s*(.*?>)/um) +          case md[1] +          when SYSTEMENTITY +            match = @source.match( SYSTEMENTITY, true )[1] +            return [ :externalentity, match ] + +          when ELEMENTDECL_START +            return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] + +          when ENTITY_START +            match = @source.match( ENTITYDECL, true ).to_a.compact +            match[0] = :entitydecl +            ref = false +            if match[1] == '%' +              ref = true +              match.delete_at 1 +            end +            # Now we have to sort out what kind of entity reference this is +            if match[2] == 'SYSTEM' +              # External reference +              match[3] = match[3][1..-2] # PUBID +              match.delete_at(4) if match.size > 4 # Chop out NDATA decl +              # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] +            elsif match[2] == 'PUBLIC' +              # External reference +              match[3] = match[3][1..-2] # PUBID +              match[4] = match[4][1..-2] # HREF +              match.delete_at(5) if match.size > 5 # Chop out NDATA decl +              # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] +            else +              match[2] = match[2][1..-2] +              match.pop if match.size == 4 +              # match is [ :entity, name, value ] +            end +            match << '%' if ref +            return match +          when ATTLISTDECL_START +            md = @source.match( ATTLISTDECL_PATTERN, true ) +            raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? +            element = md[1] +            contents = md[0] + +            pairs = {} +            values = md[0].scan( ATTDEF_RE ) +            values.each do |attdef| +              unless attdef[3] == "#IMPLIED" +                attdef.compact! +                val = attdef[3] +                val = attdef[4] if val == "#FIXED " +                pairs[attdef[0]] = val +                if attdef[0] =~ /^xmlns:(.*)/ +                  @nsstack[0] << $1 +                end +              end +            end +            return [ :attlistdecl, element, pairs, contents ] +          when NOTATIONDECL_START +            md = nil +            if @source.match( PUBLIC ) +              md = @source.match( PUBLIC, true ) +              vals = [md[1],md[2],md[4],md[6]] +            elsif @source.match( SYSTEM ) +              md = @source.match( SYSTEM, true ) +              vals = [md[1],md[2],nil,md[4]] +            else +              raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) +            end +            return [ :notationdecl, *vals ] +          when DOCTYPE_END +            @document_status = :after_doctype +            @source.match( DOCTYPE_END, true ) +            return [ :end_doctype ] +          end +        end +        begin +          if @source.buffer[0] == ?< +            if @source.buffer[1] == ?/ +              @nsstack.shift +              last_tag = @tags.pop +              #md = @source.match_to_consume( '>', CLOSE_MATCH) +              md = @source.match( CLOSE_MATCH, true ) +              raise REXML::ParseException.new( "Missing end tag for "+ +                "'#{last_tag}' (got \"#{md[1]}\")", +                @source) unless last_tag == md[1] +              return [ :end_element, last_tag ] +            elsif @source.buffer[1] == ?! +              md = @source.match(/\A(\s*[^>]*>)/um) +              #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" +              raise REXML::ParseException.new("Malformed node", @source) unless md +              if md[0][2] == ?- +                md = @source.match( COMMENT_PATTERN, true ) + +                case md[1] +                when /--/, /-\z/ +                  raise REXML::ParseException.new("Malformed comment", @source) +                end + +                return [ :comment, md[1] ] if md +              else +                md = @source.match( CDATA_PATTERN, true ) +                return [ :cdata, md[1] ] if md +              end +              raise REXML::ParseException.new( "Declarations can only occur "+ +                "in the doctype declaration.", @source) +            elsif @source.buffer[1] == ?? +              md = @source.match( INSTRUCTION_PATTERN, true ) +              return [ :processing_instruction, md[1], md[2] ] if md +              raise REXML::ParseException.new( "Bad instruction declaration", +                @source) +            else +              # Get the next tag +              md = @source.match(TAG_MATCH, true) +              unless md +                # Check for missing attribute quotes +                raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) +                raise REXML::ParseException.new("malformed XML: missing tag start", @source) +              end +              attributes = {} +              prefixes = Set.new +              prefixes << md[2] if md[2] +              @nsstack.unshift(curr_ns=Set.new) +              if md[4].size > 0 +                attrs = md[4].scan( ATTRIBUTE_PATTERN ) +                raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 +                attrs.each do |attr_name, prefix, local_part, quote, value| +                  if prefix == "xmlns" +                    if local_part == "xml" +                      if value != "http://www.w3.org/XML/1998/namespace" +                        msg = "The 'xml' prefix must not be bound to any other namespace "+ +                        "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" +                        raise REXML::ParseException.new( msg, @source, self ) +                      end +                    elsif local_part == "xmlns" +                      msg = "The 'xmlns' prefix must not be declared "+ +                      "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" +                      raise REXML::ParseException.new( msg, @source, self) +                    end +                    curr_ns << local_part +                  elsif prefix +                    prefixes << prefix unless prefix == "xml" +                  end + +                  if attributes.has_key?(attr_name) +                    msg = "Duplicate attribute #{attr_name.inspect}" +                    raise REXML::ParseException.new(msg, @source, self) +                  end + +                  attributes[attr_name] = value +                end +              end + +              # Verify that all of the prefixes have been defined +              for prefix in prefixes +                unless @nsstack.find{|k| k.member?(prefix)} +                  raise UndefinedNamespaceException.new(prefix,@source,self) +                end +              end + +              if md[6] +                @closed = md[1] +                @nsstack.shift +              else +                @tags.push( md[1] ) +              end +              return [ :start_element, md[1], attributes ] +            end +          else +            md = @source.match( TEXT_PATTERN, true ) +            if md[0].length == 0 +              @source.match( /(\s+)/, true ) +            end +            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 +            #return [ :text, "" ] if md[0].length == 0 +            # unnormalized = Text::unnormalize( md[1], self ) +            # return PullEvent.new( :text, md[1], unnormalized ) +            return [ :text, md[1] ] +          end +        rescue REXML::UndefinedNamespaceException +          raise +        rescue REXML::ParseException +          raise +        rescue Exception, NameError => error +          raise REXML::ParseException.new( "Exception parsing", +            @source, self, (error ? error : $!) ) +        end +        return [ :dummy ] +      end +      private :pull_event + +      def entity( reference, entities ) +        value = nil +        value = entities[ reference ] if entities +        if not value +          value = DEFAULT_ENTITIES[ reference ] +          value = value[2] if value +        end +        unnormalize( value, entities ) if value +      end + +      # Escapes all possible entities +      def normalize( input, entities=nil, entity_filter=nil ) +        copy = input.clone +        # Doing it like this rather than in a loop improves the speed +        copy.gsub!( EREFERENCE, '&' ) +        entities.each do |key, value| +          copy.gsub!( value, "&#{key};" ) unless entity_filter and +                                      entity_filter.include?(entity) +        end if entities +        copy.gsub!( EREFERENCE, '&' ) +        DEFAULT_ENTITIES.each do |key, value| +          copy.gsub!( value[3], value[1] ) +        end +        copy +      end + +      # Unescapes all possible entities +      def unnormalize( string, entities=nil, filter=nil ) +        rv = string.clone +        rv.gsub!( /\r\n?/, "\n" ) +        matches = rv.scan( REFERENCE_RE ) +        return rv if matches.size == 0 +        rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { +          m=$1 +          m = "0#{m}" if m[0] == ?x +          [Integer(m)].pack('U*') +        } +        matches.collect!{|x|x[0]}.compact! +        if matches.size > 0 +          matches.each do |entity_reference| +            unless filter and filter.include?(entity_reference) +              entity_value = entity( entity_reference, entities ) +              if entity_value +                re = /&#{entity_reference};/ +                rv.gsub!( re, entity_value ) +              else +                er = DEFAULT_ENTITIES[entity_reference] +                rv.gsub!( er[0], er[2] ) if er +              end +            end +          end +          rv.gsub!( /&/, '&' ) +        end +        rv +      end + +      private +      def need_source_encoding_update?(xml_declaration_encoding) +        return false if xml_declaration_encoding.nil? +        return false if /\AUTF-16\z/i =~ xml_declaration_encoding +        true +      end +    end +  end +end + +=begin +  case event[0] +  when :start_element +  when :text +  when :end_element +  when :processing_instruction +  when :cdata +  when :comment +  when :xmldecl +  when :start_doctype +  when :end_doctype +  when :externalentity +  when :elementdecl +  when :entity +  when :attlistdecl +  when :notationdecl +  when :end_doctype +  end +=end diff --git a/jni/ruby/lib/rexml/parsers/lightparser.rb b/jni/ruby/lib/rexml/parsers/lightparser.rb new file mode 100644 index 0000000..8104168 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/lightparser.rb @@ -0,0 +1,58 @@ +require 'rexml/parsers/streamparser' +require 'rexml/parsers/baseparser' +require 'rexml/light/node' + +module REXML +  module Parsers +    class LightParser +      def initialize stream +        @stream = stream +        @parser = REXML::Parsers::BaseParser.new( stream ) +      end + +      def add_listener( listener ) +        @parser.add_listener( listener ) +      end + +      def rewind +        @stream.rewind +        @parser.stream = @stream +      end + +      def parse +        root = context = [ :document ] +        while true +          event = @parser.pull +          case event[0] +          when :end_document +            break +          when :start_element, :start_doctype +            new_node = event +            context << new_node +            new_node[1,0] = [context] +            context = new_node +          when :end_element, :end_doctype +            context = context[1] +          else +            new_node = event +            context << new_node +            new_node[1,0] = [context] +          end +        end +        root +      end +    end + +    # An element is an array.  The array contains: +    #  0                        The parent element +    #  1                        The tag name +    #  2                        A hash of attributes +    #  3..-1    The child elements +    # An element is an array of size > 3 +    # Text is a String +    # PIs are [ :processing_instruction, target, data ] +    # Comments are [ :comment, data ] +    # DocTypes are DocType structs +    # The root is an array with XMLDecls, Text, DocType, Array, Text +  end +end diff --git a/jni/ruby/lib/rexml/parsers/pullparser.rb b/jni/ruby/lib/rexml/parsers/pullparser.rb new file mode 100644 index 0000000..68a4ff7 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/pullparser.rb @@ -0,0 +1,196 @@ +require 'forwardable' + +require 'rexml/parseexception' +require 'rexml/parsers/baseparser' +require 'rexml/xmltokens' + +module REXML +  module Parsers +    # = Using the Pull Parser +    # <em>This API is experimental, and subject to change.</em> +    #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) +    #  while parser.has_next? +    #    res = parser.next +    #    puts res[1]['att'] if res.start_tag? and res[0] == 'b' +    #  end +    # See the PullEvent class for information on the content of the results. +    # The data is identical to the arguments passed for the various events to +    # the StreamListener API. +    # +    # Notice that: +    #  parser = PullParser.new( "<a>BAD DOCUMENT" ) +    #  while parser.has_next? +    #    res = parser.next +    #    raise res[1] if res.error? +    #  end +    # +    # Nat Price gave me some good ideas for the API. +    class PullParser +      include XMLTokens +      extend Forwardable + +      def_delegators( :@parser, :has_next? ) +      def_delegators( :@parser, :entity ) +      def_delegators( :@parser, :empty? ) +      def_delegators( :@parser, :source ) + +      def initialize stream +        @entities = {} +        @listeners = nil +        @parser = BaseParser.new( stream ) +        @my_stack = [] +      end + +      def add_listener( listener ) +        @listeners = [] unless @listeners +        @listeners << listener +      end + +      def each +        while has_next? +          yield self.pull +        end +      end + +      def peek depth=0 +        if @my_stack.length <= depth +          (depth - @my_stack.length + 1).times { +            e = PullEvent.new(@parser.pull) +            @my_stack.push(e) +          } +        end +        @my_stack[depth] +      end + +      def pull +        return @my_stack.shift if @my_stack.length > 0 + +        event = @parser.pull +        case event[0] +        when :entitydecl +          @entities[ event[1] ] = +            event[2] unless event[2] =~ /PUBLIC|SYSTEM/ +        when :text +          unnormalized = @parser.unnormalize( event[1], @entities ) +          event << unnormalized +        end +        PullEvent.new( event ) +      end + +      def unshift token +        @my_stack.unshift token +      end +    end + +    # A parsing event.  The contents of the event are accessed as an +Array?, +    # and the type is given either by the ...? methods, or by accessing the +    # +type+ accessor.  The contents of this object vary from event to event, +    # but are identical to the arguments passed to +StreamListener+s for each +    # event. +    class PullEvent +      # The type of this event.  Will be one of :tag_start, :tag_end, :text, +      # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, +      # :notationdecl, :entity, :cdata, :xmldecl, or :error. +      def initialize(arg) +        @contents = arg +      end + +      def []( start, endd=nil) +        if start.kind_of? Range +          @contents.slice( start.begin+1 .. start.end ) +        elsif start.kind_of? Numeric +          if endd.nil? +            @contents.slice( start+1 ) +          else +            @contents.slice( start+1, endd ) +          end +        else +          raise "Illegal argument #{start.inspect} (#{start.class})" +        end +      end + +      def event_type +        @contents[0] +      end + +      # Content: [ String tag_name, Hash attributes ] +      def start_element? +        @contents[0] == :start_element +      end + +      # Content: [ String tag_name ] +      def end_element? +        @contents[0] == :end_element +      end + +      # Content: [ String raw_text, String unnormalized_text ] +      def text? +        @contents[0] == :text +      end + +      # Content: [ String text ] +      def instruction? +        @contents[0] == :processing_instruction +      end + +      # Content: [ String text ] +      def comment? +        @contents[0] == :comment +      end + +      # Content: [ String name, String pub_sys, String long_name, String uri ] +      def doctype? +        @contents[0] == :start_doctype +      end + +      # Content: [ String text ] +      def attlistdecl? +        @contents[0] == :attlistdecl +      end + +      # Content: [ String text ] +      def elementdecl? +        @contents[0] == :elementdecl +      end + +      # Due to the wonders of DTDs, an entity declaration can be just about +      # anything.  There's no way to normalize it; you'll have to interpret the +      # content yourself.  However, the following is true: +      # +      # * If the entity declaration is an internal entity: +      #   [ String name, String value ] +      # Content: [ String text ] +      def entitydecl? +        @contents[0] == :entitydecl +      end + +      # Content: [ String text ] +      def notationdecl? +        @contents[0] == :notationdecl +      end + +      # Content: [ String text ] +      def entity? +        @contents[0] == :entity +      end + +      # Content: [ String text ] +      def cdata? +        @contents[0] == :cdata +      end + +      # Content: [ String version, String encoding, String standalone ] +      def xmldecl? +        @contents[0] == :xmldecl +      end + +      def error? +        @contents[0] == :error +      end + +      def inspect +        @contents[0].to_s + ": " + @contents[1..-1].inspect +      end +    end +  end +end diff --git a/jni/ruby/lib/rexml/parsers/sax2parser.rb b/jni/ruby/lib/rexml/parsers/sax2parser.rb new file mode 100644 index 0000000..a72c0a7 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/sax2parser.rb @@ -0,0 +1,272 @@ +require 'rexml/parsers/baseparser' +require 'rexml/parseexception' +require 'rexml/namespace' +require 'rexml/text' + +module REXML +  module Parsers +    # SAX2Parser +    class SAX2Parser +      def initialize source +        @parser = BaseParser.new(source) +        @listeners = [] +        @procs = [] +        @namespace_stack = [] +        @has_listeners = false +        @tag_stack = [] +        @entities = {} +      end + +      def source +        @parser.source +      end + +      def add_listener( listener ) +        @parser.add_listener( listener ) +      end + +      # Listen arguments: +      # +      # Symbol, Array, Block +      #         Listen to Symbol events on Array elements +      # Symbol, Block +      #   Listen to Symbol events +      # Array, Listener +      #         Listen to all events on Array elements +      # Array, Block +      #         Listen to :start_element events on Array elements +      # Listener +      #         Listen to All events +      # +      # Symbol can be one of: :start_element, :end_element, +      # :start_prefix_mapping, :end_prefix_mapping, :characters, +      # :processing_instruction, :doctype, :attlistdecl, :elementdecl, +      # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment +      # +      # There is an additional symbol that can be listened for: :progress. +      # This will be called for every event generated, passing in the current +      # stream position. +      # +      # Array contains regular expressions or strings which will be matched +      # against fully qualified element names. +      # +      # Listener must implement the methods in SAX2Listener +      # +      # Block will be passed the same arguments as a SAX2Listener method would +      # be, where the method name is the same as the matched Symbol. +      # See the SAX2Listener for more information. +      def listen( *args, &blok ) +        if args[0].kind_of? Symbol +          if args.size == 2 +            args[1].each { |match| @procs << [args[0], match, blok] } +          else +            add( [args[0], nil, blok] ) +          end +        elsif args[0].kind_of? Array +          if args.size == 2 +            args[0].each { |match| add( [nil, match, args[1]] ) } +          else +            args[0].each { |match| add( [ :start_element, match, blok ] ) } +          end +        else +          add([nil, nil, args[0]]) +        end +      end + +      def deafen( listener=nil, &blok ) +        if listener +          @listeners.delete_if {|item| item[-1] == listener } +          @has_listeners = false if @listeners.size == 0 +        else +          @procs.delete_if {|item| item[-1] == blok } +        end +      end + +      def parse +        @procs.each { |sym,match,block| block.call if sym == :start_document } +        @listeners.each { |sym,match,block| +          block.start_document if sym == :start_document or sym.nil? +        } +        context = [] +        while true +          event = @parser.pull +          case event[0] +          when :end_document +            handle( :end_document ) +            break +          when :start_doctype +            handle( :doctype, *event[1..-1]) +          when :end_doctype +            context = context[1] +          when :start_element +            @tag_stack.push(event[1]) +            # find the observers for namespaces +            procs = get_procs( :start_prefix_mapping, event[1] ) +            listeners = get_listeners( :start_prefix_mapping, event[1] ) +            if procs or listeners +              # break out the namespace declarations +              # The attributes live in event[2] +              event[2].each {|n, v| event[2][n] = @parser.normalize(v)} +              nsdecl = event[2].find_all { |n, value| n =~ /^xmlns(:|$)/ } +              nsdecl.collect! { |n, value| [ n[6..-1], value ] } +              @namespace_stack.push({}) +              nsdecl.each do |n,v| +                @namespace_stack[-1][n] = v +                # notify observers of namespaces +                procs.each { |ob| ob.call( n, v ) } if procs +                listeners.each { |ob| ob.start_prefix_mapping(n, v) } if listeners +              end +            end +            event[1] =~ Namespace::NAMESPLIT +            prefix = $1 +            local = $2 +            uri = get_namespace(prefix) +            # find the observers for start_element +            procs = get_procs( :start_element, event[1] ) +            listeners = get_listeners( :start_element, event[1] ) +            # notify observers +            procs.each { |ob| ob.call( uri, local, event[1], event[2] ) } if procs +            listeners.each { |ob| +              ob.start_element( uri, local, event[1], event[2] ) +            } if listeners +          when :end_element +            @tag_stack.pop +            event[1] =~ Namespace::NAMESPLIT +            prefix = $1 +            local = $2 +            uri = get_namespace(prefix) +            # find the observers for start_element +            procs = get_procs( :end_element, event[1] ) +            listeners = get_listeners( :end_element, event[1] ) +            # notify observers +            procs.each { |ob| ob.call( uri, local, event[1] ) } if procs +            listeners.each { |ob| +              ob.end_element( uri, local, event[1] ) +            } if listeners + +            namespace_mapping = @namespace_stack.pop +            # find the observers for namespaces +            procs = get_procs( :end_prefix_mapping, event[1] ) +            listeners = get_listeners( :end_prefix_mapping, event[1] ) +            if procs or listeners +              namespace_mapping.each do |ns_prefix, ns_uri| +                # notify observers of namespaces +                procs.each { |ob| ob.call( ns_prefix ) } if procs +                listeners.each { |ob| ob.end_prefix_mapping(ns_prefix) } if listeners +              end +            end +          when :text +            #normalized = @parser.normalize( event[1] ) +            #handle( :characters, normalized ) +            copy = event[1].clone + +            esub = proc { |match| +              if @entities.has_key?($1) +                @entities[$1].gsub(Text::REFERENCE, &esub) +              else +                match +              end +            } + +            copy.gsub!( Text::REFERENCE, &esub ) +            copy.gsub!( Text::NUMERICENTITY ) {|m| +              m=$1 +              m = "0#{m}" if m[0] == ?x +              [Integer(m)].pack('U*') +            } +            handle( :characters, copy ) +          when :entitydecl +            handle_entitydecl( event ) +          when :processing_instruction, :comment, :attlistdecl, +            :elementdecl, :cdata, :notationdecl, :xmldecl +            handle( *event ) +          end +          handle( :progress, @parser.position ) +        end +      end + +      private +      def handle( symbol, *arguments ) +        tag = @tag_stack[-1] +        procs = get_procs( symbol, tag ) +        listeners = get_listeners( symbol, tag ) +        # notify observers +        procs.each { |ob| ob.call( *arguments ) } if procs +        listeners.each { |l| +          l.send( symbol.to_s, *arguments ) +        } if listeners +      end + +      def handle_entitydecl( event ) +        @entities[ event[1] ] = event[2] if event.size == 3 +        parameter_reference_p = false +        case event[2] +        when "SYSTEM" +          if event.size == 5 +            if event.last == "%" +              parameter_reference_p = true +            else +              event[4, 0] = "NDATA" +            end +          end +        when "PUBLIC" +          if event.size == 6 +            if event.last == "%" +              parameter_reference_p = true +            else +              event[5, 0] = "NDATA" +            end +          end +        else +          parameter_reference_p = (event.size == 4) +        end +        event[1, 0] = event.pop if parameter_reference_p +        handle( event[0], event[1..-1] ) +      end + +      # The following methods are duplicates, but it is faster than using +      # a helper +      def get_procs( symbol, name ) +        return nil if @procs.size == 0 +        @procs.find_all do |sym, match, block| +          ( +            (sym.nil? or symbol == sym) and +            ((name.nil? and match.nil?) or match.nil? or ( +              (name == match) or +              (match.kind_of? Regexp and name =~ match) +              ) +            ) +          ) +        end.collect{|x| x[-1]} +      end +      def get_listeners( symbol, name ) +        return nil if @listeners.size == 0 +        @listeners.find_all do |sym, match, block| +          ( +            (sym.nil? or symbol == sym) and +            ((name.nil? and match.nil?) or match.nil? or ( +              (name == match) or +              (match.kind_of? Regexp and name =~ match) +              ) +            ) +          ) +        end.collect{|x| x[-1]} +      end + +      def add( pair ) +        if pair[-1].respond_to? :call +          @procs << pair unless @procs.include? pair +        else +          @listeners << pair unless @listeners.include? pair +          @has_listeners = true +        end +      end + +      def get_namespace( prefix ) +        uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) || +          (@namespace_stack.find { |ns| not ns[nil].nil? }) +        uris[-1][prefix] unless uris.nil? or 0 == uris.size +      end +    end +  end +end diff --git a/jni/ruby/lib/rexml/parsers/streamparser.rb b/jni/ruby/lib/rexml/parsers/streamparser.rb new file mode 100644 index 0000000..9ea65ed --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/streamparser.rb @@ -0,0 +1,52 @@ +require "rexml/parsers/baseparser" + +module REXML +  module Parsers +    class StreamParser +      def initialize source, listener +        @listener = listener +        @parser = BaseParser.new( source ) +      end + +      def add_listener( listener ) +        @parser.add_listener( listener ) +      end + +      def parse +        # entity string +        while true +          event = @parser.pull +          case event[0] +          when :end_document +            return +          when :start_element +            attrs = event[2].each do |n, v| +              event[2][n] = @parser.unnormalize( v ) +            end +            @listener.tag_start( event[1], attrs ) +          when :end_element +            @listener.tag_end( event[1] ) +          when :text +            normalized = @parser.unnormalize( event[1] ) +            @listener.text( normalized ) +          when :processing_instruction +            @listener.instruction( *event[1,2] ) +          when :start_doctype +            @listener.doctype( *event[1..-1] ) +          when :end_doctype +            # FIXME: remove this condition for milestone:3.2 +            @listener.doctype_end if @listener.respond_to? :doctype_end +          when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl +            @listener.send( event[0].to_s, *event[1..-1] ) +          when :entitydecl, :notationdecl +            @listener.send( event[0].to_s, event[1..-1] ) +          when :externalentity +            entity_reference = event[1] +            content = entity_reference.gsub(/\A%|;\z/, "") +            @listener.entity(content) +          end +        end +      end +    end +  end +end diff --git a/jni/ruby/lib/rexml/parsers/treeparser.rb b/jni/ruby/lib/rexml/parsers/treeparser.rb new file mode 100644 index 0000000..68edb77 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/treeparser.rb @@ -0,0 +1,100 @@ +require 'rexml/validation/validationexception' +require 'rexml/undefinednamespaceexception' + +module REXML +  module Parsers +    class TreeParser +      def initialize( source, build_context = Document.new ) +        @build_context = build_context +        @parser = Parsers::BaseParser.new( source ) +      end + +      def add_listener( listener ) +        @parser.add_listener( listener ) +      end + +      def parse +        tag_stack = [] +        in_doctype = false +        entities = nil +        begin +          while true +            event = @parser.pull +            #STDERR.puts "TREEPARSER GOT #{event.inspect}" +            case event[0] +            when :end_document +              unless tag_stack.empty? +                raise ParseException.new("No close tag for #{@build_context.xpath}", +                                         @parser.source, @parser) +              end +              return +            when :start_element +              tag_stack.push(event[1]) +              el = @build_context = @build_context.add_element( event[1] ) +              event[2].each do |key, value| +                el.attributes[key]=Attribute.new(key,value,self) +              end +            when :end_element +              tag_stack.pop +              @build_context = @build_context.parent +            when :text +              if not in_doctype +                if @build_context[-1].instance_of? Text +                  @build_context[-1] << event[1] +                else +                  @build_context.add( +                    Text.new(event[1], @build_context.whitespace, nil, true) +                  ) unless ( +                    @build_context.ignore_whitespace_nodes and +                    event[1].strip.size==0 +                  ) +                end +              end +            when :comment +              c = Comment.new( event[1] ) +              @build_context.add( c ) +            when :cdata +              c = CData.new( event[1] ) +              @build_context.add( c ) +            when :processing_instruction +              @build_context.add( Instruction.new( event[1], event[2] ) ) +            when :end_doctype +              in_doctype = false +              entities.each { |k,v| entities[k] = @build_context.entities[k].value } +              @build_context = @build_context.parent +            when :start_doctype +              doctype = DocType.new( event[1..-1], @build_context ) +              @build_context = doctype +              entities = {} +              in_doctype = true +            when :attlistdecl +              n = AttlistDecl.new( event[1..-1] ) +              @build_context.add( n ) +            when :externalentity +              n = ExternalEntity.new( event[1] ) +              @build_context.add( n ) +            when :elementdecl +              n = ElementDecl.new( event[1] ) +              @build_context.add(n) +            when :entitydecl +              entities[ event[1] ] = event[2] unless event[2] =~ /PUBLIC|SYSTEM/ +              @build_context.add(Entity.new(event)) +            when :notationdecl +              n = NotationDecl.new( *event[1..-1] ) +              @build_context.add( n ) +            when :xmldecl +              x = XMLDecl.new( event[1], event[2], event[3] ) +              @build_context.add( x ) +            end +          end +        rescue REXML::Validation::ValidationException +          raise +        rescue REXML::ParseException +          raise +        rescue +          raise ParseException.new( $!.message, @parser.source, @parser, $! ) +        end +      end +    end +  end +end diff --git a/jni/ruby/lib/rexml/parsers/ultralightparser.rb b/jni/ruby/lib/rexml/parsers/ultralightparser.rb new file mode 100644 index 0000000..4e2d7a8 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/ultralightparser.rb @@ -0,0 +1,56 @@ +require 'rexml/parsers/streamparser' +require 'rexml/parsers/baseparser' + +module REXML +  module Parsers +    class UltraLightParser +      def initialize stream +        @stream = stream +        @parser = REXML::Parsers::BaseParser.new( stream ) +      end + +      def add_listener( listener ) +        @parser.add_listener( listener ) +      end + +      def rewind +        @stream.rewind +        @parser.stream = @stream +      end + +      def parse +        root = context = [] +        while true +          event = @parser.pull +          case event[0] +          when :end_document +            break +          when :end_doctype +            context = context[1] +          when :start_element, :start_doctype +            context << event +            event[1,0] = [context] +            context = event +          when :end_element +            context = context[1] +          else +            context << event +          end +        end +        root +      end +    end + +    # An element is an array.  The array contains: +    #  0                        The parent element +    #  1                        The tag name +    #  2                        A hash of attributes +    #  3..-1    The child elements +    # An element is an array of size > 3 +    # Text is a String +    # PIs are [ :processing_instruction, target, data ] +    # Comments are [ :comment, data ] +    # DocTypes are DocType structs +    # The root is an array with XMLDecls, Text, DocType, Array, Text +  end +end diff --git a/jni/ruby/lib/rexml/parsers/xpathparser.rb b/jni/ruby/lib/rexml/parsers/xpathparser.rb new file mode 100644 index 0000000..57767fb --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/xpathparser.rb @@ -0,0 +1,656 @@ +require 'rexml/namespace' +require 'rexml/xmltokens' + +module REXML +  module Parsers +    # You don't want to use this class.  Really.  Use XPath, which is a wrapper +    # for this class.  Believe me.  You don't want to poke around in here. +    # There is strange, dark magic at work in this code.  Beware.  Go back!  Go +    # back while you still can! +    class XPathParser +      include XMLTokens +      LITERAL    = /^'([^']*)'|^"([^"]*)"/u + +      def namespaces=( namespaces ) +        Functions::namespace_context = namespaces +        @namespaces = namespaces +      end + +      def parse path +        path = path.dup +        path.gsub!(/([\(\[])\s+/, '\1') # Strip ignorable spaces +        path.gsub!( /\s+([\]\)])/, '\1') +        parsed = [] +        OrExpr(path, parsed) +        parsed +      end + +      def predicate path +        parsed = [] +        Predicate( "[#{path}]", parsed ) +        parsed +      end + +      def abbreviate( path ) +        path = path.kind_of?(String) ? parse( path ) : path +        string = "" +        document = false +        while path.size > 0 +          op = path.shift +          case op +          when :node +          when :attribute +            string << "/" if string.size > 0 +            string << "@" +          when :child +            string << "/" if string.size > 0 +          when :descendant_or_self +            string << "/" +          when :self +            string << "." +          when :parent +            string << ".." +          when :any +            string << "*" +          when :text +            string << "text()" +          when :following, :following_sibling, +                :ancestor, :ancestor_or_self, :descendant, +                :namespace, :preceding, :preceding_sibling +            string << "/" unless string.size == 0 +            string << op.to_s.tr("_", "-") +            string << "::" +          when :qname +            prefix = path.shift +            name = path.shift +            string << prefix+":" if prefix.size > 0 +            string << name +          when :predicate +            string << '[' +            string << predicate_to_string( path.shift ) {|x| abbreviate( x ) } +            string << ']' +          when :document +            document = true +          when :function +            string << path.shift +            string << "( " +            string << predicate_to_string( path.shift[0] ) {|x| abbreviate( x )} +            string << " )" +          when :literal +            string << %Q{ "#{path.shift}" } +          else +            string << "/" unless string.size == 0 +            string << "UNKNOWN(" +            string << op.inspect +            string << ")" +          end +        end +        string = "/"+string if document +        return string +      end + +      def expand( path ) +        path = path.kind_of?(String) ? parse( path ) : path +        string = "" +        document = false +        while path.size > 0 +          op = path.shift +          case op +          when :node +            string << "node()" +          when :attribute, :child, :following, :following_sibling, +                :ancestor, :ancestor_or_self, :descendant, :descendant_or_self, +                :namespace, :preceding, :preceding_sibling, :self, :parent +            string << "/" unless string.size == 0 +            string << op.to_s.tr("_", "-") +            string << "::" +          when :any +            string << "*" +          when :qname +            prefix = path.shift +            name = path.shift +            string << prefix+":" if prefix.size > 0 +            string << name +          when :predicate +            string << '[' +            string << predicate_to_string( path.shift ) { |x| expand(x) } +            string << ']' +          when :document +            document = true +          else +            string << "/" unless string.size == 0 +            string << "UNKNOWN(" +            string << op.inspect +            string << ")" +          end +        end +        string = "/"+string if document +        return string +      end + +      def predicate_to_string( path, &block ) +        string = "" +        case path[0] +        when :and, :or, :mult, :plus, :minus, :neq, :eq, :lt, :gt, :lteq, :gteq, :div, :mod, :union +          op = path.shift +          case op +          when :eq +            op = "=" +          when :lt +            op = "<" +          when :gt +            op = ">" +          when :lteq +            op = "<=" +          when :gteq +            op = ">=" +          when :neq +            op = "!=" +          when :union +            op = "|" +          end +          left = predicate_to_string( path.shift, &block ) +          right = predicate_to_string( path.shift, &block ) +          string << " " +          string << left +          string << " " +          string << op.to_s +          string << " " +          string << right +          string << " " +        when :function +          path.shift +          name = path.shift +          string << name +          string << "( " +          string << predicate_to_string( path.shift, &block ) +          string << " )" +        when :literal +          path.shift +          string << " " +          string << path.shift.inspect +          string << " " +        else +          string << " " +          string << yield( path ) +          string << " " +        end +        return string.squeeze(" ") +      end + +      private +      #LocationPath +      #  | RelativeLocationPath +      #  | '/' RelativeLocationPath? +      #  | '//' RelativeLocationPath +      def LocationPath path, parsed +        path = path.strip +        if path[0] == ?/ +          parsed << :document +          if path[1] == ?/ +            parsed << :descendant_or_self +            parsed << :node +            path = path[2..-1] +          else +            path = path[1..-1] +          end +        end +        return RelativeLocationPath( path, parsed ) if path.size > 0 +      end + +      #RelativeLocationPath +      #  |                                                    Step +      #    | (AXIS_NAME '::' | '@' | '')                     AxisSpecifier +      #      NodeTest +      #        Predicate +      #    | '.' | '..'                                      AbbreviatedStep +      #  |  RelativeLocationPath '/' Step +      #  | RelativeLocationPath '//' Step +      AXIS = /^(ancestor|ancestor-or-self|attribute|child|descendant|descendant-or-self|following|following-sibling|namespace|parent|preceding|preceding-sibling|self)::/ +      def RelativeLocationPath path, parsed +        while path.size > 0 +          # (axis or @ or <child::>) nodetest predicate  > +          # OR                                          >  / Step +          # (. or ..)                                    > +          if path[0] == ?. +            if path[1] == ?. +              parsed << :parent +              parsed << :node +              path = path[2..-1] +            else +              parsed << :self +              parsed << :node +              path = path[1..-1] +            end +          else +            if path[0] == ?@ +              parsed << :attribute +              path = path[1..-1] +              # Goto Nodetest +            elsif path =~ AXIS +              parsed << $1.tr('-','_').intern +              path = $' +              # Goto Nodetest +            else +              parsed << :child +            end + +            n = [] +            path = NodeTest( path, n) + +            if path[0] == ?[ +              path = Predicate( path, n ) +            end + +            parsed.concat(n) +          end + +          if path.size > 0 +            if path[0] == ?/ +              if path[1] == ?/ +                parsed << :descendant_or_self +                parsed << :node +                path = path[2..-1] +              else +                path = path[1..-1] +              end +            else +              return path +            end +          end +        end +        return path +      end + +      # Returns a 1-1 map of the nodeset +      # The contents of the resulting array are either: +      #   true/false, if a positive match +      #   String, if a name match +      #NodeTest +      #  | ('*' | NCNAME ':' '*' | QNAME)                NameTest +      #  | NODE_TYPE '(' ')'                              NodeType +      #  | PI '(' LITERAL ')'                            PI +      #    | '[' expr ']'                                Predicate +      NCNAMETEST= /^(#{NCNAME_STR}):\*/u +      QNAME     = Namespace::NAMESPLIT +      NODE_TYPE  = /^(comment|text|node)\(\s*\)/m +      PI        = /^processing-instruction\(/ +      def NodeTest path, parsed +        case path +        when /^\*/ +          path = $' +          parsed << :any +        when NODE_TYPE +          type = $1 +          path = $' +          parsed << type.tr('-', '_').intern +        when PI +          path = $' +          literal = nil +          if path !~ /^\s*\)/ +            path =~ LITERAL +            literal = $1 +            path = $' +            raise ParseException.new("Missing ')' after processing instruction") if path[0] != ?) +            path = path[1..-1] +          end +          parsed << :processing_instruction +          parsed << (literal || '') +        when NCNAMETEST +          prefix = $1 +          path = $' +          parsed << :namespace +          parsed << prefix +        when QNAME +          prefix = $1 +          name = $2 +          path = $' +          prefix = "" unless prefix +          parsed << :qname +          parsed << prefix +          parsed << name +        end +        return path +      end + +      # Filters the supplied nodeset on the predicate(s) +      def Predicate path, parsed +        return nil unless path[0] == ?[ +        predicates = [] +        while path[0] == ?[ +          path, expr = get_group(path) +          predicates << expr[1..-2] if expr +        end +        predicates.each{ |pred| +          preds = [] +          parsed << :predicate +          parsed << preds +          OrExpr(pred, preds) +        } +        path +      end + +      # The following return arrays of true/false, a 1-1 mapping of the +      # supplied nodeset, except for axe(), which returns a filtered +      # nodeset + +      #| OrExpr S 'or' S AndExpr +      #| AndExpr +      def OrExpr path, parsed +        n = [] +        rest = AndExpr( path, n ) +        if rest != path +          while rest =~ /^\s*( or )/ +            n = [ :or, n, [] ] +            rest = AndExpr( $', n[-1] ) +          end +        end +        if parsed.size == 0 and n.size != 0 +          parsed.replace(n) +        elsif n.size > 0 +          parsed << n +        end +        rest +      end + +      #| AndExpr S 'and' S EqualityExpr +      #| EqualityExpr +      def AndExpr path, parsed +        n = [] +        rest = EqualityExpr( path, n ) +        if rest != path +          while rest =~ /^\s*( and )/ +            n = [ :and, n, [] ] +            rest = EqualityExpr( $', n[-1] ) +          end +        end +        if parsed.size == 0 and n.size != 0 +          parsed.replace(n) +        elsif n.size > 0 +          parsed << n +        end +        rest +      end + +      #| EqualityExpr ('=' | '!=')  RelationalExpr +      #| RelationalExpr +      def EqualityExpr path, parsed +        n = [] +        rest = RelationalExpr( path, n ) +        if rest != path +          while rest =~ /^\s*(!?=)\s*/ +            if $1[0] == ?! +              n = [ :neq, n, [] ] +            else +              n = [ :eq, n, [] ] +            end +            rest = RelationalExpr( $', n[-1] ) +          end +        end +        if parsed.size == 0 and n.size != 0 +          parsed.replace(n) +        elsif n.size > 0 +          parsed << n +        end +        rest +      end + +      #| RelationalExpr ('<' | '>' | '<=' | '>=') AdditiveExpr +      #| AdditiveExpr +      def RelationalExpr path, parsed +        n = [] +        rest = AdditiveExpr( path, n ) +        if rest != path +          while rest =~ /^\s*([<>]=?)\s*/ +            if $1[0] == ?< +              sym = "lt" +            else +              sym = "gt" +            end +            sym << "eq" if $1[-1] == ?= +            n = [ sym.intern, n, [] ] +            rest = AdditiveExpr( $', n[-1] ) +          end +        end +        if parsed.size == 0 and n.size != 0 +          parsed.replace(n) +        elsif n.size > 0 +          parsed << n +        end +        rest +      end + +      #| AdditiveExpr ('+' | S '-') MultiplicativeExpr +      #| MultiplicativeExpr +      def AdditiveExpr path, parsed +        n = [] +        rest = MultiplicativeExpr( path, n ) +        if rest != path +          while rest =~ /^\s*(\+| -)\s*/ +            if $1[0] == ?+ +              n = [ :plus, n, [] ] +            else +              n = [ :minus, n, [] ] +            end +            rest = MultiplicativeExpr( $', n[-1] ) +          end +        end +        if parsed.size == 0 and n.size != 0 +          parsed.replace(n) +        elsif n.size > 0 +          parsed << n +        end +        rest +      end + +      #| MultiplicativeExpr ('*' | S ('div' | 'mod') S) UnaryExpr +      #| UnaryExpr +      def MultiplicativeExpr path, parsed +        n = [] +        rest = UnaryExpr( path, n ) +        if rest != path +          while rest =~ /^\s*(\*| div | mod )\s*/ +            if $1[0] == ?* +              n = [ :mult, n, [] ] +            elsif $1.include?( "div" ) +              n = [ :div, n, [] ] +            else +              n = [ :mod, n, [] ] +            end +            rest = UnaryExpr( $', n[-1] ) +          end +        end +        if parsed.size == 0 and n.size != 0 +          parsed.replace(n) +        elsif n.size > 0 +          parsed << n +        end +        rest +      end + +      #| '-' UnaryExpr +      #| UnionExpr +      def UnaryExpr path, parsed +        path =~ /^(\-*)/ +        path = $' +        if $1 and (($1.size % 2) != 0) +          mult = -1 +        else +          mult = 1 +        end +        parsed << :neg if mult < 0 + +        n = [] +        path = UnionExpr( path, n ) +        parsed.concat( n ) +        path +      end + +      #| UnionExpr '|' PathExpr +      #| PathExpr +      def UnionExpr path, parsed +        n = [] +        rest = PathExpr( path, n ) +        if rest != path +          while rest =~ /^\s*(\|)\s*/ +            n = [ :union, n, [] ] +            rest = PathExpr( $', n[-1] ) +          end +        end +        if parsed.size == 0 and n.size != 0 +          parsed.replace( n ) +        elsif n.size > 0 +          parsed << n +        end +        rest +      end + +      #| LocationPath +      #| FilterExpr ('/' | '//') RelativeLocationPath +      def PathExpr path, parsed +        path =~ /^\s*/ +        path = $' +        n = [] +        rest = FilterExpr( path, n ) +        if rest != path +          if rest and rest[0] == ?/ +            return RelativeLocationPath(rest, n) +          end +        end +        rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w*]/ +        parsed.concat(n) +        return rest +      end + +      #| FilterExpr Predicate +      #| PrimaryExpr +      def FilterExpr path, parsed +        n = [] +        path = PrimaryExpr( path, n ) +        path = Predicate(path, n) if path and path[0] == ?[ +        parsed.concat(n) +        path +      end + +      #| VARIABLE_REFERENCE +      #| '(' expr ')' +      #| LITERAL +      #| NUMBER +      #| FunctionCall +      VARIABLE_REFERENCE  = /^\$(#{NAME_STR})/u +      NUMBER              = /^(\d*\.?\d+)/ +      NT        = /^comment|text|processing-instruction|node$/ +      def PrimaryExpr path, parsed +        case path +        when VARIABLE_REFERENCE +          varname = $1 +          path = $' +          parsed << :variable +          parsed << varname +          #arry << @variables[ varname ] +        when /^(\w[-\w]*)(?:\()/ +          fname = $1 +          tmp = $' +          return path if fname =~ NT +          path = tmp +          parsed << :function +          parsed << fname +          path = FunctionCall(path, parsed) +        when NUMBER +          varname = $1.nil? ? $2 : $1 +          path = $' +          parsed << :literal +          parsed << (varname.include?('.') ? varname.to_f : varname.to_i) +        when LITERAL +          varname = $1.nil? ? $2 : $1 +          path = $' +          parsed << :literal +          parsed << varname +        when /^\(/                                               #/ +          path, contents = get_group(path) +          contents = contents[1..-2] +          n = [] +          OrExpr( contents, n ) +          parsed.concat(n) +        end +        path +      end + +      #| FUNCTION_NAME '(' ( expr ( ',' expr )* )? ')' +      def FunctionCall rest, parsed +        path, arguments = parse_args(rest) +        argset = [] +        for argument in arguments +          args = [] +          OrExpr( argument, args ) +          argset << args +        end +        parsed << argset +        path +      end + +      # get_group( '[foo]bar' ) -> ['bar', '[foo]'] +      def get_group string +        ind = 0 +        depth = 0 +        st = string[0,1] +        en = (st == "(" ? ")" : "]") +        begin +          case string[ind,1] +          when st +            depth += 1 +          when en +            depth -= 1 +          end +          ind += 1 +        end while depth > 0 and ind < string.length +        return nil unless depth==0 +        [string[ind..-1], string[0..ind-1]] +      end + +      def parse_args( string ) +        arguments = [] +        ind = 0 +        inquot = false +        inapos = false +        depth = 1 +        begin +          case string[ind] +          when ?" +            inquot = !inquot unless inapos +          when ?' +            inapos = !inapos unless inquot +          else +            unless inquot or inapos +              case string[ind] +              when ?( +                depth += 1 +                if depth == 1 +                  string = string[1..-1] +                  ind -= 1 +                end +              when ?) +                depth -= 1 +                if depth == 0 +                  s = string[0,ind].strip +                  arguments << s unless s == "" +                  string = string[ind+1..-1] +                end +              when ?, +                if depth == 1 +                  s = string[0,ind].strip +                  arguments << s unless s == "" +                  string = string[ind+1..-1] +                  ind = -1 +                end +              end +            end +          end +          ind += 1 +        end while depth > 0 and ind < string.length +        return nil unless depth==0 +        [string,arguments] +      end +    end +  end +end | 
