From fcbf63e62c627deae76c1b8cb8c0876c536ed811 Mon Sep 17 00:00:00 2001
From: Jari Vetoniemi <jari.vetoniemi@indooratlas.com>
Date: Mon, 16 Mar 2020 18:49:26 +0900
Subject: Fresh start

---
 jni/ruby/lib/rexml/parsers/baseparser.rb       | 532 ++++++++++++++++++++
 jni/ruby/lib/rexml/parsers/lightparser.rb      |  58 +++
 jni/ruby/lib/rexml/parsers/pullparser.rb       | 196 ++++++++
 jni/ruby/lib/rexml/parsers/sax2parser.rb       | 272 ++++++++++
 jni/ruby/lib/rexml/parsers/streamparser.rb     |  52 ++
 jni/ruby/lib/rexml/parsers/treeparser.rb       | 100 ++++
 jni/ruby/lib/rexml/parsers/ultralightparser.rb |  56 +++
 jni/ruby/lib/rexml/parsers/xpathparser.rb      | 656 +++++++++++++++++++++++++
 8 files changed, 1922 insertions(+)
 create mode 100644 jni/ruby/lib/rexml/parsers/baseparser.rb
 create mode 100644 jni/ruby/lib/rexml/parsers/lightparser.rb
 create mode 100644 jni/ruby/lib/rexml/parsers/pullparser.rb
 create mode 100644 jni/ruby/lib/rexml/parsers/sax2parser.rb
 create mode 100644 jni/ruby/lib/rexml/parsers/streamparser.rb
 create mode 100644 jni/ruby/lib/rexml/parsers/treeparser.rb
 create mode 100644 jni/ruby/lib/rexml/parsers/ultralightparser.rb
 create mode 100644 jni/ruby/lib/rexml/parsers/xpathparser.rb

(limited to 'jni/ruby/lib/rexml/parsers')
diff --git a/jni/ruby/lib/rexml/parsers/baseparser.rb b/jni/ruby/lib/rexml/parsers/baseparser.rb
new file mode 100644
index 0000000..6a08b86
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/baseparser.rb
@@ -0,0 +1,532 @@
+require 'rexml/parseexception'
+require 'rexml/undefinednamespaceexception'
+require 'rexml/source'
+require 'set'
+
+module REXML
+  module Parsers
+    # = Using the Pull Parser
+    # <em>This API is experimental, and subject to change.</em>
+    #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
+    #  while parser.has_next?
+    #    res = parser.next
+    #    puts res[1]['att'] if res.start_tag? and res[0] == 'b'
+    #  end
+    # See the PullEvent class for information on the content of the results.
+    # The data is identical to the arguments passed for the various events to
+    # the StreamListener API.
+    #
+    # Notice that:
+    #  parser = PullParser.new( "<a>BAD DOCUMENT" )
+    #  while parser.has_next?
+    #    res = parser.next
+    #    raise res[1] if res.error?
+    #  end
+    #
+    # Nat Price gave me some good ideas for the API.
+    class BaseParser
+      LETTER = '[:alpha:]'
+      DIGIT = '[:digit:]'
+
+      COMBININGCHAR = '' # TODO
+      EXTENDER = ''      # TODO
+
+      NCNAME_STR= "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*"
+      NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
+      UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
+
+      NAMECHAR = '[\-\w\.:]'
+      NAME = "([\\w:]#{NAMECHAR}*)"
+      NMTOKEN = "(?:#{NAMECHAR})+"
+      NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
+      REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
+      REFERENCE_RE = /#{REFERENCE}/
+
+      DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
+      DOCTYPE_END = /\A\s*\]\s*>/um
+      DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
+      ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
+      COMMENT_START = /\A<!--/u
+      COMMENT_PATTERN = /<!--(.*?)-->/um
+      CDATA_START = /\A<!\[CDATA\[/u
+      CDATA_END = /\A\s*\]\s*>/um
+      CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
+      XMLDECL_START = /\A<\?xml\s/u;
+      XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
+      INSTRUCTION_START = /\A<\?/u
+      INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
+      TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
+      CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
+
+      VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
+      ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
+      STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
+
+      ENTITY_START = /\A\s*<!ENTITY/
+      IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
+      ELEMENTDECL_START = /\A\s*<!ELEMENT/um
+      ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
+      SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
+      ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
+      NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
+      ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
+      ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
+      ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
+      DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
+      ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
+      ATTDEF_RE = /#{ATTDEF}/
+      ATTLISTDECL_START = /\A\s*<!ATTLIST/um
+      ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
+      NOTATIONDECL_START = /\A\s*<!NOTATION/um
+      PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
+      SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
+
+      TEXT_PATTERN = /\A([^<]*)/um
+
+      # Entity constants
+      PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
+      SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
+      PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
+      EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
+      NDATADECL = "\\s+NDATA\\s+#{NAME}"
+      PEREFERENCE = "%#{NAME};"
+      ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
+      PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
+      ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
+      PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
+      GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
+      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
+
+      EREFERENCE = /&(?!#{NAME};)/
+
+      DEFAULT_ENTITIES = {
+        'gt' => [/&gt;/, '&gt;', '>', />/],
+        'lt' => [/&lt;/, '&lt;', '<', /</],
+        'quot' => [/&quot;/, '&quot;', '"', /"/],
+        "apos" => [/&apos;/, "&apos;", "'", /'/]
+      }
+
+
+      ######################################################################
+      # These are patterns to identify common markup errors, to make the
+      # error messages more informative.
+      ######################################################################
+      MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
+
+      def initialize( source )
+        self.stream = source
+        @listeners = []
+      end
+
+      def add_listener( listener )
+        @listeners << listener
+      end
+
+      attr_reader :source
+
+      def stream=( source )
+        @source = SourceFactory.create_from( source )
+        @closed = nil
+        @document_status = nil
+        @tags = []
+        @stack = []
+        @entities = []
+        @nsstack = []
+      end
+
+      def position
+        if @source.respond_to? :position
+          @source.position
+        else
+          # FIXME
+          0
+        end
+      end
+
+      # Returns true if there are no more events
+      def empty?
+        return (@source.empty? and @stack.empty?)
+      end
+
+      # Returns true if there are more events.  Synonymous with !empty?
+      def has_next?
+        return !(@source.empty? and @stack.empty?)
+      end
+
+      # Push an event back on the head of the stream.  This method
+      # has (theoretically) infinite depth.
+      def unshift token
+        @stack.unshift(token)
+      end
+
+      # Peek at the +depth+ event in the stack.  The first element on the stack
+      # is at depth 0.  If +depth+ is -1, will parse to the end of the input
+      # stream and return the last event, which is always :end_document.
+      # Be aware that this causes the stream to be parsed up to the +depth+
+      # event, so you can effectively pre-parse the entire document (pull the
+      # entire thing into memory) using this method.
+      def peek depth=0
+        raise %Q[Illegal argument "#{depth}"] if depth < -1
+        temp = []
+        if depth == -1
+          temp.push(pull()) until empty?
+        else
+          while @stack.size+temp.size < depth+1
+            temp.push(pull())
+          end
+        end
+        @stack += temp if temp.size > 0
+        @stack[depth]
+      end
+
+      # Returns the next event.  This is a +PullEvent+ object.
+      def pull
+        pull_event.tap do |event|
+          @listeners.each do |listener|
+            listener.receive event
+          end
+        end
+      end
+
+      def pull_event
+        if @closed
+          x, @closed = @closed, nil
+          return [ :end_element, x ]
+        end
+        return [ :end_document ] if empty?
+        return @stack.shift if @stack.size > 0
+        #STDERR.puts @source.encoding
+        @source.read if @source.buffer.size<2
+        #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
+        if @document_status == nil
+          #@source.consume( /^\s*/um )
+          word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
+          word = word[1] unless word.nil?
+          #STDERR.puts "WORD = #{word.inspect}"
+          case word
+          when COMMENT_START
+            return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
+          when XMLDECL_START
+            #STDERR.puts "XMLDECL"
+            results = @source.match( XMLDECL_PATTERN, true )[1]
+            version = VERSION.match( results )
+            version = version[1] unless version.nil?
+            encoding = ENCODING.match(results)
+            encoding = encoding[1] unless encoding.nil?
+            if need_source_encoding_update?(encoding)
+              @source.encoding = encoding
+            end
+            if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
+              encoding = "UTF-16"
+            end
+            standalone = STANDALONE.match(results)
+            standalone = standalone[1] unless standalone.nil?
+            return [ :xmldecl, version, encoding, standalone ]
+          when INSTRUCTION_START
+            return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
+          when DOCTYPE_START
+            md = @source.match( DOCTYPE_PATTERN, true )
+            @nsstack.unshift(curr_ns=Set.new)
+            identity = md[1]
+            close = md[2]
+            identity =~ IDENTITY
+            name = $1
+            raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
+            pub_sys = $2.nil? ? nil : $2.strip
+            long_name = $4.nil? ? nil : $4.strip
+            uri = $6.nil? ? nil : $6.strip
+            args = [ :start_doctype, name, pub_sys, long_name, uri ]
+            if close == ">"
+              @document_status = :after_doctype
+              @source.read if @source.buffer.size<2
+              md = @source.match(/^\s*/um, true)
+              @stack << [ :end_doctype ]
+            else
+              @document_status = :in_doctype
+            end
+            return args
+          when /^\s+/
+          else
+            @document_status = :after_doctype
+            @source.read if @source.buffer.size<2
+            md = @source.match(/\s*/um, true)
+            if @source.encoding == "UTF-8"
+              @source.buffer.force_encoding(::Encoding::UTF_8)
+            end
+          end
+        end
+        if @document_status == :in_doctype
+          md = @source.match(/\s*(.*?>)/um)
+          case md[1]
+          when SYSTEMENTITY
+            match = @source.match( SYSTEMENTITY, true )[1]
+            return [ :externalentity, match ]
+
+          when ELEMENTDECL_START
+            return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
+
+          when ENTITY_START
+            match = @source.match( ENTITYDECL, true ).to_a.compact
+            match[0] = :entitydecl
+            ref = false
+            if match[1] == '%'
+              ref = true
+              match.delete_at 1
+            end
+            # Now we have to sort out what kind of entity reference this is
+            if match[2] == 'SYSTEM'
+              # External reference
+              match[3] = match[3][1..-2] # PUBID
+              match.delete_at(4) if match.size > 4 # Chop out NDATA decl
+              # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
+            elsif match[2] == 'PUBLIC'
+              # External reference
+              match[3] = match[3][1..-2] # PUBID
+              match[4] = match[4][1..-2] # HREF
+              match.delete_at(5) if match.size > 5 # Chop out NDATA decl
+              # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
+            else
+              match[2] = match[2][1..-2]
+              match.pop if match.size == 4
+              # match is [ :entity, name, value ]
+            end
+            match << '%' if ref
+            return match
+          when ATTLISTDECL_START
+            md = @source.match( ATTLISTDECL_PATTERN, true )
+            raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
+            element = md[1]
+            contents = md[0]
+
+            pairs = {}
+            values = md[0].scan( ATTDEF_RE )
+            values.each do |attdef|
+              unless attdef[3] == "#IMPLIED"
+                attdef.compact!
+                val = attdef[3]
+                val = attdef[4] if val == "#FIXED "
+                pairs[attdef[0]] = val
+                if attdef[0] =~ /^xmlns:(.*)/
+                  @nsstack[0] << $1
+                end
+              end
+            end
+            return [ :attlistdecl, element, pairs, contents ]
+          when NOTATIONDECL_START
+            md = nil
+            if @source.match( PUBLIC )
+              md = @source.match( PUBLIC, true )
+              vals = [md[1],md[2],md[4],md[6]]
+            elsif @source.match( SYSTEM )
+              md = @source.match( SYSTEM, true )
+              vals = [md[1],md[2],nil,md[4]]
+            else
+              raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
+            end
+            return [ :notationdecl, *vals ]
+          when DOCTYPE_END
+            @document_status = :after_doctype
+            @source.match( DOCTYPE_END, true )
+            return [ :end_doctype ]
+          end
+        end
+        begin
+          if @source.buffer[0] == ?<
+            if @source.buffer[1] == ?/
+              @nsstack.shift
+              last_tag = @tags.pop
+              #md = @source.match_to_consume( '>', CLOSE_MATCH)
+              md = @source.match( CLOSE_MATCH, true )
+              raise REXML::ParseException.new( "Missing end tag for "+
+                "'#{last_tag}' (got \"#{md[1]}\")",
+                @source) unless last_tag == md[1]
+              return [ :end_element, last_tag ]
+            elsif @source.buffer[1] == ?!
+              md = @source.match(/\A(\s*[^>]*>)/um)
+              #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
+              raise REXML::ParseException.new("Malformed node", @source) unless md
+              if md[0][2] == ?-
+                md = @source.match( COMMENT_PATTERN, true )
+
+                case md[1]
+                when /--/, /-\z/
+                  raise REXML::ParseException.new("Malformed comment", @source)
+                end
+
+                return [ :comment, md[1] ] if md
+              else
+                md = @source.match( CDATA_PATTERN, true )
+                return [ :cdata, md[1] ] if md
+              end
+              raise REXML::ParseException.new( "Declarations can only occur "+
+                "in the doctype declaration.", @source)
+            elsif @source.buffer[1] == ??
+              md = @source.match( INSTRUCTION_PATTERN, true )
+              return [ :processing_instruction, md[1], md[2] ] if md
+              raise REXML::ParseException.new( "Bad instruction declaration",
+                @source)
+            else
+              # Get the next tag
+              md = @source.match(TAG_MATCH, true)
+              unless md
+                # Check for missing attribute quotes
+                raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
+                raise REXML::ParseException.new("malformed XML: missing tag start", @source)
+              end
+              attributes = {}
+              prefixes = Set.new
+              prefixes << md[2] if md[2]
+              @nsstack.unshift(curr_ns=Set.new)
+              if md[4].size > 0
+                attrs = md[4].scan( ATTRIBUTE_PATTERN )
+                raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
+                attrs.each do |attr_name, prefix, local_part, quote, value|
+                  if prefix == "xmlns"
+                    if local_part == "xml"
+                      if value != "http://www.w3.org/XML/1998/namespace"
+                        msg = "The 'xml' prefix must not be bound to any other namespace "+
+                        "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
+                        raise REXML::ParseException.new( msg, @source, self )
+                      end
+                    elsif local_part == "xmlns"
+                      msg = "The 'xmlns' prefix must not be declared "+
+                      "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
+                      raise REXML::ParseException.new( msg, @source, self)
+                    end
+                    curr_ns << local_part
+                  elsif prefix
+                    prefixes << prefix unless prefix == "xml"
+                  end
+
+                  if attributes.has_key?(attr_name)
+                    msg = "Duplicate attribute #{attr_name.inspect}"
+                    raise REXML::ParseException.new(msg, @source, self)
+                  end
+
+                  attributes[attr_name] = value
+                end
+              end
+
+              # Verify that all of the prefixes have been defined
+              for prefix in prefixes
+                unless @nsstack.find{|k| k.member?(prefix)}
+                  raise UndefinedNamespaceException.new(prefix,@source,self)
+                end
+              end
+
+              if md[6]
+                @closed = md[1]
+                @nsstack.shift
+              else
+                @tags.push( md[1] )
+              end
+              return [ :start_element, md[1], attributes ]
+            end
+          else
+            md = @source.match( TEXT_PATTERN, true )
+            if md[0].length == 0
+              @source.match( /(\s+)/, true )
+            end
+            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
+            #return [ :text, "" ] if md[0].length == 0
+            # unnormalized = Text::unnormalize( md[1], self )
+            # return PullEvent.new( :text, md[1], unnormalized )
+            return [ :text, md[1] ]
+          end
+        rescue REXML::UndefinedNamespaceException
+          raise
+        rescue REXML::ParseException
+          raise
+        rescue Exception, NameError => error
+          raise REXML::ParseException.new( "Exception parsing",
+            @source, self, (error ? error : $!) )
+        end
+        return [ :dummy ]
+      end
+      private :pull_event
+
+      def entity( reference, entities )
+        value = nil
+        value = entities[ reference ] if entities
+        if not value
+          value = DEFAULT_ENTITIES[ reference ]
+          value = value[2] if value
+        end
+        unnormalize( value, entities ) if value
+      end
+
+      # Escapes all possible entities
+      def normalize( input, entities=nil, entity_filter=nil )
+        copy = input.clone
+        # Doing it like this rather than in a loop improves the speed
+        copy.gsub!( EREFERENCE, '&amp;' )
+        entities.each do |key, value|
+          copy.gsub!( value, "&#{key};" ) unless entity_filter and
+                                      entity_filter.include?(entity)
+        end if entities
+        copy.gsub!( EREFERENCE, '&amp;' )
+        DEFAULT_ENTITIES.each do |key, value|
+          copy.gsub!( value[3], value[1] )
+        end
+        copy
+      end
+
+      # Unescapes all possible entities
+      def unnormalize( string, entities=nil, filter=nil )
+        rv = string.clone
+        rv.gsub!( /\r\n?/, "\n" )
+        matches = rv.scan( REFERENCE_RE )
+        return rv if matches.size == 0
+        rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
+          m=$1
+          m = "0#{m}" if m[0] == ?x
+          [Integer(m)].pack('U*')
+        }
+        matches.collect!{|x|x[0]}.compact!
+        if matches.size > 0
+          matches.each do |entity_reference|
+            unless filter and filter.include?(entity_reference)
+              entity_value = entity( entity_reference, entities )
+              if entity_value
+                re = /&#{entity_reference};/
+                rv.gsub!( re, entity_value )
+              else
+                er = DEFAULT_ENTITIES[entity_reference]
+                rv.gsub!( er[0], er[2] ) if er
+              end
+            end
+          end
+          rv.gsub!( /&amp;/, '&' )
+        end
+        rv
+      end
+
+      private
+      def need_source_encoding_update?(xml_declaration_encoding)
+        return false if xml_declaration_encoding.nil?
+        return false if /\AUTF-16\z/i =~ xml_declaration_encoding
+        true
+      end
+    end
+  end
+end
+
+=begin
+  case event[0]
+  when :start_element
+  when :text
+  when :end_element
+  when :processing_instruction
+  when :cdata
+  when :comment
+  when :xmldecl
+  when :start_doctype
+  when :end_doctype
+  when :externalentity
+  when :elementdecl
+  when :entity
+  when :attlistdecl
+  when :notationdecl
+  when :end_doctype
+  end
+=end
diff --git a/jni/ruby/lib/rexml/parsers/lightparser.rb b/jni/ruby/lib/rexml/parsers/lightparser.rb
new file mode 100644
index 0000000..8104168
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/lightparser.rb
@@ -0,0 +1,58 @@
+require 'rexml/parsers/streamparser'
+require 'rexml/parsers/baseparser'
+require 'rexml/light/node'
+
+module REXML
+  module Parsers
+    class LightParser
+      def initialize stream
+        @stream = stream
+        @parser = REXML::Parsers::BaseParser.new( stream )
+      end
+
+      def add_listener( listener )
+        @parser.add_listener( listener )
+      end
+
+      def rewind
+        @stream.rewind
+        @parser.stream = @stream
+      end
+
+      def parse
+        root = context = [ :document ]
+        while true
+          event = @parser.pull
+          case event[0]
+          when :end_document
+            break
+          when :start_element, :start_doctype
+            new_node = event
+            context << new_node
+            new_node[1,0] = [context]
+            context = new_node
+          when :end_element, :end_doctype
+            context = context[1]
+          else
+            new_node = event
+            context << new_node
+            new_node[1,0] = [context]
+          end
+        end
+        root
+      end
+    end
+
+    # An element is an array.  The array contains:
+    #  0                        The parent element
+    #  1                        The tag name
+    #  2                        A hash of attributes
+    #  3..-1    The child elements
+    # An element is an array of size > 3
+    # Text is a String
+    # PIs are [ :processing_instruction, target, data ]
+    # Comments are [ :comment, data ]
+    # DocTypes are DocType structs
+    # The root is an array with XMLDecls, Text, DocType, Array, Text
+  end
+end
diff --git a/jni/ruby/lib/rexml/parsers/pullparser.rb b/jni/ruby/lib/rexml/parsers/pullparser.rb
new file mode 100644
index 0000000..68a4ff7
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/pullparser.rb
@@ -0,0 +1,196 @@
+require 'forwardable'
+
+require 'rexml/parseexception'
+require 'rexml/parsers/baseparser'
+require 'rexml/xmltokens'
+
+module REXML
+  module Parsers
+    # = Using the Pull Parser
+    # <em>This API is experimental, and subject to change.</em>
+    #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
+    #  while parser.has_next?
+    #    res = parser.next
+    #    puts res[1]['att'] if res.start_tag? and res[0] == 'b'
+    #  end
+    # See the PullEvent class for information on the content of the results.
+    # The data is identical to the arguments passed for the various events to
+    # the StreamListener API.
+    #
+    # Notice that:
+    #  parser = PullParser.new( "<a>BAD DOCUMENT" )
+    #  while parser.has_next?
+    #    res = parser.next
+    #    raise res[1] if res.error?
+    #  end
+    #
+    # Nat Price gave me some good ideas for the API.
+    class PullParser
+      include XMLTokens
+      extend Forwardable
+
+      def_delegators( :@parser, :has_next? )
+      def_delegators( :@parser, :entity )
+      def_delegators( :@parser, :empty? )
+      def_delegators( :@parser, :source )
+
+      def initialize stream
+        @entities = {}
+        @listeners = nil
+        @parser = BaseParser.new( stream )
+        @my_stack = []
+      end
+
+      def add_listener( listener )
+        @listeners = [] unless @listeners
+        @listeners << listener
+      end
+
+      def each
+        while has_next?
+          yield self.pull
+        end
+      end
+
+      def peek depth=0
+        if @my_stack.length <= depth
+          (depth - @my_stack.length + 1).times {
+            e = PullEvent.new(@parser.pull)
+            @my_stack.push(e)
+          }
+        end
+        @my_stack[depth]
+      end
+
+      def pull
+        return @my_stack.shift if @my_stack.length > 0
+
+        event = @parser.pull
+        case event[0]
+        when :entitydecl
+          @entities[ event[1] ] =
+            event[2] unless event[2] =~ /PUBLIC|SYSTEM/
+        when :text
+          unnormalized = @parser.unnormalize( event[1], @entities )
+          event << unnormalized
+        end
+        PullEvent.new( event )
+      end
+
+      def unshift token
+        @my_stack.unshift token
+      end
+    end
+
+    # A parsing event.  The contents of the event are accessed as an +Array?,
+    # and the type is given either by the ...? methods, or by accessing the
+    # +type+ accessor.  The contents of this object vary from event to event,
+    # but are identical to the arguments passed to +StreamListener+s for each
+    # event.
+    class PullEvent
+      # The type of this event.  Will be one of :tag_start, :tag_end, :text,
+      # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl,
+      # :notationdecl, :entity, :cdata, :xmldecl, or :error.
+      def initialize(arg)
+        @contents = arg
+      end
+
+      def []( start, endd=nil)
+        if start.kind_of? Range
+          @contents.slice( start.begin+1 .. start.end )
+        elsif start.kind_of? Numeric
+          if endd.nil?
+            @contents.slice( start+1 )
+          else
+            @contents.slice( start+1, endd )
+          end
+        else
+          raise "Illegal argument #{start.inspect} (#{start.class})"
+        end
+      end
+
+      def event_type
+        @contents[0]
+      end
+
+      # Content: [ String tag_name, Hash attributes ]
+      def start_element?
+        @contents[0] == :start_element
+      end
+
+      # Content: [ String tag_name ]
+      def end_element?
+        @contents[0] == :end_element
+      end
+
+      # Content: [ String raw_text, String unnormalized_text ]
+      def text?
+        @contents[0] == :text
+      end
+
+      # Content: [ String text ]
+      def instruction?
+        @contents[0] == :processing_instruction
+      end
+
+      # Content: [ String text ]
+      def comment?
+        @contents[0] == :comment
+      end
+
+      # Content: [ String name, String pub_sys, String long_name, String uri ]
+      def doctype?
+        @contents[0] == :start_doctype
+      end
+
+      # Content: [ String text ]
+      def attlistdecl?
+        @contents[0] == :attlistdecl
+      end
+
+      # Content: [ String text ]
+      def elementdecl?
+        @contents[0] == :elementdecl
+      end
+
+      # Due to the wonders of DTDs, an entity declaration can be just about
+      # anything.  There's no way to normalize it; you'll have to interpret the
+      # content yourself.  However, the following is true:
+      #
+      # * If the entity declaration is an internal entity:
+      #   [ String name, String value ]
+      # Content: [ String text ]
+      def entitydecl?
+        @contents[0] == :entitydecl
+      end
+
+      # Content: [ String text ]
+      def notationdecl?
+        @contents[0] == :notationdecl
+      end
+
+      # Content: [ String text ]
+      def entity?
+        @contents[0] == :entity
+      end
+
+      # Content: [ String text ]
+      def cdata?
+        @contents[0] == :cdata
+      end
+
+      # Content: [ String version, String encoding, String standalone ]
+      def xmldecl?
+        @contents[0] == :xmldecl
+      end
+
+      def error?
+        @contents[0] == :error
+      end
+
+      def inspect
+        @contents[0].to_s + ": " + @contents[1..-1].inspect
+      end
+    end
+  end
+end
diff --git a/jni/ruby/lib/rexml/parsers/sax2parser.rb b/jni/ruby/lib/rexml/parsers/sax2parser.rb
new file mode 100644
index 0000000..a72c0a7
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/sax2parser.rb
@@ -0,0 +1,272 @@
+require 'rexml/parsers/baseparser'
+require 'rexml/parseexception'
+require 'rexml/namespace'
+require 'rexml/text'
+
+module REXML
+  module Parsers
+    # SAX2Parser
+    class SAX2Parser
+      def initialize source
+        @parser = BaseParser.new(source)
+        @listeners = []
+        @procs = []
+        @namespace_stack = []
+        @has_listeners = false
+        @tag_stack = []
+        @entities = {}
+      end
+
+      def source
+        @parser.source
+      end
+
+      def add_listener( listener )
+        @parser.add_listener( listener )
+      end
+
+      # Listen arguments:
+      #
+      # Symbol, Array, Block
+      #         Listen to Symbol events on Array elements
+      # Symbol, Block
+      #   Listen to Symbol events
+      # Array, Listener
+      #         Listen to all events on Array elements
+      # Array, Block
+      #         Listen to :start_element events on Array elements
+      # Listener
+      #         Listen to All events
+      #
+      # Symbol can be one of: :start_element, :end_element,
+      # :start_prefix_mapping, :end_prefix_mapping, :characters,
+      # :processing_instruction, :doctype, :attlistdecl, :elementdecl,
+      # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment
+      #
+      # There is an additional symbol that can be listened for: :progress.
+      # This will be called for every event generated, passing in the current
+      # stream position.
+      #
+      # Array contains regular expressions or strings which will be matched
+      # against fully qualified element names.
+      #
+      # Listener must implement the methods in SAX2Listener
+      #
+      # Block will be passed the same arguments as a SAX2Listener method would
+      # be, where the method name is the same as the matched Symbol.
+      # See the SAX2Listener for more information.
+      def listen( *args, &blok )
+        if args[0].kind_of? Symbol
+          if args.size == 2
+            args[1].each { |match| @procs << [args[0], match, blok] }
+          else
+            add( [args[0], nil, blok] )
+          end
+        elsif args[0].kind_of? Array
+          if args.size == 2
+            args[0].each { |match| add( [nil, match, args[1]] ) }
+          else
+            args[0].each { |match| add( [ :start_element, match, blok ] ) }
+          end
+        else
+          add([nil, nil, args[0]])
+        end
+      end
+
+      def deafen( listener=nil, &blok )
+        if listener
+          @listeners.delete_if {|item| item[-1] == listener }
+          @has_listeners = false if @listeners.size == 0
+        else
+          @procs.delete_if {|item| item[-1] == blok }
+        end
+      end
+
+      def parse
+        @procs.each { |sym,match,block| block.call if sym == :start_document }
+        @listeners.each { |sym,match,block|
+          block.start_document if sym == :start_document or sym.nil?
+        }
+        context = []
+        while true
+          event = @parser.pull
+          case event[0]
+          when :end_document
+            handle( :end_document )
+            break
+          when :start_doctype
+            handle( :doctype, *event[1..-1])
+          when :end_doctype
+            context = context[1]
+          when :start_element
+            @tag_stack.push(event[1])
+            # find the observers for namespaces
+            procs = get_procs( :start_prefix_mapping, event[1] )
+            listeners = get_listeners( :start_prefix_mapping, event[1] )
+            if procs or listeners
+              # break out the namespace declarations
+              # The attributes live in event[2]
+              event[2].each {|n, v| event[2][n] = @parser.normalize(v)}
+              nsdecl = event[2].find_all { |n, value| n =~ /^xmlns(:|$)/ }
+              nsdecl.collect! { |n, value| [ n[6..-1], value ] }
+              @namespace_stack.push({})
+              nsdecl.each do |n,v|
+                @namespace_stack[-1][n] = v
+                # notify observers of namespaces
+                procs.each { |ob| ob.call( n, v ) } if procs
+                listeners.each { |ob| ob.start_prefix_mapping(n, v) } if listeners
+              end
+            end
+            event[1] =~ Namespace::NAMESPLIT
+            prefix = $1
+            local = $2
+            uri = get_namespace(prefix)
+            # find the observers for start_element
+            procs = get_procs( :start_element, event[1] )
+            listeners = get_listeners( :start_element, event[1] )
+            # notify observers
+            procs.each { |ob| ob.call( uri, local, event[1], event[2] ) } if procs
+            listeners.each { |ob|
+              ob.start_element( uri, local, event[1], event[2] )
+            } if listeners
+          when :end_element
+            @tag_stack.pop
+            event[1] =~ Namespace::NAMESPLIT
+            prefix = $1
+            local = $2
+            uri = get_namespace(prefix)
+            # find the observers for start_element
+            procs = get_procs( :end_element, event[1] )
+            listeners = get_listeners( :end_element, event[1] )
+            # notify observers
+            procs.each { |ob| ob.call( uri, local, event[1] ) } if procs
+            listeners.each { |ob|
+              ob.end_element( uri, local, event[1] )
+            } if listeners
+
+            namespace_mapping = @namespace_stack.pop
+            # find the observers for namespaces
+            procs = get_procs( :end_prefix_mapping, event[1] )
+            listeners = get_listeners( :end_prefix_mapping, event[1] )
+            if procs or listeners
+              namespace_mapping.each do |ns_prefix, ns_uri|
+                # notify observers of namespaces
+                procs.each { |ob| ob.call( ns_prefix ) } if procs
+                listeners.each { |ob| ob.end_prefix_mapping(ns_prefix) } if listeners
+              end
+            end
+          when :text
+            #normalized = @parser.normalize( event[1] )
+            #handle( :characters, normalized )
+            copy = event[1].clone
+
+            esub = proc { |match|
+              if @entities.has_key?($1)
+                @entities[$1].gsub(Text::REFERENCE, &esub)
+              else
+                match
+              end
+            }
+
+            copy.gsub!( Text::REFERENCE, &esub )
+            copy.gsub!( Text::NUMERICENTITY ) {|m|
+              m=$1
+              m = "0#{m}" if m[0] == ?x
+              [Integer(m)].pack('U*')
+            }
+            handle( :characters, copy )
+          when :entitydecl
+            handle_entitydecl( event )
+          when :processing_instruction, :comment, :attlistdecl,
+            :elementdecl, :cdata, :notationdecl, :xmldecl
+            handle( *event )
+          end
+          handle( :progress, @parser.position )
+        end
+      end
+
+      private
+      def handle( symbol, *arguments )
+        tag = @tag_stack[-1]
+        procs = get_procs( symbol, tag )
+        listeners = get_listeners( symbol, tag )
+        # notify observers
+        procs.each { |ob| ob.call( *arguments ) } if procs
+        listeners.each { |l|
+          l.send( symbol.to_s, *arguments )
+        } if listeners
+      end
+
+      def handle_entitydecl( event )
+        @entities[ event[1] ] = event[2] if event.size == 3
+        parameter_reference_p = false
+        case event[2]
+        when "SYSTEM"
+          if event.size == 5
+            if event.last == "%"
+              parameter_reference_p = true
+            else
+              event[4, 0] = "NDATA"
+            end
+          end
+        when "PUBLIC"
+          if event.size == 6
+            if event.last == "%"
+              parameter_reference_p = true
+            else
+              event[5, 0] = "NDATA"
+            end
+          end
+        else
+          parameter_reference_p = (event.size == 4)
+        end
+        event[1, 0] = event.pop if parameter_reference_p
+        handle( event[0], event[1..-1] )
+      end
+
+      # The following methods are duplicates, but it is faster than using
+      # a helper
+      def get_procs( symbol, name )
+        return nil if @procs.size == 0
+        @procs.find_all do |sym, match, block|
+          (
+            (sym.nil? or symbol == sym) and
+            ((name.nil? and match.nil?) or match.nil? or (
+              (name == match) or
+              (match.kind_of? Regexp and name =~ match)
+              )
+            )
+          )
+        end.collect{|x| x[-1]}
+      end
+      def get_listeners( symbol, name )
+        return nil if @listeners.size == 0
+        @listeners.find_all do |sym, match, block|
+          (
+            (sym.nil? or symbol == sym) and
+            ((name.nil? and match.nil?) or match.nil? or (
+              (name == match) or
+              (match.kind_of? Regexp and name =~ match)
+              )
+            )
+          )
+        end.collect{|x| x[-1]}
+      end
+
+      def add( pair )
+        if pair[-1].respond_to? :call
+          @procs << pair unless @procs.include? pair
+        else
+          @listeners << pair unless @listeners.include? pair
+          @has_listeners = true
+        end
+      end
+
+      def get_namespace( prefix )
+        uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) ||
+          (@namespace_stack.find { |ns| not ns[nil].nil? })
+        uris[-1][prefix] unless uris.nil? or 0 == uris.size
+      end
+    end
+  end
+end
diff --git a/jni/ruby/lib/rexml/parsers/streamparser.rb b/jni/ruby/lib/rexml/parsers/streamparser.rb
new file mode 100644
index 0000000..9ea65ed
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/streamparser.rb
@@ -0,0 +1,52 @@
+require "rexml/parsers/baseparser"
+
+module REXML
+  module Parsers
+    class StreamParser
+      def initialize source, listener
+        @listener = listener
+        @parser = BaseParser.new( source )
+      end
+
+      def add_listener( listener )
+        @parser.add_listener( listener )
+      end
+
+      def parse
+        # entity string
+        while true
+          event = @parser.pull
+          case event[0]
+          when :end_document
+            return
+          when :start_element
+            attrs = event[2].each do |n, v|
+              event[2][n] = @parser.unnormalize( v )
+            end
+            @listener.tag_start( event[1], attrs )
+          when :end_element
+            @listener.tag_end( event[1] )
+          when :text
+            normalized = @parser.unnormalize( event[1] )
+            @listener.text( normalized )
+          when :processing_instruction
+            @listener.instruction( *event[1,2] )
+          when :start_doctype
+            @listener.doctype( *event[1..-1] )
+          when :end_doctype
+            # FIXME: remove this condition for milestone:3.2
+            @listener.doctype_end if @listener.respond_to? :doctype_end
+          when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
+            @listener.send( event[0].to_s, *event[1..-1] )
+          when :entitydecl, :notationdecl
+            @listener.send( event[0].to_s, event[1..-1] )
+          when :externalentity
+            entity_reference = event[1]
+            content = entity_reference.gsub(/\A%|;\z/, "")
+            @listener.entity(content)
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/jni/ruby/lib/rexml/parsers/treeparser.rb b/jni/ruby/lib/rexml/parsers/treeparser.rb
new file mode 100644
index 0000000..68edb77
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/treeparser.rb
@@ -0,0 +1,100 @@
+require 'rexml/validation/validationexception'
+require 'rexml/undefinednamespaceexception'
+
+module REXML
+  module Parsers
+    class TreeParser
+      def initialize( source, build_context = Document.new )
+        @build_context = build_context
+        @parser = Parsers::BaseParser.new( source )
+      end
+
+      def add_listener( listener )
+        @parser.add_listener( listener )
+      end
+
+      def parse
+        tag_stack = []
+        in_doctype = false
+        entities = nil
+        begin
+          while true
+            event = @parser.pull
+            #STDERR.puts "TREEPARSER GOT #{event.inspect}"
+            case event[0]
+            when :end_document
+              unless tag_stack.empty?
+                raise ParseException.new("No close tag for #{@build_context.xpath}",
+                                         @parser.source, @parser)
+              end
+              return
+            when :start_element
+              tag_stack.push(event[1])
+              el = @build_context = @build_context.add_element( event[1] )
+              event[2].each do |key, value|
+                el.attributes[key]=Attribute.new(key,value,self)
+              end
+            when :end_element
+              tag_stack.pop
+              @build_context = @build_context.parent
+            when :text
+              if not in_doctype
+                if @build_context[-1].instance_of? Text
+                  @build_context[-1] << event[1]
+                else
+                  @build_context.add(
+                    Text.new(event[1], @build_context.whitespace, nil, true)
+                  ) unless (
+                    @build_context.ignore_whitespace_nodes and
+                    event[1].strip.size==0
+                  )
+                end
+              end
+            when :comment
+              c = Comment.new( event[1] )
+              @build_context.add( c )
+            when :cdata
+              c = CData.new( event[1] )
+              @build_context.add( c )
+            when :processing_instruction
+              @build_context.add( Instruction.new( event[1], event[2] ) )
+            when :end_doctype
+              in_doctype = false
+              entities.each { |k,v| entities[k] = @build_context.entities[k].value }
+              @build_context = @build_context.parent
+            when :start_doctype
+              doctype = DocType.new( event[1..-1], @build_context )
+              @build_context = doctype
+              entities = {}
+              in_doctype = true
+            when :attlistdecl
+              n = AttlistDecl.new( event[1..-1] )
+              @build_context.add( n )
+            when :externalentity
+              n = ExternalEntity.new( event[1] )
+              @build_context.add( n )
+            when :elementdecl
+              n = ElementDecl.new( event[1] )
+              @build_context.add(n)
+            when :entitydecl
+              entities[ event[1] ] = event[2] unless event[2] =~ /PUBLIC|SYSTEM/
+              @build_context.add(Entity.new(event))
+            when :notationdecl
+              n = NotationDecl.new( *event[1..-1] )
+              @build_context.add( n )
+            when :xmldecl
+              x = XMLDecl.new( event[1], event[2], event[3] )
+              @build_context.add( x )
+            end
+          end
+        rescue REXML::Validation::ValidationException
+          raise
+        rescue REXML::ParseException
+          raise
+        rescue
+          raise ParseException.new( $!.message, @parser.source, @parser, $! )
+        end
+      end
+    end
+  end
+end
diff --git a/jni/ruby/lib/rexml/parsers/ultralightparser.rb b/jni/ruby/lib/rexml/parsers/ultralightparser.rb
new file mode 100644
index 0000000..4e2d7a8
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/ultralightparser.rb
@@ -0,0 +1,56 @@
+require 'rexml/parsers/streamparser'
+require 'rexml/parsers/baseparser'
+
+module REXML
+  module Parsers
+    class UltraLightParser
+      def initialize stream
+        @stream = stream
+        @parser = REXML::Parsers::BaseParser.new( stream )
+      end
+
+      def add_listener( listener )
+        @parser.add_listener( listener )
+      end
+
+      def rewind
+        @stream.rewind
+        @parser.stream = @stream
+      end
+
+      def parse
+        root = context = []
+        while true
+          event = @parser.pull
+          case event[0]
+          when :end_document
+            break
+          when :end_doctype
+            context = context[1]
+          when :start_element, :start_doctype
+            context << event
+            event[1,0] = [context]
+            context = event
+          when :end_element
+            context = context[1]
+          else
+            context << event
+          end
+        end
+        root
+      end
+    end
+
+    # An element is an array.  The array contains:
+    #  0                        The parent element
+    #  1                        The tag name
+    #  2                        A hash of attributes
+    #  3..-1    The child elements
+    # An element is an array of size > 3
+    # Text is a String
+    # PIs are [ :processing_instruction, target, data ]
+    # Comments are [ :comment, data ]
+    # DocTypes are DocType structs
+    # The root is an array with XMLDecls, Text, DocType, Array, Text
+  end
+end
diff --git a/jni/ruby/lib/rexml/parsers/xpathparser.rb b/jni/ruby/lib/rexml/parsers/xpathparser.rb
new file mode 100644
index 0000000..57767fb
--- /dev/null
+++ b/jni/ruby/lib/rexml/parsers/xpathparser.rb
@@ -0,0 +1,656 @@
+require 'rexml/namespace'
+require 'rexml/xmltokens'
+
+module REXML
+  module Parsers
+    # You don't want to use this class.  Really.  Use XPath, which is a wrapper
+    # for this class.  Believe me.  You don't want to poke around in here.
+    # There is strange, dark magic at work in this code.  Beware.  Go back!  Go
+    # back while you still can!
+    class XPathParser
+      include XMLTokens
+      LITERAL    = /^'([^']*)'|^"([^"]*)"/u
+
+      def namespaces=( namespaces )
+        Functions::namespace_context = namespaces
+        @namespaces = namespaces
+      end
+
+      def parse path
+        path = path.dup
+        path.gsub!(/([\(\[])\s+/, '\1') # Strip ignorable spaces
+        path.gsub!( /\s+([\]\)])/, '\1')
+        parsed = []
+        OrExpr(path, parsed)
+        parsed
+      end
+
+      def predicate path
+        parsed = []
+        Predicate( "[#{path}]", parsed )
+        parsed
+      end
+
+      def abbreviate( path )
+        path = path.kind_of?(String) ? parse( path ) : path
+        string = ""
+        document = false
+        while path.size > 0
+          op = path.shift
+          case op
+          when :node
+          when :attribute
+            string << "/" if string.size > 0
+            string << "@"
+          when :child
+            string << "/" if string.size > 0
+          when :descendant_or_self
+            string << "/"
+          when :self
+            string << "."
+          when :parent
+            string << ".."
+          when :any
+            string << "*"
+          when :text
+            string << "text()"
+          when :following, :following_sibling,
+                :ancestor, :ancestor_or_self, :descendant,
+                :namespace, :preceding, :preceding_sibling
+            string << "/" unless string.size == 0
+            string << op.to_s.tr("_", "-")
+            string << "::"
+          when :qname
+            prefix = path.shift
+            name = path.shift
+            string << prefix+":" if prefix.size > 0
+            string << name
+          when :predicate
+            string << '['
+            string << predicate_to_string( path.shift ) {|x| abbreviate( x ) }
+            string << ']'
+          when :document
+            document = true
+          when :function
+            string << path.shift
+            string << "( "
+            string << predicate_to_string( path.shift[0] ) {|x| abbreviate( x )}
+            string << " )"
+          when :literal
+            string << %Q{ "#{path.shift}" }
+          else
+            string << "/" unless string.size == 0
+            string << "UNKNOWN("
+            string << op.inspect
+            string << ")"
+          end
+        end
+        string = "/"+string if document
+        return string
+      end
+
+      def expand( path )
+        path = path.kind_of?(String) ? parse( path ) : path
+        string = ""
+        document = false
+        while path.size > 0
+          op = path.shift
+          case op
+          when :node
+            string << "node()"
+          when :attribute, :child, :following, :following_sibling,
+                :ancestor, :ancestor_or_self, :descendant, :descendant_or_self,
+                :namespace, :preceding, :preceding_sibling, :self, :parent
+            string << "/" unless string.size == 0
+            string << op.to_s.tr("_", "-")
+            string << "::"
+          when :any
+            string << "*"
+          when :qname
+            prefix = path.shift
+            name = path.shift
+            string << prefix+":" if prefix.size > 0
+            string << name
+          when :predicate
+            string << '['
+            string << predicate_to_string( path.shift ) { |x| expand(x) }
+            string << ']'
+          when :document
+            document = true
+          else
+            string << "/" unless string.size == 0
+            string << "UNKNOWN("
+            string << op.inspect
+            string << ")"
+          end
+        end
+        string = "/"+string if document
+        return string
+      end
+
+      def predicate_to_string( path, &block )
+        string = ""
+        case path[0]
+        when :and, :or, :mult, :plus, :minus, :neq, :eq, :lt, :gt, :lteq, :gteq, :div, :mod, :union
+          op = path.shift
+          case op
+          when :eq
+            op = "="
+          when :lt
+            op = "<"
+          when :gt
+            op = ">"
+          when :lteq
+            op = "<="
+          when :gteq
+            op = ">="
+          when :neq
+            op = "!="
+          when :union
+            op = "|"
+          end
+          left = predicate_to_string( path.shift, &block )
+          right = predicate_to_string( path.shift, &block )
+          string << " "
+          string << left
+          string << " "
+          string << op.to_s
+          string << " "
+          string << right
+          string << " "
+        when :function
+          path.shift
+          name = path.shift
+          string << name
+          string << "( "
+          string << predicate_to_string( path.shift, &block )
+          string << " )"
+        when :literal
+          path.shift
+          string << " "
+          string << path.shift.inspect
+          string << " "
+        else
+          string << " "
+          string << yield( path )
+          string << " "
+        end
+        return string.squeeze(" ")
+      end
+
+      private
+      #LocationPath
+      #  | RelativeLocationPath
+      #  | '/' RelativeLocationPath?
+      #  | '//' RelativeLocationPath
+      def LocationPath path, parsed
+        path = path.strip
+        if path[0] == ?/
+          parsed << :document
+          if path[1] == ?/
+            parsed << :descendant_or_self
+            parsed << :node
+            path = path[2..-1]
+          else
+            path = path[1..-1]
+          end
+        end
+        return RelativeLocationPath( path, parsed ) if path.size > 0
+      end
+
+      #RelativeLocationPath
+      #  |                                                    Step
+      #    | (AXIS_NAME '::' | '@' | '')                     AxisSpecifier
+      #      NodeTest
+      #        Predicate
+      #    | '.' | '..'                                      AbbreviatedStep
+      #  |  RelativeLocationPath '/' Step
+      #  | RelativeLocationPath '//' Step
+      AXIS = /^(ancestor|ancestor-or-self|attribute|child|descendant|descendant-or-self|following|following-sibling|namespace|parent|preceding|preceding-sibling|self)::/
+      def RelativeLocationPath path, parsed
+        while path.size > 0
+          # (axis or @ or <child::>) nodetest predicate  >
+          # OR                                          >  / Step
+          # (. or ..)                                    >
+          if path[0] == ?.
+            if path[1] == ?.
+              parsed << :parent
+              parsed << :node
+              path = path[2..-1]
+            else
+              parsed << :self
+              parsed << :node
+              path = path[1..-1]
+            end
+          else
+            if path[0] == ?@
+              parsed << :attribute
+              path = path[1..-1]
+              # Goto Nodetest
+            elsif path =~ AXIS
+              parsed << $1.tr('-','_').intern
+              path = $'
+              # Goto Nodetest
+            else
+              parsed << :child
+            end
+
+            n = []
+            path = NodeTest( path, n)
+
+            if path[0] == ?[
+              path = Predicate( path, n )
+            end
+
+            parsed.concat(n)
+          end
+
+          if path.size > 0
+            if path[0] == ?/
+              if path[1] == ?/
+                parsed << :descendant_or_self
+                parsed << :node
+                path = path[2..-1]
+              else
+                path = path[1..-1]
+              end
+            else
+              return path
+            end
+          end
+        end
+        return path
+      end
+
+      # Returns a 1-1 map of the nodeset
+      # The contents of the resulting array are either:
+      #   true/false, if a positive match
+      #   String, if a name match
+      #NodeTest
+      #  | ('*' | NCNAME ':' '*' | QNAME)                NameTest
+      #  | NODE_TYPE '(' ')'                              NodeType
+      #  | PI '(' LITERAL ')'                            PI
+      #    | '[' expr ']'                                Predicate
+      NCNAMETEST= /^(#{NCNAME_STR}):\*/u
+      QNAME     = Namespace::NAMESPLIT
+      NODE_TYPE  = /^(comment|text|node)\(\s*\)/m
+      PI        = /^processing-instruction\(/
+      def NodeTest path, parsed
+        case path
+        when /^\*/
+          path = $'
+          parsed << :any
+        when NODE_TYPE
+          type = $1
+          path = $'
+          parsed << type.tr('-', '_').intern
+        when PI
+          path = $'
+          literal = nil
+          if path !~ /^\s*\)/
+            path =~ LITERAL
+            literal = $1
+            path = $'
+            raise ParseException.new("Missing ')' after processing instruction") if path[0] != ?)
+            path = path[1..-1]
+          end
+          parsed << :processing_instruction
+          parsed << (literal || '')
+        when NCNAMETEST
+          prefix = $1
+          path = $'
+          parsed << :namespace
+          parsed << prefix
+        when QNAME
+          prefix = $1
+          name = $2
+          path = $'
+          prefix = "" unless prefix
+          parsed << :qname
+          parsed << prefix
+          parsed << name
+        end
+        return path
+      end
+
+      # Filters the supplied nodeset on the predicate(s)
+      def Predicate path, parsed
+        return nil unless path[0] == ?[
+        predicates = []
+        while path[0] == ?[
+          path, expr = get_group(path)
+          predicates << expr[1..-2] if expr
+        end
+        predicates.each{ |pred|
+          preds = []
+          parsed << :predicate
+          parsed << preds
+          OrExpr(pred, preds)
+        }
+        path
+      end
+
+      # The following return arrays of true/false, a 1-1 mapping of the
+      # supplied nodeset, except for axe(), which returns a filtered
+      # nodeset
+
+      #| OrExpr S 'or' S AndExpr
+      #| AndExpr
+      def OrExpr path, parsed
+        n = []
+        rest = AndExpr( path, n )
+        if rest != path
+          while rest =~ /^\s*( or )/
+            n = [ :or, n, [] ]
+            rest = AndExpr( $', n[-1] )
+          end
+        end
+        if parsed.size == 0 and n.size != 0
+          parsed.replace(n)
+        elsif n.size > 0
+          parsed << n
+        end
+        rest
+      end
+
+      #| AndExpr S 'and' S EqualityExpr
+      #| EqualityExpr
+      def AndExpr path, parsed
+        n = []
+        rest = EqualityExpr( path, n )
+        if rest != path
+          while rest =~ /^\s*( and )/
+            n = [ :and, n, [] ]
+            rest = EqualityExpr( $', n[-1] )
+          end
+        end
+        if parsed.size == 0 and n.size != 0
+          parsed.replace(n)
+        elsif n.size > 0
+          parsed << n
+        end
+        rest
+      end
+
+      #| EqualityExpr ('=' | '!=')  RelationalExpr
+      #| RelationalExpr
+      def EqualityExpr path, parsed
+        n = []
+        rest = RelationalExpr( path, n )
+        if rest != path
+          while rest =~ /^\s*(!?=)\s*/
+            if $1[0] == ?!
+              n = [ :neq, n, [] ]
+            else
+              n = [ :eq, n, [] ]
+            end
+            rest = RelationalExpr( $', n[-1] )
+          end
+        end
+        if parsed.size == 0 and n.size != 0
+          parsed.replace(n)
+        elsif n.size > 0
+          parsed << n
+        end
+        rest
+      end
+
+      #| RelationalExpr ('<' | '>' | '<=' | '>=') AdditiveExpr
+      #| AdditiveExpr
+      def RelationalExpr path, parsed
+        n = []
+        rest = AdditiveExpr( path, n )
+        if rest != path
+          while rest =~ /^\s*([<>]=?)\s*/
+            if $1[0] == ?<
+              sym = "lt"
+            else
+              sym = "gt"
+            end
+            sym << "eq" if $1[-1] == ?=
+            n = [ sym.intern, n, [] ]
+            rest = AdditiveExpr( $', n[-1] )
+          end
+        end
+        if parsed.size == 0 and n.size != 0
+          parsed.replace(n)
+        elsif n.size > 0
+          parsed << n
+        end
+        rest
+      end
+
+      #| AdditiveExpr ('+' | S '-') MultiplicativeExpr
+      #| MultiplicativeExpr
+      def AdditiveExpr path, parsed
+        n = []
+        rest = MultiplicativeExpr( path, n )
+        if rest != path
+          while rest =~ /^\s*(\+| -)\s*/
+            if $1[0] == ?+
+              n = [ :plus, n, [] ]
+            else
+              n = [ :minus, n, [] ]
+            end
+            rest = MultiplicativeExpr( $', n[-1] )
+          end
+        end
+        if parsed.size == 0 and n.size != 0
+          parsed.replace(n)
+        elsif n.size > 0
+          parsed << n
+        end
+        rest
+      end
+
+      #| MultiplicativeExpr ('*' | S ('div' | 'mod') S) UnaryExpr
+      #| UnaryExpr
+      def MultiplicativeExpr path, parsed
+        n = []
+        rest = UnaryExpr( path, n )
+        if rest != path
+          while rest =~ /^\s*(\*| div | mod )\s*/
+            if $1[0] == ?*
+              n = [ :mult, n, [] ]
+            elsif $1.include?( "div" )
+              n = [ :div, n, [] ]
+            else
+              n = [ :mod, n, [] ]
+            end
+            rest = UnaryExpr( $', n[-1] )
+          end
+        end
+        if parsed.size == 0 and n.size != 0
+          parsed.replace(n)
+        elsif n.size > 0
+          parsed << n
+        end
+        rest
+      end
+
+      #| '-' UnaryExpr
+      #| UnionExpr
+      def UnaryExpr path, parsed
+        path =~ /^(\-*)/
+        path = $'
+        if $1 and (($1.size % 2) != 0)
+          mult = -1
+        else
+          mult = 1
+        end
+        parsed << :neg if mult < 0
+
+        n = []
+        path = UnionExpr( path, n )
+        parsed.concat( n )
+        path
+      end
+
+      #| UnionExpr '|' PathExpr
+      #| PathExpr
+      def UnionExpr path, parsed
+        n = []
+        rest = PathExpr( path, n )
+        if rest != path
+          while rest =~ /^\s*(\|)\s*/
+            n = [ :union, n, [] ]
+            rest = PathExpr( $', n[-1] )
+          end
+        end
+        if parsed.size == 0 and n.size != 0
+          parsed.replace( n )
+        elsif n.size > 0
+          parsed << n
+        end
+        rest
+      end
+
+      #| LocationPath
+      #| FilterExpr ('/' | '//') RelativeLocationPath
+      def PathExpr path, parsed
+        path =~ /^\s*/
+        path = $'
+        n = []
+        rest = FilterExpr( path, n )
+        if rest != path
+          if rest and rest[0] == ?/
+            return RelativeLocationPath(rest, n)
+          end
+        end
+        rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w*]/
+        parsed.concat(n)
+        return rest
+      end
+
+      #| FilterExpr Predicate
+      #| PrimaryExpr
+      def FilterExpr path, parsed
+        n = []
+        path = PrimaryExpr( path, n )
+        path = Predicate(path, n) if path and path[0] == ?[
+        parsed.concat(n)
+        path
+      end
+
+      #| VARIABLE_REFERENCE
+      #| '(' expr ')'
+      #| LITERAL
+      #| NUMBER
+      #| FunctionCall
+      VARIABLE_REFERENCE  = /^\$(#{NAME_STR})/u
+      NUMBER              = /^(\d*\.?\d+)/
+      NT        = /^comment|text|processing-instruction|node$/
+      def PrimaryExpr path, parsed
+        case path
+        when VARIABLE_REFERENCE
+          varname = $1
+          path = $'
+          parsed << :variable
+          parsed << varname
+          #arry << @variables[ varname ]
+        when /^(\w[-\w]*)(?:\()/
+          fname = $1
+          tmp = $'
+          return path if fname =~ NT
+          path = tmp
+          parsed << :function
+          parsed << fname
+          path = FunctionCall(path, parsed)
+        when NUMBER
+          varname = $1.nil? ? $2 : $1
+          path = $'
+          parsed << :literal
+          parsed << (varname.include?('.') ? varname.to_f : varname.to_i)
+        when LITERAL
+          varname = $1.nil? ? $2 : $1
+          path = $'
+          parsed << :literal
+          parsed << varname
+        when /^\(/                                               #/
+          path, contents = get_group(path)
+          contents = contents[1..-2]
+          n = []
+          OrExpr( contents, n )
+          parsed.concat(n)
+        end
+        path
+      end
+
+      #| FUNCTION_NAME '(' ( expr ( ',' expr )* )? ')'
+      def FunctionCall rest, parsed
+        path, arguments = parse_args(rest)
+        argset = []
+        for argument in arguments
+          args = []
+          OrExpr( argument, args )
+          argset << args
+        end
+        parsed << argset
+        path
+      end
+
+      # get_group( '[foo]bar' ) -> ['bar', '[foo]']
+      def get_group string
+        ind = 0
+        depth = 0
+        st = string[0,1]
+        en = (st == "(" ? ")" : "]")
+        begin
+          case string[ind,1]
+          when st
+            depth += 1
+          when en
+            depth -= 1
+          end
+          ind += 1
+        end while depth > 0 and ind < string.length
+        return nil unless depth==0
+        [string[ind..-1], string[0..ind-1]]
+      end
+
+      def parse_args( string )
+        arguments = []
+        ind = 0
+        inquot = false
+        inapos = false
+        depth = 1
+        begin
+          case string[ind]
+          when ?"
+            inquot = !inquot unless inapos
+          when ?'
+            inapos = !inapos unless inquot
+          else
+            unless inquot or inapos
+              case string[ind]
+              when ?(
+                depth += 1
+                if depth == 1
+                  string = string[1..-1]
+                  ind -= 1
+                end
+              when ?)
+                depth -= 1
+                if depth == 0
+                  s = string[0,ind].strip
+                  arguments << s unless s == ""
+                  string = string[ind+1..-1]
+                end
+              when ?,
+                if depth == 1
+                  s = string[0,ind].strip
+                  arguments << s unless s == ""
+                  string = string[ind+1..-1]
+                  ind = -1
+                end
+              end
+            end
+          end
+          ind += 1
+        end while depth > 0 and ind < string.length
+        return nil unless depth==0
+        [string,arguments]
+      end
+    end
+  end
+end
-- 
cgit v1.2.3-70-g09d2