diff options
Diffstat (limited to 'jni/ruby/lib/rexml')
50 files changed, 8887 insertions, 0 deletions
diff --git a/jni/ruby/lib/rexml/attlistdecl.rb b/jni/ruby/lib/rexml/attlistdecl.rb new file mode 100644 index 0000000..ec4e6c3 --- /dev/null +++ b/jni/ruby/lib/rexml/attlistdecl.rb @@ -0,0 +1,62 @@ +#vim:ts=2 sw=2 noexpandtab: +require 'rexml/child' +require 'rexml/source' + +module REXML + # This class needs: + # * Documentation + # * Work! Not all types of attlists are intelligently parsed, so we just + # spew back out what we get in. This works, but it would be better if + # we formatted the output ourselves. + # + # AttlistDecls provide *just* enough support to allow namespace + # declarations. If you need some sort of generalized support, or have an + # interesting idea about how to map the hideous, terrible design of DTD + # AttlistDecls onto an intuitive Ruby interface, let me know. I'm desperate + # for anything to make DTDs more palateable. + class AttlistDecl < Child + include Enumerable + + # What is this? Got me. + attr_reader :element_name + + # Create an AttlistDecl, pulling the information from a Source. Notice + # that this isn't very convenient; to create an AttlistDecl, you basically + # have to format it yourself, and then have the initializer parse it. + # Sorry, but for the forseeable future, DTD support in REXML is pretty + # weak on convenience. Have I mentioned how much I hate DTDs? + def initialize(source) + super() + if (source.kind_of? Array) + @element_name, @pairs, @contents = *source + end + end + + # Access the attlist attribute/value pairs. + # value = attlist_decl[ attribute_name ] + def [](key) + @pairs[key] + end + + # Whether an attlist declaration includes the given attribute definition + # if attlist_decl.include? "xmlns:foobar" + def include?(key) + @pairs.keys.include? key + end + + # Iterate over the key/value pairs: + # attlist_decl.each { |attribute_name, attribute_value| ... } + def each(&block) + @pairs.each(&block) + end + + # Write out exactly what we got in. + def write out, indent=-1 + out << @contents + end + + def node_type + :attlistdecl + end + end +end diff --git a/jni/ruby/lib/rexml/attribute.rb b/jni/ruby/lib/rexml/attribute.rb new file mode 100644 index 0000000..ef9e544 --- /dev/null +++ b/jni/ruby/lib/rexml/attribute.rb @@ -0,0 +1,191 @@ +require "rexml/namespace" +require 'rexml/text' + +module REXML + # Defines an Element Attribute; IE, a attribute=value pair, as in: + # <element attribute="value"/>. Attributes can be in their own + # namespaces. General users of REXML will not interact with the + # Attribute class much. + class Attribute + include Node + include Namespace + + # The element to which this attribute belongs + attr_reader :element + # The normalized value of this attribute. That is, the attribute with + # entities intact. + attr_writer :normalized + PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um + + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um + + # Constructor. + # FIXME: The parser doesn't catch illegal characters in attributes + # + # first:: + # Either: an Attribute, which this new attribute will become a + # clone of; or a String, which is the name of this attribute + # second:: + # If +first+ is an Attribute, then this may be an Element, or nil. + # If nil, then the Element parent of this attribute is the parent + # of the +first+ Attribute. If the first argument is a String, + # then this must also be a String, and is the content of the attribute. + # If this is the content, it must be fully normalized (contain no + # illegal characters). + # parent:: + # Ignored unless +first+ is a String; otherwise, may be the Element + # parent of this attribute, or nil. + # + # + # Attribute.new( attribute_to_clone ) + # Attribute.new( attribute_to_clone, parent_element ) + # Attribute.new( "attr", "attr_value" ) + # Attribute.new( "attr", "attr_value", parent_element ) + def initialize( first, second=nil, parent=nil ) + @normalized = @unnormalized = @element = nil + if first.kind_of? Attribute + self.name = first.expanded_name + @unnormalized = first.value + if second.kind_of? Element + @element = second + else + @element = first.element + end + elsif first.kind_of? String + @element = parent + self.name = first + @normalized = second.to_s + else + raise "illegal argument #{first.class.name} to Attribute constructor" + end + end + + # Returns the namespace of the attribute. + # + # e = Element.new( "elns:myelement" ) + # e.add_attribute( "nsa:a", "aval" ) + # e.add_attribute( "b", "bval" ) + # e.attributes.get_attribute( "a" ).prefix # -> "nsa" + # e.attributes.get_attribute( "b" ).prefix # -> "elns" + # a = Attribute.new( "x", "y" ) + # a.prefix # -> "" + def prefix + pf = super + if pf == "" + pf = @element.prefix if @element + end + pf + end + + # Returns the namespace URL, if defined, or nil otherwise + # + # e = Element.new("el") + # e.add_namespace("ns", "http://url") + # e.add_attribute("ns:a", "b") + # e.add_attribute("nsx:a", "c") + # e.attribute("ns:a").namespace # => "http://url" + # e.attribute("nsx:a").namespace # => nil + def namespace arg=nil + arg = prefix if arg.nil? + @element.namespace arg + end + + # Returns true if other is an Attribute and has the same name and value, + # false otherwise. + def ==( other ) + other.kind_of?(Attribute) and other.name==name and other.value==value + end + + # Creates (and returns) a hash from both the name and value + def hash + name.hash + value.hash + end + + # Returns this attribute out as XML source, expanding the name + # + # a = Attribute.new( "x", "y" ) + # a.to_string # -> "x='y'" + # b = Attribute.new( "ns:x", "y" ) + # b.to_string # -> "ns:x='y'" + def to_string + if @element and @element.context and @element.context[:attribute_quote] == :quote + %Q^#@expanded_name="#{to_s().gsub(/"/, '"e;')}"^ + else + "#@expanded_name='#{to_s().gsub(/'/, ''')}'" + end + end + + def doctype + if @element + doc = @element.document + doc.doctype if doc + end + end + + # Returns the attribute value, with entities replaced + def to_s + return @normalized if @normalized + + @normalized = Text::normalize( @unnormalized, doctype ) + @unnormalized = nil + @normalized + end + + # Returns the UNNORMALIZED value of this attribute. That is, entities + # have been expanded to their values + def value + return @unnormalized if @unnormalized + @unnormalized = Text::unnormalize( @normalized, doctype ) + @normalized = nil + @unnormalized + end + + # Returns a copy of this attribute + def clone + Attribute.new self + end + + # Sets the element of which this object is an attribute. Normally, this + # is not directly called. + # + # Returns this attribute + def element=( element ) + @element = element + + if @normalized + Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype ) + end + + self + end + + # Removes this Attribute from the tree, and returns true if successful + # + # This method is usually not called directly. + def remove + @element.attributes.delete self.name unless @element.nil? + end + + # Writes this attribute (EG, puts 'key="value"' to the output) + def write( output, indent=-1 ) + output << to_string + end + + def node_type + :attribute + end + + def inspect + rv = "" + write( rv ) + rv + end + + def xpath + path = @element.xpath + path += "/@#{self.expanded_name}" + return path + end + end +end +#vim:ts=2 sw=2 noexpandtab: diff --git a/jni/ruby/lib/rexml/cdata.rb b/jni/ruby/lib/rexml/cdata.rb new file mode 100644 index 0000000..73358ed --- /dev/null +++ b/jni/ruby/lib/rexml/cdata.rb @@ -0,0 +1,67 @@ +require "rexml/text" + +module REXML + class CData < Text + START = '<![CDATA[' + STOP = ']]>' + ILLEGAL = /(\]\]>)/ + + # Constructor. CData is data between <![CDATA[ ... ]]> + # + # _Examples_ + # CData.new( source ) + # CData.new( "Here is some CDATA" ) + # CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element ) + def initialize( first, whitespace=true, parent=nil ) + super( first, whitespace, parent, false, true, ILLEGAL ) + end + + # Make a copy of this object + # + # _Examples_ + # c = CData.new( "Some text" ) + # d = c.clone + # d.to_s # -> "Some text" + def clone + CData.new self + end + + # Returns the content of this CData object + # + # _Examples_ + # c = CData.new( "Some text" ) + # c.to_s # -> "Some text" + def to_s + @string + end + + def value + @string + end + + # == DEPRECATED + # See the rexml/formatters package + # + # Generates XML output of this object + # + # output:: + # Where to write the string. Defaults to $stdout + # indent:: + # The amount to indent this node by + # transitive:: + # Ignored + # ie_hack:: + # Ignored + # + # _Examples_ + # c = CData.new( " Some text " ) + # c.write( $stdout ) #-> <![CDATA[ Some text ]]> + def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) + Kernel.warn( "#{self.class.name}.write is deprecated" ) + indent( output, indent ) + output << START + output << @string + output << STOP + end + end +end diff --git a/jni/ruby/lib/rexml/child.rb b/jni/ruby/lib/rexml/child.rb new file mode 100644 index 0000000..bf97d5f --- /dev/null +++ b/jni/ruby/lib/rexml/child.rb @@ -0,0 +1,96 @@ +require "rexml/node" + +module REXML + ## + # A Child object is something contained by a parent, and this class + # contains methods to support that. Most user code will not use this + # class directly. + class Child + include Node + attr_reader :parent # The Parent of this object + + # Constructor. Any inheritors of this class should call super to make + # sure this method is called. + # parent:: + # if supplied, the parent of this child will be set to the + # supplied value, and self will be added to the parent + def initialize( parent = nil ) + @parent = nil + # Declare @parent, but don't define it. The next line sets the + # parent. + parent.add( self ) if parent + end + + # Replaces this object with another object. Basically, calls + # Parent.replace_child + # + # Returns:: self + def replace_with( child ) + @parent.replace_child( self, child ) + self + end + + # Removes this child from the parent. + # + # Returns:: self + def remove + unless @parent.nil? + @parent.delete self + end + self + end + + # Sets the parent of this child to the supplied argument. + # + # other:: + # Must be a Parent object. If this object is the same object as the + # existing parent of this child, no action is taken. Otherwise, this + # child is removed from the current parent (if one exists), and is added + # to the new parent. + # Returns:: The parent added + def parent=( other ) + return @parent if @parent == other + @parent.delete self if defined? @parent and @parent + @parent = other + end + + alias :next_sibling :next_sibling_node + alias :previous_sibling :previous_sibling_node + + # Sets the next sibling of this child. This can be used to insert a child + # after some other child. + # a = Element.new("a") + # b = a.add_element("b") + # c = Element.new("c") + # b.next_sibling = c + # # => <a><b/><c/></a> + def next_sibling=( other ) + parent.insert_after self, other + end + + # Sets the previous sibling of this child. This can be used to insert a + # child before some other child. + # a = Element.new("a") + # b = a.add_element("b") + # c = Element.new("c") + # b.previous_sibling = c + # # => <a><b/><c/></a> + def previous_sibling=(other) + parent.insert_before self, other + end + + # Returns:: the document this child belongs to, or nil if this child + # belongs to no document + def document + return parent.document unless parent.nil? + nil + end + + # This doesn't yet handle encodings + def bytes + document.encoding + + to_s + end + end +end diff --git a/jni/ruby/lib/rexml/comment.rb b/jni/ruby/lib/rexml/comment.rb new file mode 100644 index 0000000..000b03d --- /dev/null +++ b/jni/ruby/lib/rexml/comment.rb @@ -0,0 +1,79 @@ +require "rexml/child" + +module REXML + ## + # Represents an XML comment; that is, text between \<!-- ... --> + class Comment < Child + include Comparable + START = "<!--" + STOP = "-->" + + # The content text + + attr_accessor :string + + ## + # Constructor. The first argument can be one of three types: + # @param first If String, the contents of this comment are set to the + # argument. If Comment, the argument is duplicated. If + # Source, the argument is scanned for a comment. + # @param second If the first argument is a Source, this argument + # should be nil, not supplied, or a Parent to be set as the parent + # of this object + def initialize( first, second = nil ) + super(second) + if first.kind_of? String + @string = first + elsif first.kind_of? Comment + @string = first.string + end + end + + def clone + Comment.new self + end + + # == DEPRECATED + # See REXML::Formatters + # + # output:: + # Where to write the string + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. + # transitive:: + # Ignored by this class. The contents of comments are never modified. + # ie_hack:: + # Needed for conformity to the child API, but not used by this class. + def write( output, indent=-1, transitive=false, ie_hack=false ) + Kernel.warn("Comment.write is deprecated. See REXML::Formatters") + indent( output, indent ) + output << START + output << @string + output << STOP + end + + alias :to_s :string + + ## + # Compares this Comment to another; the contents of the comment are used + # in the comparison. + def <=>(other) + other.to_s <=> @string + end + + ## + # Compares this Comment to another; the contents of the comment are used + # in the comparison. + def ==( other ) + other.kind_of? Comment and + (other <=> self) == 0 + end + + def node_type + :comment + end + end +end +#vim:ts=2 sw=2 noexpandtab: diff --git a/jni/ruby/lib/rexml/doctype.rb b/jni/ruby/lib/rexml/doctype.rb new file mode 100644 index 0000000..0b3c533 --- /dev/null +++ b/jni/ruby/lib/rexml/doctype.rb @@ -0,0 +1,269 @@ +require "rexml/parent" +require "rexml/parseexception" +require "rexml/namespace" +require 'rexml/entity' +require 'rexml/attlistdecl' +require 'rexml/xmltokens' + +module REXML + # Represents an XML DOCTYPE declaration; that is, the contents of <!DOCTYPE + # ... >. DOCTYPES can be used to declare the DTD of a document, as well as + # being used to declare entities used in the document. + class DocType < Parent + include XMLTokens + START = "<!DOCTYPE" + STOP = ">" + SYSTEM = "SYSTEM" + PUBLIC = "PUBLIC" + DEFAULT_ENTITIES = { + 'gt'=>EntityConst::GT, + 'lt'=>EntityConst::LT, + 'quot'=>EntityConst::QUOT, + "apos"=>EntityConst::APOS + } + + # name is the name of the doctype + # external_id is the referenced DTD, if given + attr_reader :name, :external_id, :entities, :namespaces + + # Constructor + # + # dt = DocType.new( 'foo', '-//I/Hate/External/IDs' ) + # # <!DOCTYPE foo '-//I/Hate/External/IDs'> + # dt = DocType.new( doctype_to_clone ) + # # Incomplete. Shallow clone of doctype + # + # +Note+ that the constructor: + # + # Doctype.new( Source.new( "<!DOCTYPE foo 'bar'>" ) ) + # + # is _deprecated_. Do not use it. It will probably disappear. + def initialize( first, parent=nil ) + @entities = DEFAULT_ENTITIES + @long_name = @uri = nil + if first.kind_of? String + super() + @name = first + @external_id = parent + elsif first.kind_of? DocType + super( parent ) + @name = first.name + @external_id = first.external_id + elsif first.kind_of? Array + super( parent ) + @name = first[0] + @external_id = first[1] + @long_name = first[2] + @uri = first[3] + elsif first.kind_of? Source + super( parent ) + parser = Parsers::BaseParser.new( first ) + event = parser.pull + if event[0] == :start_doctype + @name, @external_id, @long_name, @uri, = event[1..-1] + end + else + super() + end + end + + def node_type + :doctype + end + + def attributes_of element + rv = [] + each do |child| + child.each do |key,val| + rv << Attribute.new(key,val) + end if child.kind_of? AttlistDecl and child.element_name == element + end + rv + end + + def attribute_of element, attribute + att_decl = find do |child| + child.kind_of? AttlistDecl and + child.element_name == element and + child.include? attribute + end + return nil unless att_decl + att_decl[attribute] + end + + def clone + DocType.new self + end + + # output:: + # Where to write the string + # indent:: + # An integer. If -1, no indentation will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. + # transitive:: + # Ignored + # ie_hack:: + # Ignored + def write( output, indent=0, transitive=false, ie_hack=false ) + f = REXML::Formatters::Default.new + indent( output, indent ) + output << START + output << ' ' + output << @name + output << " #@external_id" if @external_id + output << " #{@long_name.inspect}" if @long_name + output << " #{@uri.inspect}" if @uri + unless @children.empty? + output << ' [' + @children.each { |child| + output << "\n" + f.write( child, output ) + } + output << "\n]" + end + output << STOP + end + + def context + @parent.context + end + + def entity( name ) + @entities[name].unnormalized if @entities[name] + end + + def add child + super(child) + @entities = DEFAULT_ENTITIES.clone if @entities == DEFAULT_ENTITIES + @entities[ child.name ] = child if child.kind_of? Entity + end + + # This method retrieves the public identifier identifying the document's + # DTD. + # + # Method contributed by Henrik Martensson + def public + case @external_id + when "SYSTEM" + nil + when "PUBLIC" + strip_quotes(@long_name) + end + end + + # This method retrieves the system identifier identifying the document's DTD + # + # Method contributed by Henrik Martensson + def system + case @external_id + when "SYSTEM" + strip_quotes(@long_name) + when "PUBLIC" + @uri.kind_of?(String) ? strip_quotes(@uri) : nil + end + end + + # This method returns a list of notations that have been declared in the + # _internal_ DTD subset. Notations in the external DTD subset are not + # listed. + # + # Method contributed by Henrik Martensson + def notations + children().select {|node| node.kind_of?(REXML::NotationDecl)} + end + + # Retrieves a named notation. Only notations declared in the internal + # DTD subset can be retrieved. + # + # Method contributed by Henrik Martensson + def notation(name) + notations.find { |notation_decl| + notation_decl.name == name + } + end + + private + + # Method contributed by Henrik Martensson + def strip_quotes(quoted_string) + quoted_string =~ /^[\'\"].*[\'\"]$/ ? + quoted_string[1, quoted_string.length-2] : + quoted_string + end + end + + # We don't really handle any of these since we're not a validating + # parser, so we can be pretty dumb about them. All we need to be able + # to do is spew them back out on a write() + + # This is an abstract class. You never use this directly; it serves as a + # parent class for the specific declarations. + class Declaration < Child + def initialize src + super() + @string = src + end + + def to_s + @string+'>' + end + + # == DEPRECATED + # See REXML::Formatters + # + def write( output, indent ) + output << to_s + end + end + + public + class ElementDecl < Declaration + def initialize( src ) + super + end + end + + class ExternalEntity < Child + def initialize( src ) + super() + @entity = src + end + def to_s + @entity + end + def write( output, indent ) + output << @entity + end + end + + class NotationDecl < Child + attr_accessor :public, :system + def initialize name, middle, pub, sys + super(nil) + @name = name + @middle = middle + @public = pub + @system = sys + end + + def to_s + notation = "<!NOTATION #{@name} #{@middle}" + notation << " #{@public.inspect}" if @public + notation << " #{@system.inspect}" if @system + notation << ">" + notation + end + + def write( output, indent=-1 ) + output << to_s + end + + # This method retrieves the name of the notation. + # + # Method contributed by Henrik Martensson + def name + @name + end + end +end diff --git a/jni/ruby/lib/rexml/document.rb b/jni/ruby/lib/rexml/document.rb new file mode 100644 index 0000000..d7d24f4 --- /dev/null +++ b/jni/ruby/lib/rexml/document.rb @@ -0,0 +1,290 @@ +require "rexml/security" +require "rexml/element" +require "rexml/xmldecl" +require "rexml/source" +require "rexml/comment" +require "rexml/doctype" +require "rexml/instruction" +require "rexml/rexml" +require "rexml/parseexception" +require "rexml/output" +require "rexml/parsers/baseparser" +require "rexml/parsers/streamparser" +require "rexml/parsers/treeparser" + +module REXML + # Represents a full XML document, including PIs, a doctype, etc. A + # Document has a single child that can be accessed by root(). + # Note that if you want to have an XML declaration written for a document + # you create, you must add one; REXML documents do not write a default + # declaration for you. See |DECLARATION| and |write|. + class Document < Element + # A convenient default XML declaration. If you want an XML declaration, + # the easiest way to add one is mydoc << Document::DECLARATION + # +DEPRECATED+ + # Use: mydoc << XMLDecl.default + DECLARATION = XMLDecl.default + + # Constructor + # @param source if supplied, must be a Document, String, or IO. + # Documents have their context and Element attributes cloned. + # Strings are expected to be valid XML documents. IOs are expected + # to be sources of valid XML documents. + # @param context if supplied, contains the context of the document; + # this should be a Hash. + def initialize( source = nil, context = {} ) + @entity_expansion_count = 0 + super() + @context = context + return if source.nil? + if source.kind_of? Document + @context = source.context + super source + else + build( source ) + end + end + + def node_type + :document + end + + # Should be obvious + def clone + Document.new self + end + + # According to the XML spec, a root node has no expanded name + def expanded_name + '' + #d = doc_type + #d ? d.name : "UNDEFINED" + end + + alias :name :expanded_name + + # We override this, because XMLDecls and DocTypes must go at the start + # of the document + def add( child ) + if child.kind_of? XMLDecl + if @children[0].kind_of? XMLDecl + @children[0] = child + else + @children.unshift child + end + child.parent = self + elsif child.kind_of? DocType + # Find first Element or DocType node and insert the decl right + # before it. If there is no such node, just insert the child at the + # end. If there is a child and it is an DocType, then replace it. + insert_before_index = @children.find_index { |x| + x.kind_of?(Element) || x.kind_of?(DocType) + } + if insert_before_index # Not null = not end of list + if @children[ insert_before_index ].kind_of? DocType + @children[ insert_before_index ] = child + else + @children[ insert_before_index-1, 0 ] = child + end + else # Insert at end of list + @children << child + end + child.parent = self + else + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + end + alias :<< :add + + def add_element(arg=nil, arg2=nil) + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + + # @return the root Element of the document, or nil if this document + # has no children. + def root + elements[1] + #self + #@children.find { |item| item.kind_of? Element } + end + + # @return the DocType child of the document, if one exists, + # and nil otherwise. + def doctype + @children.find { |item| item.kind_of? DocType } + end + + # @return the XMLDecl of this document; if no XMLDecl has been + # set, the default declaration is returned. + def xml_decl + rv = @children[0] + return rv if rv.kind_of? XMLDecl + @children.unshift(XMLDecl.default)[0] + end + + # @return the XMLDecl version of this document as a String. + # If no XMLDecl has been set, returns the default version. + def version + xml_decl().version + end + + # @return the XMLDecl encoding of this document as an + # Encoding object. + # If no XMLDecl has been set, returns the default encoding. + def encoding + xml_decl().encoding + end + + # @return the XMLDecl standalone value of this document as a String. + # If no XMLDecl has been set, returns the default setting. + def stand_alone? + xml_decl().stand_alone? + end + + # :call-seq: + # doc.write(output=$stdout, indent=-1, transtive=false, ie_hack=false, encoding=nil) + # doc.write(options={:output => $stdout, :indent => -1, :transtive => false, :ie_hack => false, :encoding => nil}) + # + # Write the XML tree out, optionally with indent. This writes out the + # entire XML document, including XML declarations, doctype declarations, + # and processing instructions (if any are given). + # + # A controversial point is whether Document should always write the XML + # declaration (<?xml version='1.0'?>) whether or not one is given by the + # user (or source document). REXML does not write one if one was not + # specified, because it adds unnecessary bandwidth to applications such + # as XML-RPC. + # + # Accept Nth argument style and options Hash style as argument. + # The recommended style is options Hash style for one or more + # arguments case. + # + # _Examples_ + # Document.new("<a><b/></a>").write + # + # output = "" + # Document.new("<a><b/></a>").write(output) + # + # output = "" + # Document.new("<a><b/></a>").write(:output => output, :indent => 2) + # + # See also the classes in the rexml/formatters package for the proper way + # to change the default formatting of XML output. + # + # _Examples_ + # + # output = "" + # tr = Transitive.new + # tr.write(Document.new("<a><b/></a>"), output) + # + # output:: + # output an object which supports '<< string'; this is where the + # document will be written. + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be twice this number of spaces, and children will be + # indented an additional amount. For a value of 3, every item will be + # indented 3 more levels, or 6 more spaces (2 * 3). Defaults to -1 + # transitive:: + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. + # ie_hack:: + # This hack inserts a space before the /> on empty tags to address + # a limitation of Internet Explorer. Defaults to false + # encoding:: + # Encoding name as String. Change output encoding to specified encoding + # instead of encoding in XML declaration. + # Defaults to nil. It means encoding in XML declaration is used. + def write(*arguments) + if arguments.size == 1 and arguments[0].class == Hash + options = arguments[0] + + output = options[:output] + indent = options[:indent] + transitive = options[:transitive] + ie_hack = options[:ie_hack] + encoding = options[:encoding] + else + output, indent, transitive, ie_hack, encoding, = *arguments + end + + output ||= $stdout + indent ||= -1 + transitive = false if transitive.nil? + ie_hack = false if ie_hack.nil? + encoding ||= xml_decl.encoding + + if encoding != 'UTF-8' && !output.kind_of?(Output) + output = Output.new( output, encoding ) + end + formatter = if indent > -1 + if transitive + require "rexml/formatters/transitive" + REXML::Formatters::Transitive.new( indent, ie_hack ) + else + REXML::Formatters::Pretty.new( indent, ie_hack ) + end + else + REXML::Formatters::Default.new( ie_hack ) + end + formatter.write( self, output ) + end + + + def Document::parse_stream( source, listener ) + Parsers::StreamParser.new( source, listener ).parse + end + + # Set the entity expansion limit. By default the limit is set to 10000. + # + # Deprecated. Use REXML::Security.entity_expansion_limit= instead. + def Document::entity_expansion_limit=( val ) + Security.entity_expansion_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10000. + # + # Deprecated. Use REXML::Security.entity_expansion_limit= instead. + def Document::entity_expansion_limit + return Security.entity_expansion_limit + end + + # Set the entity expansion limit. By default the limit is set to 10240. + # + # Deprecated. Use REXML::Security.entity_expansion_text_limit= instead. + def Document::entity_expansion_text_limit=( val ) + Security.entity_expansion_text_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10240. + # + # Deprecated. Use REXML::Security.entity_expansion_text_limit instead. + def Document::entity_expansion_text_limit + return Security.entity_expansion_text_limit + end + + attr_reader :entity_expansion_count + + def record_entity_expansion + @entity_expansion_count += 1 + if @entity_expansion_count > Security.entity_expansion_limit + raise "number of entity expansions exceeded, processing aborted." + end + end + + def document + self + end + + private + def build( source ) + Parsers::TreeParser.new( source, self ).parse + end + end +end diff --git a/jni/ruby/lib/rexml/dtd/attlistdecl.rb b/jni/ruby/lib/rexml/dtd/attlistdecl.rb new file mode 100644 index 0000000..25955ee --- /dev/null +++ b/jni/ruby/lib/rexml/dtd/attlistdecl.rb @@ -0,0 +1,10 @@ +require "rexml/child" +module REXML + module DTD + class AttlistDecl < Child + START = "<!ATTLIST" + START_RE = /^\s*#{START}/um + PATTERN_RE = /\s*(#{START}.*?>)/um + end + end +end diff --git a/jni/ruby/lib/rexml/dtd/dtd.rb b/jni/ruby/lib/rexml/dtd/dtd.rb new file mode 100644 index 0000000..62317ba --- /dev/null +++ b/jni/ruby/lib/rexml/dtd/dtd.rb @@ -0,0 +1,46 @@ +require "rexml/dtd/elementdecl" +require "rexml/dtd/entitydecl" +require "rexml/comment" +require "rexml/dtd/notationdecl" +require "rexml/dtd/attlistdecl" +require "rexml/parent" + +module REXML + module DTD + class Parser + def Parser.parse( input ) + case input + when String + parse_helper input + when File + parse_helper input.read + end + end + + # Takes a String and parses it out + def Parser.parse_helper( input ) + contents = Parent.new + while input.size > 0 + case input + when ElementDecl.PATTERN_RE + match = $& + contents << ElementDecl.new( match ) + when AttlistDecl.PATTERN_RE + matchdata = $~ + contents << AttlistDecl.new( matchdata ) + when EntityDecl.PATTERN_RE + matchdata = $~ + contents << EntityDecl.new( matchdata ) + when Comment.PATTERN_RE + matchdata = $~ + contents << Comment.new( matchdata ) + when NotationDecl.PATTERN_RE + matchdata = $~ + contents << NotationDecl.new( matchdata ) + end + end + contents + end + end + end +end diff --git a/jni/ruby/lib/rexml/dtd/elementdecl.rb b/jni/ruby/lib/rexml/dtd/elementdecl.rb new file mode 100644 index 0000000..f90b27d --- /dev/null +++ b/jni/ruby/lib/rexml/dtd/elementdecl.rb @@ -0,0 +1,17 @@ +require "rexml/child" +module REXML + module DTD + class ElementDecl < Child + START = "<!ELEMENT" + START_RE = /^\s*#{START}/um + # PATTERN_RE = /^\s*(#{START}.*?)>/um + PATTERN_RE = /^\s*#{START}\s+((?:[:\w][-\.\w]*:)?[-!\*\.\w]*)(.*?)>/ + #\s*((((["']).*?\5)|[^\/'">]*)*?)(\/)?>/um, true) + + def initialize match + @name = match[1] + @rest = match[2] + end + end + end +end diff --git a/jni/ruby/lib/rexml/dtd/entitydecl.rb b/jni/ruby/lib/rexml/dtd/entitydecl.rb new file mode 100644 index 0000000..a9286b2 --- /dev/null +++ b/jni/ruby/lib/rexml/dtd/entitydecl.rb @@ -0,0 +1,56 @@ +require "rexml/child" +module REXML + module DTD + class EntityDecl < Child + START = "<!ENTITY" + START_RE = /^\s*#{START}/um + PUBLIC = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+PUBLIC\s+((["']).*?\3)\s+((["']).*?\5)\s*>/um + SYSTEM = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+SYSTEM\s+((["']).*?\3)(?:\s+NDATA\s+\w+)?\s*>/um + PLAIN = /^\s*#{START}\s+(\w+)\s+((["']).*?\3)\s*>/um + PERCENT = /^\s*#{START}\s+%\s+(\w+)\s+((["']).*?\3)\s*>/um + # <!ENTITY name SYSTEM "..."> + # <!ENTITY name "..."> + def initialize src + super() + md = nil + if src.match( PUBLIC ) + md = src.match( PUBLIC, true ) + @middle = "PUBLIC" + @content = "#{md[2]} #{md[4]}" + elsif src.match( SYSTEM ) + md = src.match( SYSTEM, true ) + @middle = "SYSTEM" + @content = md[2] + elsif src.match( PLAIN ) + md = src.match( PLAIN, true ) + @middle = "" + @content = md[2] + elsif src.match( PERCENT ) + md = src.match( PERCENT, true ) + @middle = "" + @content = md[2] + end + raise ParseException.new("failed Entity match", src) if md.nil? + @name = md[1] + end + + def to_s + rv = "<!ENTITY #@name " + rv << "#@middle " if @middle.size > 0 + rv << @content + rv + end + + def write( output, indent ) + indent( output, indent ) + output << to_s + end + + def EntityDecl.parse_source source, listener + md = source.match( PATTERN_RE, true ) + thing = md[0].squeeze(" \t\n\r") + listener.send inspect.downcase, thing + end + end + end +end diff --git a/jni/ruby/lib/rexml/dtd/notationdecl.rb b/jni/ruby/lib/rexml/dtd/notationdecl.rb new file mode 100644 index 0000000..17d1b9e --- /dev/null +++ b/jni/ruby/lib/rexml/dtd/notationdecl.rb @@ -0,0 +1,39 @@ +require "rexml/child" +module REXML + module DTD + class NotationDecl < Child + START = "<!NOTATION" + START_RE = /^\s*#{START}/um + PUBLIC = /^\s*#{START}\s+(\w[\w-]*)\s+(PUBLIC)\s+((["']).*?\4)\s*>/um + SYSTEM = /^\s*#{START}\s+(\w[\w-]*)\s+(SYSTEM)\s+((["']).*?\4)\s*>/um + def initialize src + super() + if src.match( PUBLIC ) + md = src.match( PUBLIC, true ) + elsif src.match( SYSTEM ) + md = src.match( SYSTEM, true ) + else + raise ParseException.new( "error parsing notation: no matching pattern", src ) + end + @name = md[1] + @middle = md[2] + @rest = md[3] + end + + def to_s + "<!NOTATION #@name #@middle #@rest>" + end + + def write( output, indent ) + indent( output, indent ) + output << to_s + end + + def NotationDecl.parse_source source, listener + md = source.match( PATTERN_RE, true ) + thing = md[0].squeeze(" \t\n\r") + listener.send inspect.downcase, thing + end + end + end +end diff --git a/jni/ruby/lib/rexml/element.rb b/jni/ruby/lib/rexml/element.rb new file mode 100644 index 0000000..e459704 --- /dev/null +++ b/jni/ruby/lib/rexml/element.rb @@ -0,0 +1,1240 @@ +require "rexml/parent" +require "rexml/namespace" +require "rexml/attribute" +require "rexml/cdata" +require "rexml/xpath" +require "rexml/parseexception" + +module REXML + # An implementation note about namespaces: + # As we parse, when we find namespaces we put them in a hash and assign + # them a unique ID. We then convert the namespace prefix for the node + # to the unique ID. This makes namespace lookup much faster for the + # cost of extra memory use. We save the namespace prefix for the + # context node and convert it back when we write it. + @@namespaces = {} + + # Represents a tagged XML element. Elements are characterized by + # having children, attributes, and names, and can themselves be + # children. + class Element < Parent + include Namespace + + UNDEFINED = "UNDEFINED"; # The default name + + # Mechanisms for accessing attributes and child elements of this + # element. + attr_reader :attributes, :elements + # The context holds information about the processing environment, such as + # whitespace handling. + attr_accessor :context + + # Constructor + # arg:: + # if not supplied, will be set to the default value. + # If a String, the name of this object will be set to the argument. + # If an Element, the object will be shallowly cloned; name, + # attributes, and namespaces will be copied. Children will +not+ be + # copied. + # parent:: + # if supplied, must be a Parent, and will be used as + # the parent of this object. + # context:: + # If supplied, must be a hash containing context items. Context items + # include: + # * <tt>:respect_whitespace</tt> the value of this is :+all+ or an array of + # strings being the names of the elements to respect + # whitespace for. Defaults to :+all+. + # * <tt>:compress_whitespace</tt> the value can be :+all+ or an array of + # strings being the names of the elements to ignore whitespace on. + # Overrides :+respect_whitespace+. + # * <tt>:ignore_whitespace_nodes</tt> the value can be :+all+ or an array + # of strings being the names of the elements in which to ignore + # whitespace-only nodes. If this is set, Text nodes which contain only + # whitespace will not be added to the document tree. + # * <tt>:raw</tt> can be :+all+, or an array of strings being the names of + # the elements to process in raw mode. In raw mode, special + # characters in text is not converted to or from entities. + def initialize( arg = UNDEFINED, parent=nil, context=nil ) + super(parent) + + @elements = Elements.new(self) + @attributes = Attributes.new(self) + @context = context + + if arg.kind_of? String + self.name = arg + elsif arg.kind_of? Element + self.name = arg.expanded_name + arg.attributes.each_attribute{ |attribute| + @attributes << Attribute.new( attribute ) + } + @context = arg.context + end + end + + def inspect + rv = "<#@expanded_name" + + @attributes.each_attribute do |attr| + rv << " " + attr.write( rv, 0 ) + end + + if children.size > 0 + rv << "> ... </>" + else + rv << "/>" + end + end + + + # Creates a shallow copy of self. + # d = Document.new "<a><b/><b/><c><d/></c></a>" + # new_a = d.root.clone + # puts new_a # => "<a/>" + def clone + self.class.new self + end + + # Evaluates to the root node of the document that this element + # belongs to. If this element doesn't belong to a document, but does + # belong to another Element, the parent's root will be returned, until the + # earliest ancestor is found. + # + # Note that this is not the same as the document element. + # In the following example, <a> is the document element, and the root + # node is the parent node of the document element. You may ask yourself + # why the root node is useful: consider the doctype and XML declaration, + # and any processing instructions before the document element... they + # are children of the root node, or siblings of the document element. + # The only time this isn't true is when an Element is created that is + # not part of any Document. In this case, the ancestor that has no + # parent acts as the root node. + # d = Document.new '<a><b><c/></b></a>' + # a = d[1] ; c = a[1][1] + # d.root_node == d # TRUE + # a.root_node # namely, d + # c.root_node # again, d + def root_node + parent.nil? ? self : parent.root_node + end + + def root + return elements[1] if self.kind_of? Document + return self if parent.kind_of? Document or parent.nil? + return parent.root + end + + # Evaluates to the document to which this element belongs, or nil if this + # element doesn't belong to a document. + def document + rt = root + rt.parent if rt + end + + # Evaluates to +true+ if whitespace is respected for this element. This + # is the case if: + # 1. Neither :+respect_whitespace+ nor :+compress_whitespace+ has any value + # 2. The context has :+respect_whitespace+ set to :+all+ or + # an array containing the name of this element, and + # :+compress_whitespace+ isn't set to :+all+ or an array containing the + # name of this element. + # The evaluation is tested against +expanded_name+, and so is namespace + # sensitive. + def whitespace + @whitespace = nil + if @context + if @context[:respect_whitespace] + @whitespace = (@context[:respect_whitespace] == :all or + @context[:respect_whitespace].include? expanded_name) + end + @whitespace = false if (@context[:compress_whitespace] and + (@context[:compress_whitespace] == :all or + @context[:compress_whitespace].include? expanded_name) + ) + end + @whitespace = true unless @whitespace == false + @whitespace + end + + def ignore_whitespace_nodes + @ignore_whitespace_nodes = false + if @context + if @context[:ignore_whitespace_nodes] + @ignore_whitespace_nodes = + (@context[:ignore_whitespace_nodes] == :all or + @context[:ignore_whitespace_nodes].include? expanded_name) + end + end + end + + # Evaluates to +true+ if raw mode is set for this element. This + # is the case if the context has :+raw+ set to :+all+ or + # an array containing the name of this element. + # + # The evaluation is tested against +expanded_name+, and so is namespace + # sensitive. + def raw + @raw = (@context and @context[:raw] and + (@context[:raw] == :all or + @context[:raw].include? expanded_name)) + @raw + end + + #once :whitespace, :raw, :ignore_whitespace_nodes + + ################################################# + # Namespaces # + ################################################# + + # Evaluates to an +Array+ containing the prefixes (names) of all defined + # namespaces at this context node. + # doc = Document.new("<a xmlns:x='1' xmlns:y='2'><b/><c xmlns:z='3'/></a>") + # doc.elements['//b'].prefixes # -> ['x', 'y'] + def prefixes + prefixes = [] + prefixes = parent.prefixes if parent + prefixes |= attributes.prefixes + return prefixes + end + + def namespaces + namespaces = {} + namespaces = parent.namespaces if parent + namespaces = namespaces.merge( attributes.namespaces ) + return namespaces + end + + # Evaluates to the URI for a prefix, or the empty string if no such + # namespace is declared for this element. Evaluates recursively for + # ancestors. Returns the default namespace, if there is one. + # prefix:: + # the prefix to search for. If not supplied, returns the default + # namespace if one exists + # Returns:: + # the namespace URI as a String, or nil if no such namespace + # exists. If the namespace is undefined, returns an empty string + # doc = Document.new("<a xmlns='1' xmlns:y='2'><b/><c xmlns:z='3'/></a>") + # b = doc.elements['//b'] + # b.namespace # -> '1' + # b.namespace("y") # -> '2' + def namespace(prefix=nil) + if prefix.nil? + prefix = prefix() + end + if prefix == '' + prefix = "xmlns" + else + prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' + end + ns = attributes[ prefix ] + ns = parent.namespace(prefix) if ns.nil? and parent + ns = '' if ns.nil? and prefix == 'xmlns' + return ns + end + + # Adds a namespace to this element. + # prefix:: + # the prefix string, or the namespace URI if +uri+ is not + # supplied + # uri:: + # the namespace URI. May be nil, in which +prefix+ is used as + # the URI + # Evaluates to: this Element + # a = Element.new("a") + # a.add_namespace("xmlns:foo", "bar" ) + # a.add_namespace("foo", "bar") # shorthand for previous line + # a.add_namespace("twiddle") + # puts a #-> <a xmlns:foo='bar' xmlns='twiddle'/> + def add_namespace( prefix, uri=nil ) + unless uri + @attributes["xmlns"] = prefix + else + prefix = "xmlns:#{prefix}" unless prefix =~ /^xmlns:/ + @attributes[ prefix ] = uri + end + self + end + + # Removes a namespace from this node. This only works if the namespace is + # actually declared in this node. If no argument is passed, deletes the + # default namespace. + # + # Evaluates to: this element + # doc = Document.new "<a xmlns:foo='bar' xmlns='twiddle'/>" + # doc.root.delete_namespace + # puts doc # -> <a xmlns:foo='bar'/> + # doc.root.delete_namespace 'foo' + # puts doc # -> <a/> + def delete_namespace namespace="xmlns" + namespace = "xmlns:#{namespace}" unless namespace == 'xmlns' + attribute = attributes.get_attribute(namespace) + attribute.remove unless attribute.nil? + self + end + + ################################################# + # Elements # + ################################################# + + # Adds a child to this element, optionally setting attributes in + # the element. + # element:: + # optional. If Element, the element is added. + # Otherwise, a new Element is constructed with the argument (see + # Element.initialize). + # attrs:: + # If supplied, must be a Hash containing String name,value + # pairs, which will be used to set the attributes of the new Element. + # Returns:: the Element that was added + # el = doc.add_element 'my-tag' + # el = doc.add_element 'my-tag', {'attr1'=>'val1', 'attr2'=>'val2'} + # el = Element.new 'my-tag' + # doc.add_element el + def add_element element, attrs=nil + raise "First argument must be either an element name, or an Element object" if element.nil? + el = @elements.add(element) + attrs.each do |key, value| + el.attributes[key]=value + end if attrs.kind_of? Hash + el + end + + # Deletes a child element. + # element:: + # Must be an +Element+, +String+, or +Integer+. If Element, + # the element is removed. If String, the element is found (via XPath) + # and removed. <em>This means that any parent can remove any + # descendant.<em> If Integer, the Element indexed by that number will be + # removed. + # Returns:: the element that was removed. + # doc.delete_element "/a/b/c[@id='4']" + # doc.delete_element doc.elements["//k"] + # doc.delete_element 1 + def delete_element element + @elements.delete element + end + + # Evaluates to +true+ if this element has at least one child Element + # doc = Document.new "<a><b/><c>Text</c></a>" + # doc.root.has_elements # -> true + # doc.elements["/a/b"].has_elements # -> false + # doc.elements["/a/c"].has_elements # -> false + def has_elements? + !@elements.empty? + end + + # Iterates through the child elements, yielding for each Element that + # has a particular attribute set. + # key:: + # the name of the attribute to search for + # value:: + # the value of the attribute + # max:: + # (optional) causes this method to return after yielding + # for this number of matching children + # name:: + # (optional) if supplied, this is an XPath that filters + # the children to check. + # + # doc = Document.new "<a><b @id='1'/><c @id='2'/><d @id='1'/><e/></a>" + # # Yields b, c, d + # doc.root.each_element_with_attribute( 'id' ) {|e| p e} + # # Yields b, d + # doc.root.each_element_with_attribute( 'id', '1' ) {|e| p e} + # # Yields b + # doc.root.each_element_with_attribute( 'id', '1', 1 ) {|e| p e} + # # Yields d + # doc.root.each_element_with_attribute( 'id', '1', 0, 'd' ) {|e| p e} + def each_element_with_attribute( key, value=nil, max=0, name=nil, &block ) # :yields: Element + each_with_something( proc {|child| + if value.nil? + child.attributes[key] != nil + else + child.attributes[key]==value + end + }, max, name, &block ) + end + + # Iterates through the children, yielding for each Element that + # has a particular text set. + # text:: + # the text to search for. If nil, or not supplied, will iterate + # over all +Element+ children that contain at least one +Text+ node. + # max:: + # (optional) causes this method to return after yielding + # for this number of matching children + # name:: + # (optional) if supplied, this is an XPath that filters + # the children to check. + # + # doc = Document.new '<a><b>b</b><c>b</c><d>d</d><e/></a>' + # # Yields b, c, d + # doc.each_element_with_text {|e|p e} + # # Yields b, c + # doc.each_element_with_text('b'){|e|p e} + # # Yields b + # doc.each_element_with_text('b', 1){|e|p e} + # # Yields d + # doc.each_element_with_text(nil, 0, 'd'){|e|p e} + def each_element_with_text( text=nil, max=0, name=nil, &block ) # :yields: Element + each_with_something( proc {|child| + if text.nil? + child.has_text? + else + child.text == text + end + }, max, name, &block ) + end + + # Synonym for Element.elements.each + def each_element( xpath=nil, &block ) # :yields: Element + @elements.each( xpath, &block ) + end + + # Synonym for Element.to_a + # This is a little slower than calling elements.each directly. + # xpath:: any XPath by which to search for elements in the tree + # Returns:: an array of Elements that match the supplied path + def get_elements( xpath ) + @elements.to_a( xpath ) + end + + # Returns the next sibling that is an element, or nil if there is + # no Element sibling after this one + # doc = Document.new '<a><b/>text<c/></a>' + # doc.root.elements['b'].next_element #-> <c/> + # doc.root.elements['c'].next_element #-> nil + def next_element + element = next_sibling + element = element.next_sibling until element.nil? or element.kind_of? Element + return element + end + + # Returns the previous sibling that is an element, or nil if there is + # no Element sibling prior to this one + # doc = Document.new '<a><b/>text<c/></a>' + # doc.root.elements['c'].previous_element #-> <b/> + # doc.root.elements['b'].previous_element #-> nil + def previous_element + element = previous_sibling + element = element.previous_sibling until element.nil? or element.kind_of? Element + return element + end + + + ################################################# + # Text # + ################################################# + + # Evaluates to +true+ if this element has at least one Text child + def has_text? + not text().nil? + end + + # A convenience method which returns the String value of the _first_ + # child text element, if one exists, and +nil+ otherwise. + # + # <em>Note that an element may have multiple Text elements, perhaps + # separated by other children</em>. Be aware that this method only returns + # the first Text node. + # + # This method returns the +value+ of the first text child node, which + # ignores the +raw+ setting, so always returns normalized text. See + # the Text::value documentation. + # + # doc = Document.new "<p>some text <b>this is bold!</b> more text</p>" + # # The element 'p' has two text elements, "some text " and " more text". + # doc.root.text #-> "some text " + def text( path = nil ) + rv = get_text(path) + return rv.value unless rv.nil? + nil + end + + # Returns the first child Text node, if any, or +nil+ otherwise. + # This method returns the actual +Text+ node, rather than the String content. + # doc = Document.new "<p>some text <b>this is bold!</b> more text</p>" + # # The element 'p' has two text elements, "some text " and " more text". + # doc.root.get_text.value #-> "some text " + def get_text path = nil + rv = nil + if path + element = @elements[ path ] + rv = element.get_text unless element.nil? + else + rv = @children.find { |node| node.kind_of? Text } + end + return rv + end + + # Sets the first Text child of this object. See text() for a + # discussion about Text children. + # + # If a Text child already exists, the child is replaced by this + # content. This means that Text content can be deleted by calling + # this method with a nil argument. In this case, the next Text + # child becomes the first Text child. In no case is the order of + # any siblings disturbed. + # text:: + # If a String, a new Text child is created and added to + # this Element as the first Text child. If Text, the text is set + # as the first Child element. If nil, then any existing first Text + # child is removed. + # Returns:: this Element. + # doc = Document.new '<a><b/></a>' + # doc.root.text = 'Sean' #-> '<a><b/>Sean</a>' + # doc.root.text = 'Elliott' #-> '<a><b/>Elliott</a>' + # doc.root.add_element 'c' #-> '<a><b/>Elliott<c/></a>' + # doc.root.text = 'Russell' #-> '<a><b/>Russell<c/></a>' + # doc.root.text = nil #-> '<a><b/><c/></a>' + def text=( text ) + if text.kind_of? String + text = Text.new( text, whitespace(), nil, raw() ) + elsif !text.nil? and !text.kind_of? Text + text = Text.new( text.to_s, whitespace(), nil, raw() ) + end + old_text = get_text + if text.nil? + old_text.remove unless old_text.nil? + else + if old_text.nil? + self << text + else + old_text.replace_with( text ) + end + end + return self + end + + # A helper method to add a Text child. Actual Text instances can + # be added with regular Parent methods, such as add() and <<() + # text:: + # if a String, a new Text instance is created and added + # to the parent. If Text, the object is added directly. + # Returns:: this Element + # e = Element.new('a') #-> <e/> + # e.add_text 'foo' #-> <e>foo</e> + # e.add_text Text.new(' bar') #-> <e>foo bar</e> + # Note that at the end of this example, the branch has <b>3</b> nodes; the 'e' + # element and <b>2</b> Text node children. + def add_text( text ) + if text.kind_of? String + if @children[-1].kind_of? Text + @children[-1] << text + return + end + text = Text.new( text, whitespace(), nil, raw() ) + end + self << text unless text.nil? + return self + end + + def node_type + :element + end + + def xpath + path_elements = [] + cur = self + path_elements << __to_xpath_helper( self ) + while cur.parent + cur = cur.parent + path_elements << __to_xpath_helper( cur ) + end + return path_elements.reverse.join( "/" ) + end + + ################################################# + # Attributes # + ################################################# + + def attribute( name, namespace=nil ) + prefix = nil + if namespaces.respond_to? :key + prefix = namespaces.key(namespace) if namespace + else + prefix = namespaces.index(namespace) if namespace + end + prefix = nil if prefix == 'xmlns' + + ret_val = + attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + + return ret_val unless ret_val.nil? + return nil if prefix.nil? + + # now check that prefix'es namespace is not the same as the + # default namespace + return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] ) + + attributes.get_attribute( name ) + + end + + # Evaluates to +true+ if this element has any attributes set, false + # otherwise. + def has_attributes? + return !@attributes.empty? + end + + # Adds an attribute to this element, overwriting any existing attribute + # by the same name. + # key:: + # can be either an Attribute or a String. If an Attribute, + # the attribute is added to the list of Element attributes. If String, + # the argument is used as the name of the new attribute, and the value + # parameter must be supplied. + # value:: + # Required if +key+ is a String, and ignored if the first argument is + # an Attribute. This is a String, and is used as the value + # of the new Attribute. This should be the unnormalized value of the + # attribute (without entities). + # Returns:: the Attribute added + # e = Element.new 'e' + # e.add_attribute( 'a', 'b' ) #-> <e a='b'/> + # e.add_attribute( 'x:a', 'c' ) #-> <e a='b' x:a='c'/> + # e.add_attribute Attribute.new('b', 'd') #-> <e a='b' x:a='c' b='d'/> + def add_attribute( key, value=nil ) + if key.kind_of? Attribute + @attributes << key + else + @attributes[key] = value + end + end + + # Add multiple attributes to this element. + # hash:: is either a hash, or array of arrays + # el.add_attributes( {"name1"=>"value1", "name2"=>"value2"} ) + # el.add_attributes( [ ["name1","value1"], ["name2"=>"value2"] ] ) + def add_attributes hash + if hash.kind_of? Hash + hash.each_pair {|key, value| @attributes[key] = value } + elsif hash.kind_of? Array + hash.each { |value| @attributes[ value[0] ] = value[1] } + end + end + + # Removes an attribute + # key:: + # either an Attribute or a String. In either case, the + # attribute is found by matching the attribute name to the argument, + # and then removed. If no attribute is found, no action is taken. + # Returns:: + # the attribute removed, or nil if this Element did not contain + # a matching attribute + # e = Element.new('E') + # e.add_attribute( 'name', 'Sean' ) #-> <E name='Sean'/> + # r = e.add_attribute( 'sur:name', 'Russell' ) #-> <E name='Sean' sur:name='Russell'/> + # e.delete_attribute( 'name' ) #-> <E sur:name='Russell'/> + # e.delete_attribute( r ) #-> <E/> + def delete_attribute(key) + attr = @attributes.get_attribute(key) + attr.remove unless attr.nil? + end + + ################################################# + # Other Utilities # + ################################################# + + # Get an array of all CData children. + # IMMUTABLE + def cdatas + find_all { |child| child.kind_of? CData }.freeze + end + + # Get an array of all Comment children. + # IMMUTABLE + def comments + find_all { |child| child.kind_of? Comment }.freeze + end + + # Get an array of all Instruction children. + # IMMUTABLE + def instructions + find_all { |child| child.kind_of? Instruction }.freeze + end + + # Get an array of all Text children. + # IMMUTABLE + def texts + find_all { |child| child.kind_of? Text }.freeze + end + + # == DEPRECATED + # See REXML::Formatters + # + # Writes out this element, and recursively, all children. + # output:: + # output an object which supports '<< string'; this is where the + # document will be written. + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. Defaults to -1 + # transitive:: + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the parse tree of the document + # ie_hack:: + # This hack inserts a space before the /> on empty tags to address + # a limitation of Internet Explorer. Defaults to false + # + # out = '' + # doc.write( out ) #-> doc is written to the string 'out' + # doc.write( $stdout ) #-> doc written to the console + def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) + Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters") + formatter = if indent > -1 + if transitive + require "rexml/formatters/transitive" + REXML::Formatters::Transitive.new( indent, ie_hack ) + else + REXML::Formatters::Pretty.new( indent, ie_hack ) + end + else + REXML::Formatters::Default.new( ie_hack ) + end + formatter.write( self, output ) + end + + + private + def __to_xpath_helper node + rv = node.expanded_name.clone + if node.parent + results = node.parent.find_all {|n| + n.kind_of?(REXML::Element) and n.expanded_name == node.expanded_name + } + if results.length > 1 + idx = results.index( node ) + rv << "[#{idx+1}]" + end + end + rv + end + + # A private helper method + def each_with_something( test, max=0, name=nil ) + num = 0 + @elements.each( name ){ |child| + yield child if test.call(child) and num += 1 + return if max>0 and num == max + } + end + end + + ######################################################################## + # ELEMENTS # + ######################################################################## + + # A class which provides filtering of children for Elements, and + # XPath search support. You are expected to only encounter this class as + # the <tt>element.elements</tt> object. Therefore, you are + # _not_ expected to instantiate this yourself. + class Elements + include Enumerable + # Constructor + # parent:: the parent Element + def initialize parent + @element = parent + end + + # Fetches a child element. Filters only Element children, regardless of + # the XPath match. + # index:: + # the search parameter. This is either an Integer, which + # will be used to find the index'th child Element, or an XPath, + # which will be used to search for the Element. <em>Because + # of the nature of XPath searches, any element in the connected XML + # document can be fetched through any other element.</em> <b>The + # Integer index is 1-based, not 0-based.</b> This means that the first + # child element is at index 1, not 0, and the +n+th element is at index + # +n+, not <tt>n-1</tt>. This is because XPath indexes element children + # starting from 1, not 0, and the indexes should be the same. + # name:: + # optional, and only used in the first argument is an + # Integer. In that case, the index'th child Element that has the + # supplied name will be returned. Note again that the indexes start at 1. + # Returns:: the first matching Element, or nil if no child matched + # doc = Document.new '<a><b/><c id="1"/><c id="2"/><d/></a>' + # doc.root.elements[1] #-> <b/> + # doc.root.elements['c'] #-> <c id="1"/> + # doc.root.elements[2,'c'] #-> <c id="2"/> + def []( index, name=nil) + if index.kind_of? Integer + raise "index (#{index}) must be >= 1" if index < 1 + name = literalize(name) if name + num = 0 + @element.find { |child| + child.kind_of? Element and + (name.nil? ? true : child.has_name?( name )) and + (num += 1) == index + } + else + return XPath::first( @element, index ) + #{ |element| + # return element if element.kind_of? Element + #} + #return nil + end + end + + # Sets an element, replacing any previous matching element. If no + # existing element is found ,the element is added. + # index:: Used to find a matching element to replace. See [](). + # element:: + # The element to replace the existing element with + # the previous element + # Returns:: nil if no previous element was found. + # + # doc = Document.new '<a/>' + # doc.root.elements[10] = Element.new('b') #-> <a><b/></a> + # doc.root.elements[1] #-> <b/> + # doc.root.elements[1] = Element.new('c') #-> <a><c/></a> + # doc.root.elements['c'] = Element.new('d') #-> <a><d/></a> + def []=( index, element ) + previous = self[index] + if previous.nil? + @element.add element + else + previous.replace_with element + end + return previous + end + + # Returns +true+ if there are no +Element+ children, +false+ otherwise + def empty? + @element.find{ |child| child.kind_of? Element}.nil? + end + + # Returns the index of the supplied child (starting at 1), or -1 if + # the element is not a child + # element:: an +Element+ child + def index element + rv = 0 + found = @element.find do |child| + child.kind_of? Element and + (rv += 1) and + child == element + end + return rv if found == element + return -1 + end + + # Deletes a child Element + # element:: + # Either an Element, which is removed directly; an + # xpath, where the first matching child is removed; or an Integer, + # where the n'th Element is removed. + # Returns:: the removed child + # doc = Document.new '<a><b/><c/><c id="1"/></a>' + # b = doc.root.elements[1] + # doc.root.elements.delete b #-> <a><c/><c id="1"/></a> + # doc.elements.delete("a/c[@id='1']") #-> <a><c/></a> + # doc.root.elements.delete 1 #-> <a/> + def delete element + if element.kind_of? Element + @element.delete element + else + el = self[element] + el.remove if el + end + end + + # Removes multiple elements. Filters for Element children, regardless of + # XPath matching. + # xpath:: all elements matching this String path are removed. + # Returns:: an Array of Elements that have been removed + # doc = Document.new '<a><c/><c/><c/><c/></a>' + # deleted = doc.elements.delete_all 'a/c' #-> [<c/>, <c/>, <c/>, <c/>] + def delete_all( xpath ) + rv = [] + XPath::each( @element, xpath) {|element| + rv << element if element.kind_of? Element + } + rv.each do |element| + @element.delete element + element.remove + end + return rv + end + + # Adds an element + # element:: + # if supplied, is either an Element, String, or + # Source (see Element.initialize). If not supplied or nil, a + # new, default Element will be constructed + # Returns:: the added Element + # a = Element.new('a') + # a.elements.add(Element.new('b')) #-> <a><b/></a> + # a.elements.add('c') #-> <a><b/><c/></a> + def add element=nil + if element.nil? + Element.new("", self, @element.context) + elsif not element.kind_of?(Element) + Element.new(element, self, @element.context) + else + @element << element + element.context = @element.context + element + end + end + + alias :<< :add + + # Iterates through all of the child Elements, optionally filtering + # them by a given XPath + # xpath:: + # optional. If supplied, this is a String XPath, and is used to + # filter the children, so that only matching children are yielded. Note + # that XPaths are automatically filtered for Elements, so that + # non-Element children will not be yielded + # doc = Document.new '<a><b/><c/><d/>sean<b/><c/><d/></a>' + # doc.root.elements.each {|e|p e} #-> Yields b, c, d, b, c, d elements + # doc.root.elements.each('b') {|e|p e} #-> Yields b, b elements + # doc.root.elements.each('child::node()') {|e|p e} + # #-> Yields <b/>, <c/>, <d/>, <b/>, <c/>, <d/> + # XPath.each(doc.root, 'child::node()', &block) + # #-> Yields <b/>, <c/>, <d/>, sean, <b/>, <c/>, <d/> + def each( xpath=nil ) + XPath::each( @element, xpath ) {|e| yield e if e.kind_of? Element } + end + + def collect( xpath=nil ) + collection = [] + XPath::each( @element, xpath ) {|e| + collection << yield(e) if e.kind_of?(Element) + } + collection + end + + def inject( xpath=nil, initial=nil ) + first = true + XPath::each( @element, xpath ) {|e| + if (e.kind_of? Element) + if (first and initial == nil) + initial = e + first = false + else + initial = yield( initial, e ) if e.kind_of? Element + end + end + } + initial + end + + # Returns the number of +Element+ children of the parent object. + # doc = Document.new '<a>sean<b/>elliott<b/>russell<b/></a>' + # doc.root.size #-> 6, 3 element and 3 text nodes + # doc.root.elements.size #-> 3 + def size + count = 0 + @element.each {|child| count+=1 if child.kind_of? Element } + count + end + + # Returns an Array of Element children. An XPath may be supplied to + # filter the children. Only Element children are returned, even if the + # supplied XPath matches non-Element children. + # doc = Document.new '<a>sean<b/>elliott<c/></a>' + # doc.root.elements.to_a #-> [ <b/>, <c/> ] + # doc.root.elements.to_a("child::node()") #-> [ <b/>, <c/> ] + # XPath.match(doc.root, "child::node()") #-> [ sean, <b/>, elliott, <c/> ] + def to_a( xpath=nil ) + rv = XPath.match( @element, xpath ) + return rv.find_all{|e| e.kind_of? Element} if xpath + rv + end + + private + # Private helper class. Removes quotes from quoted strings + def literalize name + name = name[1..-2] if name[0] == ?' or name[0] == ?" #' + name + end + end + + ######################################################################## + # ATTRIBUTES # + ######################################################################## + + # A class that defines the set of Attributes of an Element and provides + # operations for accessing elements in that set. + class Attributes < Hash + # Constructor + # element:: the Element of which this is an Attribute + def initialize element + @element = element + end + + # Fetches an attribute value. If you want to get the Attribute itself, + # use get_attribute() + # name:: an XPath attribute name. Namespaces are relevant here. + # Returns:: + # the String value of the matching attribute, or +nil+ if no + # matching attribute was found. This is the unnormalized value + # (with entities expanded). + # + # doc = Document.new "<a foo:att='1' bar:att='2' att='<'/>" + # doc.root.attributes['att'] #-> '<' + # doc.root.attributes['bar:att'] #-> '2' + def [](name) + attr = get_attribute(name) + return attr.value unless attr.nil? + return nil + end + + def to_a + enum_for(:each_attribute).to_a + end + + # Returns the number of attributes the owning Element contains. + # doc = Document "<a x='1' y='2' foo:x='3'/>" + # doc.root.attributes.length #-> 3 + def length + c = 0 + each_attribute { c+=1 } + c + end + alias :size :length + + # Iterates over the attributes of an Element. Yields actual Attribute + # nodes, not String values. + # + # doc = Document.new '<a x="1" y="2"/>' + # doc.root.attributes.each_attribute {|attr| + # p attr.expanded_name+" => "+attr.value + # } + def each_attribute # :yields: attribute + each_value do |val| + if val.kind_of? Attribute + yield val + else + val.each_value { |atr| yield atr } + end + end + end + + # Iterates over each attribute of an Element, yielding the expanded name + # and value as a pair of Strings. + # + # doc = Document.new '<a x="1" y="2"/>' + # doc.root.attributes.each {|name, value| p name+" => "+value } + def each + each_attribute do |attr| + yield [attr.expanded_name, attr.value] + end + end + + # Fetches an attribute + # name:: + # the name by which to search for the attribute. Can be a + # <tt>prefix:name</tt> namespace name. + # Returns:: The first matching attribute, or nil if there was none. This + # value is an Attribute node, not the String value of the attribute. + # doc = Document.new '<a x:foo="1" foo="2" bar="3"/>' + # doc.root.attributes.get_attribute("foo").value #-> "2" + # doc.root.attributes.get_attribute("x:foo").value #-> "1" + def get_attribute( name ) + attr = fetch( name, nil ) + if attr.nil? + return nil if name.nil? + # Look for prefix + name =~ Namespace::NAMESPLIT + prefix, n = $1, $2 + if prefix + attr = fetch( n, nil ) + # check prefix + if attr == nil + elsif attr.kind_of? Attribute + return attr if prefix == attr.prefix + else + attr = attr[ prefix ] + return attr + end + end + element_document = @element.document + if element_document and element_document.doctype + expn = @element.expanded_name + expn = element_document.doctype.name if expn.size == 0 + attr_val = element_document.doctype.attribute_of(expn, name) + return Attribute.new( name, attr_val ) if attr_val + end + return nil + end + if attr.kind_of? Hash + attr = attr[ @element.prefix ] + end + return attr + end + + # Sets an attribute, overwriting any existing attribute value by the + # same name. Namespace is significant. + # name:: the name of the attribute + # value:: + # (optional) If supplied, the value of the attribute. If + # nil, any existing matching attribute is deleted. + # Returns:: + # Owning element + # doc = Document.new "<a x:foo='1' foo='3'/>" + # doc.root.attributes['y:foo'] = '2' + # doc.root.attributes['foo'] = '4' + # doc.root.attributes['x:foo'] = nil + def []=( name, value ) + if value.nil? # Delete the named attribute + attr = get_attribute(name) + delete attr + return + end + + unless value.kind_of? Attribute + if @element.document and @element.document.doctype + value = Text::normalize( value, @element.document.doctype ) + else + value = Text::normalize( value, nil ) + end + value = Attribute.new(name, value) + end + value.element = @element + old_attr = fetch(value.name, nil) + if old_attr.nil? + store(value.name, value) + elsif old_attr.kind_of? Hash + old_attr[value.prefix] = value + elsif old_attr.prefix != value.prefix + # Check for conflicting namespaces + raise ParseException.new( + "Namespace conflict in adding attribute \"#{value.name}\": "+ + "Prefix \"#{old_attr.prefix}\" = "+ + "\"#{@element.namespace(old_attr.prefix)}\" and prefix "+ + "\"#{value.prefix}\" = \"#{@element.namespace(value.prefix)}\"") if + value.prefix != "xmlns" and old_attr.prefix != "xmlns" and + @element.namespace( old_attr.prefix ) == + @element.namespace( value.prefix ) + store value.name, { old_attr.prefix => old_attr, + value.prefix => value } + else + store value.name, value + end + return @element + end + + # Returns an array of Strings containing all of the prefixes declared + # by this set of # attributes. The array does not include the default + # namespace declaration, if one exists. + # doc = Document.new("<a xmlns='foo' xmlns:x='bar' xmlns:y='twee' "+ + # "z='glorp' p:k='gru'/>") + # prefixes = doc.root.attributes.prefixes #-> ['x', 'y'] + def prefixes + ns = [] + each_attribute do |attribute| + ns << attribute.name if attribute.prefix == 'xmlns' + end + if @element.document and @element.document.doctype + expn = @element.expanded_name + expn = @element.document.doctype.name if expn.size == 0 + @element.document.doctype.attributes_of(expn).each { + |attribute| + ns << attribute.name if attribute.prefix == 'xmlns' + } + end + ns + end + + def namespaces + namespaces = {} + each_attribute do |attribute| + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + end + if @element.document and @element.document.doctype + expn = @element.expanded_name + expn = @element.document.doctype.name if expn.size == 0 + @element.document.doctype.attributes_of(expn).each { + |attribute| + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + } + end + namespaces + end + + # Removes an attribute + # attribute:: + # either a String, which is the name of the attribute to remove -- + # namespaces are significant here -- or the attribute to remove. + # Returns:: the owning element + # doc = Document.new "<a y:foo='0' x:foo='1' foo='3' z:foo='4'/>" + # doc.root.attributes.delete 'foo' #-> <a y:foo='0' x:foo='1' z:foo='4'/>" + # doc.root.attributes.delete 'x:foo' #-> <a y:foo='0' z:foo='4'/>" + # attr = doc.root.attributes.get_attribute('y:foo') + # doc.root.attributes.delete attr #-> <a z:foo='4'/>" + def delete( attribute ) + name = nil + prefix = nil + if attribute.kind_of? Attribute + name = attribute.name + prefix = attribute.prefix + else + attribute =~ Namespace::NAMESPLIT + prefix, name = $1, $2 + prefix = '' unless prefix + end + old = fetch(name, nil) + if old.kind_of? Hash # the supplied attribute is one of many + old.delete(prefix) + if old.size == 1 + repl = nil + old.each_value{|v| repl = v} + store name, repl + end + elsif old.nil? + return @element + else # the supplied attribute is a top-level one + super(name) + end + @element + end + + # Adds an attribute, overriding any existing attribute by the + # same name. Namespaces are significant. + # attribute:: An Attribute + def add( attribute ) + self[attribute.name] = attribute + end + + alias :<< :add + + # Deletes all attributes matching a name. Namespaces are significant. + # name:: + # A String; all attributes that match this path will be removed + # Returns:: an Array of the Attributes that were removed + def delete_all( name ) + rv = [] + each_attribute { |attribute| + rv << attribute if attribute.expanded_name == name + } + rv.each{ |attr| attr.remove } + return rv + end + + # The +get_attribute_ns+ method retrieves a method by its namespace + # and name. Thus it is possible to reliably identify an attribute + # even if an XML processor has changed the prefix. + # + # Method contributed by Henrik Martensson + def get_attribute_ns(namespace, name) + result = nil + each_attribute() { |attribute| + if name == attribute.name && + namespace == attribute.namespace() && + ( !namespace.empty? || !attribute.fully_expanded_name.index(':') ) + # foo will match xmlns:foo, but only if foo isn't also an attribute + result = attribute if !result or !namespace.empty? or + !attribute.fully_expanded_name.index(':') + end + } + result + end + end +end diff --git a/jni/ruby/lib/rexml/encoding.rb b/jni/ruby/lib/rexml/encoding.rb new file mode 100644 index 0000000..1c7e79a --- /dev/null +++ b/jni/ruby/lib/rexml/encoding.rb @@ -0,0 +1,50 @@ +# coding: US-ASCII +module REXML + module Encoding + # ID ---> Encoding name + attr_reader :encoding + def encoding=(encoding) + encoding = encoding.name if encoding.is_a?(Encoding) + if encoding.is_a?(String) + original_encoding = encoding + encoding = find_encoding(encoding) + unless encoding + raise ArgumentError, "Bad encoding name #{original_encoding}" + end + end + return false if defined?(@encoding) and encoding == @encoding + if encoding + @encoding = encoding.upcase + else + @encoding = 'UTF-8' + end + true + end + + def encode(string) + string.encode(@encoding) + end + + def decode(string) + string.encode(::Encoding::UTF_8, @encoding) + end + + private + def find_encoding(name) + case name + when /\Ashift-jis\z/i + return "SHIFT_JIS" + when /\ACP-(\d+)\z/ + name = "CP#{$1}" + when /\AUTF-8\z/i + return name + end + begin + ::Encoding::Converter.search_convpath(name, 'UTF-8') + rescue ::Encoding::ConverterNotFoundError + return nil + end + name + end + end +end diff --git a/jni/ruby/lib/rexml/entity.rb b/jni/ruby/lib/rexml/entity.rb new file mode 100644 index 0000000..3a35ec6 --- /dev/null +++ b/jni/ruby/lib/rexml/entity.rb @@ -0,0 +1,173 @@ +require 'rexml/child' +require 'rexml/source' +require 'rexml/xmltokens' + +module REXML + # God, I hate DTDs. I really do. Why this idiot standard still + # plagues us is beyond me. + class Entity < Child + include XMLTokens + PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" + SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} + PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} + EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" + NDATADECL = "\\s+NDATA\\s+#{NAME}" + PEREFERENCE = "%#{NAME};" + ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} + PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + + attr_reader :name, :external, :ref, :ndata, :pubid + + # Create a new entity. Simple entities can be constructed by passing a + # name, value to the constructor; this creates a generic, plain entity + # reference. For anything more complicated, you have to pass a Source to + # the constructor with the entity definition, or use the accessor methods. + # +WARNING+: There is no validation of entity state except when the entity + # is read from a stream. If you start poking around with the accessors, + # you can easily create a non-conformant Entity. The best thing to do is + # dump the stupid DTDs and use XMLSchema instead. + # + # e = Entity.new( 'amp', '&' ) + def initialize stream, value=nil, parent=nil, reference=false + super(parent) + @ndata = @pubid = @value = @external = nil + if stream.kind_of? Array + @name = stream[1] + if stream[-1] == '%' + @reference = true + stream.pop + else + @reference = false + end + if stream[2] =~ /SYSTEM|PUBLIC/ + @external = stream[2] + if @external == 'SYSTEM' + @ref = stream[3] + @ndata = stream[4] if stream.size == 5 + else + @pubid = stream[3] + @ref = stream[4] + end + else + @value = stream[2] + end + else + @reference = reference + @external = nil + @name = stream + @value = value + end + end + + # Evaluates whether the given string matches an entity definition, + # returning true if so, and false otherwise. + def Entity::matches? string + (ENTITYDECL =~ string) == 0 + end + + # Evaluates to the unnormalized value of this entity; that is, replacing + # all entities -- both %ent; and &ent; entities. This differs from + # +value()+ in that +value+ only replaces %ent; entities. + def unnormalized + document.record_entity_expansion unless document.nil? + v = value() + return nil if v.nil? + @unnormalized = Text::unnormalize(v, parent) + @unnormalized + end + + #once :unnormalized + + # Returns the value of this entity unprocessed -- raw. This is the + # normalized value; that is, with all %ent; and &ent; entities intact + def normalized + @value + end + + # Write out a fully formed, correct entity definition (assuming the Entity + # object itself is valid.) + # + # out:: + # An object implementing <TT><<<TT> to which the entity will be + # output + # indent:: + # *DEPRECATED* and ignored + def write out, indent=-1 + out << '<!ENTITY ' + out << '% ' if @reference + out << @name + out << ' ' + if @external + out << @external << ' ' + if @pubid + q = @pubid.include?('"')?"'":'"' + out << q << @pubid << q << ' ' + end + q = @ref.include?('"')?"'":'"' + out << q << @ref << q + out << ' NDATA ' << @ndata if @ndata + else + q = @value.include?('"')?"'":'"' + out << q << @value << q + end + out << '>' + end + + # Returns this entity as a string. See write(). + def to_s + rv = '' + write rv + rv + end + + PEREFERENCE_RE = /#{PEREFERENCE}/um + # Returns the value of this entity. At the moment, only internal entities + # are processed. If the value contains internal references (IE, + # %blah;), those are replaced with their values. IE, if the doctype + # contains: + # <!ENTITY % foo "bar"> + # <!ENTITY yada "nanoo %foo; nanoo> + # then: + # doctype.entity('yada').value #-> "nanoo bar nanoo" + def value + if @value + matches = @value.scan(PEREFERENCE_RE) + rv = @value.clone + if @parent + sum = 0 + matches.each do |entity_reference| + entity_value = @parent.entity( entity_reference[0] ) + if sum + entity_value.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + else + sum += entity_value.bytesize + end + rv.gsub!( /%#{entity_reference.join};/um, entity_value ) + end + end + return rv + end + nil + end + end + + # This is a set of entity constants -- the ones defined in the XML + # specification. These are +gt+, +lt+, +amp+, +quot+ and +apos+. + # CAUTION: these entities does not have parent and document + module EntityConst + # +>+ + GT = Entity.new( 'gt', '>' ) + # +<+ + LT = Entity.new( 'lt', '<' ) + # +&+ + AMP = Entity.new( 'amp', '&' ) + # +"+ + QUOT = Entity.new( 'quot', '"' ) + # +'+ + APOS = Entity.new( 'apos', "'" ) + end +end diff --git a/jni/ruby/lib/rexml/formatters/default.rb b/jni/ruby/lib/rexml/formatters/default.rb new file mode 100644 index 0000000..574c821 --- /dev/null +++ b/jni/ruby/lib/rexml/formatters/default.rb @@ -0,0 +1,111 @@ +module REXML + module Formatters + class Default + # Prints out the XML document with no formatting -- except if id_hack is + # set. + # + # ie_hack:: + # If set to true, then inserts whitespace before the close of an empty + # tag, so that IE's bad XML parser doesn't choke. + def initialize( ie_hack=false ) + @ie_hack = ie_hack + end + + # Writes the node to some output. + # + # node:: + # The node to write + # output:: + # A class implementing <TT><<</TT>. Pass in an Output object to + # change the output encoding. + def write( node, output ) + case node + + when Document + if node.xml_decl.encoding != 'UTF-8' && !output.kind_of?(Output) + output = Output.new( output, node.xml_decl.encoding ) + end + write_document( node, output ) + + when Element + write_element( node, output ) + + when Declaration, ElementDecl, NotationDecl, ExternalEntity, Entity, + Attribute, AttlistDecl + node.write( output,-1 ) + + when Instruction + write_instruction( node, output ) + + when DocType, XMLDecl + node.write( output ) + + when Comment + write_comment( node, output ) + + when CData + write_cdata( node, output ) + + when Text + write_text( node, output ) + + else + raise Exception.new("XML FORMATTING ERROR") + + end + end + + protected + def write_document( node, output ) + node.children.each { |child| write( child, output ) } + end + + def write_element( node, output ) + output << "<#{node.expanded_name}" + + node.attributes.to_a.map { |a| + Hash === a ? a.values : a + }.flatten.sort_by {|attr| attr.name}.each do |attr| + output << " " + attr.write( output ) + end unless node.attributes.empty? + + if node.children.empty? + output << " " if @ie_hack + output << "/" + else + output << ">" + node.children.each { |child| + write( child, output ) + } + output << "</#{node.expanded_name}" + end + output << ">" + end + + def write_text( node, output ) + output << node.to_s() + end + + def write_comment( node, output ) + output << Comment::START + output << node.to_s + output << Comment::STOP + end + + def write_cdata( node, output ) + output << CData::START + output << node.to_s + output << CData::STOP + end + + def write_instruction( node, output ) + output << Instruction::START.sub(/\\/u, '') + output << node.target + output << ' ' + output << node.content + output << Instruction::STOP.sub(/\\/u, '') + end + end + end +end diff --git a/jni/ruby/lib/rexml/formatters/pretty.rb b/jni/ruby/lib/rexml/formatters/pretty.rb new file mode 100644 index 0000000..e5ba561 --- /dev/null +++ b/jni/ruby/lib/rexml/formatters/pretty.rb @@ -0,0 +1,141 @@ +require 'rexml/formatters/default' + +module REXML + module Formatters + # Pretty-prints an XML document. This destroys whitespace in text nodes + # and will insert carriage returns and indentations. + # + # TODO: Add an option to print attributes on new lines + class Pretty < Default + + # If compact is set to true, then the formatter will attempt to use as + # little space as possible + attr_accessor :compact + # The width of a page. Used for formatting text + attr_accessor :width + + # Create a new pretty printer. + # + # output:: + # An object implementing '<<(String)', to which the output will be written. + # indentation:: + # An integer greater than 0. The indentation of each level will be + # this number of spaces. If this is < 1, the behavior of this object + # is undefined. Defaults to 2. + # ie_hack:: + # If true, the printer will insert whitespace before closing empty + # tags, thereby allowing Internet Explorer's XML parser to + # function. Defaults to false. + def initialize( indentation=2, ie_hack=false ) + @indentation = indentation + @level = 0 + @ie_hack = ie_hack + @width = 80 + @compact = false + end + + protected + def write_element(node, output) + output << ' '*@level + output << "<#{node.expanded_name}" + + node.attributes.each_attribute do |attr| + output << " " + attr.write( output ) + end unless node.attributes.empty? + + if node.children.empty? + if @ie_hack + output << " " + end + output << "/" + else + output << ">" + # If compact and all children are text, and if the formatted output + # is less than the specified width, then try to print everything on + # one line + skip = false + if compact + if node.children.inject(true) {|s,c| s & c.kind_of?(Text)} + string = "" + old_level = @level + @level = 0 + node.children.each { |child| write( child, string ) } + @level = old_level + if string.length < @width + output << string + skip = true + end + end + end + unless skip + output << "\n" + @level += @indentation + node.children.each { |child| + next if child.kind_of?(Text) and child.to_s.strip.length == 0 + write( child, output ) + output << "\n" + } + @level -= @indentation + output << ' '*@level + end + output << "</#{node.expanded_name}" + end + output << ">" + end + + def write_text( node, output ) + s = node.to_s() + s.gsub!(/\s/,' ') + s.squeeze!(" ") + s = wrap(s, @width - @level) + s = indent_text(s, @level, " ", true) + output << (' '*@level + s) + end + + def write_comment( node, output) + output << ' ' * @level + super + end + + def write_cdata( node, output) + output << ' ' * @level + super + end + + def write_document( node, output ) + # Ok, this is a bit odd. All XML documents have an XML declaration, + # but it may not write itself if the user didn't specifically add it, + # either through the API or in the input document. If it doesn't write + # itself, then we don't need a carriage return... which makes this + # logic more complex. + node.children.each { |child| + next if child == node.children[-1] and child.instance_of?(Text) + unless child == node.children[0] or child.instance_of?(Text) or + (child == node.children[1] and !node.children[0].writethis) + output << "\n" + end + write( child, output ) + } + end + + private + def indent_text(string, level=1, style="\t", indentfirstline=true) + return string if level < 0 + string.gsub(/\n/, "\n#{style*level}") + end + + def wrap(string, width) + parts = [] + while string.length > width and place = string.rindex(' ', width) + parts << string[0...place] + string = string[place+1..-1] + end + parts << string + parts.join("\n") + end + + end + end +end + diff --git a/jni/ruby/lib/rexml/formatters/transitive.rb b/jni/ruby/lib/rexml/formatters/transitive.rb new file mode 100644 index 0000000..6cc690d --- /dev/null +++ b/jni/ruby/lib/rexml/formatters/transitive.rb @@ -0,0 +1,57 @@ +require 'rexml/formatters/pretty' + +module REXML + module Formatters + # The Transitive formatter writes an XML document that parses to an + # identical document as the source document. This means that no extra + # whitespace nodes are inserted, and whitespace within text nodes is + # preserved. Within these constraints, the document is pretty-printed, + # with whitespace inserted into the metadata to introduce formatting. + # + # Note that this is only useful if the original XML is not already + # formatted. Since this formatter does not alter whitespace nodes, the + # results of formatting already formatted XML will be odd. + class Transitive < Default + def initialize( indentation=2, ie_hack=false ) + @indentation = indentation + @level = 0 + @ie_hack = ie_hack + end + + protected + def write_element( node, output ) + output << "<#{node.expanded_name}" + + node.attributes.each_attribute do |attr| + output << " " + attr.write( output ) + end unless node.attributes.empty? + + output << "\n" + output << ' '*@level + if node.children.empty? + output << " " if @ie_hack + output << "/" + else + output << ">" + # If compact and all children are text, and if the formatted output + # is less than the specified width, then try to print everything on + # one line + @level += @indentation + node.children.each { |child| + write( child, output ) + } + @level -= @indentation + output << "</#{node.expanded_name}" + output << "\n" + output << ' '*@level + end + output << ">" + end + + def write_text( node, output ) + output << node.to_s() + end + end + end +end diff --git a/jni/ruby/lib/rexml/functions.rb b/jni/ruby/lib/rexml/functions.rb new file mode 100644 index 0000000..2010be1 --- /dev/null +++ b/jni/ruby/lib/rexml/functions.rb @@ -0,0 +1,394 @@ +module REXML + # If you add a method, keep in mind two things: + # (1) the first argument will always be a list of nodes from which to + # filter. In the case of context methods (such as position), the function + # should return an array with a value for each child in the array. + # (2) all method calls from XML will have "-" replaced with "_". + # Therefore, in XML, "local-name()" is identical (and actually becomes) + # "local_name()" + module Functions + @@context = nil + @@namespace_context = {} + @@variables = {} + + def Functions::namespace_context=(x) ; @@namespace_context=x ; end + def Functions::variables=(x) ; @@variables=x ; end + def Functions::namespace_context ; @@namespace_context ; end + def Functions::variables ; @@variables ; end + + def Functions::context=(value); @@context = value; end + + def Functions::text( ) + if @@context[:node].node_type == :element + return @@context[:node].find_all{|n| n.node_type == :text}.collect{|n| n.value} + elsif @@context[:node].node_type == :text + return @@context[:node].value + else + return false + end + end + + # Returns the last node of the given list of nodes. + def Functions::last( ) + @@context[:size] + end + + def Functions::position( ) + @@context[:index] + end + + # Returns the size of the given list of nodes. + def Functions::count( node_set ) + node_set.size + end + + # Since REXML is non-validating, this method is not implemented as it + # requires a DTD + def Functions::id( object ) + end + + # UNTESTED + def Functions::local_name( node_set=nil ) + get_namespace( node_set ) do |node| + return node.local_name + end + end + + def Functions::namespace_uri( node_set=nil ) + get_namespace( node_set ) {|node| node.namespace} + end + + def Functions::name( node_set=nil ) + get_namespace( node_set ) do |node| + node.expanded_name + end + end + + # Helper method. + def Functions::get_namespace( node_set = nil ) + if node_set == nil + yield @@context[:node] if defined? @@context[:node].namespace + else + if node_set.respond_to? :each + node_set.each { |node| yield node if defined? node.namespace } + elsif node_set.respond_to? :namespace + yield node_set + end + end + end + + # A node-set is converted to a string by returning the string-value of the + # node in the node-set that is first in document order. If the node-set is + # empty, an empty string is returned. + # + # A number is converted to a string as follows + # + # NaN is converted to the string NaN + # + # positive zero is converted to the string 0 + # + # negative zero is converted to the string 0 + # + # positive infinity is converted to the string Infinity + # + # negative infinity is converted to the string -Infinity + # + # if the number is an integer, the number is represented in decimal form + # as a Number with no decimal point and no leading zeros, preceded by a + # minus sign (-) if the number is negative + # + # otherwise, the number is represented in decimal form as a Number + # including a decimal point with at least one digit before the decimal + # point and at least one digit after the decimal point, preceded by a + # minus sign (-) if the number is negative; there must be no leading zeros + # before the decimal point apart possibly from the one required digit + # immediately before the decimal point; beyond the one required digit + # after the decimal point there must be as many, but only as many, more + # digits as are needed to uniquely distinguish the number from all other + # IEEE 754 numeric values. + # + # The boolean false value is converted to the string false. The boolean + # true value is converted to the string true. + # + # An object of a type other than the four basic types is converted to a + # string in a way that is dependent on that type. + def Functions::string( object=nil ) + #object = @context unless object + if object.instance_of? Array + string( object[0] ) + elsif defined? object.node_type + if object.node_type == :attribute + object.value + elsif object.node_type == :element || object.node_type == :document + string_value(object) + else + object.to_s + end + elsif object.nil? + return "" + else + object.to_s + end + end + + # A node-set is converted to a string by + # returning the concatenation of the string-value + # of each of the children of the node in the + # node-set that is first in document order. + # If the node-set is empty, an empty string is returned. + def Functions::string_value( o ) + rv = "" + o.children.each { |e| + if e.node_type == :text + rv << e.to_s + elsif e.node_type == :element + rv << string_value( e ) + end + } + rv + end + + # UNTESTED + def Functions::concat( *objects ) + objects.join + end + + # Fixed by Mike Stok + def Functions::starts_with( string, test ) + string(string).index(string(test)) == 0 + end + + # Fixed by Mike Stok + def Functions::contains( string, test ) + string(string).include?(string(test)) + end + + # Kouhei fixed this + def Functions::substring_before( string, test ) + ruby_string = string(string) + ruby_index = ruby_string.index(string(test)) + if ruby_index.nil? + "" + else + ruby_string[ 0...ruby_index ] + end + end + + # Kouhei fixed this too + def Functions::substring_after( string, test ) + ruby_string = string(string) + return $1 if ruby_string =~ /#{test}(.*)/ + "" + end + + # Take equal portions of Mike Stok and Sean Russell; mix + # vigorously, and pour into a tall, chilled glass. Serves 10,000. + def Functions::substring( string, start, length=nil ) + ruby_string = string(string) + ruby_length = if length.nil? + ruby_string.length.to_f + else + number(length) + end + ruby_start = number(start) + + # Handle the special cases + return '' if ( + ruby_length.nan? or + ruby_start.nan? or + ruby_start.infinite? + ) + + infinite_length = ruby_length.infinite? == 1 + ruby_length = ruby_string.length if infinite_length + + # Now, get the bounds. The XPath bounds are 1..length; the ruby bounds + # are 0..length. Therefore, we have to offset the bounds by one. + ruby_start = ruby_start.round - 1 + ruby_length = ruby_length.round + + if ruby_start < 0 + ruby_length += ruby_start unless infinite_length + ruby_start = 0 + end + return '' if ruby_length <= 0 + ruby_string[ruby_start,ruby_length] + end + + # UNTESTED + def Functions::string_length( string ) + string(string).length + end + + # UNTESTED + def Functions::normalize_space( string=nil ) + string = string(@@context[:node]) if string.nil? + if string.kind_of? Array + string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string} + else + string.to_s.strip.gsub(/\s+/um, ' ') + end + end + + # This is entirely Mike Stok's beast + def Functions::translate( string, tr1, tr2 ) + from = string(tr1) + to = string(tr2) + + # the map is our translation table. + # + # if a character occurs more than once in the + # from string then we ignore the second & + # subsequent mappings + # + # if a character maps to nil then we delete it + # in the output. This happens if the from + # string is longer than the to string + # + # there's nothing about - or ^ being special in + # http://www.w3.org/TR/xpath#function-translate + # so we don't build ranges or negated classes + + map = Hash.new + 0.upto(from.length - 1) { |pos| + from_char = from[pos] + unless map.has_key? from_char + map[from_char] = + if pos < to.length + to[pos] + else + nil + end + end + } + + if ''.respond_to? :chars + string(string).chars.collect { |c| + if map.has_key? c then map[c] else c end + }.compact.join + else + string(string).unpack('U*').collect { |c| + if map.has_key? c then map[c] else c end + }.compact.pack('U*') + end + end + + # UNTESTED + def Functions::boolean( object=nil ) + if object.kind_of? String + if object =~ /\d+/u + return object.to_f != 0 + else + return object.size > 0 + end + elsif object.kind_of? Array + object = object.find{|x| x and true} + end + return object ? true : false + end + + # UNTESTED + def Functions::not( object ) + not boolean( object ) + end + + # UNTESTED + def Functions::true( ) + true + end + + # UNTESTED + def Functions::false( ) + false + end + + # UNTESTED + def Functions::lang( language ) + lang = false + node = @@context[:node] + attr = nil + until node.nil? + if node.node_type == :element + attr = node.attributes["xml:lang"] + unless attr.nil? + lang = compare_language(string(language), attr) + break + else + end + end + node = node.parent + end + lang + end + + def Functions::compare_language lang1, lang2 + lang2.downcase.index(lang1.downcase) == 0 + end + + # a string that consists of optional whitespace followed by an optional + # minus sign followed by a Number followed by whitespace is converted to + # the IEEE 754 number that is nearest (according to the IEEE 754 + # round-to-nearest rule) to the mathematical value represented by the + # string; any other string is converted to NaN + # + # boolean true is converted to 1; boolean false is converted to 0 + # + # a node-set is first converted to a string as if by a call to the string + # function and then converted in the same way as a string argument + # + # an object of a type other than the four basic types is converted to a + # number in a way that is dependent on that type + def Functions::number( object=nil ) + object = @@context[:node] unless object + case object + when true + Float(1) + when false + Float(0) + when Array + number(string( object )) + when Numeric + object.to_f + else + str = string( object ) + # If XPath ever gets scientific notation... + #if str =~ /^\s*-?(\d*\.?\d+|\d+\.)([Ee]\d*)?\s*$/ + if str =~ /^\s*-?(\d*\.?\d+|\d+\.)\s*$/ + str.to_f + else + (0.0 / 0.0) + end + end + end + + def Functions::sum( nodes ) + nodes = [nodes] unless nodes.kind_of? Array + nodes.inject(0) { |r,n| r + number(string(n)) } + end + + def Functions::floor( number ) + number(number).floor + end + + def Functions::ceiling( number ) + number(number).ceil + end + + def Functions::round( number ) + begin + number(number).round + rescue FloatDomainError + number(number) + end + end + + def Functions::processing_instruction( node ) + node.node_type == :processing_instruction + end + + def Functions::method_missing( id ) + puts "METHOD MISSING #{id.id2name}" + XPath.match( @@context[:node], id.id2name ) + end + end +end diff --git a/jni/ruby/lib/rexml/instruction.rb b/jni/ruby/lib/rexml/instruction.rb new file mode 100644 index 0000000..f8b734a --- /dev/null +++ b/jni/ruby/lib/rexml/instruction.rb @@ -0,0 +1,70 @@ +require "rexml/child" +require "rexml/source" + +module REXML + # Represents an XML Instruction; IE, <? ... ?> + # TODO: Add parent arg (3rd arg) to constructor + class Instruction < Child + START = '<\?' + STOP = '\?>' + + # target is the "name" of the Instruction; IE, the "tag" in <?tag ...?> + # content is everything else. + attr_accessor :target, :content + + # Constructs a new Instruction + # @param target can be one of a number of things. If String, then + # the target of this instruction is set to this. If an Instruction, + # then the Instruction is shallowly cloned (target and content are + # copied). If a Source, then the source is scanned and parsed for + # an Instruction declaration. + # @param content Must be either a String, or a Parent. Can only + # be a Parent if the target argument is a Source. Otherwise, this + # String is set as the content of this instruction. + def initialize(target, content=nil) + if target.kind_of? String + super() + @target = target + @content = content + elsif target.kind_of? Instruction + super(content) + @target = target.target + @content = target.content + end + @content.strip! if @content + end + + def clone + Instruction.new self + end + + # == DEPRECATED + # See the rexml/formatters package + # + def write writer, indent=-1, transitive=false, ie_hack=false + Kernel.warn( "#{self.class.name}.write is deprecated" ) + indent(writer, indent) + writer << START.sub(/\\/u, '') + writer << @target + writer << ' ' + writer << @content + writer << STOP.sub(/\\/u, '') + end + + # @return true if other is an Instruction, and the content and target + # of the other matches the target and content of this object. + def ==( other ) + other.kind_of? Instruction and + other.target == @target and + other.content == @content + end + + def node_type + :processing_instruction + end + + def inspect + "<?p-i #{target} ...?>" + end + end +end diff --git a/jni/ruby/lib/rexml/light/node.rb b/jni/ruby/lib/rexml/light/node.rb new file mode 100644 index 0000000..b33f78f --- /dev/null +++ b/jni/ruby/lib/rexml/light/node.rb @@ -0,0 +1,195 @@ +require 'rexml/xmltokens' + +# [ :element, parent, name, attributes, children* ] + # a = Node.new + # a << "B" # => <a>B</a> + # a.b # => <a>B<b/></a> + # a.b[1] # => <a>B<b/><b/><a> + # a.b[1]["x"] = "y" # => <a>B<b/><b x="y"/></a> + # a.b[0].c # => <a>B<b><c/></b><b x="y"/></a> + # a.b.c << "D" # => <a>B<b><c>D</c></b><b x="y"/></a> +module REXML + module Light + # Represents a tagged XML element. Elements are characterized by + # having children, attributes, and names, and can themselves be + # children. + class Node + NAMESPLIT = /^(?:(#{XMLTokens::NCNAME_STR}):)?(#{XMLTokens::NCNAME_STR})/u + PARENTS = [ :element, :document, :doctype ] + # Create a new element. + def initialize node=nil + @node = node + if node.kind_of? String + node = [ :text, node ] + elsif node.nil? + node = [ :document, nil, nil ] + elsif node[0] == :start_element + node[0] = :element + elsif node[0] == :start_doctype + node[0] = :doctype + elsif node[0] == :start_document + node[0] = :document + end + end + + def size + if PARENTS.include? @node[0] + @node[-1].size + else + 0 + end + end + + def each + size.times { |x| yield( at(x+4) ) } + end + + def name + at(2) + end + + def name=( name_str, ns=nil ) + pfx = '' + pfx = "#{prefix(ns)}:" if ns + _old_put(2, "#{pfx}#{name_str}") + end + + def parent=( node ) + _old_put(1,node) + end + + def local_name + namesplit + @name + end + + def local_name=( name_str ) + _old_put( 1, "#@prefix:#{name_str}" ) + end + + def prefix( namespace=nil ) + prefix_of( self, namespace ) + end + + def namespace( prefix=prefix() ) + namespace_of( self, prefix ) + end + + def namespace=( namespace ) + @prefix = prefix( namespace ) + pfx = '' + pfx = "#@prefix:" if @prefix.size > 0 + _old_put(1, "#{pfx}#@name") + end + + def []( reference, ns=nil ) + if reference.kind_of? String + pfx = '' + pfx = "#{prefix(ns)}:" if ns + at(3)["#{pfx}#{reference}"] + elsif reference.kind_of? Range + _old_get( Range.new(4+reference.begin, reference.end, reference.exclude_end?) ) + else + _old_get( 4+reference ) + end + end + + def =~( path ) + XPath.match( self, path ) + end + + # Doesn't handle namespaces yet + def []=( reference, ns, value=nil ) + if reference.kind_of? String + value = ns unless value + at( 3 )[reference] = value + elsif reference.kind_of? Range + _old_put( Range.new(3+reference.begin, reference.end, reference.exclude_end?), ns ) + else + if value + _old_put( 4+reference, ns, value ) + else + _old_put( 4+reference, ns ) + end + end + end + + # Append a child to this element, optionally under a provided namespace. + # The namespace argument is ignored if the element argument is an Element + # object. Otherwise, the element argument is a string, the namespace (if + # provided) is the namespace the element is created in. + def << element + if node_type() == :text + at(-1) << element + else + newnode = Node.new( element ) + newnode.parent = self + self.push( newnode ) + end + at(-1) + end + + def node_type + _old_get(0) + end + + def text=( foo ) + replace = at(4).kind_of?(String)? 1 : 0 + self._old_put(4,replace, normalizefoo) + end + + def root + context = self + context = context.at(1) while context.at(1) + end + + def has_name?( name, namespace = '' ) + at(3) == name and namespace() == namespace + end + + def children + self + end + + def parent + at(1) + end + + def to_s + + end + + private + + def namesplit + return if @name.defined? + at(2) =~ NAMESPLIT + @prefix = '' || $1 + @name = $2 + end + + def namespace_of( node, prefix=nil ) + if not prefix + name = at(2) + name =~ NAMESPLIT + prefix = $1 + end + to_find = 'xmlns' + to_find = "xmlns:#{prefix}" if not prefix.nil? + ns = at(3)[ to_find ] + ns ? ns : namespace_of( @node[0], prefix ) + end + + def prefix_of( node, namespace=nil ) + if not namespace + name = node.name + name =~ NAMESPLIT + $1 + else + ns = at(3).find { |k,v| v == namespace } + ns ? ns : prefix_of( node.parent, namespace ) + end + end + end + end +end diff --git a/jni/ruby/lib/rexml/namespace.rb b/jni/ruby/lib/rexml/namespace.rb new file mode 100644 index 0000000..aeb339e --- /dev/null +++ b/jni/ruby/lib/rexml/namespace.rb @@ -0,0 +1,47 @@ +require 'rexml/xmltokens' + +module REXML + # Adds named attributes to an object. + module Namespace + # The name of the object, valid if set + attr_reader :name, :expanded_name + # The expanded name of the object, valid if name is set + attr_accessor :prefix + include XMLTokens + NAMESPLIT = /^(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})/u + + # Sets the name and the expanded name + def name=( name ) + @expanded_name = name + name =~ NAMESPLIT + if $1 + @prefix = $1 + else + @prefix = "" + @namespace = "" + end + @name = $2 + end + + # Compares names optionally WITH namespaces + def has_name?( other, ns=nil ) + if ns + return (namespace() == ns and name() == other) + elsif other.include? ":" + return fully_expanded_name == other + else + return name == other + end + end + + alias :local_name :name + + # Fully expand the name, even if the prefix wasn't specified in the + # source file. + def fully_expanded_name + ns = prefix + return "#{ns}:#@name" if ns.size > 0 + return @name + end + end +end diff --git a/jni/ruby/lib/rexml/node.rb b/jni/ruby/lib/rexml/node.rb new file mode 100644 index 0000000..cab6e9f --- /dev/null +++ b/jni/ruby/lib/rexml/node.rb @@ -0,0 +1,75 @@ +require "rexml/parseexception" +require "rexml/formatters/pretty" +require "rexml/formatters/default" + +module REXML + # Represents a node in the tree. Nodes are never encountered except as + # superclasses of other objects. Nodes have siblings. + module Node + # @return the next sibling (nil if unset) + def next_sibling_node + return nil if @parent.nil? + @parent[ @parent.index(self) + 1 ] + end + + # @return the previous sibling (nil if unset) + def previous_sibling_node + return nil if @parent.nil? + ind = @parent.index(self) + return nil if ind == 0 + @parent[ ind - 1 ] + end + + # indent:: + # *DEPRECATED* This parameter is now ignored. See the formatters in the + # REXML::Formatters package for changing the output style. + def to_s indent=nil + unless indent.nil? + Kernel.warn( "#{self.class.name}.to_s(indent) parameter is deprecated" ) + f = REXML::Formatters::Pretty.new( indent ) + f.write( self, rv = "" ) + else + f = REXML::Formatters::Default.new + f.write( self, rv = "" ) + end + return rv + end + + def indent to, ind + if @parent and @parent.context and not @parent.context[:indentstyle].nil? then + indentstyle = @parent.context[:indentstyle] + else + indentstyle = ' ' + end + to << indentstyle*ind unless ind<1 + end + + def parent? + false; + end + + + # Visit all subnodes of +self+ recursively + def each_recursive(&block) # :yields: node + self.elements.each {|node| + block.call(node) + node.each_recursive(&block) + } + end + + # Find (and return) first subnode (recursively) for which the block + # evaluates to true. Returns +nil+ if none was found. + def find_first_recursive(&block) # :yields: node + each_recursive {|node| + return node if block.call(node) + } + return nil + end + + # Returns the position that +self+ holds in its parent's array, indexed + # from 1. + def index_in_parent + parent.index(self)+1 + end + end +end diff --git a/jni/ruby/lib/rexml/output.rb b/jni/ruby/lib/rexml/output.rb new file mode 100644 index 0000000..0c6cc7a --- /dev/null +++ b/jni/ruby/lib/rexml/output.rb @@ -0,0 +1,29 @@ +require 'rexml/encoding' + +module REXML + class Output + include Encoding + + attr_reader :encoding + + def initialize real_IO, encd="iso-8859-1" + @output = real_IO + self.encoding = encd + + @to_utf = encoding != 'UTF-8' + + if encoding == "UTF-16" + @output << "\ufeff".encode("UTF-16BE") + self.encoding = "UTF-16BE" + end + end + + def <<( content ) + @output << (@to_utf ? self.encode(content) : content) + end + + def to_s + "Output[#{encoding}]" + end + end +end diff --git a/jni/ruby/lib/rexml/parent.rb b/jni/ruby/lib/rexml/parent.rb new file mode 100644 index 0000000..2a07fca --- /dev/null +++ b/jni/ruby/lib/rexml/parent.rb @@ -0,0 +1,165 @@ +require "rexml/child" + +module REXML + # A parent has children, and has methods for accessing them. The Parent + # class is never encountered except as the superclass for some other + # object. + class Parent < Child + include Enumerable + + # Constructor + # @param parent if supplied, will be set as the parent of this object + def initialize parent=nil + super(parent) + @children = [] + end + + def add( object ) + object.parent = self + @children << object + object + end + + alias :push :add + alias :<< :push + + def unshift( object ) + object.parent = self + @children.unshift object + end + + def delete( object ) + found = false + @children.delete_if {|c| c.equal?(object) and found = true } + object.parent = nil if found + found ? object : nil + end + + def each(&block) + @children.each(&block) + end + + def delete_if( &block ) + @children.delete_if(&block) + end + + def delete_at( index ) + @children.delete_at index + end + + def each_index( &block ) + @children.each_index(&block) + end + + # Fetches a child at a given index + # @param index the Integer index of the child to fetch + def []( index ) + @children[index] + end + + alias :each_child :each + + + + # Set an index entry. See Array.[]= + # @param index the index of the element to set + # @param opt either the object to set, or an Integer length + # @param child if opt is an Integer, this is the child to set + # @return the parent (self) + def []=( *args ) + args[-1].parent = self + @children[*args[0..-2]] = args[-1] + end + + # Inserts an child before another child + # @param child1 this is either an xpath or an Element. If an Element, + # child2 will be inserted before child1 in the child list of the parent. + # If an xpath, child2 will be inserted before the first child to match + # the xpath. + # @param child2 the child to insert + # @return the parent (self) + def insert_before( child1, child2 ) + if child1.kind_of? String + child1 = XPath.first( self, child1 ) + child1.parent.insert_before child1, child2 + else + ind = index(child1) + child2.parent.delete(child2) if child2.parent + @children[ind,0] = child2 + child2.parent = self + end + self + end + + # Inserts an child after another child + # @param child1 this is either an xpath or an Element. If an Element, + # child2 will be inserted after child1 in the child list of the parent. + # If an xpath, child2 will be inserted after the first child to match + # the xpath. + # @param child2 the child to insert + # @return the parent (self) + def insert_after( child1, child2 ) + if child1.kind_of? String + child1 = XPath.first( self, child1 ) + child1.parent.insert_after child1, child2 + else + ind = index(child1)+1 + child2.parent.delete(child2) if child2.parent + @children[ind,0] = child2 + child2.parent = self + end + self + end + + def to_a + @children.dup + end + + # Fetches the index of a given child + # @param child the child to get the index of + # @return the index of the child, or nil if the object is not a child + # of this parent. + def index( child ) + count = -1 + @children.find { |i| count += 1 ; i.hash == child.hash } + count + end + + # @return the number of children of this parent + def size + @children.size + end + + alias :length :size + + # Replaces one child with another, making sure the nodelist is correct + # @param to_replace the child to replace (must be a Child) + # @param replacement the child to insert into the nodelist (must be a + # Child) + def replace_child( to_replace, replacement ) + @children.map! {|c| c.equal?( to_replace ) ? replacement : c } + to_replace.parent = nil + replacement.parent = self + end + + # Deeply clones this object. This creates a complete duplicate of this + # Parent, including all descendants. + def deep_clone + cl = clone() + each do |child| + if child.kind_of? Parent + cl << child.deep_clone + else + cl << child.clone + end + end + cl + end + + alias :children :to_a + + def parent? + true + end + end +end diff --git a/jni/ruby/lib/rexml/parseexception.rb b/jni/ruby/lib/rexml/parseexception.rb new file mode 100644 index 0000000..0c4d55a --- /dev/null +++ b/jni/ruby/lib/rexml/parseexception.rb @@ -0,0 +1,51 @@ +module REXML + class ParseException < RuntimeError + attr_accessor :source, :parser, :continued_exception + + def initialize( message, source=nil, parser=nil, exception=nil ) + super(message) + @source = source + @parser = parser + @continued_exception = exception + end + + def to_s + # Quote the original exception, if there was one + if @continued_exception + err = @continued_exception.inspect + err << "\n" + err << @continued_exception.backtrace.join("\n") + err << "\n...\n" + else + err = "" + end + + # Get the stack trace and error message + err << super + + # Add contextual information + if @source + err << "\nLine: #{line}\n" + err << "Position: #{position}\n" + err << "Last 80 unconsumed characters:\n" + err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ') + end + + err + end + + def position + @source.current_line[0] if @source and defined? @source.current_line and + @source.current_line + end + + def line + @source.current_line[2] if @source and defined? @source.current_line and + @source.current_line + end + + def context + @source.current_line + end + end +end diff --git a/jni/ruby/lib/rexml/parsers/baseparser.rb b/jni/ruby/lib/rexml/parsers/baseparser.rb new file mode 100644 index 0000000..6a08b86 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/baseparser.rb @@ -0,0 +1,532 @@ +require 'rexml/parseexception' +require 'rexml/undefinednamespaceexception' +require 'rexml/source' +require 'set' + +module REXML + module Parsers + # = Using the Pull Parser + # <em>This API is experimental, and subject to change.</em> + # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "<a>BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class BaseParser + LETTER = '[:alpha:]' + DIGIT = '[:digit:]' + + COMBININGCHAR = '' # TODO + EXTENDER = '' # TODO + + NCNAME_STR= "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*" + NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" + UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + + NAMECHAR = '[\-\w\.:]' + NAME = "([\\w:]#{NAMECHAR}*)" + NMTOKEN = "(?:#{NAMECHAR})+" + NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" + REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" + REFERENCE_RE = /#{REFERENCE}/ + + DOCTYPE_START = /\A\s*<!DOCTYPE\s/um + DOCTYPE_END = /\A\s*\]\s*>/um + DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um + ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um + COMMENT_START = /\A<!--/u + COMMENT_PATTERN = /<!--(.*?)-->/um + CDATA_START = /\A<!\[CDATA\[/u + CDATA_END = /\A\s*\]\s*>/um + CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um + XMLDECL_START = /\A<\?xml\s/u; + XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um + INSTRUCTION_START = /\A<\?/u + INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um + TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um + CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um + + VERSION = /\bversion\s*=\s*["'](.*?)['"]/um + ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um + STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um + + ENTITY_START = /\A\s*<!ENTITY/ + IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u + ELEMENTDECL_START = /\A\s*<!ELEMENT/um + ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um + SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um + ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" + NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" + ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" + ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" + ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" + DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" + ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" + ATTDEF_RE = /#{ATTDEF}/ + ATTLISTDECL_START = /\A\s*<!ATTLIST/um + ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um + NOTATIONDECL_START = /\A\s*<!NOTATION/um + PUBLIC = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um + SYSTEM = /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um + + TEXT_PATTERN = /\A([^<]*)/um + + # Entity constants + PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" + SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} + PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} + EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" + NDATADECL = "\\s+NDATA\\s+#{NAME}" + PEREFERENCE = "%#{NAME};" + ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} + PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + + EREFERENCE = /&(?!#{NAME};)/ + + DEFAULT_ENTITIES = { + 'gt' => [/>/, '>', '>', />/], + 'lt' => [/</, '<', '<', /</], + 'quot' => [/"/, '"', '"', /"/], + "apos" => [/'/, "'", "'", /'/] + } + + + ###################################################################### + # These are patterns to identify common markup errors, to make the + # error messages more informative. + ###################################################################### + MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um + + def initialize( source ) + self.stream = source + @listeners = [] + end + + def add_listener( listener ) + @listeners << listener + end + + attr_reader :source + + def stream=( source ) + @source = SourceFactory.create_from( source ) + @closed = nil + @document_status = nil + @tags = [] + @stack = [] + @entities = [] + @nsstack = [] + end + + def position + if @source.respond_to? :position + @source.position + else + # FIXME + 0 + end + end + + # Returns true if there are no more events + def empty? + return (@source.empty? and @stack.empty?) + end + + # Returns true if there are more events. Synonymous with !empty? + def has_next? + return !(@source.empty? and @stack.empty?) + end + + # Push an event back on the head of the stream. This method + # has (theoretically) infinite depth. + def unshift token + @stack.unshift(token) + end + + # Peek at the +depth+ event in the stack. The first element on the stack + # is at depth 0. If +depth+ is -1, will parse to the end of the input + # stream and return the last event, which is always :end_document. + # Be aware that this causes the stream to be parsed up to the +depth+ + # event, so you can effectively pre-parse the entire document (pull the + # entire thing into memory) using this method. + def peek depth=0 + raise %Q[Illegal argument "#{depth}"] if depth < -1 + temp = [] + if depth == -1 + temp.push(pull()) until empty? + else + while @stack.size+temp.size < depth+1 + temp.push(pull()) + end + end + @stack += temp if temp.size > 0 + @stack[depth] + end + + # Returns the next event. This is a +PullEvent+ object. + def pull + pull_event.tap do |event| + @listeners.each do |listener| + listener.receive event + end + end + end + + def pull_event + if @closed + x, @closed = @closed, nil + return [ :end_element, x ] + end + return [ :end_document ] if empty? + return @stack.shift if @stack.size > 0 + #STDERR.puts @source.encoding + @source.read if @source.buffer.size<2 + #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + if @document_status == nil + #@source.consume( /^\s*/um ) + word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) + word = word[1] unless word.nil? + #STDERR.puts "WORD = #{word.inspect}" + case word + when COMMENT_START + return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] + when XMLDECL_START + #STDERR.puts "XMLDECL" + results = @source.match( XMLDECL_PATTERN, true )[1] + version = VERSION.match( results ) + version = version[1] unless version.nil? + encoding = ENCODING.match(results) + encoding = encoding[1] unless encoding.nil? + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end + standalone = STANDALONE.match(results) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] + when INSTRUCTION_START + return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] + when DOCTYPE_START + md = @source.match( DOCTYPE_PATTERN, true ) + @nsstack.unshift(curr_ns=Set.new) + identity = md[1] + close = md[2] + identity =~ IDENTITY + name = $1 + raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil? + pub_sys = $2.nil? ? nil : $2.strip + long_name = $4.nil? ? nil : $4.strip + uri = $6.nil? ? nil : $6.strip + args = [ :start_doctype, name, pub_sys, long_name, uri ] + if close == ">" + @document_status = :after_doctype + @source.read if @source.buffer.size<2 + md = @source.match(/^\s*/um, true) + @stack << [ :end_doctype ] + else + @document_status = :in_doctype + end + return args + when /^\s+/ + else + @document_status = :after_doctype + @source.read if @source.buffer.size<2 + md = @source.match(/\s*/um, true) + if @source.encoding == "UTF-8" + @source.buffer.force_encoding(::Encoding::UTF_8) + end + end + end + if @document_status == :in_doctype + md = @source.match(/\s*(.*?>)/um) + case md[1] + when SYSTEMENTITY + match = @source.match( SYSTEMENTITY, true )[1] + return [ :externalentity, match ] + + when ELEMENTDECL_START + return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] + + when ENTITY_START + match = @source.match( ENTITYDECL, true ).to_a.compact + match[0] = :entitydecl + ref = false + if match[1] == '%' + ref = true + match.delete_at 1 + end + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + match.delete_at(5) if match.size > 5 # Chop out NDATA decl + # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] + else + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] + end + match << '%' if ref + return match + when ATTLISTDECL_START + md = @source.match( ATTLISTDECL_PATTERN, true ) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] + + pairs = {} + values = md[0].scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + if attdef[0] =~ /^xmlns:(.*)/ + @nsstack[0] << $1 + end + end + end + return [ :attlistdecl, element, pairs, contents ] + when NOTATIONDECL_START + md = nil + if @source.match( PUBLIC ) + md = @source.match( PUBLIC, true ) + vals = [md[1],md[2],md[4],md[6]] + elsif @source.match( SYSTEM ) + md = @source.match( SYSTEM, true ) + vals = [md[1],md[2],nil,md[4]] + else + raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) + end + return [ :notationdecl, *vals ] + when DOCTYPE_END + @document_status = :after_doctype + @source.match( DOCTYPE_END, true ) + return [ :end_doctype ] + end + end + begin + if @source.buffer[0] == ?< + if @source.buffer[1] == ?/ + @nsstack.shift + last_tag = @tags.pop + #md = @source.match_to_consume( '>', CLOSE_MATCH) + md = @source.match( CLOSE_MATCH, true ) + raise REXML::ParseException.new( "Missing end tag for "+ + "'#{last_tag}' (got \"#{md[1]}\")", + @source) unless last_tag == md[1] + return [ :end_element, last_tag ] + elsif @source.buffer[1] == ?! + md = @source.match(/\A(\s*[^>]*>)/um) + #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" + raise REXML::ParseException.new("Malformed node", @source) unless md + if md[0][2] == ?- + md = @source.match( COMMENT_PATTERN, true ) + + case md[1] + when /--/, /-\z/ + raise REXML::ParseException.new("Malformed comment", @source) + end + + return [ :comment, md[1] ] if md + else + md = @source.match( CDATA_PATTERN, true ) + return [ :cdata, md[1] ] if md + end + raise REXML::ParseException.new( "Declarations can only occur "+ + "in the doctype declaration.", @source) + elsif @source.buffer[1] == ?? + md = @source.match( INSTRUCTION_PATTERN, true ) + return [ :processing_instruction, md[1], md[2] ] if md + raise REXML::ParseException.new( "Bad instruction declaration", + @source) + else + # Get the next tag + md = @source.match(TAG_MATCH, true) + unless md + # Check for missing attribute quotes + raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) + raise REXML::ParseException.new("malformed XML: missing tag start", @source) + end + attributes = {} + prefixes = Set.new + prefixes << md[2] if md[2] + @nsstack.unshift(curr_ns=Set.new) + if md[4].size > 0 + attrs = md[4].scan( ATTRIBUTE_PATTERN ) + raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 + attrs.each do |attr_name, prefix, local_part, quote, value| + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self) + end + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" + end + + if attributes.has_key?(attr_name) + msg = "Duplicate attribute #{attr_name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end + + attributes[attr_name] = value + end + end + + # Verify that all of the prefixes have been defined + for prefix in prefixes + unless @nsstack.find{|k| k.member?(prefix)} + raise UndefinedNamespaceException.new(prefix,@source,self) + end + end + + if md[6] + @closed = md[1] + @nsstack.shift + else + @tags.push( md[1] ) + end + return [ :start_element, md[1], attributes ] + end + else + md = @source.match( TEXT_PATTERN, true ) + if md[0].length == 0 + @source.match( /(\s+)/, true ) + end + #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 + #return [ :text, "" ] if md[0].length == 0 + # unnormalized = Text::unnormalize( md[1], self ) + # return PullEvent.new( :text, md[1], unnormalized ) + return [ :text, md[1] ] + end + rescue REXML::UndefinedNamespaceException + raise + rescue REXML::ParseException + raise + rescue Exception, NameError => error + raise REXML::ParseException.new( "Exception parsing", + @source, self, (error ? error : $!) ) + end + return [ :dummy ] + end + private :pull_event + + def entity( reference, entities ) + value = nil + value = entities[ reference ] if entities + if not value + value = DEFAULT_ENTITIES[ reference ] + value = value[2] if value + end + unnormalize( value, entities ) if value + end + + # Escapes all possible entities + def normalize( input, entities=nil, entity_filter=nil ) + copy = input.clone + # Doing it like this rather than in a loop improves the speed + copy.gsub!( EREFERENCE, '&' ) + entities.each do |key, value| + copy.gsub!( value, "&#{key};" ) unless entity_filter and + entity_filter.include?(entity) + end if entities + copy.gsub!( EREFERENCE, '&' ) + DEFAULT_ENTITIES.each do |key, value| + copy.gsub!( value[3], value[1] ) + end + copy + end + + # Unescapes all possible entities + def unnormalize( string, entities=nil, filter=nil ) + rv = string.clone + rv.gsub!( /\r\n?/, "\n" ) + matches = rv.scan( REFERENCE_RE ) + return rv if matches.size == 0 + rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + m=$1 + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + matches.collect!{|x|x[0]}.compact! + if matches.size > 0 + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + entity_value = entity( entity_reference, entities ) + if entity_value + re = /&#{entity_reference};/ + rv.gsub!( re, entity_value ) + else + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er + end + end + end + rv.gsub!( /&/, '&' ) + end + rv + end + + private + def need_source_encoding_update?(xml_declaration_encoding) + return false if xml_declaration_encoding.nil? + return false if /\AUTF-16\z/i =~ xml_declaration_encoding + true + end + end + end +end + +=begin + case event[0] + when :start_element + when :text + when :end_element + when :processing_instruction + when :cdata + when :comment + when :xmldecl + when :start_doctype + when :end_doctype + when :externalentity + when :elementdecl + when :entity + when :attlistdecl + when :notationdecl + when :end_doctype + end +=end diff --git a/jni/ruby/lib/rexml/parsers/lightparser.rb b/jni/ruby/lib/rexml/parsers/lightparser.rb new file mode 100644 index 0000000..8104168 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/lightparser.rb @@ -0,0 +1,58 @@ +require 'rexml/parsers/streamparser' +require 'rexml/parsers/baseparser' +require 'rexml/light/node' + +module REXML + module Parsers + class LightParser + def initialize stream + @stream = stream + @parser = REXML::Parsers::BaseParser.new( stream ) + end + + def add_listener( listener ) + @parser.add_listener( listener ) + end + + def rewind + @stream.rewind + @parser.stream = @stream + end + + def parse + root = context = [ :document ] + while true + event = @parser.pull + case event[0] + when :end_document + break + when :start_element, :start_doctype + new_node = event + context << new_node + new_node[1,0] = [context] + context = new_node + when :end_element, :end_doctype + context = context[1] + else + new_node = event + context << new_node + new_node[1,0] = [context] + end + end + root + end + end + + # An element is an array. The array contains: + # 0 The parent element + # 1 The tag name + # 2 A hash of attributes + # 3..-1 The child elements + # An element is an array of size > 3 + # Text is a String + # PIs are [ :processing_instruction, target, data ] + # Comments are [ :comment, data ] + # DocTypes are DocType structs + # The root is an array with XMLDecls, Text, DocType, Array, Text + end +end diff --git a/jni/ruby/lib/rexml/parsers/pullparser.rb b/jni/ruby/lib/rexml/parsers/pullparser.rb new file mode 100644 index 0000000..68a4ff7 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/pullparser.rb @@ -0,0 +1,196 @@ +require 'forwardable' + +require 'rexml/parseexception' +require 'rexml/parsers/baseparser' +require 'rexml/xmltokens' + +module REXML + module Parsers + # = Using the Pull Parser + # <em>This API is experimental, and subject to change.</em> + # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "<a>BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class PullParser + include XMLTokens + extend Forwardable + + def_delegators( :@parser, :has_next? ) + def_delegators( :@parser, :entity ) + def_delegators( :@parser, :empty? ) + def_delegators( :@parser, :source ) + + def initialize stream + @entities = {} + @listeners = nil + @parser = BaseParser.new( stream ) + @my_stack = [] + end + + def add_listener( listener ) + @listeners = [] unless @listeners + @listeners << listener + end + + def each + while has_next? + yield self.pull + end + end + + def peek depth=0 + if @my_stack.length <= depth + (depth - @my_stack.length + 1).times { + e = PullEvent.new(@parser.pull) + @my_stack.push(e) + } + end + @my_stack[depth] + end + + def pull + return @my_stack.shift if @my_stack.length > 0 + + event = @parser.pull + case event[0] + when :entitydecl + @entities[ event[1] ] = + event[2] unless event[2] =~ /PUBLIC|SYSTEM/ + when :text + unnormalized = @parser.unnormalize( event[1], @entities ) + event << unnormalized + end + PullEvent.new( event ) + end + + def unshift token + @my_stack.unshift token + end + end + + # A parsing event. The contents of the event are accessed as an +Array?, + # and the type is given either by the ...? methods, or by accessing the + # +type+ accessor. The contents of this object vary from event to event, + # but are identical to the arguments passed to +StreamListener+s for each + # event. + class PullEvent + # The type of this event. Will be one of :tag_start, :tag_end, :text, + # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, + # :notationdecl, :entity, :cdata, :xmldecl, or :error. + def initialize(arg) + @contents = arg + end + + def []( start, endd=nil) + if start.kind_of? Range + @contents.slice( start.begin+1 .. start.end ) + elsif start.kind_of? Numeric + if endd.nil? + @contents.slice( start+1 ) + else + @contents.slice( start+1, endd ) + end + else + raise "Illegal argument #{start.inspect} (#{start.class})" + end + end + + def event_type + @contents[0] + end + + # Content: [ String tag_name, Hash attributes ] + def start_element? + @contents[0] == :start_element + end + + # Content: [ String tag_name ] + def end_element? + @contents[0] == :end_element + end + + # Content: [ String raw_text, String unnormalized_text ] + def text? + @contents[0] == :text + end + + # Content: [ String text ] + def instruction? + @contents[0] == :processing_instruction + end + + # Content: [ String text ] + def comment? + @contents[0] == :comment + end + + # Content: [ String name, String pub_sys, String long_name, String uri ] + def doctype? + @contents[0] == :start_doctype + end + + # Content: [ String text ] + def attlistdecl? + @contents[0] == :attlistdecl + end + + # Content: [ String text ] + def elementdecl? + @contents[0] == :elementdecl + end + + # Due to the wonders of DTDs, an entity declaration can be just about + # anything. There's no way to normalize it; you'll have to interpret the + # content yourself. However, the following is true: + # + # * If the entity declaration is an internal entity: + # [ String name, String value ] + # Content: [ String text ] + def entitydecl? + @contents[0] == :entitydecl + end + + # Content: [ String text ] + def notationdecl? + @contents[0] == :notationdecl + end + + # Content: [ String text ] + def entity? + @contents[0] == :entity + end + + # Content: [ String text ] + def cdata? + @contents[0] == :cdata + end + + # Content: [ String version, String encoding, String standalone ] + def xmldecl? + @contents[0] == :xmldecl + end + + def error? + @contents[0] == :error + end + + def inspect + @contents[0].to_s + ": " + @contents[1..-1].inspect + end + end + end +end diff --git a/jni/ruby/lib/rexml/parsers/sax2parser.rb b/jni/ruby/lib/rexml/parsers/sax2parser.rb new file mode 100644 index 0000000..a72c0a7 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/sax2parser.rb @@ -0,0 +1,272 @@ +require 'rexml/parsers/baseparser' +require 'rexml/parseexception' +require 'rexml/namespace' +require 'rexml/text' + +module REXML + module Parsers + # SAX2Parser + class SAX2Parser + def initialize source + @parser = BaseParser.new(source) + @listeners = [] + @procs = [] + @namespace_stack = [] + @has_listeners = false + @tag_stack = [] + @entities = {} + end + + def source + @parser.source + end + + def add_listener( listener ) + @parser.add_listener( listener ) + end + + # Listen arguments: + # + # Symbol, Array, Block + # Listen to Symbol events on Array elements + # Symbol, Block + # Listen to Symbol events + # Array, Listener + # Listen to all events on Array elements + # Array, Block + # Listen to :start_element events on Array elements + # Listener + # Listen to All events + # + # Symbol can be one of: :start_element, :end_element, + # :start_prefix_mapping, :end_prefix_mapping, :characters, + # :processing_instruction, :doctype, :attlistdecl, :elementdecl, + # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment + # + # There is an additional symbol that can be listened for: :progress. + # This will be called for every event generated, passing in the current + # stream position. + # + # Array contains regular expressions or strings which will be matched + # against fully qualified element names. + # + # Listener must implement the methods in SAX2Listener + # + # Block will be passed the same arguments as a SAX2Listener method would + # be, where the method name is the same as the matched Symbol. + # See the SAX2Listener for more information. + def listen( *args, &blok ) + if args[0].kind_of? Symbol + if args.size == 2 + args[1].each { |match| @procs << [args[0], match, blok] } + else + add( [args[0], nil, blok] ) + end + elsif args[0].kind_of? Array + if args.size == 2 + args[0].each { |match| add( [nil, match, args[1]] ) } + else + args[0].each { |match| add( [ :start_element, match, blok ] ) } + end + else + add([nil, nil, args[0]]) + end + end + + def deafen( listener=nil, &blok ) + if listener + @listeners.delete_if {|item| item[-1] == listener } + @has_listeners = false if @listeners.size == 0 + else + @procs.delete_if {|item| item[-1] == blok } + end + end + + def parse + @procs.each { |sym,match,block| block.call if sym == :start_document } + @listeners.each { |sym,match,block| + block.start_document if sym == :start_document or sym.nil? + } + context = [] + while true + event = @parser.pull + case event[0] + when :end_document + handle( :end_document ) + break + when :start_doctype + handle( :doctype, *event[1..-1]) + when :end_doctype + context = context[1] + when :start_element + @tag_stack.push(event[1]) + # find the observers for namespaces + procs = get_procs( :start_prefix_mapping, event[1] ) + listeners = get_listeners( :start_prefix_mapping, event[1] ) + if procs or listeners + # break out the namespace declarations + # The attributes live in event[2] + event[2].each {|n, v| event[2][n] = @parser.normalize(v)} + nsdecl = event[2].find_all { |n, value| n =~ /^xmlns(:|$)/ } + nsdecl.collect! { |n, value| [ n[6..-1], value ] } + @namespace_stack.push({}) + nsdecl.each do |n,v| + @namespace_stack[-1][n] = v + # notify observers of namespaces + procs.each { |ob| ob.call( n, v ) } if procs + listeners.each { |ob| ob.start_prefix_mapping(n, v) } if listeners + end + end + event[1] =~ Namespace::NAMESPLIT + prefix = $1 + local = $2 + uri = get_namespace(prefix) + # find the observers for start_element + procs = get_procs( :start_element, event[1] ) + listeners = get_listeners( :start_element, event[1] ) + # notify observers + procs.each { |ob| ob.call( uri, local, event[1], event[2] ) } if procs + listeners.each { |ob| + ob.start_element( uri, local, event[1], event[2] ) + } if listeners + when :end_element + @tag_stack.pop + event[1] =~ Namespace::NAMESPLIT + prefix = $1 + local = $2 + uri = get_namespace(prefix) + # find the observers for start_element + procs = get_procs( :end_element, event[1] ) + listeners = get_listeners( :end_element, event[1] ) + # notify observers + procs.each { |ob| ob.call( uri, local, event[1] ) } if procs + listeners.each { |ob| + ob.end_element( uri, local, event[1] ) + } if listeners + + namespace_mapping = @namespace_stack.pop + # find the observers for namespaces + procs = get_procs( :end_prefix_mapping, event[1] ) + listeners = get_listeners( :end_prefix_mapping, event[1] ) + if procs or listeners + namespace_mapping.each do |ns_prefix, ns_uri| + # notify observers of namespaces + procs.each { |ob| ob.call( ns_prefix ) } if procs + listeners.each { |ob| ob.end_prefix_mapping(ns_prefix) } if listeners + end + end + when :text + #normalized = @parser.normalize( event[1] ) + #handle( :characters, normalized ) + copy = event[1].clone + + esub = proc { |match| + if @entities.has_key?($1) + @entities[$1].gsub(Text::REFERENCE, &esub) + else + match + end + } + + copy.gsub!( Text::REFERENCE, &esub ) + copy.gsub!( Text::NUMERICENTITY ) {|m| + m=$1 + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + handle( :characters, copy ) + when :entitydecl + handle_entitydecl( event ) + when :processing_instruction, :comment, :attlistdecl, + :elementdecl, :cdata, :notationdecl, :xmldecl + handle( *event ) + end + handle( :progress, @parser.position ) + end + end + + private + def handle( symbol, *arguments ) + tag = @tag_stack[-1] + procs = get_procs( symbol, tag ) + listeners = get_listeners( symbol, tag ) + # notify observers + procs.each { |ob| ob.call( *arguments ) } if procs + listeners.each { |l| + l.send( symbol.to_s, *arguments ) + } if listeners + end + + def handle_entitydecl( event ) + @entities[ event[1] ] = event[2] if event.size == 3 + parameter_reference_p = false + case event[2] + when "SYSTEM" + if event.size == 5 + if event.last == "%" + parameter_reference_p = true + else + event[4, 0] = "NDATA" + end + end + when "PUBLIC" + if event.size == 6 + if event.last == "%" + parameter_reference_p = true + else + event[5, 0] = "NDATA" + end + end + else + parameter_reference_p = (event.size == 4) + end + event[1, 0] = event.pop if parameter_reference_p + handle( event[0], event[1..-1] ) + end + + # The following methods are duplicates, but it is faster than using + # a helper + def get_procs( symbol, name ) + return nil if @procs.size == 0 + @procs.find_all do |sym, match, block| + ( + (sym.nil? or symbol == sym) and + ((name.nil? and match.nil?) or match.nil? or ( + (name == match) or + (match.kind_of? Regexp and name =~ match) + ) + ) + ) + end.collect{|x| x[-1]} + end + def get_listeners( symbol, name ) + return nil if @listeners.size == 0 + @listeners.find_all do |sym, match, block| + ( + (sym.nil? or symbol == sym) and + ((name.nil? and match.nil?) or match.nil? or ( + (name == match) or + (match.kind_of? Regexp and name =~ match) + ) + ) + ) + end.collect{|x| x[-1]} + end + + def add( pair ) + if pair[-1].respond_to? :call + @procs << pair unless @procs.include? pair + else + @listeners << pair unless @listeners.include? pair + @has_listeners = true + end + end + + def get_namespace( prefix ) + uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) || + (@namespace_stack.find { |ns| not ns[nil].nil? }) + uris[-1][prefix] unless uris.nil? or 0 == uris.size + end + end + end +end diff --git a/jni/ruby/lib/rexml/parsers/streamparser.rb b/jni/ruby/lib/rexml/parsers/streamparser.rb new file mode 100644 index 0000000..9ea65ed --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/streamparser.rb @@ -0,0 +1,52 @@ +require "rexml/parsers/baseparser" + +module REXML + module Parsers + class StreamParser + def initialize source, listener + @listener = listener + @parser = BaseParser.new( source ) + end + + def add_listener( listener ) + @parser.add_listener( listener ) + end + + def parse + # entity string + while true + event = @parser.pull + case event[0] + when :end_document + return + when :start_element + attrs = event[2].each do |n, v| + event[2][n] = @parser.unnormalize( v ) + end + @listener.tag_start( event[1], attrs ) + when :end_element + @listener.tag_end( event[1] ) + when :text + normalized = @parser.unnormalize( event[1] ) + @listener.text( normalized ) + when :processing_instruction + @listener.instruction( *event[1,2] ) + when :start_doctype + @listener.doctype( *event[1..-1] ) + when :end_doctype + # FIXME: remove this condition for milestone:3.2 + @listener.doctype_end if @listener.respond_to? :doctype_end + when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl + @listener.send( event[0].to_s, *event[1..-1] ) + when :entitydecl, :notationdecl + @listener.send( event[0].to_s, event[1..-1] ) + when :externalentity + entity_reference = event[1] + content = entity_reference.gsub(/\A%|;\z/, "") + @listener.entity(content) + end + end + end + end + end +end diff --git a/jni/ruby/lib/rexml/parsers/treeparser.rb b/jni/ruby/lib/rexml/parsers/treeparser.rb new file mode 100644 index 0000000..68edb77 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/treeparser.rb @@ -0,0 +1,100 @@ +require 'rexml/validation/validationexception' +require 'rexml/undefinednamespaceexception' + +module REXML + module Parsers + class TreeParser + def initialize( source, build_context = Document.new ) + @build_context = build_context + @parser = Parsers::BaseParser.new( source ) + end + + def add_listener( listener ) + @parser.add_listener( listener ) + end + + def parse + tag_stack = [] + in_doctype = false + entities = nil + begin + while true + event = @parser.pull + #STDERR.puts "TREEPARSER GOT #{event.inspect}" + case event[0] + when :end_document + unless tag_stack.empty? + raise ParseException.new("No close tag for #{@build_context.xpath}", + @parser.source, @parser) + end + return + when :start_element + tag_stack.push(event[1]) + el = @build_context = @build_context.add_element( event[1] ) + event[2].each do |key, value| + el.attributes[key]=Attribute.new(key,value,self) + end + when :end_element + tag_stack.pop + @build_context = @build_context.parent + when :text + if not in_doctype + if @build_context[-1].instance_of? Text + @build_context[-1] << event[1] + else + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) + ) unless ( + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 + ) + end + end + when :comment + c = Comment.new( event[1] ) + @build_context.add( c ) + when :cdata + c = CData.new( event[1] ) + @build_context.add( c ) + when :processing_instruction + @build_context.add( Instruction.new( event[1], event[2] ) ) + when :end_doctype + in_doctype = false + entities.each { |k,v| entities[k] = @build_context.entities[k].value } + @build_context = @build_context.parent + when :start_doctype + doctype = DocType.new( event[1..-1], @build_context ) + @build_context = doctype + entities = {} + in_doctype = true + when :attlistdecl + n = AttlistDecl.new( event[1..-1] ) + @build_context.add( n ) + when :externalentity + n = ExternalEntity.new( event[1] ) + @build_context.add( n ) + when :elementdecl + n = ElementDecl.new( event[1] ) + @build_context.add(n) + when :entitydecl + entities[ event[1] ] = event[2] unless event[2] =~ /PUBLIC|SYSTEM/ + @build_context.add(Entity.new(event)) + when :notationdecl + n = NotationDecl.new( *event[1..-1] ) + @build_context.add( n ) + when :xmldecl + x = XMLDecl.new( event[1], event[2], event[3] ) + @build_context.add( x ) + end + end + rescue REXML::Validation::ValidationException + raise + rescue REXML::ParseException + raise + rescue + raise ParseException.new( $!.message, @parser.source, @parser, $! ) + end + end + end + end +end diff --git a/jni/ruby/lib/rexml/parsers/ultralightparser.rb b/jni/ruby/lib/rexml/parsers/ultralightparser.rb new file mode 100644 index 0000000..4e2d7a8 --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/ultralightparser.rb @@ -0,0 +1,56 @@ +require 'rexml/parsers/streamparser' +require 'rexml/parsers/baseparser' + +module REXML + module Parsers + class UltraLightParser + def initialize stream + @stream = stream + @parser = REXML::Parsers::BaseParser.new( stream ) + end + + def add_listener( listener ) + @parser.add_listener( listener ) + end + + def rewind + @stream.rewind + @parser.stream = @stream + end + + def parse + root = context = [] + while true + event = @parser.pull + case event[0] + when :end_document + break + when :end_doctype + context = context[1] + when :start_element, :start_doctype + context << event + event[1,0] = [context] + context = event + when :end_element + context = context[1] + else + context << event + end + end + root + end + end + + # An element is an array. The array contains: + # 0 The parent element + # 1 The tag name + # 2 A hash of attributes + # 3..-1 The child elements + # An element is an array of size > 3 + # Text is a String + # PIs are [ :processing_instruction, target, data ] + # Comments are [ :comment, data ] + # DocTypes are DocType structs + # The root is an array with XMLDecls, Text, DocType, Array, Text + end +end diff --git a/jni/ruby/lib/rexml/parsers/xpathparser.rb b/jni/ruby/lib/rexml/parsers/xpathparser.rb new file mode 100644 index 0000000..57767fb --- /dev/null +++ b/jni/ruby/lib/rexml/parsers/xpathparser.rb @@ -0,0 +1,656 @@ +require 'rexml/namespace' +require 'rexml/xmltokens' + +module REXML + module Parsers + # You don't want to use this class. Really. Use XPath, which is a wrapper + # for this class. Believe me. You don't want to poke around in here. + # There is strange, dark magic at work in this code. Beware. Go back! Go + # back while you still can! + class XPathParser + include XMLTokens + LITERAL = /^'([^']*)'|^"([^"]*)"/u + + def namespaces=( namespaces ) + Functions::namespace_context = namespaces + @namespaces = namespaces + end + + def parse path + path = path.dup + path.gsub!(/([\(\[])\s+/, '\1') # Strip ignorable spaces + path.gsub!( /\s+([\]\)])/, '\1') + parsed = [] + OrExpr(path, parsed) + parsed + end + + def predicate path + parsed = [] + Predicate( "[#{path}]", parsed ) + parsed + end + + def abbreviate( path ) + path = path.kind_of?(String) ? parse( path ) : path + string = "" + document = false + while path.size > 0 + op = path.shift + case op + when :node + when :attribute + string << "/" if string.size > 0 + string << "@" + when :child + string << "/" if string.size > 0 + when :descendant_or_self + string << "/" + when :self + string << "." + when :parent + string << ".." + when :any + string << "*" + when :text + string << "text()" + when :following, :following_sibling, + :ancestor, :ancestor_or_self, :descendant, + :namespace, :preceding, :preceding_sibling + string << "/" unless string.size == 0 + string << op.to_s.tr("_", "-") + string << "::" + when :qname + prefix = path.shift + name = path.shift + string << prefix+":" if prefix.size > 0 + string << name + when :predicate + string << '[' + string << predicate_to_string( path.shift ) {|x| abbreviate( x ) } + string << ']' + when :document + document = true + when :function + string << path.shift + string << "( " + string << predicate_to_string( path.shift[0] ) {|x| abbreviate( x )} + string << " )" + when :literal + string << %Q{ "#{path.shift}" } + else + string << "/" unless string.size == 0 + string << "UNKNOWN(" + string << op.inspect + string << ")" + end + end + string = "/"+string if document + return string + end + + def expand( path ) + path = path.kind_of?(String) ? parse( path ) : path + string = "" + document = false + while path.size > 0 + op = path.shift + case op + when :node + string << "node()" + when :attribute, :child, :following, :following_sibling, + :ancestor, :ancestor_or_self, :descendant, :descendant_or_self, + :namespace, :preceding, :preceding_sibling, :self, :parent + string << "/" unless string.size == 0 + string << op.to_s.tr("_", "-") + string << "::" + when :any + string << "*" + when :qname + prefix = path.shift + name = path.shift + string << prefix+":" if prefix.size > 0 + string << name + when :predicate + string << '[' + string << predicate_to_string( path.shift ) { |x| expand(x) } + string << ']' + when :document + document = true + else + string << "/" unless string.size == 0 + string << "UNKNOWN(" + string << op.inspect + string << ")" + end + end + string = "/"+string if document + return string + end + + def predicate_to_string( path, &block ) + string = "" + case path[0] + when :and, :or, :mult, :plus, :minus, :neq, :eq, :lt, :gt, :lteq, :gteq, :div, :mod, :union + op = path.shift + case op + when :eq + op = "=" + when :lt + op = "<" + when :gt + op = ">" + when :lteq + op = "<=" + when :gteq + op = ">=" + when :neq + op = "!=" + when :union + op = "|" + end + left = predicate_to_string( path.shift, &block ) + right = predicate_to_string( path.shift, &block ) + string << " " + string << left + string << " " + string << op.to_s + string << " " + string << right + string << " " + when :function + path.shift + name = path.shift + string << name + string << "( " + string << predicate_to_string( path.shift, &block ) + string << " )" + when :literal + path.shift + string << " " + string << path.shift.inspect + string << " " + else + string << " " + string << yield( path ) + string << " " + end + return string.squeeze(" ") + end + + private + #LocationPath + # | RelativeLocationPath + # | '/' RelativeLocationPath? + # | '//' RelativeLocationPath + def LocationPath path, parsed + path = path.strip + if path[0] == ?/ + parsed << :document + if path[1] == ?/ + parsed << :descendant_or_self + parsed << :node + path = path[2..-1] + else + path = path[1..-1] + end + end + return RelativeLocationPath( path, parsed ) if path.size > 0 + end + + #RelativeLocationPath + # | Step + # | (AXIS_NAME '::' | '@' | '') AxisSpecifier + # NodeTest + # Predicate + # | '.' | '..' AbbreviatedStep + # | RelativeLocationPath '/' Step + # | RelativeLocationPath '//' Step + AXIS = /^(ancestor|ancestor-or-self|attribute|child|descendant|descendant-or-self|following|following-sibling|namespace|parent|preceding|preceding-sibling|self)::/ + def RelativeLocationPath path, parsed + while path.size > 0 + # (axis or @ or <child::>) nodetest predicate > + # OR > / Step + # (. or ..) > + if path[0] == ?. + if path[1] == ?. + parsed << :parent + parsed << :node + path = path[2..-1] + else + parsed << :self + parsed << :node + path = path[1..-1] + end + else + if path[0] == ?@ + parsed << :attribute + path = path[1..-1] + # Goto Nodetest + elsif path =~ AXIS + parsed << $1.tr('-','_').intern + path = $' + # Goto Nodetest + else + parsed << :child + end + + n = [] + path = NodeTest( path, n) + + if path[0] == ?[ + path = Predicate( path, n ) + end + + parsed.concat(n) + end + + if path.size > 0 + if path[0] == ?/ + if path[1] == ?/ + parsed << :descendant_or_self + parsed << :node + path = path[2..-1] + else + path = path[1..-1] + end + else + return path + end + end + end + return path + end + + # Returns a 1-1 map of the nodeset + # The contents of the resulting array are either: + # true/false, if a positive match + # String, if a name match + #NodeTest + # | ('*' | NCNAME ':' '*' | QNAME) NameTest + # | NODE_TYPE '(' ')' NodeType + # | PI '(' LITERAL ')' PI + # | '[' expr ']' Predicate + NCNAMETEST= /^(#{NCNAME_STR}):\*/u + QNAME = Namespace::NAMESPLIT + NODE_TYPE = /^(comment|text|node)\(\s*\)/m + PI = /^processing-instruction\(/ + def NodeTest path, parsed + case path + when /^\*/ + path = $' + parsed << :any + when NODE_TYPE + type = $1 + path = $' + parsed << type.tr('-', '_').intern + when PI + path = $' + literal = nil + if path !~ /^\s*\)/ + path =~ LITERAL + literal = $1 + path = $' + raise ParseException.new("Missing ')' after processing instruction") if path[0] != ?) + path = path[1..-1] + end + parsed << :processing_instruction + parsed << (literal || '') + when NCNAMETEST + prefix = $1 + path = $' + parsed << :namespace + parsed << prefix + when QNAME + prefix = $1 + name = $2 + path = $' + prefix = "" unless prefix + parsed << :qname + parsed << prefix + parsed << name + end + return path + end + + # Filters the supplied nodeset on the predicate(s) + def Predicate path, parsed + return nil unless path[0] == ?[ + predicates = [] + while path[0] == ?[ + path, expr = get_group(path) + predicates << expr[1..-2] if expr + end + predicates.each{ |pred| + preds = [] + parsed << :predicate + parsed << preds + OrExpr(pred, preds) + } + path + end + + # The following return arrays of true/false, a 1-1 mapping of the + # supplied nodeset, except for axe(), which returns a filtered + # nodeset + + #| OrExpr S 'or' S AndExpr + #| AndExpr + def OrExpr path, parsed + n = [] + rest = AndExpr( path, n ) + if rest != path + while rest =~ /^\s*( or )/ + n = [ :or, n, [] ] + rest = AndExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| AndExpr S 'and' S EqualityExpr + #| EqualityExpr + def AndExpr path, parsed + n = [] + rest = EqualityExpr( path, n ) + if rest != path + while rest =~ /^\s*( and )/ + n = [ :and, n, [] ] + rest = EqualityExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| EqualityExpr ('=' | '!=') RelationalExpr + #| RelationalExpr + def EqualityExpr path, parsed + n = [] + rest = RelationalExpr( path, n ) + if rest != path + while rest =~ /^\s*(!?=)\s*/ + if $1[0] == ?! + n = [ :neq, n, [] ] + else + n = [ :eq, n, [] ] + end + rest = RelationalExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| RelationalExpr ('<' | '>' | '<=' | '>=') AdditiveExpr + #| AdditiveExpr + def RelationalExpr path, parsed + n = [] + rest = AdditiveExpr( path, n ) + if rest != path + while rest =~ /^\s*([<>]=?)\s*/ + if $1[0] == ?< + sym = "lt" + else + sym = "gt" + end + sym << "eq" if $1[-1] == ?= + n = [ sym.intern, n, [] ] + rest = AdditiveExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| AdditiveExpr ('+' | S '-') MultiplicativeExpr + #| MultiplicativeExpr + def AdditiveExpr path, parsed + n = [] + rest = MultiplicativeExpr( path, n ) + if rest != path + while rest =~ /^\s*(\+| -)\s*/ + if $1[0] == ?+ + n = [ :plus, n, [] ] + else + n = [ :minus, n, [] ] + end + rest = MultiplicativeExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| MultiplicativeExpr ('*' | S ('div' | 'mod') S) UnaryExpr + #| UnaryExpr + def MultiplicativeExpr path, parsed + n = [] + rest = UnaryExpr( path, n ) + if rest != path + while rest =~ /^\s*(\*| div | mod )\s*/ + if $1[0] == ?* + n = [ :mult, n, [] ] + elsif $1.include?( "div" ) + n = [ :div, n, [] ] + else + n = [ :mod, n, [] ] + end + rest = UnaryExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| '-' UnaryExpr + #| UnionExpr + def UnaryExpr path, parsed + path =~ /^(\-*)/ + path = $' + if $1 and (($1.size % 2) != 0) + mult = -1 + else + mult = 1 + end + parsed << :neg if mult < 0 + + n = [] + path = UnionExpr( path, n ) + parsed.concat( n ) + path + end + + #| UnionExpr '|' PathExpr + #| PathExpr + def UnionExpr path, parsed + n = [] + rest = PathExpr( path, n ) + if rest != path + while rest =~ /^\s*(\|)\s*/ + n = [ :union, n, [] ] + rest = PathExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace( n ) + elsif n.size > 0 + parsed << n + end + rest + end + + #| LocationPath + #| FilterExpr ('/' | '//') RelativeLocationPath + def PathExpr path, parsed + path =~ /^\s*/ + path = $' + n = [] + rest = FilterExpr( path, n ) + if rest != path + if rest and rest[0] == ?/ + return RelativeLocationPath(rest, n) + end + end + rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w*]/ + parsed.concat(n) + return rest + end + + #| FilterExpr Predicate + #| PrimaryExpr + def FilterExpr path, parsed + n = [] + path = PrimaryExpr( path, n ) + path = Predicate(path, n) if path and path[0] == ?[ + parsed.concat(n) + path + end + + #| VARIABLE_REFERENCE + #| '(' expr ')' + #| LITERAL + #| NUMBER + #| FunctionCall + VARIABLE_REFERENCE = /^\$(#{NAME_STR})/u + NUMBER = /^(\d*\.?\d+)/ + NT = /^comment|text|processing-instruction|node$/ + def PrimaryExpr path, parsed + case path + when VARIABLE_REFERENCE + varname = $1 + path = $' + parsed << :variable + parsed << varname + #arry << @variables[ varname ] + when /^(\w[-\w]*)(?:\()/ + fname = $1 + tmp = $' + return path if fname =~ NT + path = tmp + parsed << :function + parsed << fname + path = FunctionCall(path, parsed) + when NUMBER + varname = $1.nil? ? $2 : $1 + path = $' + parsed << :literal + parsed << (varname.include?('.') ? varname.to_f : varname.to_i) + when LITERAL + varname = $1.nil? ? $2 : $1 + path = $' + parsed << :literal + parsed << varname + when /^\(/ #/ + path, contents = get_group(path) + contents = contents[1..-2] + n = [] + OrExpr( contents, n ) + parsed.concat(n) + end + path + end + + #| FUNCTION_NAME '(' ( expr ( ',' expr )* )? ')' + def FunctionCall rest, parsed + path, arguments = parse_args(rest) + argset = [] + for argument in arguments + args = [] + OrExpr( argument, args ) + argset << args + end + parsed << argset + path + end + + # get_group( '[foo]bar' ) -> ['bar', '[foo]'] + def get_group string + ind = 0 + depth = 0 + st = string[0,1] + en = (st == "(" ? ")" : "]") + begin + case string[ind,1] + when st + depth += 1 + when en + depth -= 1 + end + ind += 1 + end while depth > 0 and ind < string.length + return nil unless depth==0 + [string[ind..-1], string[0..ind-1]] + end + + def parse_args( string ) + arguments = [] + ind = 0 + inquot = false + inapos = false + depth = 1 + begin + case string[ind] + when ?" + inquot = !inquot unless inapos + when ?' + inapos = !inapos unless inquot + else + unless inquot or inapos + case string[ind] + when ?( + depth += 1 + if depth == 1 + string = string[1..-1] + ind -= 1 + end + when ?) + depth -= 1 + if depth == 0 + s = string[0,ind].strip + arguments << s unless s == "" + string = string[ind+1..-1] + end + when ?, + if depth == 1 + s = string[0,ind].strip + arguments << s unless s == "" + string = string[ind+1..-1] + ind = -1 + end + end + end + end + ind += 1 + end while depth > 0 and ind < string.length + return nil unless depth==0 + [string,arguments] + end + end + end +end diff --git a/jni/ruby/lib/rexml/quickpath.rb b/jni/ruby/lib/rexml/quickpath.rb new file mode 100644 index 0000000..9bec215 --- /dev/null +++ b/jni/ruby/lib/rexml/quickpath.rb @@ -0,0 +1,265 @@ +require 'rexml/functions' +require 'rexml/xmltokens' + +module REXML + class QuickPath + include Functions + include XMLTokens + + # A base Hash object to be used when initializing a + # default empty namespaces set. + EMPTY_HASH = {} + + def QuickPath::first element, path, namespaces=EMPTY_HASH + match(element, path, namespaces)[0] + end + + def QuickPath::each element, path, namespaces=EMPTY_HASH, &block + path = "*" unless path + match(element, path, namespaces).each( &block ) + end + + def QuickPath::match element, path, namespaces=EMPTY_HASH + raise "nil is not a valid xpath" unless path + results = nil + Functions::namespace_context = namespaces + case path + when /^\/([^\/]|$)/u + # match on root + path = path[1..-1] + return [element.root.parent] if path == '' + results = filter([element.root], path) + when /^[-\w]*::/u + results = filter([element], path) + when /^\*/u + results = filter(element.to_a, path) + when /^[\[!\w:]/u + # match on child + children = element.to_a + results = filter(children, path) + else + results = filter([element], path) + end + return results + end + + # Given an array of nodes it filters the array based on the path. The + # result is that when this method returns, the array will contain elements + # which match the path + def QuickPath::filter elements, path + return elements if path.nil? or path == '' or elements.size == 0 + case path + when /^\/\//u # Descendant + return axe( elements, "descendant-or-self", $' ) + when /^\/?\b(\w[-\w]*)\b::/u # Axe + return axe( elements, $1, $' ) + when /^\/(?=\b([:!\w][-\.\w]*:)?[-!\*\.\w]*\b([^:(]|$)|\*)/u # Child + rest = $' + results = [] + elements.each do |element| + results |= filter( element.to_a, rest ) + end + return results + when /^\/?(\w[-\w]*)\(/u # / Function + return function( elements, $1, $' ) + when Namespace::NAMESPLIT # Element name + name = $2 + ns = $1 + rest = $' + elements.delete_if do |element| + !(element.kind_of? Element and + (element.expanded_name == name or + (element.name == name and + element.namespace == Functions.namespace_context[ns]))) + end + return filter( elements, rest ) + when /^\/\[/u + matches = [] + elements.each do |element| + matches |= predicate( element.to_a, path[1..-1] ) if element.kind_of? Element + end + return matches + when /^\[/u # Predicate + return predicate( elements, path ) + when /^\/?\.\.\./u # Ancestor + return axe( elements, "ancestor", $' ) + when /^\/?\.\./u # Parent + return filter( elements.collect{|e|e.parent}, $' ) + when /^\/?\./u # Self + return filter( elements, $' ) + when /^\*/u # Any + results = [] + elements.each do |element| + results |= filter( [element], $' ) if element.kind_of? Element + #if element.kind_of? Element + # children = element.to_a + # children.delete_if { |child| !child.kind_of?(Element) } + # results |= filter( children, $' ) + #end + end + return results + end + return [] + end + + def QuickPath::axe( elements, axe_name, rest ) + matches = [] + matches = filter( elements.dup, rest ) if axe_name =~ /-or-self$/u + case axe_name + when /^descendant/u + elements.each do |element| + matches |= filter( element.to_a, "descendant-or-self::#{rest}" ) if element.kind_of? Element + end + when /^ancestor/u + elements.each do |element| + while element.parent + matches << element.parent + element = element.parent + end + end + matches = filter( matches, rest ) + when "self" + matches = filter( elements, rest ) + when "child" + elements.each do |element| + matches |= filter( element.to_a, rest ) if element.kind_of? Element + end + when "attribute" + elements.each do |element| + matches << element.attributes[ rest ] if element.kind_of? Element + end + when "parent" + matches = filter(elements.collect{|element| element.parent}.uniq, rest) + when "following-sibling" + matches = filter(elements.collect{|element| element.next_sibling}.uniq, + rest) + when "previous-sibling" + matches = filter(elements.collect{|element| + element.previous_sibling}.uniq, rest ) + end + return matches.uniq + end + + OPERAND_ = '((?=(?:(?!and|or).)*[^\s<>=])[^\s<>=]+)' + # A predicate filters a node-set with respect to an axis to produce a + # new node-set. For each node in the node-set to be filtered, the + # PredicateExpr is evaluated with that node as the context node, with + # the number of nodes in the node-set as the context size, and with the + # proximity position of the node in the node-set with respect to the + # axis as the context position; if PredicateExpr evaluates to true for + # that node, the node is included in the new node-set; otherwise, it is + # not included. + # + # A PredicateExpr is evaluated by evaluating the Expr and converting + # the result to a boolean. If the result is a number, the result will + # be converted to true if the number is equal to the context position + # and will be converted to false otherwise; if the result is not a + # number, then the result will be converted as if by a call to the + # boolean function. Thus a location path para[3] is equivalent to + # para[position()=3]. + def QuickPath::predicate( elements, path ) + ind = 1 + bcount = 1 + while bcount > 0 + bcount += 1 if path[ind] == ?[ + bcount -= 1 if path[ind] == ?] + ind += 1 + end + ind -= 1 + predicate = path[1..ind-1] + rest = path[ind+1..-1] + + # have to change 'a [=<>] b [=<>] c' into 'a [=<>] b and b [=<>] c' + # + predicate.gsub!( + /#{OPERAND_}\s*([<>=])\s*#{OPERAND_}\s*([<>=])\s*#{OPERAND_}/u, + '\1 \2 \3 and \3 \4 \5' ) + # Let's do some Ruby trickery to avoid some work: + predicate.gsub!( /&/u, "&&" ) + predicate.gsub!( /=/u, "==" ) + predicate.gsub!( /@(\w[-\w.]*)/u, 'attribute("\1")' ) + predicate.gsub!( /\bmod\b/u, "%" ) + predicate.gsub!( /\b(\w[-\w.]*\()/u ) { + fname = $1 + fname.gsub( /-/u, "_" ) + } + + Functions.pair = [ 0, elements.size ] + results = [] + elements.each do |element| + Functions.pair[0] += 1 + Functions.node = element + res = eval( predicate ) + case res + when true + results << element + when Fixnum + results << element if Functions.pair[0] == res + when String + results << element + end + end + return filter( results, rest ) + end + + def QuickPath::attribute( name ) + return Functions.node.attributes[name] if Functions.node.kind_of? Element + end + + def QuickPath::name() + return Functions.node.name if Functions.node.kind_of? Element + end + + def QuickPath::method_missing( id, *args ) + begin + Functions.send( id.id2name, *args ) + rescue Exception + raise "METHOD: #{id.id2name}(#{args.join ', '})\n#{$!.message}" + end + end + + def QuickPath::function( elements, fname, rest ) + args = parse_args( elements, rest ) + Functions.pair = [0, elements.size] + results = [] + elements.each do |element| + Functions.pair[0] += 1 + Functions.node = element + res = Functions.send( fname, *args ) + case res + when true + results << element + when Fixnum + results << element if Functions.pair[0] == res + end + end + return results + end + + def QuickPath::parse_args( element, string ) + # /.*?(?:\)|,)/ + arguments = [] + buffer = "" + while string and string != "" + c = string[0] + string.sub!(/^./u, "") + case c + when ?, + # if depth = 1, then we start a new argument + arguments << evaluate( buffer ) + #arguments << evaluate( string[0..count] ) + when ?( + # start a new method call + function( element, buffer, string ) + buffer = "" + when ?) + # close the method call and return arguments + return arguments + else + buffer << c + end + end + "" + end + end +end diff --git a/jni/ruby/lib/rexml/rexml.rb b/jni/ruby/lib/rexml/rexml.rb new file mode 100644 index 0000000..2c0defc --- /dev/null +++ b/jni/ruby/lib/rexml/rexml.rb @@ -0,0 +1,31 @@ +# -*- encoding: utf-8 -*- +# REXML is an XML toolkit for Ruby[http://www.ruby-lang.org], in Ruby. +# +# REXML is a _pure_ Ruby, XML 1.0 conforming, +# non-validating[http://www.w3.org/TR/2004/REC-xml-20040204/#sec-conformance] +# toolkit with an intuitive API. REXML passes 100% of the non-validating Oasis +# tests[http://www.oasis-open.org/committees/xml-conformance/xml-test-suite.shtml], +# and provides tree, stream, SAX2, pull, and lightweight APIs. REXML also +# includes a full XPath[http://www.w3c.org/tr/xpath] 1.0 implementation. Since +# Ruby 1.8, REXML is included in the standard Ruby distribution. +# +# Main page:: http://www.germane-software.com/software/rexml +# Author:: Sean Russell <serATgermaneHYPHENsoftwareDOTcom> +# Date:: 2008/019 +# Version:: 3.1.7.3 +# +# This API documentation can be downloaded from the REXML home page, or can +# be accessed online[http://www.germane-software.com/software/rexml_doc] +# +# A tutorial is available in the REXML distribution in docs/tutorial.html, +# or can be accessed +# online[http://www.germane-software.com/software/rexml/docs/tutorial.html] +module REXML + COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>" + DATE = "2008/019" + VERSION = "3.1.7.3" + REVISION = %w$Revision: 39528 $[1] || '' + + Copyright = COPYRIGHT + Version = VERSION +end diff --git a/jni/ruby/lib/rexml/sax2listener.rb b/jni/ruby/lib/rexml/sax2listener.rb new file mode 100644 index 0000000..9f276eb --- /dev/null +++ b/jni/ruby/lib/rexml/sax2listener.rb @@ -0,0 +1,97 @@ +module REXML + # A template for stream parser listeners. + # Note that the declarations (attlistdecl, elementdecl, etc) are trivially + # processed; REXML doesn't yet handle doctype entity declarations, so you + # have to parse them out yourself. + # === Missing methods from SAX2 + # ignorable_whitespace + # === Methods extending SAX2 + # +WARNING+ + # These methods are certainly going to change, until DTDs are fully + # supported. Be aware of this. + # start_document + # end_document + # doctype + # elementdecl + # attlistdecl + # entitydecl + # notationdecl + # cdata + # xmldecl + # comment + module SAX2Listener + def start_document + end + def end_document + end + def start_prefix_mapping prefix, uri + end + def end_prefix_mapping prefix + end + def start_element uri, localname, qname, attributes + end + def end_element uri, localname, qname + end + def characters text + end + def processing_instruction target, data + end + # Handles a doctype declaration. Any attributes of the doctype which are + # not supplied will be nil. # EG, <!DOCTYPE me PUBLIC "foo" "bar"> + # @p name the name of the doctype; EG, "me" + # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" + # @p long_name the supplied long name, or nil. EG, "foo" + # @p uri the uri of the doctype, or nil. EG, "bar" + def doctype name, pub_sys, long_name, uri + end + # If a doctype includes an ATTLIST declaration, it will cause this + # method to be called. The content is the declaration itself, unparsed. + # EG, <!ATTLIST el attr CDATA #REQUIRED> will come to this method as "el + # attr CDATA #REQUIRED". This is the same for all of the .*decl + # methods. + def attlistdecl(element, pairs, contents) + end + # <!ELEMENT ...> + def elementdecl content + end + # <!ENTITY ...> + # The argument passed to this method is an array of the entity + # declaration. It can be in a number of formats, but in general it + # returns (example, result): + # <!ENTITY % YN '"Yes"'> + # ["%", "YN", "\"Yes\""] + # <!ENTITY % YN 'Yes'> + # ["%", "YN", "Yes"] + # <!ENTITY WhatHeSaid "He said %YN;"> + # ["WhatHeSaid", "He said %YN;"] + # <!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "SYSTEM", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "PUBLIC", "-//Textuality//TEXT Standard open-hatch boilerplate//EN", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> + # ["hatch-pic", "SYSTEM", "../grafix/OpenHatch.gif", "NDATA", "gif"] + def entitydecl declaration + end + # <!NOTATION ...> + def notationdecl name, public_or_system, public_id, system_id + end + # Called when <![CDATA[ ... ]]> is encountered in a document. + # @p content "..." + def cdata content + end + # Called when an XML PI is encountered in the document. + # EG: <?xml version="1.0" encoding="utf"?> + # @p version the version attribute value. EG, "1.0" + # @p encoding the encoding attribute value, or nil. EG, "utf" + # @p standalone the standalone attribute value, or nil. EG, nil + # @p spaced the declaration is followed by a line break + def xmldecl version, encoding, standalone + end + # Called when a comment is encountered. + # @p comment The content of the comment + def comment comment + end + def progress position + end + end +end diff --git a/jni/ruby/lib/rexml/security.rb b/jni/ruby/lib/rexml/security.rb new file mode 100644 index 0000000..593b652 --- /dev/null +++ b/jni/ruby/lib/rexml/security.rb @@ -0,0 +1,27 @@ +module REXML + module Security + @@entity_expansion_limit = 10_000 + + # Set the entity expansion limit. By default the limit is set to 10000. + def self.entity_expansion_limit=( val ) + @@entity_expansion_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10000. + def self.entity_expansion_limit + return @@entity_expansion_limit + end + + @@entity_expansion_text_limit = 10_240 + + # Set the entity expansion limit. By default the limit is set to 10240. + def self.entity_expansion_text_limit=( val ) + @@entity_expansion_text_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10240. + def self.entity_expansion_text_limit + return @@entity_expansion_text_limit + end + end +end diff --git a/jni/ruby/lib/rexml/source.rb b/jni/ruby/lib/rexml/source.rb new file mode 100644 index 0000000..cfafdbd --- /dev/null +++ b/jni/ruby/lib/rexml/source.rb @@ -0,0 +1,296 @@ +# coding: US-ASCII +require 'rexml/encoding' + +module REXML + # Generates Source-s. USE THIS CLASS. + class SourceFactory + # Generates a Source object + # @param arg Either a String, or an IO + # @return a Source, or nil if a bad argument was given + def SourceFactory::create_from(arg) + if arg.respond_to? :read and + arg.respond_to? :readline and + arg.respond_to? :nil? and + arg.respond_to? :eof? + IOSource.new(arg) + elsif arg.respond_to? :to_str + require 'stringio' + IOSource.new(StringIO.new(arg)) + elsif arg.kind_of? Source + arg + else + raise "#{arg.class} is not a valid input stream. It must walk \n"+ + "like either a String, an IO, or a Source." + end + end + end + + # A Source can be searched for patterns, and wraps buffers and other + # objects and provides consumption of text + class Source + include Encoding + # The current buffer (what we're going to read next) + attr_reader :buffer + # The line number of the last consumed text + attr_reader :line + attr_reader :encoding + + # Constructor + # @param arg must be a String, and should be a valid XML document + # @param encoding if non-null, sets the encoding of the source to this + # value, overriding all encoding detection + def initialize(arg, encoding=nil) + @orig = @buffer = arg + if encoding + self.encoding = encoding + else + detect_encoding + end + @line = 0 + end + + + # Inherited from Encoding + # Overridden to support optimized en/decoding + def encoding=(enc) + return unless super + encoding_updated + end + + # Scans the source for a given pattern. Note, that this is not your + # usual scan() method. For one thing, the pattern argument has some + # requirements; for another, the source can be consumed. You can easily + # confuse this method. Originally, the patterns were easier + # to construct and this method more robust, because this method + # generated search regexps on the fly; however, this was + # computationally expensive and slowed down the entire REXML package + # considerably, since this is by far the most commonly called method. + # @param pattern must be a Regexp, and must be in the form of + # /^\s*(#{your pattern, with no groups})(.*)/. The first group + # will be returned; the second group is used if the consume flag is + # set. + # @param consume if true, the pattern returned will be consumed, leaving + # everything after it in the Source. + # @return the pattern, if found, or nil if the Source is empty or the + # pattern is not found. + def scan(pattern, cons=false) + return nil if @buffer.nil? + rv = @buffer.scan(pattern) + @buffer = $' if cons and rv.size>0 + rv + end + + def read + end + + def consume( pattern ) + @buffer = $' if pattern.match( @buffer ) + end + + def match_to( char, pattern ) + return pattern.match(@buffer) + end + + def match_to_consume( char, pattern ) + md = pattern.match(@buffer) + @buffer = $' + return md + end + + def match(pattern, cons=false) + md = pattern.match(@buffer) + @buffer = $' if cons and md + return md + end + + # @return true if the Source is exhausted + def empty? + @buffer == "" + end + + def position + @orig.index( @buffer ) + end + + # @return the current line in the source + def current_line + lines = @orig.split + res = lines.grep @buffer[0..30] + res = res[-1] if res.kind_of? Array + lines.index( res ) if res + end + + private + def detect_encoding + buffer_encoding = @buffer.encoding + detected_encoding = "UTF-8" + begin + @buffer.force_encoding("ASCII-8BIT") + if @buffer[0, 2] == "\xfe\xff" + @buffer[0, 2] = "" + detected_encoding = "UTF-16BE" + elsif @buffer[0, 2] == "\xff\xfe" + @buffer[0, 2] = "" + detected_encoding = "UTF-16LE" + elsif @buffer[0, 3] == "\xef\xbb\xbf" + @buffer[0, 3] = "" + detected_encoding = "UTF-8" + end + ensure + @buffer.force_encoding(buffer_encoding) + end + self.encoding = detected_encoding + end + + def encoding_updated + if @encoding != 'UTF-8' + @buffer = decode(@buffer) + @to_utf = true + else + @to_utf = false + @buffer.force_encoding ::Encoding::UTF_8 + end + end + end + + # A Source that wraps an IO. See the Source class for method + # documentation + class IOSource < Source + #attr_reader :block_size + + # block_size has been deprecated + def initialize(arg, block_size=500, encoding=nil) + @er_source = @source = arg + @to_utf = false + @pending_buffer = nil + + if encoding + super("", encoding) + else + super(@source.read(3) || "") + end + + if !@to_utf and + @buffer.respond_to?(:force_encoding) and + @source.respond_to?(:external_encoding) and + @source.external_encoding != ::Encoding::UTF_8 + @force_utf8 = true + else + @force_utf8 = false + end + end + + def scan(pattern, cons=false) + rv = super + # You'll notice that this next section is very similar to the same + # section in match(), but just a liiittle different. This is + # because it is a touch faster to do it this way with scan() + # than the way match() does it; enough faster to warrent duplicating + # some code + if rv.size == 0 + until @buffer =~ pattern or @source.nil? + begin + @buffer << readline + rescue Iconv::IllegalSequence + raise + rescue + @source = nil + end + end + rv = super + end + rv.taint + rv + end + + def read + begin + @buffer << readline + rescue Exception, NameError + @source = nil + end + end + + def consume( pattern ) + match( pattern, true ) + end + + def match( pattern, cons=false ) + rv = pattern.match(@buffer) + @buffer = $' if cons and rv + while !rv and @source + begin + @buffer << readline + rv = pattern.match(@buffer) + @buffer = $' if cons and rv + rescue + @source = nil + end + end + rv.taint + rv + end + + def empty? + super and ( @source.nil? || @source.eof? ) + end + + def position + @er_source.pos rescue 0 + end + + # @return the current line in the source + def current_line + begin + pos = @er_source.pos # The byte position in the source + lineno = @er_source.lineno # The XML < position in the source + @er_source.rewind + line = 0 # The \r\n position in the source + begin + while @er_source.pos < pos + @er_source.readline + line += 1 + end + rescue + end + rescue IOError + pos = -1 + line = -1 + end + [pos, lineno, line] + end + + private + def readline + str = @source.readline(@line_break) + if @pending_buffer + if str.nil? + str = @pending_buffer + else + str = @pending_buffer + str + end + @pending_buffer = nil + end + return nil if str.nil? + + if @to_utf + decode(str) + else + str.force_encoding(::Encoding::UTF_8) if @force_utf8 + str + end + end + + def encoding_updated + case @encoding + when "UTF-16BE", "UTF-16LE" + @source.binmode + @source.set_encoding(@encoding, @encoding) + end + @line_break = encode(">") + @pending_buffer, @buffer = @buffer, "" + @pending_buffer.force_encoding(@encoding) + super + end + end +end diff --git a/jni/ruby/lib/rexml/streamlistener.rb b/jni/ruby/lib/rexml/streamlistener.rb new file mode 100644 index 0000000..3a0528c --- /dev/null +++ b/jni/ruby/lib/rexml/streamlistener.rb @@ -0,0 +1,92 @@ +module REXML + # A template for stream parser listeners. + # Note that the declarations (attlistdecl, elementdecl, etc) are trivially + # processed; REXML doesn't yet handle doctype entity declarations, so you + # have to parse them out yourself. + module StreamListener + # Called when a tag is encountered. + # @p name the tag name + # @p attrs an array of arrays of attribute/value pairs, suitable for + # use with assoc or rassoc. IE, <tag attr1="value1" attr2="value2"> + # will result in + # tag_start( "tag", # [["attr1","value1"],["attr2","value2"]]) + def tag_start name, attrs + end + # Called when the end tag is reached. In the case of <tag/>, tag_end + # will be called immediately after tag_start + # @p the name of the tag + def tag_end name + end + # Called when text is encountered in the document + # @p text the text content. + def text text + end + # Called when an instruction is encountered. EG: <?xsl sheet='foo'?> + # @p name the instruction name; in the example, "xsl" + # @p instruction the rest of the instruction. In the example, + # "sheet='foo'" + def instruction name, instruction + end + # Called when a comment is encountered. + # @p comment The content of the comment + def comment comment + end + # Handles a doctype declaration. Any attributes of the doctype which are + # not supplied will be nil. # EG, <!DOCTYPE me PUBLIC "foo" "bar"> + # @p name the name of the doctype; EG, "me" + # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" + # @p long_name the supplied long name, or nil. EG, "foo" + # @p uri the uri of the doctype, or nil. EG, "bar" + def doctype name, pub_sys, long_name, uri + end + # Called when the doctype is done + def doctype_end + end + # If a doctype includes an ATTLIST declaration, it will cause this + # method to be called. The content is the declaration itself, unparsed. + # EG, <!ATTLIST el attr CDATA #REQUIRED> will come to this method as "el + # attr CDATA #REQUIRED". This is the same for all of the .*decl + # methods. + def attlistdecl element_name, attributes, raw_content + end + # <!ELEMENT ...> + def elementdecl content + end + # <!ENTITY ...> + # The argument passed to this method is an array of the entity + # declaration. It can be in a number of formats, but in general it + # returns (example, result): + # <!ENTITY % YN '"Yes"'> + # ["YN", "\"Yes\"", "%"] + # <!ENTITY % YN 'Yes'> + # ["YN", "Yes", "%"] + # <!ENTITY WhatHeSaid "He said %YN;"> + # ["WhatHeSaid", "He said %YN;"] + # <!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "SYSTEM", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "PUBLIC", "-//Textuality//TEXT Standard open-hatch boilerplate//EN", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> + # ["hatch-pic", "SYSTEM", "../grafix/OpenHatch.gif", "gif"] + def entitydecl content + end + # <!NOTATION ...> + def notationdecl content + end + # Called when %foo; is encountered in a doctype declaration. + # @p content "foo" + def entity content + end + # Called when <![CDATA[ ... ]]> is encountered in a document. + # @p content "..." + def cdata content + end + # Called when an XML PI is encountered in the document. + # EG: <?xml version="1.0" encoding="utf"?> + # @p version the version attribute value. EG, "1.0" + # @p encoding the encoding attribute value, or nil. EG, "utf" + # @p standalone the standalone attribute value, or nil. EG, nil + def xmldecl version, encoding, standalone + end + end +end diff --git a/jni/ruby/lib/rexml/syncenumerator.rb b/jni/ruby/lib/rexml/syncenumerator.rb new file mode 100644 index 0000000..11609bd --- /dev/null +++ b/jni/ruby/lib/rexml/syncenumerator.rb @@ -0,0 +1,32 @@ +module REXML + class SyncEnumerator + include Enumerable + + # Creates a new SyncEnumerator which enumerates rows of given + # Enumerable objects. + def initialize(*enums) + @gens = enums + @length = @gens.collect {|x| x.size }.max + end + + # Returns the number of enumerated Enumerable objects, i.e. the size + # of each row. + def size + @gens.size + end + + # Returns the number of enumerated Enumerable objects, i.e. the size + # of each row. + def length + @gens.length + end + + # Enumerates rows of the Enumerable objects. + def each + @length.times {|i| + yield @gens.collect {|x| x[i]} + } + self + end + end +end diff --git a/jni/ruby/lib/rexml/text.rb b/jni/ruby/lib/rexml/text.rb new file mode 100644 index 0000000..d3242ee --- /dev/null +++ b/jni/ruby/lib/rexml/text.rb @@ -0,0 +1,425 @@ +require 'rexml/security' +require 'rexml/entity' +require 'rexml/doctype' +require 'rexml/child' +require 'rexml/doctype' +require 'rexml/parseexception' + +module REXML + # Represents text nodes in an XML document + class Text < Child + include Comparable + # The order in which the substitutions occur + SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ] + SUBSTITUTES = ['&', '<', '>', '"', ''', ' '] + # Characters which are substituted in written strings + SLAICEPS = [ '<', '>', '"', "'", '&' ] + SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ] + + # If +raw+ is true, then REXML leaves the value alone + attr_accessor :raw + + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um + NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + VALID_CHAR = [ + 0x9, 0xA, 0xD, + (0x20..0xD7FF), + (0xE000..0xFFFD), + (0x10000..0x10FFFF) + ] + + if String.method_defined? :encode + VALID_XML_CHARS = Regexp.new('^['+ + VALID_CHAR.map { |item| + case item + when Fixnum + [item].pack('U').force_encoding('utf-8') + when Range + [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') + end + }.join + + ']*$') + else + VALID_XML_CHARS = /^( + [\x09\x0A\x0D\x20-\x7E] # ASCII + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte + | \xEF[\x80-\xBE]{2} # + | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )*$/nx; + end + + # Constructor + # +arg+ if a String, the content is set to the String. If a Text, + # the object is shallowly cloned. + # + # +respect_whitespace+ (boolean, false) if true, whitespace is + # respected + # + # +parent+ (nil) if this is a Parent object, the parent + # will be set to this. + # + # +raw+ (nil) This argument can be given three values. + # If true, then the value of used to construct this object is expected to + # contain no unescaped XML markup, and REXML will not change the text. If + # this value is false, the string may contain any characters, and REXML will + # escape any and all defined entities whose values are contained in the + # text. If this value is nil (the default), then the raw value of the + # parent will be used as the raw value for this node. If there is no raw + # value for the parent, and no value is supplied, the default is false. + # Use this field if you have entities defined for some text, and you don't + # want REXML to escape that text in output. + # Text.new( "<&", false, nil, false ) #-> "<&" + # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;" + # Text.new( "<&", false, nil, true ) #-> Parse exception + # Text.new( "<&", false, nil, true ) #-> "<&" + # # Assume that the entity "s" is defined to be "sean" + # # and that the entity "r" is defined to be "russell" + # Text.new( "sean russell" ) #-> "&s; &r;" + # Text.new( "sean russell", false, nil, true ) #-> "sean russell" + # + # +entity_filter+ (nil) This can be an array of entities to match in the + # supplied text. This argument is only useful if +raw+ is set to false. + # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell" + # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell" + # In the last example, the +entity_filter+ argument is ignored. + # + # +illegal+ INTERNAL USE ONLY + def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, + entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK ) + + @raw = false + @parent = nil + + if parent + super( parent ) + @raw = parent.raw + end + + @raw = raw unless raw.nil? + @entity_filter = entity_filter + clear_cache + + if arg.kind_of? String + @string = arg.dup + @string.squeeze!(" \n\t") unless respect_whitespace + elsif arg.kind_of? Text + @string = arg.to_s + @raw = arg.raw + elsif + raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})" + end + + @string.gsub!( /\r\n?/, "\n" ) + + Text.check(@string, illegal, doctype) if @raw + end + + def parent= parent + super(parent) + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + # check for illegal characters + def Text.check string, pattern, doctype + + # illegal anywhere + if string !~ VALID_XML_CHARS + if String.method_defined? :encode + string.chars.each do |c| + case c.ord + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + else + string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c| + case c.unpack('U') + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + end + end + + # context sensitive + string.scan(pattern) do + if $1[-1] != ?; + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + elsif $1[0] == ?& + if $5 and $5[0] == ?# + case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + end + # FIXME: below can't work but this needs API change. + # elsif @parent and $3 and !SUBSTITUTES.include?($1) + # if !doctype or !doctype.entities.has_key?($3) + # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" + # end + end + end + end + end + + def node_type + :text + end + + def empty? + @string.size==0 + end + + + def clone + return Text.new(self) + end + + + # Appends text to this text node. The text is appended in the +raw+ mode + # of this text node. + # + # +returns+ the text itself to enable method chain like + # 'text << "XXX" << "YYY"'. + def <<( to_append ) + @string << to_append.gsub( /\r\n?/, "\n" ) + clear_cache + self + end + + + # +other+ a String or a Text + # +returns+ the result of (to_s <=> arg.to_s) + def <=>( other ) + to_s() <=> other.to_s + end + + def doctype + if @parent + doc = @parent.document + doc.doctype if doc + end + end + + REFERENCE = /#{Entity::REFERENCE}/ + # Returns the string value of this text node. This string is always + # escaped, meaning that it is a valid XML text node string, and all + # entities that can be escaped, have been inserted. This method respects + # the entity filter set in the constructor. + # + # # Assume that the entity "s" is defined to be "sean", and that the + # # entity "r" is defined to be "russell" + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t.to_s #-> "< & &s; russell" + # t = Text.new( "< & &s; russell", false, nil, false ) + # t.to_s #-> "< & &s; russell" + # u = Text.new( "sean russell", false, nil, true ) + # u.to_s #-> "sean russell" + def to_s + return @string if @raw + return @normalized if @normalized + + @normalized = Text::normalize( @string, doctype, @entity_filter ) + end + + def inspect + @string.inspect + end + + # Returns the string value of this text. This is the text without + # entities, as it might be used programmatically, or printed to the + # console. This ignores the 'raw' attribute setting, and any + # entity_filter. + # + # # Assume that the entity "s" is defined to be "sean", and that the + # # entity "r" is defined to be "russell" + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t.value #-> "< & sean russell" + # t = Text.new( "< & &s; russell", false, nil, false ) + # t.value #-> "< & sean russell" + # u = Text.new( "sean russell", false, nil, true ) + # u.value #-> "sean russell" + def value + return @unnormalized if @unnormalized + @unnormalized = Text::unnormalize( @string, doctype ) + end + + # Sets the contents of this text node. This expects the text to be + # unnormalized. It returns self. + # + # e = Element.new( "a" ) + # e.add_text( "foo" ) # <a>foo</a> + # e[0].value = "bar" # <a>bar</a> + # e[0].value = "<a>" # <a><a></a> + def value=( val ) + @string = val.gsub( /\r\n?/, "\n" ) + clear_cache + @raw = false + end + + def wrap(string, width, addnewline=false) + # Recursively wrap string at width. + return string if string.length <= width + place = string.rindex(' ', width) # Position in string with last ' ' before cutoff + if addnewline then + return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + else + return string[0,place] + "\n" + wrap(string[place+1..-1], width) + end + end + + def indent_text(string, level=1, style="\t", indentfirstline=true) + return string if level < 0 + new_string = '' + string.each_line { |line| + indent_string = style * level + new_line = (indent_string + line).sub(/[\s]+$/,'') + new_string << new_line + } + new_string.strip! unless indentfirstline + return new_string + end + + # == DEPRECATED + # See REXML::Formatters + # + def write( writer, indent=-1, transitive=false, ie_hack=false ) + Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters") + formatter = if indent > -1 + REXML::Formatters::Pretty.new( indent ) + else + REXML::Formatters::Default.new + end + formatter.write( self, writer ) + end + + # FIXME + # This probably won't work properly + def xpath + path = @parent.xpath + path += "/text()" + return path + end + + # Writes out text, substituting special characters beforehand. + # +out+ A String, IO, or any other object supporting <<( String ) + # +input+ the text to substitute and the write out + # + # z=utf8.unpack("U*") + # ascOut="" + # z.each{|r| + # if r < 0x100 + # ascOut.concat(r.chr) + # else + # ascOut.concat(sprintf("&#x%x;", r)) + # end + # } + # puts ascOut + def write_with_substitution out, input + copy = input.clone + # Doing it like this rather than in a loop improves the speed + copy.gsub!( SPECIALS[0], SUBSTITUTES[0] ) + copy.gsub!( SPECIALS[1], SUBSTITUTES[1] ) + copy.gsub!( SPECIALS[2], SUBSTITUTES[2] ) + copy.gsub!( SPECIALS[3], SUBSTITUTES[3] ) + copy.gsub!( SPECIALS[4], SUBSTITUTES[4] ) + copy.gsub!( SPECIALS[5], SUBSTITUTES[5] ) + out << copy + end + + private + def clear_cache + @normalized = nil + @unnormalized = nil + end + + # Reads text, substituting entities + def Text::read_with_substitution( input, illegal=nil ) + copy = input.clone + + if copy =~ illegal + raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" ) + end if illegal + + copy.gsub!( /\r\n?/, "\n" ) + if copy.include? ?& + copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] ) + copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] ) + copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] ) + copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] ) + copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] ) + copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) { + m=$1 + #m='0' if m=='' + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + end + copy + end + + EREFERENCE = /&(?!#{Entity::NAME};)/ + # Escapes all possible entities + def Text::normalize( input, doctype=nil, entity_filter=nil ) + copy = input.to_s + # Doing it like this rather than in a loop improves the speed + #copy = copy.gsub( EREFERENCE, '&' ) + copy = copy.gsub( "&", "&" ) + if doctype + # Replace all ampersands that aren't part of an entity + doctype.entities.each_value do |entity| + copy = copy.gsub( entity.value, + "&#{entity.name};" ) if entity.value and + not( entity_filter and entity_filter.include?(entity.name) ) + end + else + # Replace all ampersands that aren't part of an entity + DocType::DEFAULT_ENTITIES.each_value do |entity| + copy = copy.gsub(entity.value, "&#{entity.name};" ) + end + end + copy + end + + # Unescapes all possible entities + def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) + sum = 0 + string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { + s = Text.expand($&, doctype, filter) + if sum + s.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + else + sum += s.bytesize + end + s + } + end + + def Text.expand(ref, doctype, filter) + if ref[1] == ?# + if ref[2] == ?x + [ref[3...-1].to_i(16)].pack('U*') + else + [ref[2...-1].to_i].pack('U*') + end + elsif ref == '&' + '&' + elsif filter and filter.include?( ref[1...-1] ) + ref + elsif doctype + doctype.entity( ref[1...-1] ) or ref + else + entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ] + entity_value ? entity_value.value : ref + end + end + end +end diff --git a/jni/ruby/lib/rexml/undefinednamespaceexception.rb b/jni/ruby/lib/rexml/undefinednamespaceexception.rb new file mode 100644 index 0000000..8ebfdfd --- /dev/null +++ b/jni/ruby/lib/rexml/undefinednamespaceexception.rb @@ -0,0 +1,8 @@ +require 'rexml/parseexception' +module REXML + class UndefinedNamespaceException < ParseException + def initialize( prefix, source, parser ) + super( "Undefined prefix #{prefix} found" ) + end + end +end diff --git a/jni/ruby/lib/rexml/validation/relaxng.rb b/jni/ruby/lib/rexml/validation/relaxng.rb new file mode 100644 index 0000000..370efd5 --- /dev/null +++ b/jni/ruby/lib/rexml/validation/relaxng.rb @@ -0,0 +1,538 @@ +require "rexml/validation/validation" +require "rexml/parsers/baseparser" + +module REXML + module Validation + # Implemented: + # * empty + # * element + # * attribute + # * text + # * optional + # * choice + # * oneOrMore + # * zeroOrMore + # * group + # * value + # * interleave + # * mixed + # * ref + # * grammar + # * start + # * define + # + # Not implemented: + # * data + # * param + # * include + # * externalRef + # * notAllowed + # * anyName + # * nsName + # * except + # * name + class RelaxNG + include Validator + + INFINITY = 1.0 / 0.0 + EMPTY = Event.new( nil ) + TEXT = [:start_element, "text"] + attr_accessor :current + attr_accessor :count + attr_reader :references + + # FIXME: Namespaces + def initialize source + parser = REXML::Parsers::BaseParser.new( source ) + + @count = 0 + @references = {} + @root = @current = Sequence.new(self) + @root.previous = true + states = [ @current ] + begin + event = parser.pull + case event[0] + when :start_element + case event[1] + when "empty" + when "element", "attribute", "text", "value" + states[-1] << event + when "optional" + states << Optional.new( self ) + states[-2] << states[-1] + when "choice" + states << Choice.new( self ) + states[-2] << states[-1] + when "oneOrMore" + states << OneOrMore.new( self ) + states[-2] << states[-1] + when "zeroOrMore" + states << ZeroOrMore.new( self ) + states[-2] << states[-1] + when "group" + states << Sequence.new( self ) + states[-2] << states[-1] + when "interleave" + states << Interleave.new( self ) + states[-2] << states[-1] + when "mixed" + states << Interleave.new( self ) + states[-2] << states[-1] + states[-1] << TEXT + when "define" + states << [ event[2]["name"] ] + when "ref" + states[-1] << Ref.new( event[2]["name"] ) + when "anyName" + states << AnyName.new( self ) + states[-2] << states[-1] + when "nsName" + when "except" + when "name" + when "data" + when "param" + when "include" + when "grammar" + when "start" + when "externalRef" + when "notAllowed" + end + when :end_element + case event[1] + when "element", "attribute" + states[-1] << event + when "zeroOrMore", "oneOrMore", "choice", "optional", + "interleave", "group", "mixed" + states.pop + when "define" + ref = states.pop + @references[ ref.shift ] = ref + #when "empty" + end + when :end_document + states[-1] << event + when :text + states[-1] << event + end + end while event[0] != :end_document + end + + def receive event + validate( event ) + end + end + + class State + def initialize( context ) + @previous = [] + @events = [] + @current = 0 + @count = context.count += 1 + @references = context.references + @value = false + end + + def reset + return if @current == 0 + @current = 0 + @events.each {|s| s.reset if s.kind_of? State } + end + + def previous=( previous ) + @previous << previous + end + + def next( event ) + #print "In next with #{event.inspect}. " + #p @previous + return @previous.pop.next( event ) if @events[@current].nil? + expand_ref_in( @events, @current ) if @events[@current].class == Ref + if ( @events[@current].kind_of? State ) + @current += 1 + @events[@current-1].previous = self + return @events[@current-1].next( event ) + end + if ( @events[@current].matches?(event) ) + @current += 1 + if @events[@current].nil? + return @previous.pop + elsif @events[@current].kind_of? State + @current += 1 + @events[@current-1].previous = self + return @events[@current-1] + else + return self + end + else + return nil + end + end + + def to_s + # Abbreviated: + self.class.name =~ /(?:::)(\w)\w+$/ + # Full: + #self.class.name =~ /(?:::)(\w+)$/ + "#$1.#@count" + end + + def inspect + "< #{to_s} #{@events.collect{|e| + pre = e == @events[@current] ? '#' : '' + pre + e.inspect unless self == e + }.join(', ')} >" + end + + def expected + return [@events[@current]] + end + + def <<( event ) + add_event_to_arry( @events, event ) + end + + + protected + def expand_ref_in( arry, ind ) + new_events = [] + @references[ arry[ind].to_s ].each{ |evt| + add_event_to_arry(new_events,evt) + } + arry[ind,1] = new_events + end + + def add_event_to_arry( arry, evt ) + evt = generate_event( evt ) + if evt.kind_of? String + arry[-1].event_arg = evt if arry[-1].kind_of? Event and @value + @value = false + else + arry << evt + end + end + + def generate_event( event ) + return event if event.kind_of? State or event.class == Ref + evt = nil + arg = nil + case event[0] + when :start_element + case event[1] + when "element" + evt = :start_element + arg = event[2]["name"] + when "attribute" + evt = :start_attribute + arg = event[2]["name"] + when "text" + evt = :text + when "value" + evt = :text + @value = true + end + when :text + return event[1] + when :end_document + return Event.new( event[0] ) + else # then :end_element + case event[1] + when "element" + evt = :end_element + when "attribute" + evt = :end_attribute + end + end + return Event.new( evt, arg ) + end + end + + + class Sequence < State + def matches?(event) + @events[@current].matches?( event ) + end + end + + + class Optional < State + def next( event ) + if @current == 0 + rv = super + return rv if rv + @prior = @previous.pop + return @prior.next( event ) + end + super + end + + def matches?(event) + @events[@current].matches?(event) || + (@current == 0 and @previous[-1].matches?(event)) + end + + def expected + return [ @prior.expected, @events[0] ].flatten if @current == 0 + return [@events[@current]] + end + end + + + class ZeroOrMore < Optional + def next( event ) + expand_ref_in( @events, @current ) if @events[@current].class == Ref + if ( @events[@current].matches?(event) ) + @current += 1 + if @events[@current].nil? + @current = 0 + return self + elsif @events[@current].kind_of? State + @current += 1 + @events[@current-1].previous = self + return @events[@current-1] + else + return self + end + else + @prior = @previous.pop + return @prior.next( event ) if @current == 0 + return nil + end + end + + def expected + return [ @prior.expected, @events[0] ].flatten if @current == 0 + return [@events[@current]] + end + end + + + class OneOrMore < State + def initialize context + super + @ord = 0 + end + + def reset + super + @ord = 0 + end + + def next( event ) + expand_ref_in( @events, @current ) if @events[@current].class == Ref + if ( @events[@current].matches?(event) ) + @current += 1 + @ord += 1 + if @events[@current].nil? + @current = 0 + return self + elsif @events[@current].kind_of? State + @current += 1 + @events[@current-1].previous = self + return @events[@current-1] + else + return self + end + else + return @previous.pop.next( event ) if @current == 0 and @ord > 0 + return nil + end + end + + def matches?( event ) + @events[@current].matches?(event) || + (@current == 0 and @ord > 0 and @previous[-1].matches?(event)) + end + + def expected + if @current == 0 and @ord > 0 + return [@previous[-1].expected, @events[0]].flatten + else + return [@events[@current]] + end + end + end + + + class Choice < State + def initialize context + super + @choices = [] + end + + def reset + super + @events = [] + @choices.each { |c| c.each { |s| s.reset if s.kind_of? State } } + end + + def <<( event ) + add_event_to_arry( @choices, event ) + end + + def next( event ) + # Make the choice if we haven't + if @events.size == 0 + c = 0 ; max = @choices.size + while c < max + if @choices[c][0].class == Ref + expand_ref_in( @choices[c], 0 ) + @choices += @choices[c] + @choices.delete( @choices[c] ) + max -= 1 + else + c += 1 + end + end + @events = @choices.find { |evt| evt[0].matches? event } + # Remove the references + # Find the events + end + unless @events + @events = [] + return nil + end + super + end + + def matches?( event ) + return @events[@current].matches?( event ) if @events.size > 0 + !@choices.find{|evt| evt[0].matches?(event)}.nil? + end + + def expected + return [@events[@current]] if @events.size > 0 + return @choices.collect do |x| + if x[0].kind_of? State + x[0].expected + else + x[0] + end + end.flatten + end + + def inspect + "< #{to_s} #{@choices.collect{|e| e.collect{|f|f.to_s}.join(', ')}.join(' or ')} >" + end + + protected + def add_event_to_arry( arry, evt ) + if evt.kind_of? State or evt.class == Ref + arry << [evt] + elsif evt[0] == :text + if arry[-1] and + arry[-1][-1].kind_of?( Event ) and + arry[-1][-1].event_type == :text and @value + + arry[-1][-1].event_arg = evt[1] + @value = false + end + else + arry << [] if evt[0] == :start_element + arry[-1] << generate_event( evt ) + end + end + end + + + class Interleave < Choice + def initialize context + super + @choice = 0 + end + + def reset + @choice = 0 + end + + def next_current( event ) + # Expand references + c = 0 ; max = @choices.size + while c < max + if @choices[c][0].class == Ref + expand_ref_in( @choices[c], 0 ) + @choices += @choices[c] + @choices.delete( @choices[c] ) + max -= 1 + else + c += 1 + end + end + @events = @choices[@choice..-1].find { |evt| evt[0].matches? event } + @current = 0 + if @events + # reorder the choices + old = @choices[@choice] + idx = @choices.index( @events ) + @choices[@choice] = @events + @choices[idx] = old + @choice += 1 + end + + @events = [] unless @events + end + + + def next( event ) + # Find the next series + next_current(event) unless @events[@current] + return nil unless @events[@current] + + expand_ref_in( @events, @current ) if @events[@current].class == Ref + if ( @events[@current].kind_of? State ) + @current += 1 + @events[@current-1].previous = self + return @events[@current-1].next( event ) + end + return @previous.pop.next( event ) if @events[@current].nil? + if ( @events[@current].matches?(event) ) + @current += 1 + if @events[@current].nil? + return self unless @choices[@choice].nil? + return @previous.pop + elsif @events[@current].kind_of? State + @current += 1 + @events[@current-1].previous = self + return @events[@current-1] + else + return self + end + else + return nil + end + end + + def matches?( event ) + return @events[@current].matches?( event ) if @events[@current] + !@choices[@choice..-1].find{|evt| evt[0].matches?(event)}.nil? + end + + def expected + return [@events[@current]] if @events[@current] + return @choices[@choice..-1].collect do |x| + if x[0].kind_of? State + x[0].expected + else + x[0] + end + end.flatten + end + + def inspect + "< #{to_s} #{@choices.collect{|e| e.collect{|f|f.to_s}.join(', ')}.join(' and ')} >" + end + end + + class Ref + def initialize value + @value = value + end + def to_s + @value + end + def inspect + "{#{to_s}}" + end + end + end +end diff --git a/jni/ruby/lib/rexml/validation/validation.rb b/jni/ruby/lib/rexml/validation/validation.rb new file mode 100644 index 0000000..bab7f22 --- /dev/null +++ b/jni/ruby/lib/rexml/validation/validation.rb @@ -0,0 +1,143 @@ +require 'rexml/validation/validationexception' + +module REXML + module Validation + module Validator + NILEVENT = [ nil ] + def reset + @current = @root + @root.reset + @root.previous = true + @attr_stack = [] + self + end + def dump + puts @root.inspect + end + def validate( event ) + @attr_stack = [] unless defined? @attr_stack + match = @current.next(event) + raise ValidationException.new( "Validation error. Expected: "+ + @current.expected.join( " or " )+" from #{@current.inspect} "+ + " but got #{Event.new( event[0], event[1] ).inspect}" ) unless match + @current = match + + # Check for attributes + case event[0] + when :start_element + @attr_stack << event[2] + begin + sattr = [:start_attribute, nil] + eattr = [:end_attribute] + text = [:text, nil] + k, = event[2].find { |key,value| + sattr[1] = key + m = @current.next( sattr ) + if m + # If the state has text children... + if m.matches?( eattr ) + @current = m + else + text[1] = value + m = m.next( text ) + text[1] = nil + return false unless m + @current = m if m + end + m = @current.next( eattr ) + if m + @current = m + true + else + false + end + else + false + end + } + event[2].delete(k) if k + end while k + when :end_element + attrs = @attr_stack.pop + raise ValidationException.new( "Validation error. Illegal "+ + " attributes: #{attrs.inspect}") if attrs.length > 0 + end + end + end + + class Event + def initialize(event_type, event_arg=nil ) + @event_type = event_type + @event_arg = event_arg + end + + attr_reader :event_type + attr_accessor :event_arg + + def done? + @done + end + + def single? + return (@event_type != :start_element and @event_type != :start_attribute) + end + + def matches?( event ) + return false unless event[0] == @event_type + case event[0] + when nil + return true + when :start_element + return true if event[1] == @event_arg + when :end_element + return true + when :start_attribute + return true if event[1] == @event_arg + when :end_attribute + return true + when :end_document + return true + when :text + return (@event_arg.nil? or @event_arg == event[1]) +=begin + when :processing_instruction + false + when :xmldecl + false + when :start_doctype + false + when :end_doctype + false + when :externalentity + false + when :elementdecl + false + when :entity + false + when :attlistdecl + false + when :notationdecl + false + when :end_doctype + false +=end + else + false + end + end + + def ==( other ) + return false unless other.kind_of? Event + @event_type == other.event_type and @event_arg == other.event_arg + end + + def to_s + inspect + end + + def inspect + "#{@event_type.inspect}( #@event_arg )" + end + end + end +end diff --git a/jni/ruby/lib/rexml/validation/validationexception.rb b/jni/ruby/lib/rexml/validation/validationexception.rb new file mode 100644 index 0000000..4723d9e --- /dev/null +++ b/jni/ruby/lib/rexml/validation/validationexception.rb @@ -0,0 +1,9 @@ +module REXML + module Validation + class ValidationException < RuntimeError + def initialize msg + super + end + end + end +end diff --git a/jni/ruby/lib/rexml/xmldecl.rb b/jni/ruby/lib/rexml/xmldecl.rb new file mode 100644 index 0000000..465e6ab --- /dev/null +++ b/jni/ruby/lib/rexml/xmldecl.rb @@ -0,0 +1,115 @@ +require 'rexml/encoding' +require 'rexml/source' + +module REXML + # NEEDS DOCUMENTATION + class XMLDecl < Child + include Encoding + + DEFAULT_VERSION = "1.0"; + DEFAULT_ENCODING = "UTF-8"; + DEFAULT_STANDALONE = "no"; + START = '<\?xml'; + STOP = '\?>'; + + attr_accessor :version, :standalone + attr_reader :writeencoding, :writethis + + def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) + @writethis = true + @writeencoding = !encoding.nil? + if version.kind_of? XMLDecl + super() + @version = version.version + self.encoding = version.encoding + @writeencoding = version.writeencoding + @standalone = version.standalone + else + super() + @version = version + self.encoding = encoding + @standalone = standalone + end + @version = DEFAULT_VERSION if @version.nil? + end + + def clone + XMLDecl.new(self) + end + + # indent:: + # Ignored. There must be no whitespace before an XML declaration + # transitive:: + # Ignored + # ie_hack:: + # Ignored + def write(writer, indent=-1, transitive=false, ie_hack=false) + return nil unless @writethis or writer.kind_of? Output + writer << START.sub(/\\/u, '') + writer << " #{content encoding}" + writer << STOP.sub(/\\/u, '') + end + + def ==( other ) + other.kind_of?(XMLDecl) and + other.version == @version and + other.encoding == self.encoding and + other.standalone == @standalone + end + + def xmldecl version, encoding, standalone + @version = version + self.encoding = encoding + @standalone = standalone + end + + def node_type + :xmldecl + end + + alias :stand_alone? :standalone + alias :old_enc= :encoding= + + def encoding=( enc ) + if enc.nil? + self.old_enc = "UTF-8" + @writeencoding = false + else + self.old_enc = enc + @writeencoding = true + end + self.dowrite + end + + # Only use this if you do not want the XML declaration to be written; + # this object is ignored by the XML writer. Otherwise, instantiate your + # own XMLDecl and add it to the document. + # + # Note that XML 1.1 documents *must* include an XML declaration + def XMLDecl.default + rv = XMLDecl.new( "1.0" ) + rv.nowrite + rv + end + + def nowrite + @writethis = false + end + + def dowrite + @writethis = true + end + + def inspect + START.sub(/\\/u, '') + " ... " + STOP.sub(/\\/u, '') + end + + private + def content(enc) + rv = "version='#@version'" + rv << " encoding='#{enc}'" if @writeencoding || enc !~ /\Autf-8\z/i + rv << " standalone='#@standalone'" if @standalone + rv + end + end +end diff --git a/jni/ruby/lib/rexml/xmltokens.rb b/jni/ruby/lib/rexml/xmltokens.rb new file mode 100644 index 0000000..4d4dd27 --- /dev/null +++ b/jni/ruby/lib/rexml/xmltokens.rb @@ -0,0 +1,84 @@ +module REXML + # Defines a number of tokens used for parsing XML. Not for general + # consumption. + module XMLTokens + # From http://www.w3.org/TR/REC-xml/#sec-common-syn + # + # [4] NameStartChar ::= + # ":" | + # [A-Z] | + # "_" | + # [a-z] | + # [#xC0-#xD6] | + # [#xD8-#xF6] | + # [#xF8-#x2FF] | + # [#x370-#x37D] | + # [#x37F-#x1FFF] | + # [#x200C-#x200D] | + # [#x2070-#x218F] | + # [#x2C00-#x2FEF] | + # [#x3001-#xD7FF] | + # [#xF900-#xFDCF] | + # [#xFDF0-#xFFFD] | + # [#x10000-#xEFFFF] + name_start_chars = [ + ":", + "A-Z", + "_", + "a-z", + "\\u00C0-\\u00D6", + "\\u00D8-\\u00F6", + "\\u00F8-\\u02FF", + "\\u0370-\\u037D", + "\\u037F-\\u1FFF", + "\\u200C-\\u200D", + "\\u2070-\\u218F", + "\\u2C00-\\u2FEF", + "\\u3001-\\uD7FF", + "\\uF900-\\uFDCF", + "\\uFDF0-\\uFFFD", + "\\u{10000}-\\u{EFFFF}", + ] + # From http://www.w3.org/TR/REC-xml/#sec-common-syn + # + # [4a] NameChar ::= + # NameStartChar | + # "-" | + # "." | + # [0-9] | + # #xB7 | + # [#x0300-#x036F] | + # [#x203F-#x2040] + name_chars = name_start_chars + [ + "\\-", + "\\.", + "0-9", + "\\u00B7", + "\\u0300-\\u036F", + "\\u203F-\\u2040", + ] + NAME_START_CHAR = "[#{name_start_chars.join('')}]" + NAME_CHAR = "[#{name_chars.join('')}]" + NAMECHAR = NAME_CHAR # deprecated. Use NAME_CHAR instead. + + # From http://www.w3.org/TR/xml-names11/#NT-NCName + # + # [6] NCNameStartChar ::= NameStartChar - ':' + ncname_start_chars = name_start_chars - [":"] + # From http://www.w3.org/TR/xml-names11/#NT-NCName + # + # [5] NCNameChar ::= NameChar - ':' + ncname_chars = name_chars - [":"] + NCNAME_STR = "[#{ncname_start_chars.join('')}][#{ncname_chars.join('')}]*" + NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + + NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)" + NMTOKEN = "(?:#{NAME_CHAR})+" + NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" + REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + + #REFERENCE = "(?:#{ENTITYREF}|#{CHARREF})" + #ENTITYREF = "&#{NAME};" + #CHARREF = "&#\\d+;|&#x[0-9a-fA-F]+;" + end +end diff --git a/jni/ruby/lib/rexml/xpath.rb b/jni/ruby/lib/rexml/xpath.rb new file mode 100644 index 0000000..0f99808 --- /dev/null +++ b/jni/ruby/lib/rexml/xpath.rb @@ -0,0 +1,80 @@ +require 'rexml/functions' +require 'rexml/xpath_parser' + +module REXML + # Wrapper class. Use this class to access the XPath functions. + class XPath + include Functions + # A base Hash object, supposing to be used when initializing a + # default empty namespaces set, but is currently unused. + # TODO: either set the namespaces=EMPTY_HASH, or deprecate this. + EMPTY_HASH = {} + + # Finds and returns the first node that matches the supplied xpath. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, returns the first + # node matching '*'. + # namespaces:: + # If supplied, a Hash which defines a namespace mapping. + # variables:: + # If supplied, a Hash which maps $variables in the query + # to values. This can be used to avoid XPath injection attacks + # or to automatically handle escaping string values. + # + # XPath.first( node ) + # XPath.first( doc, "//b"} ) + # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) + # XPath.first( node, '/book/publisher/text()=$publisher', {}, {"publisher"=>"O'Reilly"}) + def XPath::first element, path=nil, namespaces=nil, variables={} + raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) + raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).flatten[0] + end + + # Iterates over nodes that match the given path, calling the supplied + # block with the match. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, defaults to '*' + # namespaces:: + # If supplied, a Hash which defines a namespace mapping + # variables:: + # If supplied, a Hash which maps $variables in the query + # to values. This can be used to avoid XPath injection attacks + # or to automatically handle escaping string values. + # + # XPath.each( node ) { |el| ... } + # XPath.each( node, '/*[@attr='v']' ) { |el| ... } + # XPath.each( node, 'ancestor::x' ) { |el| ... } + # XPath.each( node, '/book/publisher/text()=$publisher', {}, {"publisher"=>"O'Reilly"}) \ + # {|el| ... } + def XPath::each element, path=nil, namespaces=nil, variables={}, &block + raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) + raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).each( &block ) + end + + # Returns an array of nodes matching a given XPath. + def XPath::match element, path=nil, namespaces=nil, variables={} + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path,element) + end + end +end diff --git a/jni/ruby/lib/rexml/xpath_parser.rb b/jni/ruby/lib/rexml/xpath_parser.rb new file mode 100644 index 0000000..ef49a32 --- /dev/null +++ b/jni/ruby/lib/rexml/xpath_parser.rb @@ -0,0 +1,703 @@ +require 'rexml/namespace' +require 'rexml/xmltokens' +require 'rexml/attribute' +require 'rexml/syncenumerator' +require 'rexml/parsers/xpathparser' + +class Object + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types + def dclone + clone + end +end +class Symbol + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types + def dclone ; self ; end +end +class Fixnum + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types + def dclone ; self ; end +end +class Float + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types + def dclone ; self ; end +end +class Array + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object+ types + def dclone + klone = self.clone + klone.clear + self.each{|v| klone << v.dclone} + klone + end +end + +module REXML + # You don't want to use this class. Really. Use XPath, which is a wrapper + # for this class. Believe me. You don't want to poke around in here. + # There is strange, dark magic at work in this code. Beware. Go back! Go + # back while you still can! + class XPathParser + include XMLTokens + LITERAL = /^'([^']*)'|^"([^"]*)"/u + + def initialize( ) + @parser = REXML::Parsers::XPathParser.new + @namespaces = nil + @variables = {} + end + + def namespaces=( namespaces={} ) + Functions::namespace_context = namespaces + @namespaces = namespaces + end + + def variables=( vars={} ) + Functions::variables = vars + @variables = vars + end + + def parse path, nodeset + path_stack = @parser.parse( path ) + match( path_stack, nodeset ) + end + + def get_first path, nodeset + path_stack = @parser.parse( path ) + first( path_stack, nodeset ) + end + + def predicate path, nodeset + path_stack = @parser.parse( path ) + expr( path_stack, nodeset ) + end + + def []=( variable_name, value ) + @variables[ variable_name ] = value + end + + + # Performs a depth-first (document order) XPath search, and returns the + # first match. This is the fastest, lightest way to return a single result. + # + # FIXME: This method is incomplete! + def first( path_stack, node ) + return nil if path.size == 0 + + case path[0] + when :document + # do nothing + return first( path[1..-1], node ) + when :child + for c in node.children + r = first( path[1..-1], c ) + return r if r + end + when :qname + name = path[2] + if node.name == name + return node if path.size == 3 + return first( path[3..-1], node ) + else + return nil + end + when :descendant_or_self + r = first( path[1..-1], node ) + return r if r + for c in node.children + r = first( path, c ) + return r if r + end + when :node + return first( path[1..-1], node ) + when :any + return first( path[1..-1], node ) + end + return nil + end + + + def match( path_stack, nodeset ) + r = expr( path_stack, nodeset ) + r + end + + private + + + # Returns a String namespace for a node, given a prefix + # The rules are: + # + # 1. Use the supplied namespace mapping first. + # 2. If no mapping was supplied, use the context node to look up the namespace + def get_namespace( node, prefix ) + if @namespaces + return @namespaces[prefix] || '' + else + return node.namespace( prefix ) if node.node_type == :element + return '' + end + end + + + # Expr takes a stack of path elements and a set of nodes (either a Parent + # or an Array and returns an Array of matching nodes + ALL = [ :attribute, :element, :text, :processing_instruction, :comment ] + ELEMENTS = [ :element ] + def expr( path_stack, nodeset, context=nil ) + node_types = ELEMENTS + return nodeset if path_stack.length == 0 || nodeset.length == 0 + while path_stack.length > 0 + if nodeset.length == 0 + path_stack.clear + return [] + end + case (op = path_stack.shift) + when :document + nodeset = [ nodeset[0].root_node ] + + when :qname + prefix = path_stack.shift + name = path_stack.shift + nodeset.delete_if do |node| + # FIXME: This DOUBLES the time XPath searches take + ns = get_namespace( node, prefix ) + if node.node_type == :element + if node.name == name + end + end + !(node.node_type == :element and + node.name == name and + node.namespace == ns ) + end + node_types = ELEMENTS + + when :any + nodeset.delete_if { |node| !node_types.include?(node.node_type) } + + when :self + # This space left intentionally blank + + when :processing_instruction + target = path_stack.shift + nodeset.delete_if do |node| + (node.node_type != :processing_instruction) or + ( target!='' and ( node.target != target ) ) + end + + when :text + nodeset.delete_if { |node| node.node_type != :text } + + when :comment + nodeset.delete_if { |node| node.node_type != :comment } + + when :node + # This space left intentionally blank + node_types = ALL + + when :child + new_nodeset = [] + nt = nil + nodeset.each do |node| + nt = node.node_type + new_nodeset += node.children if nt == :element or nt == :document + end + nodeset = new_nodeset + node_types = ELEMENTS + + when :literal + return path_stack.shift + + when :attribute + new_nodeset = [] + case path_stack.shift + when :qname + prefix = path_stack.shift + name = path_stack.shift + for element in nodeset + if element.node_type == :element + attrib = element.attribute( name, get_namespace(element, prefix) ) + new_nodeset << attrib if attrib + end + end + when :any + for element in nodeset + if element.node_type == :element + new_nodeset += element.attributes.to_a + end + end + end + nodeset = new_nodeset + + when :parent + nodeset = nodeset.collect{|n| n.parent}.compact + #nodeset = expr(path_stack.dclone, nodeset.collect{|n| n.parent}.compact) + node_types = ELEMENTS + + when :ancestor + new_nodeset = [] + nodeset.each do |node| + while node.parent + node = node.parent + new_nodeset << node unless new_nodeset.include? node + end + end + nodeset = new_nodeset + node_types = ELEMENTS + + when :ancestor_or_self + new_nodeset = [] + nodeset.each do |node| + if node.node_type == :element + new_nodeset << node + while ( node.parent ) + node = node.parent + new_nodeset << node unless new_nodeset.include? node + end + end + end + nodeset = new_nodeset + node_types = ELEMENTS + + when :predicate + new_nodeset = [] + subcontext = { :size => nodeset.size } + pred = path_stack.shift + nodeset.each_with_index { |node, index| + subcontext[ :node ] = node + subcontext[ :index ] = index+1 + pc = pred.dclone + result = expr( pc, [node], subcontext ) + result = result[0] if result.kind_of? Array and result.length == 1 + if result.kind_of? Numeric + new_nodeset << node if result == (index+1) + elsif result.instance_of? Array + if result.size > 0 and result.inject(false) {|k,s| s or k} + new_nodeset << node if result.size > 0 + end + else + new_nodeset << node if result + end + } + nodeset = new_nodeset +=begin + predicate = path_stack.shift + ns = nodeset.clone + result = expr( predicate, ns ) + if result.kind_of? Array + nodeset = result.zip(ns).collect{|m,n| n if m}.compact + else + nodeset = result ? nodeset : [] + end +=end + + when :descendant_or_self + rv = descendant_or_self( path_stack, nodeset ) + path_stack.clear + nodeset = rv + node_types = ELEMENTS + + when :descendant + results = [] + nt = nil + nodeset.each do |node| + nt = node.node_type + results += expr( path_stack.dclone.unshift( :descendant_or_self ), + node.children ) if nt == :element or nt == :document + end + nodeset = results + node_types = ELEMENTS + + when :following_sibling + results = [] + nodeset.each do |node| + next if node.parent.nil? + all_siblings = node.parent.children + current_index = all_siblings.index( node ) + following_siblings = all_siblings[ current_index+1 .. -1 ] + results += expr( path_stack.dclone, following_siblings ) + end + nodeset = results + + when :preceding_sibling + results = [] + nodeset.each do |node| + next if node.parent.nil? + all_siblings = node.parent.children + current_index = all_siblings.index( node ) + preceding_siblings = all_siblings[ 0, current_index ].reverse + results += preceding_siblings + end + nodeset = results + node_types = ELEMENTS + + when :preceding + new_nodeset = [] + nodeset.each do |node| + new_nodeset += preceding( node ) + end + nodeset = new_nodeset + node_types = ELEMENTS + + when :following + new_nodeset = [] + nodeset.each do |node| + new_nodeset += following( node ) + end + nodeset = new_nodeset + node_types = ELEMENTS + + when :namespace + new_nodeset = [] + prefix = path_stack.shift + nodeset.each do |node| + if (node.node_type == :element or node.node_type == :attribute) + if @namespaces + namespaces = @namespaces + elsif (node.node_type == :element) + namespaces = node.namespaces + else + namespaces = node.element.namesapces + end + if (node.namespace == namespaces[prefix]) + new_nodeset << node + end + end + end + nodeset = new_nodeset + + when :variable + var_name = path_stack.shift + return @variables[ var_name ] + + # :and, :or, :eq, :neq, :lt, :lteq, :gt, :gteq + # TODO: Special case for :or and :and -- not evaluate the right + # operand if the left alone determines result (i.e. is true for + # :or and false for :and). + when :eq, :neq, :lt, :lteq, :gt, :gteq, :or + left = expr( path_stack.shift, nodeset.dup, context ) + right = expr( path_stack.shift, nodeset.dup, context ) + res = equality_relational_compare( left, op, right ) + return res + + when :and + left = expr( path_stack.shift, nodeset.dup, context ) + return [] unless left + if left.respond_to?(:inject) and !left.inject(false) {|a,b| a | b} + return [] + end + right = expr( path_stack.shift, nodeset.dup, context ) + res = equality_relational_compare( left, op, right ) + return res + + when :div + left = Functions::number(expr(path_stack.shift, nodeset, context)).to_f + right = Functions::number(expr(path_stack.shift, nodeset, context)).to_f + return (left / right) + + when :mod + left = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + right = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + return (left % right) + + when :mult + left = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + right = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + return (left * right) + + when :plus + left = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + right = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + return (left + right) + + when :minus + left = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + right = Functions::number(expr(path_stack.shift, nodeset, context )).to_f + return (left - right) + + when :union + left = expr( path_stack.shift, nodeset, context ) + right = expr( path_stack.shift, nodeset, context ) + return (left | right) + + when :neg + res = expr( path_stack, nodeset, context ) + return -(res.to_f) + + when :not + when :function + func_name = path_stack.shift.tr('-','_') + arguments = path_stack.shift + subcontext = context ? nil : { :size => nodeset.size } + + res = [] + cont = context + nodeset.each_with_index { |n, i| + if subcontext + subcontext[:node] = n + subcontext[:index] = i + cont = subcontext + end + arg_clone = arguments.dclone + args = arg_clone.collect { |arg| + expr( arg, [n], cont ) + } + Functions.context = cont + res << Functions.send( func_name, *args ) + } + return res + + end + end # while + return nodeset + end + + + ########################################################## + # FIXME + # The next two methods are BAD MOJO! + # This is my achilles heel. If anybody thinks of a better + # way of doing this, be my guest. This really sucks, but + # it is a wonder it works at all. + # ######################################################## + + def descendant_or_self( path_stack, nodeset ) + rs = [] + d_o_s( path_stack, nodeset, rs ) + document_order(rs.flatten.compact) + #rs.flatten.compact + end + + def d_o_s( p, ns, r ) + nt = nil + ns.each_index do |i| + n = ns[i] + x = expr( p.dclone, [ n ] ) + nt = n.node_type + d_o_s( p, n.children, x ) if nt == :element or nt == :document and n.children.size > 0 + r.concat(x) if x.size > 0 + end + end + + + # Reorders an array of nodes so that they are in document order + # It tries to do this efficiently. + # + # FIXME: I need to get rid of this, but the issue is that most of the XPath + # interpreter functions as a filter, which means that we lose context going + # in and out of function calls. If I knew what the index of the nodes was, + # I wouldn't have to do this. Maybe add a document IDX for each node? + # Problems with mutable documents. Or, rewrite everything. + def document_order( array_of_nodes ) + new_arry = [] + array_of_nodes.each { |node| + node_idx = [] + np = node.node_type == :attribute ? node.element : node + while np.parent and np.parent.node_type == :element + node_idx << np.parent.index( np ) + np = np.parent + end + new_arry << [ node_idx.reverse, node ] + } + new_arry.sort{ |s1, s2| s1[0] <=> s2[0] }.collect{ |s| s[1] } + end + + + def recurse( nodeset, &block ) + for node in nodeset + yield node + recurse( node, &block ) if node.node_type == :element + end + end + + + + # Builds a nodeset of all of the preceding nodes of the supplied node, + # in reverse document order + # preceding:: includes every element in the document that precedes this node, + # except for ancestors + def preceding( node ) + ancestors = [] + p = node.parent + while p + ancestors << p + p = p.parent + end + + acc = [] + p = preceding_node_of( node ) + while p + if ancestors.include? p + ancestors.delete(p) + else + acc << p + end + p = preceding_node_of( p ) + end + acc + end + + def preceding_node_of( node ) + psn = node.previous_sibling_node + if psn.nil? + if node.parent.nil? or node.parent.class == Document + return nil + end + return node.parent + #psn = preceding_node_of( node.parent ) + end + while psn and psn.kind_of? Element and psn.children.size > 0 + psn = psn.children[-1] + end + psn + end + + def following( node ) + acc = [] + p = next_sibling_node( node ) + while p + acc << p + p = following_node_of( p ) + end + acc + end + + def following_node_of( node ) + if node.kind_of? Element and node.children.size > 0 + return node.children[0] + end + return next_sibling_node(node) + end + + def next_sibling_node(node) + psn = node.next_sibling_node + while psn.nil? + if node.parent.nil? or node.parent.class == Document + return nil + end + node = node.parent + psn = node.next_sibling_node + end + return psn + end + + def norm b + case b + when true, false + return b + when 'true', 'false' + return Functions::boolean( b ) + when /^\d+(\.\d+)?$/ + return Functions::number( b ) + else + return Functions::string( b ) + end + end + + def equality_relational_compare( set1, op, set2 ) + if set1.kind_of? Array and set2.kind_of? Array + if set1.size == 1 and set2.size == 1 + set1 = set1[0] + set2 = set2[0] + elsif set1.size == 0 or set2.size == 0 + nd = set1.size==0 ? set2 : set1 + rv = nd.collect { |il| compare( il, op, nil ) } + return rv + else + res = [] + SyncEnumerator.new( set1, set2 ).each { |i1, i2| + i1 = norm( i1 ) + i2 = norm( i2 ) + res << compare( i1, op, i2 ) + } + return res + end + end + # If one is nodeset and other is number, compare number to each item + # in nodeset s.t. number op number(string(item)) + # If one is nodeset and other is string, compare string to each item + # in nodeset s.t. string op string(item) + # If one is nodeset and other is boolean, compare boolean to each item + # in nodeset s.t. boolean op boolean(item) + if set1.kind_of? Array or set2.kind_of? Array + if set1.kind_of? Array + a = set1 + b = set2 + else + a = set2 + b = set1 + end + + case b + when true, false + return a.collect {|v| compare( Functions::boolean(v), op, b ) } + when Numeric + return a.collect {|v| compare( Functions::number(v), op, b )} + when /^\d+(\.\d+)?$/ + b = Functions::number( b ) + return a.collect {|v| compare( Functions::number(v), op, b )} + else + b = Functions::string( b ) + return a.collect { |v| compare( Functions::string(v), op, b ) } + end + else + # If neither is nodeset, + # If op is = or != + # If either boolean, convert to boolean + # If either number, convert to number + # Else, convert to string + # Else + # Convert both to numbers and compare + s1 = set1.to_s + s2 = set2.to_s + if s1 == 'true' or s1 == 'false' or s2 == 'true' or s2 == 'false' + set1 = Functions::boolean( set1 ) + set2 = Functions::boolean( set2 ) + else + if op == :eq or op == :neq + if s1 =~ /^\d+(\.\d+)?$/ or s2 =~ /^\d+(\.\d+)?$/ + set1 = Functions::number( s1 ) + set2 = Functions::number( s2 ) + else + set1 = Functions::string( set1 ) + set2 = Functions::string( set2 ) + end + else + set1 = Functions::number( set1 ) + set2 = Functions::number( set2 ) + end + end + return compare( set1, op, set2 ) + end + return false + end + + def compare a, op, b + case op + when :eq + a == b + when :neq + a != b + when :lt + a < b + when :lteq + a <= b + when :gt + a > b + when :gteq + a >= b + when :and + a and b + when :or + a or b + else + false + end + end + end +end |