diff options
author | Jari Vetoniemi <jari.vetoniemi@indooratlas.com> | 2020-03-16 18:49:26 +0900 |
---|---|---|
committer | Jari Vetoniemi <jari.vetoniemi@indooratlas.com> | 2020-03-30 00:39:06 +0900 |
commit | fcbf63e62c627deae76c1b8cb8c0876c536ed811 (patch) | |
tree | 64cb17de3f41a2b6fef2368028fbd00349946994 /jni/ruby/lib/rexml/source.rb |
Fresh start
Diffstat (limited to 'jni/ruby/lib/rexml/source.rb')
-rw-r--r-- | jni/ruby/lib/rexml/source.rb | 296 |
1 files changed, 296 insertions, 0 deletions
diff --git a/jni/ruby/lib/rexml/source.rb b/jni/ruby/lib/rexml/source.rb new file mode 100644 index 0000000..cfafdbd --- /dev/null +++ b/jni/ruby/lib/rexml/source.rb @@ -0,0 +1,296 @@ +# coding: US-ASCII +require 'rexml/encoding' + +module REXML + # Generates Source-s. USE THIS CLASS. + class SourceFactory + # Generates a Source object + # @param arg Either a String, or an IO + # @return a Source, or nil if a bad argument was given + def SourceFactory::create_from(arg) + if arg.respond_to? :read and + arg.respond_to? :readline and + arg.respond_to? :nil? and + arg.respond_to? :eof? + IOSource.new(arg) + elsif arg.respond_to? :to_str + require 'stringio' + IOSource.new(StringIO.new(arg)) + elsif arg.kind_of? Source + arg + else + raise "#{arg.class} is not a valid input stream. It must walk \n"+ + "like either a String, an IO, or a Source." + end + end + end + + # A Source can be searched for patterns, and wraps buffers and other + # objects and provides consumption of text + class Source + include Encoding + # The current buffer (what we're going to read next) + attr_reader :buffer + # The line number of the last consumed text + attr_reader :line + attr_reader :encoding + + # Constructor + # @param arg must be a String, and should be a valid XML document + # @param encoding if non-null, sets the encoding of the source to this + # value, overriding all encoding detection + def initialize(arg, encoding=nil) + @orig = @buffer = arg + if encoding + self.encoding = encoding + else + detect_encoding + end + @line = 0 + end + + + # Inherited from Encoding + # Overridden to support optimized en/decoding + def encoding=(enc) + return unless super + encoding_updated + end + + # Scans the source for a given pattern. Note, that this is not your + # usual scan() method. For one thing, the pattern argument has some + # requirements; for another, the source can be consumed. You can easily + # confuse this method. Originally, the patterns were easier + # to construct and this method more robust, because this method + # generated search regexps on the fly; however, this was + # computationally expensive and slowed down the entire REXML package + # considerably, since this is by far the most commonly called method. + # @param pattern must be a Regexp, and must be in the form of + # /^\s*(#{your pattern, with no groups})(.*)/. The first group + # will be returned; the second group is used if the consume flag is + # set. + # @param consume if true, the pattern returned will be consumed, leaving + # everything after it in the Source. + # @return the pattern, if found, or nil if the Source is empty or the + # pattern is not found. + def scan(pattern, cons=false) + return nil if @buffer.nil? + rv = @buffer.scan(pattern) + @buffer = $' if cons and rv.size>0 + rv + end + + def read + end + + def consume( pattern ) + @buffer = $' if pattern.match( @buffer ) + end + + def match_to( char, pattern ) + return pattern.match(@buffer) + end + + def match_to_consume( char, pattern ) + md = pattern.match(@buffer) + @buffer = $' + return md + end + + def match(pattern, cons=false) + md = pattern.match(@buffer) + @buffer = $' if cons and md + return md + end + + # @return true if the Source is exhausted + def empty? + @buffer == "" + end + + def position + @orig.index( @buffer ) + end + + # @return the current line in the source + def current_line + lines = @orig.split + res = lines.grep @buffer[0..30] + res = res[-1] if res.kind_of? Array + lines.index( res ) if res + end + + private + def detect_encoding + buffer_encoding = @buffer.encoding + detected_encoding = "UTF-8" + begin + @buffer.force_encoding("ASCII-8BIT") + if @buffer[0, 2] == "\xfe\xff" + @buffer[0, 2] = "" + detected_encoding = "UTF-16BE" + elsif @buffer[0, 2] == "\xff\xfe" + @buffer[0, 2] = "" + detected_encoding = "UTF-16LE" + elsif @buffer[0, 3] == "\xef\xbb\xbf" + @buffer[0, 3] = "" + detected_encoding = "UTF-8" + end + ensure + @buffer.force_encoding(buffer_encoding) + end + self.encoding = detected_encoding + end + + def encoding_updated + if @encoding != 'UTF-8' + @buffer = decode(@buffer) + @to_utf = true + else + @to_utf = false + @buffer.force_encoding ::Encoding::UTF_8 + end + end + end + + # A Source that wraps an IO. See the Source class for method + # documentation + class IOSource < Source + #attr_reader :block_size + + # block_size has been deprecated + def initialize(arg, block_size=500, encoding=nil) + @er_source = @source = arg + @to_utf = false + @pending_buffer = nil + + if encoding + super("", encoding) + else + super(@source.read(3) || "") + end + + if !@to_utf and + @buffer.respond_to?(:force_encoding) and + @source.respond_to?(:external_encoding) and + @source.external_encoding != ::Encoding::UTF_8 + @force_utf8 = true + else + @force_utf8 = false + end + end + + def scan(pattern, cons=false) + rv = super + # You'll notice that this next section is very similar to the same + # section in match(), but just a liiittle different. This is + # because it is a touch faster to do it this way with scan() + # than the way match() does it; enough faster to warrent duplicating + # some code + if rv.size == 0 + until @buffer =~ pattern or @source.nil? + begin + @buffer << readline + rescue Iconv::IllegalSequence + raise + rescue + @source = nil + end + end + rv = super + end + rv.taint + rv + end + + def read + begin + @buffer << readline + rescue Exception, NameError + @source = nil + end + end + + def consume( pattern ) + match( pattern, true ) + end + + def match( pattern, cons=false ) + rv = pattern.match(@buffer) + @buffer = $' if cons and rv + while !rv and @source + begin + @buffer << readline + rv = pattern.match(@buffer) + @buffer = $' if cons and rv + rescue + @source = nil + end + end + rv.taint + rv + end + + def empty? + super and ( @source.nil? || @source.eof? ) + end + + def position + @er_source.pos rescue 0 + end + + # @return the current line in the source + def current_line + begin + pos = @er_source.pos # The byte position in the source + lineno = @er_source.lineno # The XML < position in the source + @er_source.rewind + line = 0 # The \r\n position in the source + begin + while @er_source.pos < pos + @er_source.readline + line += 1 + end + rescue + end + rescue IOError + pos = -1 + line = -1 + end + [pos, lineno, line] + end + + private + def readline + str = @source.readline(@line_break) + if @pending_buffer + if str.nil? + str = @pending_buffer + else + str = @pending_buffer + str + end + @pending_buffer = nil + end + return nil if str.nil? + + if @to_utf + decode(str) + else + str.force_encoding(::Encoding::UTF_8) if @force_utf8 + str + end + end + + def encoding_updated + case @encoding + when "UTF-16BE", "UTF-16LE" + @source.binmode + @source.set_encoding(@encoding, @encoding) + end + @line_break = encode(">") + @pending_buffer, @buffer = @buffer, "" + @pending_buffer.force_encoding(@encoding) + super + end + end +end |