From fcbf63e62c627deae76c1b8cb8c0876c536ed811 Mon Sep 17 00:00:00 2001
From: Jari Vetoniemi <jari.vetoniemi@indooratlas.com>
Date: Mon, 16 Mar 2020 18:49:26 +0900
Subject: Fresh start

---
 jni/ruby/lib/rdoc/markup/parser.rb | 558 +++++++++++++++++++++++++++++++++++++
 1 file changed, 558 insertions(+)
 create mode 100644 jni/ruby/lib/rdoc/markup/parser.rb

(limited to 'jni/ruby/lib/rdoc/markup/parser.rb')
diff --git a/jni/ruby/lib/rdoc/markup/parser.rb b/jni/ruby/lib/rdoc/markup/parser.rb
new file mode 100644
index 0000000..cc828a4
--- /dev/null
+++ b/jni/ruby/lib/rdoc/markup/parser.rb
@@ -0,0 +1,558 @@
+require 'strscan'
+
+##
+# A recursive-descent parser for RDoc markup.
+#
+# The parser tokenizes an input string then parses the tokens into a Document.
+# Documents can be converted into output formats by writing a visitor like
+# RDoc::Markup::ToHTML.
+#
+# The parser only handles the block-level constructs Paragraph, List,
+# ListItem, Heading, Verbatim, BlankLine and Rule.  Inline markup such as
+# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
+#
+# To see what markup the Parser implements read RDoc.  To see how to use
+# RDoc markup to format text in your program read RDoc::Markup.
+
+class RDoc::Markup::Parser
+
+  include RDoc::Text
+
+  ##
+  # List token types
+
+  LIST_TOKENS = [
+    :BULLET,
+    :LABEL,
+    :LALPHA,
+    :NOTE,
+    :NUMBER,
+    :UALPHA,
+  ]
+
+  ##
+  # Parser error subclass
+
+  class Error < RuntimeError; end
+
+  ##
+  # Raised when the parser is unable to handle the given markup
+
+  class ParseError < Error; end
+
+  ##
+  # Enables display of debugging information
+
+  attr_accessor :debug
+
+  ##
+  # Token accessor
+
+  attr_reader :tokens
+
+  ##
+  # Parses +str+ into a Document.
+  #
+  # Use RDoc::Markup#parse instead of this method.
+
+  def self.parse str
+    parser = new
+    parser.tokenize str
+    doc = RDoc::Markup::Document.new
+    parser.parse doc
+  end
+
+  ##
+  # Returns a token stream for +str+, for testing
+
+  def self.tokenize str
+    parser = new
+    parser.tokenize str
+    parser.tokens
+  end
+
+  ##
+  # Creates a new Parser.  See also ::parse
+
+  def initialize
+    @binary_input   = nil
+    @current_token  = nil
+    @debug          = false
+    @have_encoding  = Object.const_defined? :Encoding
+    @have_byteslice = ''.respond_to? :byteslice
+    @input          = nil
+    @input_encoding = nil
+    @line           = 0
+    @line_pos       = 0
+    @s              = nil
+    @tokens         = []
+  end
+
+  ##
+  # Builds a Heading of +level+
+
+  def build_heading level
+    type, text, = get
+
+    text = case type
+           when :TEXT then
+             skip :NEWLINE
+             text
+           else
+             unget
+             ''
+           end
+
+    RDoc::Markup::Heading.new level, text
+  end
+
+  ##
+  # Builds a List flush to +margin+
+
+  def build_list margin
+    p :list_start => margin if @debug
+
+    list = RDoc::Markup::List.new
+    label = nil
+
+    until @tokens.empty? do
+      type, data, column, = get
+
+      case type
+      when *LIST_TOKENS then
+        if column < margin || (list.type && list.type != type) then
+          unget
+          break
+        end
+
+        list.type = type
+        peek_type, _, column, = peek_token
+
+        case type
+        when :NOTE, :LABEL then
+          label = [] unless label
+
+          if peek_type == :NEWLINE then
+            # description not on the same line as LABEL/NOTE
+            # skip the trailing newline & any blank lines below
+            while peek_type == :NEWLINE
+              get
+              peek_type, _, column, = peek_token
+            end
+
+            # we may be:
+            #   - at end of stream
+            #   - at a column < margin:
+            #         [text]
+            #       blah blah blah
+            #   - at the same column, but with a different type of list item
+            #       [text]
+            #       * blah blah
+            #   - at the same column, with the same type of list item
+            #       [one]
+            #       [two]
+            # In all cases, we have an empty description.
+            # In the last case only, we continue.
+            if peek_type.nil? || column < margin then
+              empty = true
+            elsif column == margin then
+              case peek_type
+              when type
+                empty = :continue
+              when *LIST_TOKENS
+                empty = true
+              else
+                empty = false
+              end
+            else
+              empty = false
+            end
+
+            if empty then
+              label << data
+              next if empty == :continue
+              break
+            end
+          end
+        else
+          data = nil
+        end
+
+        if label then
+          data = label << data
+          label = nil
+        end
+
+        list_item = RDoc::Markup::ListItem.new data
+        parse list_item, column
+        list << list_item
+
+      else
+        unget
+        break
+      end
+    end
+
+    p :list_end => margin if @debug
+
+    if list.empty? then
+      return nil unless label
+      return nil unless [:LABEL, :NOTE].include? list.type
+
+      list_item = RDoc::Markup::ListItem.new label, RDoc::Markup::BlankLine.new
+      list << list_item
+    end
+
+    list
+  end
+
+  ##
+  # Builds a Paragraph that is flush to +margin+
+
+  def build_paragraph margin
+    p :paragraph_start => margin if @debug
+
+    paragraph = RDoc::Markup::Paragraph.new
+
+    until @tokens.empty? do
+      type, data, column, = get
+
+      if type == :TEXT and column == margin then
+        paragraph << data
+
+        break if peek_token.first == :BREAK
+
+        data << ' ' if skip :NEWLINE
+      else
+        unget
+        break
+      end
+    end
+
+    paragraph.parts.last.sub!(/ \z/, '') # cleanup
+
+    p :paragraph_end => margin if @debug
+
+    paragraph
+  end
+
+  ##
+  # Builds a Verbatim that is indented from +margin+.
+  #
+  # The verbatim block is shifted left (the least indented lines start in
+  # column 0).  Each part of the verbatim is one line of text, always
+  # terminated by a newline.  Blank lines always consist of a single newline
+  # character, and there is never a single newline at the end of the verbatim.
+
+  def build_verbatim margin
+    p :verbatim_begin => margin if @debug
+    verbatim = RDoc::Markup::Verbatim.new
+
+    min_indent = nil
+    generate_leading_spaces = true
+    line = ''
+
+    until @tokens.empty? do
+      type, data, column, = get
+
+      if type == :NEWLINE then
+        line << data
+        verbatim << line
+        line = ''
+        generate_leading_spaces = true
+        next
+      end
+
+      if column <= margin
+        unget
+        break
+      end
+
+      if generate_leading_spaces then
+        indent = column - margin
+        line << ' ' * indent
+        min_indent = indent if min_indent.nil? || indent < min_indent
+        generate_leading_spaces = false
+      end
+
+      case type
+      when :HEADER then
+        line << '=' * data
+        _, _, peek_column, = peek_token
+        peek_column ||= column + data
+        indent = peek_column - column - data
+        line << ' ' * indent
+      when :RULE then
+        width = 2 + data
+        line << '-' * width
+        _, _, peek_column, = peek_token
+        peek_column ||= column + width
+        indent = peek_column - column - width
+        line << ' ' * indent
+      when :BREAK, :TEXT then
+        line << data
+      else # *LIST_TOKENS
+        list_marker = case type
+                      when :BULLET then data
+                      when :LABEL  then "[#{data}]"
+                      when :NOTE   then "#{data}::"
+                      else # :LALPHA, :NUMBER, :UALPHA
+                        "#{data}."
+                      end
+        line << list_marker
+        peek_type, _, peek_column = peek_token
+        unless peek_type == :NEWLINE then
+          peek_column ||= column + list_marker.length
+          indent = peek_column - column - list_marker.length
+          line << ' ' * indent
+        end
+      end
+
+    end
+
+    verbatim << line << "\n" unless line.empty?
+    verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0
+    verbatim.normalize
+
+    p :verbatim_end => margin if @debug
+
+    verbatim
+  end
+
+  ##
+  # The character offset for the input string at the given +byte_offset+
+
+  def char_pos byte_offset
+    if @have_byteslice then
+      @input.byteslice(0, byte_offset).length
+    elsif @have_encoding then
+      matched = @binary_input[0, byte_offset]
+      matched.force_encoding @input_encoding
+      matched.length
+    else
+      byte_offset
+    end
+  end
+
+  ##
+  # Pulls the next token from the stream.
+
+  def get
+    @current_token = @tokens.shift
+    p :get => @current_token if @debug
+    @current_token
+  end
+
+  ##
+  # Parses the tokens into an array of RDoc::Markup::XXX objects,
+  # and appends them to the passed +parent+ RDoc::Markup::YYY object.
+  #
+  # Exits at the end of the token stream, or when it encounters a token
+  # in a column less than +indent+ (unless it is a NEWLINE).
+  #
+  # Returns +parent+.
+
+  def parse parent, indent = 0
+    p :parse_start => indent if @debug
+
+    until @tokens.empty? do
+      type, data, column, = get
+
+      case type
+      when :BREAK then
+        parent << RDoc::Markup::BlankLine.new
+        skip :NEWLINE, false
+        next
+      when :NEWLINE then
+        # trailing newlines are skipped below, so this is a blank line
+        parent << RDoc::Markup::BlankLine.new
+        skip :NEWLINE, false
+        next
+      end
+
+      # indentation change: break or verbatim
+      if column < indent then
+        unget
+        break
+      elsif column > indent then
+        unget
+        parent << build_verbatim(indent)
+        next
+      end
+
+      # indentation is the same
+      case type
+      when :HEADER then
+        parent << build_heading(data)
+      when :RULE then
+        parent << RDoc::Markup::Rule.new(data)
+        skip :NEWLINE
+      when :TEXT then
+        unget
+        parse_text parent, indent
+      when *LIST_TOKENS then
+        unget
+        parent << build_list(indent)
+      else
+        type, data, column, line = @current_token
+        raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
+      end
+    end
+
+    p :parse_end => indent if @debug
+
+    parent
+
+  end
+
+  ##
+  # Small hook that is overridden by RDoc::TomDoc
+
+  def parse_text parent, indent # :nodoc:
+    parent << build_paragraph(indent)
+  end
+
+  ##
+  # Returns the next token on the stream without modifying the stream
+
+  def peek_token
+    token = @tokens.first || []
+    p :peek => token if @debug
+    token
+  end
+
+  ##
+  # Creates the StringScanner
+
+  def setup_scanner input
+    @line     = 0
+    @line_pos = 0
+    @input    = input.dup
+
+    if @have_encoding and not @have_byteslice then
+      @input_encoding = @input.encoding
+      @binary_input   = @input.force_encoding Encoding::BINARY
+    end
+
+    @s = StringScanner.new input
+  end
+
+  ##
+  # Skips the next token if its type is +token_type+.
+  #
+  # Optionally raises an error if the next token is not of the expected type.
+
+  def skip token_type, error = true
+    type, = get
+    return unless type # end of stream
+    return @current_token if token_type == type
+    unget
+    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error
+  end
+
+  ##
+  # Turns text +input+ into a stream of tokens
+
+  def tokenize input
+    setup_scanner input
+
+    until @s.eos? do
+      pos = @s.pos
+
+      # leading spaces will be reflected by the column of the next token
+      # the only thing we loose are trailing spaces at the end of the file
+      next if @s.scan(/ +/)
+
+      # note: after BULLET, LABEL, etc.,
+      # indent will be the column of the next non-newline token
+
+      @tokens << case
+                 # [CR]LF => :NEWLINE
+                 when @s.scan(/\r?\n/) then
+                   token = [:NEWLINE, @s.matched, *token_pos(pos)]
+                   @line_pos = char_pos @s.pos
+                   @line += 1
+                   token
+                 # === text => :HEADER then :TEXT
+                 when @s.scan(/(=+)(\s*)/) then
+                   level = @s[1].length
+                   header = [:HEADER, level, *token_pos(pos)]
+
+                   if @s[2] =~ /^\r?\n/ then
+                     @s.pos -= @s[2].length
+                     header
+                   else
+                     pos = @s.pos
+                     @s.scan(/.*/)
+                     @tokens << header
+                     [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
+                   end
+                 # --- (at least 3) and nothing else on the line => :RULE
+                 when @s.scan(/(-{3,}) *\r?$/) then
+                   [:RULE, @s[1].length - 2, *token_pos(pos)]
+                 # * or - followed by white space and text => :BULLET
+                 when @s.scan(/([*-]) +(\S)/) then
+                   @s.pos -= @s[2].bytesize # unget \S
+                   [:BULLET, @s[1], *token_pos(pos)]
+                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
+                 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
+                   # FIXME if tab(s), the column will be wrong
+                   # either support tabs everywhere by first expanding them to
+                   # spaces, or assume that they will have been replaced
+                   # before (and provide a check for that at least in debug
+                   # mode)
+                   list_label = @s[1]
+                   @s.pos -= @s[2].bytesize # unget \S
+                   list_type =
+                     case list_label
+                     when /[a-z]/ then :LALPHA
+                     when /[A-Z]/ then :UALPHA
+                     when /\d/    then :NUMBER
+                     else
+                       raise ParseError, "BUG token #{list_label}"
+                     end
+                   [list_type, list_label, *token_pos(pos)]
+                 # [text] followed by spaces or end of line => :LABEL
+                 when @s.scan(/\[(.*?)\]( +|\r?$)/) then
+                   [:LABEL, @s[1], *token_pos(pos)]
+                 # text:: followed by spaces or end of line => :NOTE
+                 when @s.scan(/(.*?)::( +|\r?$)/) then
+                   [:NOTE, @s[1], *token_pos(pos)]
+                 # anything else: :TEXT
+                 else @s.scan(/(.*?)(  )?\r?$/)
+                   token = [:TEXT, @s[1], *token_pos(pos)]
+
+                   if @s[2] then
+                     @tokens << token
+                     [:BREAK, @s[2], *token_pos(pos + @s[1].length)]
+                   else
+                     token
+                   end
+                 end
+    end
+
+    self
+  end
+
+  ##
+  # Calculates the column (by character) and line of the current token based
+  # on +byte_offset+.
+
+  def token_pos byte_offset
+    offset = char_pos byte_offset
+
+    [offset - @line_pos, @line]
+  end
+
+  ##
+  # Returns the current token to the token stream
+
+  def unget
+    token = @current_token
+    p :unget => token if @debug
+    raise Error, 'too many #ungets' if token == @tokens.first
+    @tokens.unshift token if token
+  end
+
+end
+
-- 
cgit v1.2.3