summaryrefslogtreecommitdiff
path: root/jni/ruby/tool/transcode-tblgen.rb
diff options
context:
space:
mode:
authorJari Vetoniemi <jari.vetoniemi@indooratlas.com>2020-03-16 18:49:26 +0900
committerJari Vetoniemi <jari.vetoniemi@indooratlas.com>2020-03-30 00:39:06 +0900
commitfcbf63e62c627deae76c1b8cb8c0876c536ed811 (patch)
tree64cb17de3f41a2b6fef2368028fbd00349946994 /jni/ruby/tool/transcode-tblgen.rb
Fresh start
Diffstat (limited to 'jni/ruby/tool/transcode-tblgen.rb')
-rw-r--r--jni/ruby/tool/transcode-tblgen.rb1074
1 files changed, 1074 insertions, 0 deletions
diff --git a/jni/ruby/tool/transcode-tblgen.rb b/jni/ruby/tool/transcode-tblgen.rb
new file mode 100644
index 0000000..832d9a4
--- /dev/null
+++ b/jni/ruby/tool/transcode-tblgen.rb
@@ -0,0 +1,1074 @@
+require 'optparse'
+require 'erb'
+require 'fileutils'
+require 'pp'
+
+class Array
+ unless [].respond_to? :product
+ def product(*args)
+ if args.empty?
+ self.map {|e| [e] }
+ else
+ result = []
+ self.each {|e0|
+ result.concat args.first.product(*args[1..-1]).map {|es| [e0, *es] }
+ }
+ result
+ end
+ end
+ end
+end
+
+class String
+ unless "".respond_to? :start_with?
+ def start_with?(*prefixes)
+ prefixes.each {|prefix|
+ return true if prefix.length <= self.length && prefix == self[0, prefix.length]
+ }
+ false
+ end
+ end
+end
+
+NUM_ELEM_BYTELOOKUP = 2
+
+C_ESC = {
+ "\\" => "\\\\",
+ '"' => '\"',
+ "\n" => '\n',
+}
+
+0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
+0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
+C_ESC_PAT = Regexp.union(*C_ESC.keys)
+
+def c_esc(str)
+ '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
+end
+
+HEX2 = /(?:[0-9A-Fa-f]{2})/
+
+class ArrayCode
+ def initialize(type, name)
+ @type = type
+ @name = name
+ @len = 0;
+ @content = ''
+ end
+
+ def length
+ @len
+ end
+
+ def insert_at_last(num, str)
+ newnum = self.length + num
+ @content << str
+ @len += num
+ end
+
+ def to_s
+ <<"End"
+static const #{@type}
+#{@name}[#{@len}] = {
+#{@content}};
+End
+ end
+end
+
+class Action
+ def initialize(value)
+ @value = value
+ end
+ attr_reader :value
+
+ def hash
+ @value.hash
+ end
+
+ def eql?(other)
+ self.class == other.class &&
+ @value == other.value
+ end
+ alias == eql?
+end
+
+class Branch
+ def initialize(byte_min, byte_max, child_tree)
+ @byte_min = byte_min
+ @byte_max = byte_max
+ @child_tree = child_tree
+ @hash = byte_min.hash ^ byte_max.hash ^ child_tree.hash
+ end
+ attr_reader :byte_min, :byte_max, :child_tree, :hash
+
+ def eql?(other)
+ self.class == other.class &&
+ @hash == other.hash &&
+ @byte_min == other.byte_min &&
+ @byte_max == other.byte_max &&
+ @child_tree == other.child_tree
+ end
+ alias == eql?
+end
+
+class ActionMap
+ def self.parse_to_rects(mapping)
+ rects = []
+ n = 0
+ mapping.each {|pat, action|
+ pat = pat.to_s
+ if /\A\s*\(empset\)\s*\z/ =~ pat
+ next
+ elsif /\A\s*\(empstr\)\s*\z/ =~ pat
+ rects << ['', '', action]
+ n += 1
+ elsif /\A\s*(#{HEX2}+)\s*\z/o =~ pat
+ hex = $1.upcase
+ rects << [hex, hex, action]
+ elsif /\A\s*((#{HEX2}|\{#{HEX2}(?:-#{HEX2})?(,#{HEX2}(?:-#{HEX2})?)*\})+(\s+|\z))*\z/o =~ pat
+ pat = pat.upcase
+ pat.scan(/\S+/) {
+ pat1 = $&
+ ranges_list = []
+ pat1.scan(/#{HEX2}|\{([^\}]*)\}/o) {
+ ranges_list << []
+ if !$1
+ ranges_list.last << [$&,$&]
+ else
+ set = {}
+ $1.scan(/(#{HEX2})(?:-(#{HEX2}))?/o) {
+ if !$2
+ c = $1.to_i(16)
+ set[c] = true
+ else
+ b = $1.to_i(16)
+ e = $2.to_i(16)
+ b.upto(e) {|c| set[c] = true }
+ end
+ }
+ i = nil
+ 0.upto(256) {|j|
+ if set[j]
+ if !i
+ i = j
+ end
+ if !set[j+1]
+ ranges_list.last << ["%02X" % i, "%02X" % j]
+ i = nil
+ end
+ end
+ }
+ end
+ }
+ first_ranges = ranges_list.shift
+ first_ranges.product(*ranges_list).each {|range_list|
+ min = range_list.map {|x, y| x }.join
+ max = range_list.map {|x, y| y }.join
+ rects << [min, max, action]
+ }
+ }
+ else
+ raise ArgumentError, "invalid pattern: #{pat.inspect}"
+ end
+ }
+ rects
+ end
+
+ def self.unambiguous_action(actions0)
+ actions = actions0.uniq
+ if actions.length == 1
+ actions[0]
+ else
+ actions.delete(:nomap0)
+ if actions.length == 1
+ actions[0]
+ else
+ raise ArgumentError, "ambiguous actions: #{actions0.inspect}"
+ end
+ end
+ end
+
+ def self.build_tree(rects)
+ expand(rects) {|prefix, actions|
+ unambiguous_action(actions)
+ }
+ end
+
+ def self.parse(mapping)
+ rects = parse_to_rects(mapping)
+ tree = build_tree(rects)
+ self.new(tree)
+ end
+
+ def self.merge_rects(*rects_list)
+ if rects_list.length < 2
+ raise ArgumentError, "not enough arguments"
+ end
+
+ all_rects = []
+ rects_list.each_with_index {|rects, i|
+ all_rects.concat rects.map {|min, max, action| [min, max, [i, action]] }
+ }
+
+ tree = expand(all_rects) {|prefix, actions|
+ args = Array.new(rects_list.length) { [] }
+ actions.each {|i, action|
+ args[i] << action
+ }
+ yield(prefix, *args)
+ }
+
+ self.new(tree)
+ end
+
+ def self.merge(*mappings, &block)
+ merge_rects(*mappings.map {|m| parse_to_rects(m) }, &block)
+ end
+
+ def self.merge2(map1, map2, &block)
+ rects1 = parse_to_rects(map1)
+ rects2 = parse_to_rects(map2)
+
+ actions = []
+ all_rects = []
+
+ rects1.each {|rect|
+ min, max, action = rect
+ rect[2] = actions.length
+ actions << action
+ all_rects << rect
+ }
+
+ boundary = actions.length
+
+ rects2.each {|rect|
+ min, max, action = rect
+ rect[2] = actions.length
+ actions << action
+ all_rects << rect
+ }
+
+ tree = expand(all_rects) {|prefix, as0|
+ as1 = []
+ as2 = []
+ as0.each {|i|
+ if i < boundary
+ as1 << actions[i]
+ else
+ as2 << actions[i]
+ end
+ }
+ yield(prefix, as1, as2)
+ }
+
+ self.new(tree)
+ end
+
+ def self.expand(rects, &block)
+ #numsing = numreg = 0
+ #rects.each {|min, max, action| if min == max then numsing += 1 else numreg += 1 end }
+ #puts "#{numsing} singleton mappings and #{numreg} region mappings."
+ singleton_rects = []
+ region_rects = []
+ rects.each {|rect|
+ min, max, action = rect
+ if min == max
+ singleton_rects << rect
+ else
+ region_rects << rect
+ end
+ }
+ @singleton_rects = singleton_rects.sort_by {|min, max, action| min }
+ @singleton_rects.reverse!
+ ret = expand_rec("", region_rects, &block)
+ @singleton_rects = nil
+ ret
+ end
+
+ TMPHASH = {}
+ def self.expand_rec(prefix, region_rects, &block)
+ return region_rects if region_rects.empty? && !((s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix))
+ if region_rects.empty? ? s_rect[0].length == prefix.length : region_rects[0][0].empty?
+ h = TMPHASH
+ while (s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix)
+ min, max, action = @singleton_rects.pop
+ raise ArgumentError, "ambiguous pattern: #{prefix}" if min.length != prefix.length
+ h[action] = true
+ end
+ region_rects.each {|min, max, action|
+ raise ArgumentError, "ambiguous pattern: #{prefix}" if !min.empty?
+ h[action] = true
+ }
+ tree = Action.new(block.call(prefix, h.keys))
+ h.clear
+ else
+ tree = []
+ each_firstbyte_range(prefix, region_rects) {|byte_min, byte_max, r_rects2|
+ if byte_min == byte_max
+ prefix2 = prefix + "%02X" % byte_min
+ else
+ prefix2 = prefix + "{%02X-%02X}" % [byte_min, byte_max]
+ end
+ child_tree = expand_rec(prefix2, r_rects2, &block)
+ tree << Branch.new(byte_min, byte_max, child_tree)
+ }
+ end
+ return tree
+ end
+
+ def self.each_firstbyte_range(prefix, region_rects)
+ index_from = TMPHASH
+
+ region_ary = []
+ region_rects.each {|min, max, action|
+ raise ArgumentError, "ambiguous pattern: #{prefix}" if min.empty?
+ min_firstbyte = min[0,2].to_i(16)
+ min_rest = min[2..-1]
+ max_firstbyte = max[0,2].to_i(16)
+ max_rest = max[2..-1]
+ region_ary << [min_firstbyte, max_firstbyte, [min_rest, max_rest, action]]
+ index_from[min_firstbyte] = true
+ index_from[max_firstbyte+1] = true
+ }
+
+ byte_from = Array.new(index_from.size)
+ bytes = index_from.keys
+ bytes.sort!
+ bytes.reverse!
+ bytes.each_with_index {|byte, i|
+ index_from[byte] = i
+ byte_from[i] = byte
+ }
+
+ region_rects_ary = Array.new(index_from.size) { [] }
+ region_ary.each {|min_firstbyte, max_firstbyte, rest_elt|
+ index_from[min_firstbyte].downto(index_from[max_firstbyte+1]+1) {|i|
+ region_rects_ary[i] << rest_elt
+ }
+ }
+
+ index_from.clear
+
+ r_rects = region_rects_ary.pop
+ region_byte = byte_from.pop
+ prev_r_start = region_byte
+ prev_r_rects = []
+ while r_rects && (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
+ singleton_byte = seq[prefix.length, 2].to_i(16)
+ min_byte = singleton_byte < region_byte ? singleton_byte : region_byte
+ if prev_r_start < min_byte && !prev_r_rects.empty?
+ yield prev_r_start, min_byte-1, prev_r_rects
+ end
+ if region_byte < singleton_byte
+ prev_r_start = region_byte
+ prev_r_rects = r_rects
+ r_rects = region_rects_ary.pop
+ region_byte = byte_from.pop
+ elsif region_byte > singleton_byte
+ yield singleton_byte, singleton_byte, prev_r_rects
+ prev_r_start = singleton_byte+1
+ else # region_byte == singleton_byte
+ prev_r_start = region_byte+1
+ prev_r_rects = r_rects
+ r_rects = region_rects_ary.pop
+ region_byte = byte_from.pop
+ yield singleton_byte, singleton_byte, prev_r_rects
+ end
+ end
+
+ while r_rects
+ if prev_r_start < region_byte && !prev_r_rects.empty?
+ yield prev_r_start, region_byte-1, prev_r_rects
+ end
+ prev_r_start = region_byte
+ prev_r_rects = r_rects
+ r_rects = region_rects_ary.pop
+ region_byte = byte_from.pop
+ end
+
+ while (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
+ singleton_byte = seq[prefix.length, 2].to_i(16)
+ yield singleton_byte, singleton_byte, []
+ end
+ end
+
+ def initialize(tree)
+ @tree = tree
+ end
+
+ def inspect
+ "\#<#{self.class}:" +
+ @tree.inspect +
+ ">"
+ end
+
+ def max_input_length_rec(tree)
+ case tree
+ when Action
+ 0
+ else
+ tree.map {|branch|
+ max_input_length_rec(branch.child_tree)
+ }.max + 1
+ end
+ end
+
+ def max_input_length
+ max_input_length_rec(@tree)
+ end
+
+ def empty_action
+ if @tree.kind_of? Action
+ @tree.value
+ else
+ nil
+ end
+ end
+
+ OffsetsMemo = {}
+ InfosMemo = {}
+
+ def format_offsets(min, max, offsets)
+ offsets = offsets[min..max]
+ code = "%d, %d,\n" % [min, max]
+ 0.step(offsets.length-1,16) {|i|
+ code << " "
+ code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
+ if i+8 < offsets.length
+ code << " "
+ code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
+ end
+ code << "\n"
+ }
+ code
+ end
+
+ UsedName = {}
+
+ StrMemo = {}
+
+ def str_name(bytes)
+ size = @bytes_code.length
+ rawbytes = [bytes].pack("H*")
+
+ n = nil
+ if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end
+ if !n && !UsedName[nn = "str1_" + bytes] then n = nn end
+ n ||= "str1s_#{size}"
+
+ StrMemo[bytes] = n
+ UsedName[n] = true
+ n
+ end
+
+ def gen_str(bytes)
+ if n = StrMemo[bytes]
+ n
+ else
+ len = bytes.length/2
+ size = @bytes_code.length
+ n = str_name(bytes)
+ @bytes_code.insert_at_last(1 + len,
+ "\#define #{n} makeSTR1(#{size})\n" +
+ " makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n")
+ n
+ end
+ end
+
+ def generate_info(info)
+ case info
+ when :nomap, :nomap0
+ # :nomap0 is low priority. it never collides.
+ "NOMAP"
+ when :undef
+ "UNDEF"
+ when :invalid
+ "INVALID"
+ when :func_ii
+ "FUNii"
+ when :func_si
+ "FUNsi"
+ when :func_io
+ "FUNio"
+ when :func_so
+ "FUNso"
+ when /\A(#{HEX2})\z/o
+ "o1(0x#$1)"
+ when /\A(#{HEX2})(#{HEX2})\z/o
+ "o2(0x#$1,0x#$2)"
+ when /\A(#{HEX2})(#{HEX2})(#{HEX2})\z/o
+ "o3(0x#$1,0x#$2,0x#$3)"
+ when /funsio\((\d+)\)/
+ "funsio(#{$1})"
+ when /\A(#{HEX2})(3[0-9])(#{HEX2})(3[0-9])\z/o
+ "g4(0x#$1,0x#$2,0x#$3,0x#$4)"
+ when /\A(f[0-7])(#{HEX2})(#{HEX2})(#{HEX2})\z/o
+ "o4(0x#$1,0x#$2,0x#$3,0x#$4)"
+ when /\A(#{HEX2}){4,259}\z/o
+ gen_str(info.upcase)
+ when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure
+ $'.to_s
+ else
+ raise "unexpected action: #{info.inspect}"
+ end
+ end
+
+ def format_infos(infos)
+ infos = infos.map {|info| generate_info(info) }
+ maxlen = infos.map {|info| info.length }.max
+ columns = maxlen <= 16 ? 4 : 2
+ code = ""
+ 0.step(infos.length-1, columns) {|i|
+ code << " "
+ is = infos[i,columns]
+ is.each {|info|
+ code << sprintf(" %#{maxlen}s,", info)
+ }
+ code << "\n"
+ }
+ code
+ end
+
+ def generate_lookup_node(name, table)
+ bytes_code = @bytes_code
+ words_code = @words_code
+ offsets = []
+ infos = []
+ infomap = {}
+ min = max = nil
+ table.each_with_index {|action, byte|
+ action ||= :invalid
+ if action != :invalid
+ min = byte if !min
+ max = byte
+ end
+ unless o = infomap[action]
+ infomap[action] = o = infos.length
+ infos[o] = action
+ end
+ offsets[byte] = o
+ }
+ infomap.clear
+ if !min
+ min = max = 0
+ end
+
+ offsets_key = [min, max, offsets[min..max]]
+ if n = OffsetsMemo[offsets_key]
+ offsets_name = n
+ else
+ offsets_name = "#{name}_offsets"
+ OffsetsMemo[offsets_key] = offsets_name
+ size = bytes_code.length
+ bytes_code.insert_at_last(2+max-min+1,
+ "\#define #{offsets_name} #{size}\n" +
+ format_offsets(min,max,offsets) + "\n")
+ end
+
+ if n = InfosMemo[infos]
+ infos_name = n
+ else
+ infos_name = "#{name}_infos"
+ InfosMemo[infos] = infos_name
+
+ size = words_code.length
+ words_code.insert_at_last(infos.length,
+ "\#define #{infos_name} WORDINDEX2INFO(#{size})\n" +
+ format_infos(infos) + "\n")
+ end
+
+ size = words_code.length
+ words_code.insert_at_last(NUM_ELEM_BYTELOOKUP,
+ "\#define #{name} WORDINDEX2INFO(#{size})\n" +
+ <<"End" + "\n")
+ #{offsets_name},
+ #{infos_name},
+End
+ end
+
+ PreMemo = {}
+ NextName = "a"
+
+ def generate_node(name_hint=nil)
+ if n = PreMemo[@tree]
+ return n
+ end
+
+ table = Array.new(0x100, :invalid)
+ @tree.each {|branch|
+ byte_min, byte_max, child_tree = branch.byte_min, branch.byte_max, branch.child_tree
+ rest = ActionMap.new(child_tree)
+ if a = rest.empty_action
+ table.fill(a, byte_min..byte_max)
+ else
+ name_hint2 = nil
+ if name_hint
+ name_hint2 = "#{name_hint}_#{byte_min == byte_max ? '%02X' % byte_min : '%02Xto%02X' % [byte_min, byte_max]}"
+ end
+ v = "/*BYTE_LOOKUP*/" + rest.gennode(@bytes_code, @words_code, name_hint2)
+ table.fill(v, byte_min..byte_max)
+ end
+ }
+
+ if !name_hint
+ name_hint = "fun_" + NextName
+ NextName.succ!
+ end
+
+ PreMemo[@tree] = name_hint
+
+ generate_lookup_node(name_hint, table)
+ name_hint
+ end
+
+ def gennode(bytes_code, words_code, name_hint=nil)
+ @bytes_code = bytes_code
+ @words_code = words_code
+ name = generate_node(name_hint)
+ @bytes_code = nil
+ @words_code = nil
+ return name
+ end
+end
+
+def citrus_mskanji_cstomb(csid, index)
+ case csid
+ when 0
+ index
+ when 1
+ index + 0x80
+ when 2, 3
+ row = index >> 8
+ raise "invalid byte sequence" if row < 0x21
+ if csid == 3
+ if row <= 0x2F
+ offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0
+ elsif row >= 0x4D && row <= 0x7E
+ offset = 0xCE
+ else
+ raise "invalid byte sequence"
+ end
+ else
+ raise "invalid byte sequence" if row > 0x97
+ offset = (row < 0x5F) ? 0x81 : 0xC1
+ end
+ col = index & 0xFF
+ raise "invalid byte sequence" if (col < 0x21 || col > 0x7E)
+
+ row -= 0x21
+ col -= 0x21
+ if (row & 1) == 0
+ col += 0x40
+ col += 1 if (col >= 0x7F)
+ else
+ col += 0x9F;
+ end
+ row = row / 2 + offset
+ (row << 8) | col
+ end.to_s(16)
+end
+
+def citrus_euc_cstomb(csid, index)
+ case csid
+ when 0x0000
+ index
+ when 0x8080
+ index | 0x8080
+ when 0x0080
+ index | 0x8E80
+ when 0x8000
+ index | 0x8F8080
+ end.to_s(16)
+end
+
+def citrus_stateless_iso_cstomb(csid, index)
+ (index | 0x8080 | (csid << 16)).to_s(16)
+end
+
+def citrus_cstomb(ces, csid, index)
+ case ces
+ when 'mskanji'
+ citrus_mskanji_cstomb(csid, index)
+ when 'euc'
+ citrus_euc_cstomb(csid, index)
+ when 'stateless_iso'
+ citrus_stateless_iso_cstomb(csid, index)
+ end
+end
+
+SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/
+
+
+def citrus_decode_mapsrc(ces, csid, mapsrcs)
+ table = []
+ mapsrcs.split(',').each do |mapsrc|
+ path = [$srcdir]
+ mode = nil
+ if mapsrc.rindex(/UCS(?:@[A-Z]+)?/, 0)
+ mode = :from_ucs
+ from = mapsrc[$&.size+1..-1]
+ path << SUBDIR.find{|x| from.rindex(x, 0) }
+ else
+ mode = :to_ucs
+ path << SUBDIR.find{|x| mapsrc.rindex(x, 0) }
+ end
+ if /\bUCS@(BMP|SMP|SIP|TIP|SSP)\b/ =~ mapsrc
+ plane = {"BMP"=>0, "SMP"=>1, "SIP"=>2, "TIP"=>3, "SSP"=>14}[$1]
+ else
+ plane = 0
+ end
+ plane <<= 16
+ path << mapsrc.gsub(':', '@')
+ path = File.join(*path)
+ path << ".src"
+ path[path.rindex('/')] = '%'
+ STDERR.puts 'load mapsrc %s' % path if VERBOSE_MODE
+ open(path) do |f|
+ f.each_line do |l|
+ break if /^BEGIN_MAP/ =~ l
+ end
+ f.each_line do |l|
+ next if /^\s*(?:#|$)/ =~ l
+ break if /^END_MAP/ =~ l
+ case mode
+ when :from_ucs
+ case l
+ when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/
+ # Citrus OOB_MODE
+ when /(0x\w+)\s*=\s*(0x\w+)/
+ table.push << [plane | $1.hex, citrus_cstomb(ces, csid, $2.hex)]
+ else
+ raise "unknown notation '%s'"% l
+ end
+ when :to_ucs
+ case l
+ when /(0x\w+)\s*=\s*(0x\w+)/
+ table.push << [citrus_cstomb(ces, csid, $1.hex), plane | $2.hex]
+ else
+ raise "unknown notation '%s'"% l
+ end
+ end
+ end
+ end
+ end
+ return table
+end
+
+def import_ucm(path)
+ to_ucs = []
+ from_ucs = []
+ File.foreach(File.join($srcdir, "ucm", path)) do |line|
+ uc, bs, fb = nil
+ if /^<U([0-9a-fA-F]+)>\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
+ uc = $1.hex
+ bs = $2.delete('x\\')
+ fb = $3.to_i
+ next if uc < 128 && uc == bs.hex
+ elsif /^([<U0-9a-fA-F>+]+)\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
+ uc = $1.scan(/[0-9a-fA-F]+>/).map(&:hex).pack("U*").unpack("H*")[0]
+ bs = $2.delete('x\\')
+ fb = $3.to_i
+ end
+ to_ucs << [bs, uc] if fb == 0 || fb == 3
+ from_ucs << [uc, bs] if fb == 0 || fb == 1
+ end
+ [to_ucs, from_ucs]
+end
+
+def encode_utf8(map)
+ r = []
+ map.each {|k, v|
+ # integer means UTF-8 encoded sequence.
+ k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
+ v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
+ r << [k,v]
+ }
+ r
+end
+
+UnspecifiedValidEncoding = Object.new
+
+def transcode_compile_tree(name, from, map, valid_encoding)
+ map = encode_utf8(map)
+ h = {}
+ map.each {|k, v|
+ h[k] = v unless h[k] # use first mapping
+ }
+ if valid_encoding.equal? UnspecifiedValidEncoding
+ valid_encoding = ValidEncoding.fetch(from)
+ end
+ if valid_encoding
+ am = ActionMap.merge2(h, {valid_encoding => :undef}) {|prefix, as1, as2|
+ a1 = as1.empty? ? nil : ActionMap.unambiguous_action(as1)
+ a2 = as2.empty? ? nil : ActionMap.unambiguous_action(as2)
+ if !a2
+ raise "invalid mapping: #{prefix}"
+ end
+ a1 || a2
+ }
+ else
+ am = ActionMap.parse(h)
+ end
+ h.clear
+
+ max_input = am.max_input_length
+ defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name)
+ return defined_name, max_input
+end
+
+TRANSCODERS = []
+TRANSCODE_GENERATED_TRANSCODER_CODE = ''
+
+def transcode_tbl_only(from, to, map, valid_encoding=UnspecifiedValidEncoding)
+ if VERBOSE_MODE
+ if from.empty? || to.empty?
+ STDERR.puts "converter for #{from.empty? ? to : from}"
+ else
+ STDERR.puts "converter from #{from} to #{to}"
+ end
+ end
+ id_from = from.tr('^0-9A-Za-z', '_')
+ id_to = to.tr('^0-9A-Za-z', '_')
+ if from == "UTF-8"
+ tree_name = "to_#{id_to}"
+ elsif to == "UTF-8"
+ tree_name = "from_#{id_from}"
+ else
+ tree_name = "from_#{id_from}_to_#{id_to}"
+ end
+ real_tree_name, max_input = transcode_compile_tree(tree_name, from, map, valid_encoding)
+ return map, tree_name, real_tree_name, max_input
+end
+
+def transcode_tblgen(from, to, map, valid_encoding=UnspecifiedValidEncoding)
+ map, tree_name, real_tree_name, max_input = transcode_tbl_only(from, to, map, valid_encoding)
+ transcoder_name = "rb_#{tree_name}"
+ TRANSCODERS << transcoder_name
+ input_unit_length = UnitLength[from]
+ max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
+ transcoder_code = <<"End"
+static const rb_transcoder
+#{transcoder_name} = {
+ #{c_esc from}, #{c_esc to}, #{real_tree_name},
+ TRANSCODE_TABLE_INFO,
+ #{input_unit_length}, /* input_unit_length */
+ #{max_input}, /* max_input */
+ #{max_output}, /* max_output */
+ asciicompat_converter, /* asciicompat_type */
+ 0, NULL, NULL, /* state_size, state_init, state_fini */
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL
+};
+End
+ TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code
+ ''
+end
+
+def transcode_generate_node(am, name_hint=nil)
+ STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
+ name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint)
+ ''
+end
+
+def transcode_generated_code
+ TRANSCODE_GENERATED_BYTES_CODE.to_s +
+ TRANSCODE_GENERATED_WORDS_CODE.to_s +
+ "\#define TRANSCODE_TABLE_INFO " +
+ "#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " +
+ "#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " +
+ "((int)sizeof(unsigned int))\n" +
+ TRANSCODE_GENERATED_TRANSCODER_CODE
+end
+
+def transcode_register_code
+ code = ''
+ TRANSCODERS.each {|transcoder_name|
+ code << " rb_register_transcoder(&#{transcoder_name});\n"
+ }
+ code
+end
+
+UnitLength = {
+ 'UTF-16BE' => 2,
+ 'UTF-16LE' => 2,
+ 'UTF-32BE' => 4,
+ 'UTF-32LE' => 4,
+}
+UnitLength.default = 1
+
+ValidEncoding = {
+ '1byte' => '{00-ff}',
+ '2byte' => '{00-ff}{00-ff}',
+ '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
+ 'US-ASCII' => '{00-7f}',
+ 'UTF-8' => '{00-7f}
+ {c2-df}{80-bf}
+ e0{a0-bf}{80-bf}
+ {e1-ec}{80-bf}{80-bf}
+ ed{80-9f}{80-bf}
+ {ee-ef}{80-bf}{80-bf}
+ f0{90-bf}{80-bf}{80-bf}
+ {f1-f3}{80-bf}{80-bf}{80-bf}
+ f4{80-8f}{80-bf}{80-bf}',
+ 'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
+ {d8-db}{00-ff}{dc-df}{00-ff}',
+ 'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
+ {00-ff}{d8-db}{00-ff}{dc-df}',
+ 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
+ 00{01-10}{00-ff}{00-ff}',
+ 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
+ {00-ff}{00-ff}{01-10}00',
+ 'EUC-JP' => '{00-7f}
+ {a1-fe}{a1-fe}
+ 8e{a1-fe}
+ 8f{a1-fe}{a1-fe}',
+ 'CP51932' => '{00-7f}
+ {a1-fe}{a1-fe}
+ 8e{a1-fe}',
+ 'EUC-JIS-2004' => '{00-7f}
+ {a1-fe}{a1-fe}
+ 8e{a1-fe}
+ 8f{a1-fe}{a1-fe}',
+ 'Shift_JIS' => '{00-7f}
+ {81-9f,e0-fc}{40-7e,80-fc}
+ {a1-df}',
+ 'EUC-KR' => '{00-7f}
+ {a1-fe}{a1-fe}',
+ 'CP949' => '{00-7f}
+ {81-fe}{41-5a,61-7a,81-fe}',
+ 'Big5' => '{00-7f}
+ {81-fe}{40-7e,a1-fe}',
+ 'EUC-TW' => '{00-7f}
+ {a1-fe}{a1-fe}
+ 8e{a1-b0}{a1-fe}{a1-fe}',
+ 'GBK' => '{00-80}
+ {81-fe}{40-7e,80-fe}',
+ 'GB18030' => '{00-7f}
+ {81-fe}{40-7e,80-fe}
+ {81-fe}{30-39}{81-fe}{30-39}',
+}
+
+def ValidEncoding(enc)
+ ValidEncoding.fetch(enc)
+end
+
+def set_valid_byte_pattern(encoding, pattern_or_label)
+ pattern =
+ if ValidEncoding[pattern_or_label]
+ ValidEncoding[pattern_or_label]
+ else
+ pattern_or_label
+ end
+ if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern
+ raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}"
+ end
+ ValidEncoding[encoding] = pattern
+end
+
+# the following may be used in different places, so keep them here for the moment
+set_valid_byte_pattern 'ASCII-8BIT', '1byte'
+set_valid_byte_pattern 'Windows-31J', 'Shift_JIS'
+set_valid_byte_pattern 'eucJP-ms', 'EUC-JP'
+
+def make_signature(filename, src)
+ "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
+end
+
+if __FILE__ == $0
+ start_time = Time.now
+
+ output_filename = nil
+ verbose_mode = false
+ force_mode = false
+
+ op = OptionParser.new
+ op.def_option("--help", "show help message") { puts op; exit 0 }
+ op.def_option("--verbose", "verbose mode") { verbose_mode = true }
+ op.def_option("--force", "force table generation") { force_mode = true }
+ op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
+ op.parse!
+
+ VERBOSE_MODE = verbose_mode
+
+ OUTPUT_FILENAME = output_filename
+ OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : ""
+ OUTPUT_PREFIX.sub!(/\A_+/, '')
+ OUTPUT_PREFIX.sub!(/_*\z/, '_')
+
+ TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array")
+ TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array")
+
+ arg = ARGV.shift
+ $srcdir = File.dirname(arg)
+ $:.unshift $srcdir unless $:.include? $srcdir
+ src = File.read(arg)
+ src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
+ this_script = File.read(__FILE__)
+ this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
+
+ base_signature = "/* autogenerated. */\n"
+ base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
+ base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
+
+ if !force_mode && output_filename && File.readable?(output_filename)
+ old_signature = File.open(output_filename) {|f| f.gets("").chomp }
+ chk_signature = base_signature.dup
+ old_signature.each_line {|line|
+ if %r{/\* src="([0-9a-z_.-]+)",} =~ line
+ name = $1
+ next if name == File.basename(arg) || name == File.basename(__FILE__)
+ path = File.join($srcdir, name)
+ if File.readable? path
+ chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
+ end
+ end
+ }
+ if old_signature == chk_signature
+ now = Time.now
+ File.utime(now, now, output_filename)
+ STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
+ exit
+ end
+ end
+
+ if VERBOSE_MODE
+ if output_filename
+ STDERR.puts "generating #{output_filename} ..."
+ end
+ end
+
+ libs1 = $".dup
+ erb = ERB.new(src, nil, '%')
+ erb.filename = arg
+ erb_result = erb.result(binding)
+ libs2 = $".dup
+
+ libs = libs2 - libs1
+ lib_sigs = ''
+ libs.each {|lib|
+ lib = File.basename(lib)
+ path = File.join($srcdir, lib)
+ if File.readable? path
+ lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
+ end
+ }
+
+ result = ''
+ result << base_signature
+ result << lib_sigs
+ result << "\n"
+ result << erb_result
+ result << "\n"
+
+ if output_filename
+ new_filename = output_filename + ".new"
+ FileUtils.mkdir_p(File.dirname(output_filename))
+ File.open(new_filename, "wb") {|f| f << result }
+ File.rename(new_filename, output_filename)
+ tms = Process.times
+ elapsed = Time.now - start_time
+ STDERR.puts "done. (#{'%.2f' % tms.utime}user #{'%.2f' % tms.stime}system #{'%.2f' % elapsed}elapsed)" if VERBOSE_MODE
+ else
+ print result
+ end
+end