From fcbf63e62c627deae76c1b8cb8c0876c536ed811 Mon Sep 17 00:00:00 2001
From: Jari Vetoniemi <jari.vetoniemi@indooratlas.com>
Date: Mon, 16 Mar 2020 18:49:26 +0900
Subject: Fresh start

---
 jni/ruby/tool/enc-unicode.rb | 371 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 371 insertions(+)
 create mode 100755 jni/ruby/tool/enc-unicode.rb

(limited to 'jni/ruby/tool/enc-unicode.rb')

diff --git a/jni/ruby/tool/enc-unicode.rb b/jni/ruby/tool/enc-unicode.rb
new file mode 100755
index 0000000..38140ab
--- /dev/null
+++ b/jni/ruby/tool/enc-unicode.rb
@@ -0,0 +1,371 @@
+#!/usr/bin/env ruby
+
+# Creates the data structures needed by Onigurma to map Unicode codepoints to
+# property names and POSIX character classes
+#
+# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
+# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
+# DerivedAge.txt and Blocks.txt  from unicode.org.
+# (http://unicode.org/Public/UNIDATA/) And run following command.
+# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
+# You can get source file for gperf.  After this, simply make ruby.
+
+unless ARGV.size == 1
+  $stderr.puts "Usage: #{$0} data_directory"
+  exit(1)
+end
+
+POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]
+
+def pair_codepoints(codepoints)
+
+  # We have a sorted Array of codepoints that we wish to partition into
+  # ranges such that the start- and endpoints form an inclusive set of
+  # codepoints with property _property_. Note: It is intended that some ranges
+  # will begin with the value with  which they end, e.g. 0x0020 -> 0x0020
+
+  codepoints.sort!
+  last_cp = codepoints.first
+  pairs = [[last_cp, nil]]
+  codepoints[1..-1].each do |codepoint|
+    next if last_cp == codepoint
+
+    # If the current codepoint does not follow directly on from the last
+    # codepoint, the last codepoint represents the end of the current range,
+    # and the current codepoint represents the start of the next range.
+    if last_cp.next != codepoint
+      pairs[-1][-1] = last_cp
+      pairs << [codepoint, nil]
+    end
+    last_cp = codepoint
+  end
+
+  # The final pair has as its endpoint the last codepoint for this property
+  pairs[-1][-1] = codepoints.last
+  pairs
+end
+
+def parse_unicode_data(file)
+  last_cp = 0
+  data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
+    'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
+  beg_cp = nil
+  IO.foreach(file) do |line|
+    fields = line.split(';')
+    cp = fields[0].to_i(16)
+
+    case fields[1]
+    when /\A<(.*),\s*First>\z/
+      beg_cp = cp
+      next
+    when /\A<(.*),\s*Last>\z/
+      cps = (beg_cp..cp).to_a
+    else
+      beg_cp = cp
+      cps = [cp]
+    end
+
+    # The Cn category represents unassigned characters. These are not listed in
+    # UnicodeData.txt so we must derive them by looking for 'holes' in the range
+    # of listed codepoints. We increment the last codepoint seen and compare it
+    # with the current codepoint. If the current codepoint is less than
+    # last_cp.next we have found a hole, so we add the missing codepoint to the
+    # Cn category.
+    data['Cn'].concat((last_cp.next...beg_cp).to_a)
+
+    # Assigned - Defined in unicode.c; interpreted as every character in the
+    # Unicode range minus the unassigned characters
+    data['Assigned'].concat(cps)
+
+    # The third field denotes the 'General' category, e.g. Lu
+    (data[fields[2]] ||= []).concat(cps)
+
+    # The 'Major' category is the first letter of the 'General' category, e.g.
+    # 'Lu' -> 'L'
+    (data[fields[2][0,1]] ||= []).concat(cps)
+    last_cp = cp
+  end
+
+  # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
+  # codepoints to Cn and C
+  cn_remainder = (last_cp.next..0x10ffff).to_a
+  data['Cn'] += cn_remainder
+  data['C'] += data['Cn']
+
+  # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu
+  data['LC'] = data['Ll'] + data['Lt'] + data['Lu']
+
+  # Define General Category properties
+  gcps = data.keys.sort - POSIX_NAMES
+
+  # Returns General Category Property names and the data
+  [gcps, data]
+end
+
+def define_posix_props(data)
+  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
+  #
+
+  data['Alpha'] = data['Alphabetic']
+  data['Upper'] = data['Uppercase']
+  data['Lower'] = data['Lowercase']
+  data['Punct'] = data['Punctuation']
+  data['Digit'] = data['Decimal_Number']
+  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
+                   (0x0061..0x0066).to_a
+  data['Alnum'] = data['Alpha'] + data['Digit']
+  data['Space'] = data['White_Space']
+  data['Blank'] = data['Space_Separator'] + [0x0009]
+  data['Cntrl'] = data['Cc']
+  data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation']
+  data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] -
+    data['Surrogate'] - data['Unassigned']
+  data['Print'] = data['Graph'] + data['Space_Separator']
+end
+
+def parse_scripts(data, categories)
+  files = [
+    {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'},
+    {:fn => 'Scripts.txt', :title => 'Script'},
+    {:fn => 'PropList.txt', :title => 'Binary Property'}
+  ]
+  current = nil
+  cps = []
+  names = {}
+  files.each do |file|
+    IO.foreach(get_file(file[:fn])) do |line|
+      if /^# Total code points: / =~ line
+        data[current] = cps
+        categories[current] = file[:title]
+        (names[file[:title]] ||= []) << current
+        cps = []
+      elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
+        current = $3
+        $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
+      end
+    end
+  end
+  #  All code points not explicitly listed for Script
+  #  have the value Unknown (Zzzz).
+  data['Unknown'] =  (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten
+  categories['Unknown'] = 'Script'
+  names.values.flatten << 'Unknown'
+end
+
+def parse_aliases(data)
+  kv = {}
+  IO.foreach(get_file('PropertyAliases.txt')) do |line|
+    next unless /^(\w+)\s*; (\w+)/ =~ line
+    data[$1] = data[$2]
+    kv[normalize_propname($1)] = normalize_propname($2)
+  end
+  IO.foreach(get_file('PropertyValueAliases.txt')) do |line|
+    next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line
+    if $1 == 'gc'
+      data[$3] = data[$2]
+      data[$4] = data[$2]
+      kv[normalize_propname($3)] = normalize_propname($2)
+      kv[normalize_propname($4)] = normalize_propname($2) if $4
+    else
+      data[$2] = data[$3]
+      data[$4] = data[$3]
+      kv[normalize_propname($2)] = normalize_propname($3)
+      kv[normalize_propname($4)] = normalize_propname($3) if $4
+    end
+  end
+  kv
+end
+
+# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version
+# never involves any additions to the character repertoire." Versions
+# in DerivedAge.txt should always be /\d+\.\d+/
+def parse_age(data)
+  current = nil
+  last_constname = nil
+  cps = []
+  ages = []
+  IO.foreach(get_file('DerivedAge.txt')) do |line|
+    if /^# Total code points: / =~ line
+      constname = constantize_agename(current)
+      # each version matches all previous versions
+      cps.concat(data[last_constname]) if last_constname
+      data[constname] = cps
+      make_const(constname, cps, "Derived Age #{current}")
+      ages << current
+      last_constname = constname
+      cps = []
+    elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line
+      current = $3
+      $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
+    end
+  end
+  ages
+end
+
+def parse_block(data)
+  current = nil
+  last_constname = nil
+  cps = []
+  blocks = []
+  IO.foreach(get_file('Blocks.txt')) do |line|
+    if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line
+      cps = ($1.to_i(16)..$2.to_i(16)).to_a
+      constname = constantize_blockname($3)
+      data[constname] = cps
+      make_const(constname, cps, "Block")
+      blocks << constname
+    end
+  end
+
+  # All code points not belonging to any of the named blocks
+  # have the value No_Block.
+  no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten
+  constname = constantize_blockname("No_Block")
+  make_const(constname, no_block, "Block")
+  blocks << constname
+end
+
+# shim for Ruby 1.8
+unless {}.respond_to?(:key)
+  class Hash
+    alias key index
+  end
+end
+
+$const_cache = {}
+# make_const(property, pairs, name): Prints a 'static const' structure for a
+# given property, group of paired codepoints, and a human-friendly name for
+# the group
+def make_const(prop, data, name)
+  puts "\n/* '#{prop}': #{name} */"
+  if origprop = $const_cache.key(data)
+    puts "#define CR_#{prop} CR_#{origprop}"
+  else
+    $const_cache[prop] = data
+    pairs = pair_codepoints(data)
+    puts "static const OnigCodePoint CR_#{prop}[] = {"
+    # The first element of the constant is the number of pairs of codepoints
+    puts "\t#{pairs.size},"
+    pairs.each do |pair|
+      pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
+      puts "\t#{pair.first}, #{pair.last},"
+    end
+    puts "}; /* CR_#{prop} */"
+  end
+end
+
+def normalize_propname(name)
+  name = name.downcase
+  name.delete!('- _')
+  name
+end
+
+def constantize_agename(name)
+  "Age_#{name.sub(/\./, '_')}"
+end
+
+def constantize_blockname(name)
+  "In_#{name.gsub(/\W/, '_')}"
+end
+
+def get_file(name)
+  File.join(ARGV[0], name)
+end
+
+
+# Write Data
+puts '%{'
+puts '#define long size_t'
+props, data = parse_unicode_data(get_file('UnicodeData.txt'))
+categories = {}
+props.concat parse_scripts(data, categories)
+aliases = parse_aliases(data)
+define_posix_props(data)
+POSIX_NAMES.each do |name|
+  make_const(name, data[name], "[[:#{name}:]]")
+end
+print "\n#ifdef USE_UNICODE_PROPERTIES"
+props.each do |name|
+  category = categories[name] ||
+    case name.size
+    when 1 then 'Major Category'
+    when 2 then 'General Category'
+    else        '-'
+    end
+  make_const(name, data[name], category)
+end
+print "\n#ifdef USE_UNICODE_AGE_PROPERTIES"
+ages = parse_age(data)
+puts "#endif /* USE_UNICODE_AGE_PROPERTIES */"
+blocks = parse_block(data)
+puts '#endif /* USE_UNICODE_PROPERTIES */'
+puts(<<'__HEREDOC')
+
+static const OnigCodePoint* const CodeRanges[] = {
+__HEREDOC
+POSIX_NAMES.each{|name|puts"  CR_#{name},"}
+puts "#ifdef USE_UNICODE_PROPERTIES"
+props.each{|name| puts"  CR_#{name},"}
+puts "#ifdef USE_UNICODE_AGE_PROPERTIES"
+ages.each{|name|  puts"  CR_#{constantize_agename(name)},"}
+puts "#endif /* USE_UNICODE_AGE_PROPERTIES */"
+blocks.each{|name|puts"  CR_#{name},"}
+
+puts(<<'__HEREDOC')
+#endif /* USE_UNICODE_PROPERTIES */
+};
+struct uniname2ctype_struct {
+  int name, ctype;
+};
+
+static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
+%}
+struct uniname2ctype_struct;
+%%
+__HEREDOC
+i = -1
+name_to_index = {}
+POSIX_NAMES.each do |name|
+  i += 1
+  next if name == 'NEWLINE'
+  name = normalize_propname(name)
+  name_to_index[name] = i
+  puts"%-40s %3d" % [name + ',', i]
+end
+puts "#ifdef USE_UNICODE_PROPERTIES"
+props.each do |name|
+  i += 1
+  name = normalize_propname(name)
+  name_to_index[name] = i
+  puts "%-40s %3d" % [name + ',', i]
+end
+aliases.each_pair do |k, v|
+  next if name_to_index[k]
+  next unless v = name_to_index[v]
+  puts "%-40s %3d" % [k + ',', v]
+end
+puts "#ifdef USE_UNICODE_AGE_PROPERTIES"
+ages.each do |name|
+  i += 1
+  name = "age=#{name}"
+  name_to_index[name] = i
+  puts "%-40s %3d" % [name + ',', i]
+end
+puts "#endif /* USE_UNICODE_AGE_PROPERTIES */"
+blocks.each do |name|
+  i += 1
+  name = normalize_propname(name)
+  name_to_index[name] = i
+  puts "%-40s %3d" % [name + ',', i]
+end
+puts(<<'__HEREDOC')
+#endif /* USE_UNICODE_PROPERTIES */
+%%
+static int
+uniname2ctype(const UChar *name, unsigned int len)
+{
+  const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
+  if (p) return p->ctype;
+  return -1;
+}
+__HEREDOC
-- 
cgit v1.2.3