From fcbf63e62c627deae76c1b8cb8c0876c536ed811 Mon Sep 17 00:00:00 2001
From: Jari Vetoniemi <jari.vetoniemi@indooratlas.com>
Date: Mon, 16 Mar 2020 18:49:26 +0900
Subject: Fresh start

---
 jni/ruby/template/unicode_norm_gen.tmpl | 225 ++++++++++++++++++++++++++++++++
 1 file changed, 225 insertions(+)
 create mode 100644 jni/ruby/template/unicode_norm_gen.tmpl

(limited to 'jni/ruby/template/unicode_norm_gen.tmpl')

diff --git a/jni/ruby/template/unicode_norm_gen.tmpl b/jni/ruby/template/unicode_norm_gen.tmpl
new file mode 100644
index 0000000..955c858
--- /dev/null
+++ b/jni/ruby/template/unicode_norm_gen.tmpl
@@ -0,0 +1,225 @@
+%# -*- mode: ruby; coding: utf-8 -*-
+<%
+# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+# Script to generate Ruby data structures used in implementing
+# String#unicode_normalize,...
+
+# Constants for input and ouput directory
+InputDataDir = ARGV[0] || 'enc/unicode/data'
+unicode_version = InputDataDir[/[\d.]+\z/]
+
+# convenience methods
+class Integer
+  def to_UTF8() # convert to string, taking legibility into account
+    if self>0xFFFF
+      "\\u{#{to_s(16).upcase}}"
+    elsif self>0x7f
+      "\\u#{to_s(16).upcase.rjust(4, '0')}"
+    else
+      chr.sub(/[\\\"]/, "\\\\\\\&")
+    end
+  end
+end
+
+module Enumerable
+  unless method_defined?(:each_slice)
+    def each_slice(n)
+      ary = []
+      each do |i|
+        ary << i
+        if ary.size >= n
+          yield ary
+          ary = []
+        end
+      end
+      yield ary unless ary.empty?
+      self
+    end
+  end
+end
+
+class Array
+  def to_UTF8() collect {|c| c.to_UTF8}.join('') end
+
+  def each_regexp_chars(n = 8) # converts an array of Integers to character ranges
+    sort.inject([]) do |ranges, value|
+      if ranges.last and ranges.last[1]+1>=value
+        ranges.last[1] = value
+        ranges
+      else
+        ranges << [value, value]
+      end
+    end.collect do |first, last|
+      case last-first
+      when 0
+        first.to_UTF8
+      when 1
+        first.to_UTF8 + last.to_UTF8
+      else
+        first.to_UTF8 + '-' + last.to_UTF8
+      end
+    end.each_slice(n) do |slice|
+      yield slice.join('')
+    end
+  end
+end
+
+# read the file 'CompositionExclusions.txt'
+composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt") {|f|
+  base = Regexp.quote(File.basename(f.path, '.*'))
+  ext = Regexp.quote(File.extname(f.path))
+  version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or
+    abort "No file version in #{f.path}: #{line}"
+  (unicode_version ||= version) == version or
+    abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch"
+  f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex}
+}
+
+decomposition_table = {}
+kompatible_table = {}
+combining_class = {}  # constant to allow use in Integer#to_UTF8
+
+# read the file 'UnicodeData.txt'
+vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
+  codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
+
+  case decomposition
+  when /^[0-9A-F]/
+    decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
+  when /^</
+    kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
+  end
+  combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
+
+  if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
+    warn "Unexpected: Character range with data relevant to normalization!"
+  end
+end
+
+# calculate compositions from decompositions
+composition_table = decomposition_table.reject do |character, decomposition|
+  composition_exclusions.member? character or # predefined composition exclusion
+    decomposition.length<=1 or                # Singleton Decomposition
+    combining_class[character] or             # character is not a Starter
+    combining_class[decomposition.first]      # decomposition begins with a character that is not a Starter
+end.invert
+
+# recalculate composition_exclusions
+composition_exclusions = decomposition_table.keys - composition_table.values
+
+accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
+
+composition_starters = composition_table.keys.collect {|key| key.first}
+
+hangul_no_trailing = []
+0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}
+
+# expand decomposition table values
+decomposition_table.each do |key, value|
+  position = 0
+  while position < value.length
+    if decomposition = decomposition_table[value[position]]
+      decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
+      value[position, 1] = decomposition
+    else
+      position += 1
+    end
+  end
+end
+
+# deal with relationship between canonical and kompatibility decompositions
+decomposition_table.each do |key, value|
+  value = value.dup
+  expanded = false
+  position = 0
+  while position < value.length
+    if decomposition = kompatible_table[value[position]]
+      value[position, 1] = decomposition
+      expanded = true
+    else
+      position += 1
+    end
+  end
+  kompatible_table[key] = value if expanded
+end
+
+while kompatible_table.any? {|key, value|
+        expanded = value.map {|v| kompatible_table[v] || v}.flatten
+        kompatible_table[key] = expanded unless value == expanded
+      }
+end
+
+# generate normalization tables file
+%># coding: us-ascii
+%# >
+
+# automatically generated by template/unicode_norm_gen.tmpl
+
+module UnicodeNormalize
+  UNICODE_VERSION = "<%=unicode_version%>".freeze
+
+  accents = "" \
+    "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
+    "<% end%>]" \
+  "".freeze
+  ACCENTS = accents
+  REGEXP_D_STRING = "#{''  # composition starters and composition exclusions
+    }" \
+    "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
+    "<% end%>]#{accents}*" \
+    "|#{''  # characters that can be the result of a composition, except composition starters
+    }" \
+    "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
+    "<% end%>]?#{accents}+" \
+    "|#{''  # precomposed Hangul syllables
+    }" \
+    "[\u{AC00}-\u{D7A4}]" \
+  "".freeze
+  REGEXP_C_STRING = "#{''  # composition exclusions
+    }" \
+    "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
+    "<% end%>]#{accents}*" \
+    "|#{''  # composition starters and characters that can be the result of a composition
+    }" \
+    "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
+    "<% end%>]?#{accents}+" \
+    "|#{''  # Hangul syllables with separate trailer
+    }" \
+    "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
+    "<% end%>][\u11A8-\u11C2]" \
+    "|#{''  # decomposed Hangul syllables
+    }" \
+    "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" \
+  "".freeze
+  REGEXP_K_STRING = "" \
+    "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
+    "<%end%>]" \
+  "".freeze
+
+  class_table = {
+% combining_class.each_slice(8) do |slice|
+   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%>
+% end
+  }
+  class_table.default = 0
+  CLASS_TABLE = class_table.freeze
+
+  DECOMPOSITION_TABLE = {
+% decomposition_table.each_slice(8) do |slice|
+   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
+% end
+  }.freeze
+
+  KOMPATIBLE_TABLE = {
+% kompatible_table.each_slice(8) do |slice|
+   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
+% end
+  }.freeze
+
+  COMPOSITION_TABLE = {
+% composition_table.each_slice(8) do |slice|
+   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
+% end
+  }.freeze
+end
-- 
cgit v1.2.3