From fcbf63e62c627deae76c1b8cb8c0876c536ed811 Mon Sep 17 00:00:00 2001 From: Jari Vetoniemi Date: Mon, 16 Mar 2020 18:49:26 +0900 Subject: Fresh start --- jni/ruby/template/unicode_norm_gen.tmpl | 225 ++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 jni/ruby/template/unicode_norm_gen.tmpl (limited to 'jni/ruby/template/unicode_norm_gen.tmpl') diff --git a/jni/ruby/template/unicode_norm_gen.tmpl b/jni/ruby/template/unicode_norm_gen.tmpl new file mode 100644 index 0000000..955c858 --- /dev/null +++ b/jni/ruby/template/unicode_norm_gen.tmpl @@ -0,0 +1,225 @@ +%# -*- mode: ruby; coding: utf-8 -*- +<% +# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp) + +# Script to generate Ruby data structures used in implementing +# String#unicode_normalize,... + +# Constants for input and ouput directory +InputDataDir = ARGV[0] || 'enc/unicode/data' +unicode_version = InputDataDir[/[\d.]+\z/] + +# convenience methods +class Integer + def to_UTF8() # convert to string, taking legibility into account + if self>0xFFFF + "\\u{#{to_s(16).upcase}}" + elsif self>0x7f + "\\u#{to_s(16).upcase.rjust(4, '0')}" + else + chr.sub(/[\\\"]/, "\\\\\\\&") + end + end +end + +module Enumerable + unless method_defined?(:each_slice) + def each_slice(n) + ary = [] + each do |i| + ary << i + if ary.size >= n + yield ary + ary = [] + end + end + yield ary unless ary.empty? + self + end + end +end + +class Array + def to_UTF8() collect {|c| c.to_UTF8}.join('') end + + def each_regexp_chars(n = 8) # converts an array of Integers to character ranges + sort.inject([]) do |ranges, value| + if ranges.last and ranges.last[1]+1>=value + ranges.last[1] = value + ranges + else + ranges << [value, value] + end + end.collect do |first, last| + case last-first + when 0 + first.to_UTF8 + when 1 + first.to_UTF8 + last.to_UTF8 + else + first.to_UTF8 + '-' + last.to_UTF8 + end + end.each_slice(n) do |slice| + yield slice.join('') + end + end +end + +# read the file 'CompositionExclusions.txt' +composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt") {|f| + base = Regexp.quote(File.basename(f.path, '.*')) + ext = Regexp.quote(File.extname(f.path)) + version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or + abort "No file version in #{f.path}: #{line}" + (unicode_version ||= version) == version or + abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch" + f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex} +} + +decomposition_table = {} +kompatible_table = {} +combining_class = {} # constant to allow use in Integer#to_UTF8 + +# read the file 'UnicodeData.txt' +vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line| + codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";") + + case decomposition + when /^[0-9A-F]/ + decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex} + when /^$/ and (char_class!="0" or decomposition!="") + warn "Unexpected: Character range with data relevant to normalization!" + end +end + +# calculate compositions from decompositions +composition_table = decomposition_table.reject do |character, decomposition| + composition_exclusions.member? character or # predefined composition exclusion + decomposition.length<=1 or # Singleton Decomposition + combining_class[character] or # character is not a Starter + combining_class[decomposition.first] # decomposition begins with a character that is not a Starter +end.invert + +# recalculate composition_exclusions +composition_exclusions = decomposition_table.keys - composition_table.values + +accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last} + +composition_starters = composition_table.keys.collect {|key| key.first} + +hangul_no_trailing = [] +0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c} + +# expand decomposition table values +decomposition_table.each do |key, value| + position = 0 + while position < value.length + if decomposition = decomposition_table[value[position]] + decomposition_table[key] = value = value.dup # avoid overwriting composition_table key + value[position, 1] = decomposition + else + position += 1 + end + end +end + +# deal with relationship between canonical and kompatibility decompositions +decomposition_table.each do |key, value| + value = value.dup + expanded = false + position = 0 + while position < value.length + if decomposition = kompatible_table[value[position]] + value[position, 1] = decomposition + expanded = true + else + position += 1 + end + end + kompatible_table[key] = value if expanded +end + +while kompatible_table.any? {|key, value| + expanded = value.map {|v| kompatible_table[v] || v}.flatten + kompatible_table[key] = expanded unless value == expanded + } +end + +# generate normalization tables file +%># coding: us-ascii +%# > + +# automatically generated by template/unicode_norm_gen.tmpl + +module UnicodeNormalize + UNICODE_VERSION = "<%=unicode_version%>".freeze + + accents = "" \ + "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]" \ + "".freeze + ACCENTS = accents + REGEXP_D_STRING = "#{'' # composition starters and composition exclusions + }" \ + "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]#{accents}*" \ + "|#{'' # characters that can be the result of a composition, except composition starters + }" \ + "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]?#{accents}+" \ + "|#{'' # precomposed Hangul syllables + }" \ + "[\u{AC00}-\u{D7A4}]" \ + "".freeze + REGEXP_C_STRING = "#{'' # composition exclusions + }" \ + "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]#{accents}*" \ + "|#{'' # composition starters and characters that can be the result of a composition + }" \ + "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>]?#{accents}+" \ + "|#{'' # Hangul syllables with separate trailer + }" \ + "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \ + "<% end%>][\u11A8-\u11C2]" \ + "|#{'' # decomposed Hangul syllables + }" \ + "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" \ + "".freeze + REGEXP_K_STRING = "" \ + "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \ + "<%end%>]" \ + "".freeze + + class_table = { +% combining_class.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%> +% end + } + class_table.default = 0 + CLASS_TABLE = class_table.freeze + + DECOMPOSITION_TABLE = { +% decomposition_table.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%> +% end + }.freeze + + KOMPATIBLE_TABLE = { +% kompatible_table.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%> +% end + }.freeze + + COMPOSITION_TABLE = { +% composition_table.each_slice(8) do |slice| + <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%> +% end + }.freeze +end -- cgit v1.2.3