From fcbf63e62c627deae76c1b8cb8c0876c536ed811 Mon Sep 17 00:00:00 2001 From: Jari Vetoniemi Date: Mon, 16 Mar 2020 18:49:26 +0900 Subject: Fresh start --- jni/ruby/tool/enc-unicode.rb | 371 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 371 insertions(+) create mode 100755 jni/ruby/tool/enc-unicode.rb (limited to 'jni/ruby/tool/enc-unicode.rb') diff --git a/jni/ruby/tool/enc-unicode.rb b/jni/ruby/tool/enc-unicode.rb new file mode 100755 index 0000000..38140ab --- /dev/null +++ b/jni/ruby/tool/enc-unicode.rb @@ -0,0 +1,371 @@ +#!/usr/bin/env ruby + +# Creates the data structures needed by Onigurma to map Unicode codepoints to +# property names and POSIX character classes +# +# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt, +# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt, +# DerivedAge.txt and Blocks.txt from unicode.org. +# (http://unicode.org/Public/UNIDATA/) And run following command. +# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd +# You can get source file for gperf. After this, simply make ruby. + +unless ARGV.size == 1 + $stderr.puts "Usage: #{$0} data_directory" + exit(1) +end + +POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII] + +def pair_codepoints(codepoints) + + # We have a sorted Array of codepoints that we wish to partition into + # ranges such that the start- and endpoints form an inclusive set of + # codepoints with property _property_. Note: It is intended that some ranges + # will begin with the value with which they end, e.g. 0x0020 -> 0x0020 + + codepoints.sort! + last_cp = codepoints.first + pairs = [[last_cp, nil]] + codepoints[1..-1].each do |codepoint| + next if last_cp == codepoint + + # If the current codepoint does not follow directly on from the last + # codepoint, the last codepoint represents the end of the current range, + # and the current codepoint represents the start of the next range. + if last_cp.next != codepoint + pairs[-1][-1] = last_cp + pairs << [codepoint, nil] + end + last_cp = codepoint + end + + # The final pair has as its endpoint the last codepoint for this property + pairs[-1][-1] = codepoints.last + pairs +end + +def parse_unicode_data(file) + last_cp = 0 + data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [], + 'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []} + beg_cp = nil + IO.foreach(file) do |line| + fields = line.split(';') + cp = fields[0].to_i(16) + + case fields[1] + when /\A<(.*),\s*First>\z/ + beg_cp = cp + next + when /\A<(.*),\s*Last>\z/ + cps = (beg_cp..cp).to_a + else + beg_cp = cp + cps = [cp] + end + + # The Cn category represents unassigned characters. These are not listed in + # UnicodeData.txt so we must derive them by looking for 'holes' in the range + # of listed codepoints. We increment the last codepoint seen and compare it + # with the current codepoint. If the current codepoint is less than + # last_cp.next we have found a hole, so we add the missing codepoint to the + # Cn category. + data['Cn'].concat((last_cp.next...beg_cp).to_a) + + # Assigned - Defined in unicode.c; interpreted as every character in the + # Unicode range minus the unassigned characters + data['Assigned'].concat(cps) + + # The third field denotes the 'General' category, e.g. Lu + (data[fields[2]] ||= []).concat(cps) + + # The 'Major' category is the first letter of the 'General' category, e.g. + # 'Lu' -> 'L' + (data[fields[2][0,1]] ||= []).concat(cps) + last_cp = cp + end + + # The last Cn codepoint should be 0x10ffff. If it's not, append the missing + # codepoints to Cn and C + cn_remainder = (last_cp.next..0x10ffff).to_a + data['Cn'] += cn_remainder + data['C'] += data['Cn'] + + # Special case for LC (Cased_Letter). LC = Ll + Lt + Lu + data['LC'] = data['Ll'] + data['Lt'] + data['Lu'] + + # Define General Category properties + gcps = data.keys.sort - POSIX_NAMES + + # Returns General Category Property names and the data + [gcps, data] +end + +def define_posix_props(data) + # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]] + # + + data['Alpha'] = data['Alphabetic'] + data['Upper'] = data['Uppercase'] + data['Lower'] = data['Lowercase'] + data['Punct'] = data['Punctuation'] + data['Digit'] = data['Decimal_Number'] + data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a + + (0x0061..0x0066).to_a + data['Alnum'] = data['Alpha'] + data['Digit'] + data['Space'] = data['White_Space'] + data['Blank'] = data['Space_Separator'] + [0x0009] + data['Cntrl'] = data['Cc'] + data['Word'] = data['Alpha'] + data['Mark'] + data['Digit'] + data['Connector_Punctuation'] + data['Graph'] = data['Any'] - data['Space'] - data['Cntrl'] - + data['Surrogate'] - data['Unassigned'] + data['Print'] = data['Graph'] + data['Space_Separator'] +end + +def parse_scripts(data, categories) + files = [ + {:fn => 'DerivedCoreProperties.txt', :title => 'Derived Property'}, + {:fn => 'Scripts.txt', :title => 'Script'}, + {:fn => 'PropList.txt', :title => 'Binary Property'} + ] + current = nil + cps = [] + names = {} + files.each do |file| + IO.foreach(get_file(file[:fn])) do |line| + if /^# Total code points: / =~ line + data[current] = cps + categories[current] = file[:title] + (names[file[:title]] ||= []) << current + cps = [] + elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + end + # All code points not explicitly listed for Script + # have the value Unknown (Zzzz). + data['Unknown'] = (0..0x10ffff).to_a - data.values_at(*names['Script']).flatten + categories['Unknown'] = 'Script' + names.values.flatten << 'Unknown' +end + +def parse_aliases(data) + kv = {} + IO.foreach(get_file('PropertyAliases.txt')) do |line| + next unless /^(\w+)\s*; (\w+)/ =~ line + data[$1] = data[$2] + kv[normalize_propname($1)] = normalize_propname($2) + end + IO.foreach(get_file('PropertyValueAliases.txt')) do |line| + next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line + if $1 == 'gc' + data[$3] = data[$2] + data[$4] = data[$2] + kv[normalize_propname($3)] = normalize_propname($2) + kv[normalize_propname($4)] = normalize_propname($2) if $4 + else + data[$2] = data[$3] + data[$4] = data[$3] + kv[normalize_propname($2)] = normalize_propname($3) + kv[normalize_propname($4)] = normalize_propname($3) if $4 + end + end + kv +end + +# According to Unicode6.0.0/ch03.pdf, Section 3.1, "An update version +# never involves any additions to the character repertoire." Versions +# in DerivedAge.txt should always be /\d+\.\d+/ +def parse_age(data) + current = nil + last_constname = nil + cps = [] + ages = [] + IO.foreach(get_file('DerivedAge.txt')) do |line| + if /^# Total code points: / =~ line + constname = constantize_agename(current) + # each version matches all previous versions + cps.concat(data[last_constname]) if last_constname + data[constname] = cps + make_const(constname, cps, "Derived Age #{current}") + ages << current + last_constname = constname + cps = [] + elsif /^([0-9a-fA-F]+)(?:..([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line + current = $3 + $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) + end + end + ages +end + +def parse_block(data) + current = nil + last_constname = nil + cps = [] + blocks = [] + IO.foreach(get_file('Blocks.txt')) do |line| + if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line + cps = ($1.to_i(16)..$2.to_i(16)).to_a + constname = constantize_blockname($3) + data[constname] = cps + make_const(constname, cps, "Block") + blocks << constname + end + end + + # All code points not belonging to any of the named blocks + # have the value No_Block. + no_block = (0..0x10ffff).to_a - data.values_at(*blocks).flatten + constname = constantize_blockname("No_Block") + make_const(constname, no_block, "Block") + blocks << constname +end + +# shim for Ruby 1.8 +unless {}.respond_to?(:key) + class Hash + alias key index + end +end + +$const_cache = {} +# make_const(property, pairs, name): Prints a 'static const' structure for a +# given property, group of paired codepoints, and a human-friendly name for +# the group +def make_const(prop, data, name) + puts "\n/* '#{prop}': #{name} */" + if origprop = $const_cache.key(data) + puts "#define CR_#{prop} CR_#{origprop}" + else + $const_cache[prop] = data + pairs = pair_codepoints(data) + puts "static const OnigCodePoint CR_#{prop}[] = {" + # The first element of the constant is the number of pairs of codepoints + puts "\t#{pairs.size}," + pairs.each do |pair| + pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) } + puts "\t#{pair.first}, #{pair.last}," + end + puts "}; /* CR_#{prop} */" + end +end + +def normalize_propname(name) + name = name.downcase + name.delete!('- _') + name +end + +def constantize_agename(name) + "Age_#{name.sub(/\./, '_')}" +end + +def constantize_blockname(name) + "In_#{name.gsub(/\W/, '_')}" +end + +def get_file(name) + File.join(ARGV[0], name) +end + + +# Write Data +puts '%{' +puts '#define long size_t' +props, data = parse_unicode_data(get_file('UnicodeData.txt')) +categories = {} +props.concat parse_scripts(data, categories) +aliases = parse_aliases(data) +define_posix_props(data) +POSIX_NAMES.each do |name| + make_const(name, data[name], "[[:#{name}:]]") +end +print "\n#ifdef USE_UNICODE_PROPERTIES" +props.each do |name| + category = categories[name] || + case name.size + when 1 then 'Major Category' + when 2 then 'General Category' + else '-' + end + make_const(name, data[name], category) +end +print "\n#ifdef USE_UNICODE_AGE_PROPERTIES" +ages = parse_age(data) +puts "#endif /* USE_UNICODE_AGE_PROPERTIES */" +blocks = parse_block(data) +puts '#endif /* USE_UNICODE_PROPERTIES */' +puts(<<'__HEREDOC') + +static const OnigCodePoint* const CodeRanges[] = { +__HEREDOC +POSIX_NAMES.each{|name|puts" CR_#{name},"} +puts "#ifdef USE_UNICODE_PROPERTIES" +props.each{|name| puts" CR_#{name},"} +puts "#ifdef USE_UNICODE_AGE_PROPERTIES" +ages.each{|name| puts" CR_#{constantize_agename(name)},"} +puts "#endif /* USE_UNICODE_AGE_PROPERTIES */" +blocks.each{|name|puts" CR_#{name},"} + +puts(<<'__HEREDOC') +#endif /* USE_UNICODE_PROPERTIES */ +}; +struct uniname2ctype_struct { + int name, ctype; +}; + +static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int); +%} +struct uniname2ctype_struct; +%% +__HEREDOC +i = -1 +name_to_index = {} +POSIX_NAMES.each do |name| + i += 1 + next if name == 'NEWLINE' + name = normalize_propname(name) + name_to_index[name] = i + puts"%-40s %3d" % [name + ',', i] +end +puts "#ifdef USE_UNICODE_PROPERTIES" +props.each do |name| + i += 1 + name = normalize_propname(name) + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] +end +aliases.each_pair do |k, v| + next if name_to_index[k] + next unless v = name_to_index[v] + puts "%-40s %3d" % [k + ',', v] +end +puts "#ifdef USE_UNICODE_AGE_PROPERTIES" +ages.each do |name| + i += 1 + name = "age=#{name}" + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] +end +puts "#endif /* USE_UNICODE_AGE_PROPERTIES */" +blocks.each do |name| + i += 1 + name = normalize_propname(name) + name_to_index[name] = i + puts "%-40s %3d" % [name + ',', i] +end +puts(<<'__HEREDOC') +#endif /* USE_UNICODE_PROPERTIES */ +%% +static int +uniname2ctype(const UChar *name, unsigned int len) +{ + const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len); + if (p) return p->ctype; + return -1; +} +__HEREDOC -- cgit v1.2.3