jni/ruby/lib/unicode_normalize/normalize.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

# coding: utf-8

# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)

require 'unicode_normalize/tables.rb'


module UnicodeNormalize
  ## Constant for max hash capacity to avoid DoS attack
  MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow

  ## Regular Expressions and Hash Constants
  REGEXP_D = Regexp.compile(REGEXP_D_STRING, Regexp::EXTENDED)
  REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
  REGEXP_K = Regexp.compile(REGEXP_K_STRING, Regexp::EXTENDED)
  NF_HASH_D = Hash.new do |hash, key|
                         hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
                         hash[key] = nfd_one(key)
                       end
  NF_HASH_C = Hash.new do |hash, key|
                         hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
                         hash[key] = nfc_one(key)
                       end
  NF_HASH_K = Hash.new do |hash, key|
                         hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
                         hash[key] = nfkd_one(key)
                       end

  ## Constants For Hangul
  # for details such as the meaning of the identifiers below, please see
  # http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf, pp. 144/145
  SBASE = 0xAC00
  LBASE = 0x1100
  VBASE = 0x1161
  TBASE = 0x11A7
  LCOUNT = 19
  VCOUNT = 21
  TCOUNT = 28
  NCOUNT = VCOUNT * TCOUNT
  SCOUNT = LCOUNT * NCOUNT

  # Unicode-based encodings (except UTF-8)
  UNICODE_ENCODINGS = [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE,
                       Encoding::GB18030, Encoding::UCS_2BE, Encoding::UCS_4BE]

  ## Hangul Algorithm
  def self.hangul_decomp_one(target)
    syllable_index = target.ord - SBASE
    return target if syllable_index < 0 || syllable_index >= SCOUNT
    l = LBASE + syllable_index / NCOUNT
    v = VBASE + (syllable_index % NCOUNT) / TCOUNT
    t = TBASE + syllable_index % TCOUNT
    (t==TBASE ? [l, v] : [l, v, t]).pack('U*') + target[1..-1]
  end

  def self.hangul_comp_one(string)
    length = string.length
    if length>1 and 0 <= (lead =string[0].ord-LBASE) and lead  < LCOUNT and
                    0 <= (vowel=string[1].ord-VBASE) and vowel < VCOUNT
      lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
      if length>2 and 0 <= (trail=string[2].ord-TBASE) and trail < TCOUNT
        (lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1]
      else
        lead_vowel.chr(Encoding::UTF_8) + string[2..-1]
      end
    else
      string
    end
  end

  ## Canonical Ordering
  def self.canonical_ordering_one(string)
    sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
    (sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort
      (0..i).each do |j|
        later_class = sorting[j+1].last
        if 0<later_class and later_class<sorting[j].last
          sorting[j], sorting[j+1] = sorting[j+1], sorting[j]
        end
      end
    end
    return sorting.collect(&:first).join('')
  end

  ## Normalization Forms for Patterns (not whole Strings)
  def self.nfd_one(string)
    string = string.chars.map! {|c| DECOMPOSITION_TABLE[c] || c}.join('')
    canonical_ordering_one(hangul_decomp_one(string))
  end

  def self.nfkd_one(string)
    string.chars.map! {|c| KOMPATIBLE_TABLE[c] || c}.join('')
  end

  def self.nfc_one(string)
    nfd_string = nfd_one string
    start = nfd_string[0]
    last_class = CLASS_TABLE[start]-1
    accents = ''
    nfd_string[1..-1].each_char do |accent|
      accent_class = CLASS_TABLE[accent]
      if last_class<accent_class and composite = COMPOSITION_TABLE[start+accent]
        start = composite
      else
        accents << accent
        last_class = accent_class
      end
    end
    hangul_comp_one(start+accents)
  end

  def self.normalize(string, form = :nfc)
    encoding = string.encoding
    case encoding
    when Encoding::UTF_8
      case form
      when :nfc then
        string.gsub REGEXP_C, NF_HASH_C
      when :nfd then
        string.gsub REGEXP_D, NF_HASH_D
      when :nfkc then
        string.gsub(REGEXP_K, NF_HASH_K).gsub REGEXP_C, NF_HASH_C
      when :nfkd then
        string.gsub(REGEXP_K, NF_HASH_K).gsub REGEXP_D, NF_HASH_D
      else
        raise ArgumentError, "Invalid normalization form #{form}."
      end
    when Encoding::US_ASCII
      string
    when *UNICODE_ENCODINGS
      normalize(string.encode(Encoding::UTF_8), form).encode(encoding)
    else
      raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}"
    end
  end

  def self.normalized?(string, form = :nfc)
    encoding = string.encoding
    case encoding
    when Encoding::UTF_8
      case form
      when :nfc then
        string.scan REGEXP_C do |match|
          return false  if NF_HASH_C[match] != match
        end
        true
      when :nfd then
        string.scan REGEXP_D do |match|
          return false  if NF_HASH_D[match] != match
        end
        true
      when :nfkc then
        normalized?(string, :nfc) and string !~ REGEXP_K
      when :nfkd then
        normalized?(string, :nfd) and string !~ REGEXP_K
      else
        raise ArgumentError, "Invalid normalization form #{form}."
      end
    when Encoding::US_ASCII
      true
    when *UNICODE_ENCODINGS
      normalized? string.encode(Encoding::UTF_8), form
    else
      raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}"
    end
  end

end # module