| #!/usr/bin/env ruby |
| # |
| # This scripted has been updated to accept more command-line arguments: |
| # |
| # -u, --url URL to process |
| # -m, --machine Machine name |
| # -p, --properties Properties to add to the machine |
| # -o, --output Write output to file |
| # |
| # Updated by: Marty Schoch <marty.schoch@gmail.com> |
| # |
| # This script uses the unicode spec to generate a Ragel state machine |
| # that recognizes unicode alphanumeric characters. It generates 5 |
| # character classes: uupper, ulower, ualpha, udigit, and ualnum. |
| # Currently supported encodings are UTF-8 [default] and UCS-4. |
| # |
| # Usage: unicode2ragel.rb [options] |
| # -e, --encoding [ucs4 | utf8] Data encoding |
| # -h, --help Show this message |
| # |
| # This script was originally written as part of the Ferret search |
| # engine library. |
| # |
| # Author: Rakan El-Khalil <rakan@well.com> |
| |
| require 'optparse' |
| require 'open-uri' |
| |
| ENCODINGS = [ :utf8, :ucs4 ] |
| ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" } |
| DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" |
| DEFAULT_MACHINE_NAME= "WChar" |
| |
| ### |
| # Display vars & default option |
| |
| TOTAL_WIDTH = 80 |
| RANGE_WIDTH = 23 |
| @encoding = :utf8 |
| @chart_url = DEFAULT_CHART_URL |
| machine_name = DEFAULT_MACHINE_NAME |
| properties = [] |
| @output = $stdout |
| |
| ### |
| # Option parsing |
| |
| cli_opts = OptionParser.new do |opts| |
| opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| |
| @encoding = o.downcase.to_sym |
| end |
| opts.on("-h", "--help", "Show this message") do |
| puts opts |
| exit |
| end |
| opts.on("-u", "--url URL", "URL to process") do |o| |
| @chart_url = o |
| end |
| opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o| |
| machine_name = o |
| end |
| opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o| |
| properties = o |
| end |
| opts.on("-o", "--output FILE", "output file") do |o| |
| @output = File.new(o, "w+") |
| end |
| end |
| |
| cli_opts.parse(ARGV) |
| unless ENCODINGS.member? @encoding |
| puts "Invalid encoding: #{@encoding}" |
| puts cli_opts |
| exit |
| end |
| |
| ## |
| # Downloads the document at url and yields every alpha line's hex |
| # range and description. |
| |
| def each_alpha( url, property ) |
| open( url ) do |file| |
| file.each_line do |line| |
| next if line =~ /^#/; |
| next if line !~ /; #{property} #/; |
| |
| range, description = line.split(/;/) |
| range.strip! |
| description.gsub!(/.*#/, '').strip! |
| |
| if range =~ /\.\./ |
| start, stop = range.split '..' |
| else start = stop = range |
| end |
| |
| yield start.hex .. stop.hex, description |
| end |
| end |
| end |
| |
| ### |
| # Formats to hex at minimum width |
| |
| def to_hex( n ) |
| r = "%0X" % n |
| r = "0#{r}" unless (r.length % 2).zero? |
| r |
| end |
| |
| ### |
| # UCS4 is just a straight hex conversion of the unicode codepoint. |
| |
| def to_ucs4( range ) |
| rangestr = "0x" + to_hex(range.begin) |
| rangestr << "..0x" + to_hex(range.end) if range.begin != range.end |
| [ rangestr ] |
| end |
| |
| ## |
| # 0x00 - 0x7f -> 0zzzzzzz[7] |
| # 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] |
| # 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] |
| # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] |
| |
| UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] |
| |
| def to_utf8_enc( n ) |
| r = 0 |
| if n <= 0x7f |
| r = n |
| elsif n <= 0x7ff |
| y = 0xc0 | (n >> 6) |
| z = 0x80 | (n & 0x3f) |
| r = y << 8 | z |
| elsif n <= 0xffff |
| x = 0xe0 | (n >> 12) |
| y = 0x80 | (n >> 6) & 0x3f |
| z = 0x80 | n & 0x3f |
| r = x << 16 | y << 8 | z |
| elsif n <= 0x10ffff |
| w = 0xf0 | (n >> 18) |
| x = 0x80 | (n >> 12) & 0x3f |
| y = 0x80 | (n >> 6) & 0x3f |
| z = 0x80 | n & 0x3f |
| r = w << 24 | x << 16 | y << 8 | z |
| end |
| |
| to_hex(r) |
| end |
| |
| def from_utf8_enc( n ) |
| n = n.hex |
| r = 0 |
| if n <= 0x7f |
| r = n |
| elsif n <= 0xdfff |
| y = (n >> 8) & 0x1f |
| z = n & 0x3f |
| r = y << 6 | z |
| elsif n <= 0xefffff |
| x = (n >> 16) & 0x0f |
| y = (n >> 8) & 0x3f |
| z = n & 0x3f |
| r = x << 10 | y << 6 | z |
| elsif n <= 0xf7ffffff |
| w = (n >> 24) & 0x07 |
| x = (n >> 16) & 0x3f |
| y = (n >> 8) & 0x3f |
| z = n & 0x3f |
| r = w << 18 | x << 12 | y << 6 | z |
| end |
| r |
| end |
| |
| ### |
| # Given a range, splits it up into ranges that can be continuously |
| # encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] |
| # This is not strictly needed since the current [5.1] unicode standard |
| # doesn't have ranges that straddle utf8 boundaries. This is included |
| # for completeness as there is no telling if that will ever change. |
| |
| def utf8_ranges( range ) |
| ranges = [] |
| UTF8_BOUNDARIES.each do |max| |
| if range.begin <= max |
| if range.end <= max |
| ranges << range |
| return ranges |
| end |
| |
| ranges << (range.begin .. max) |
| range = (max + 1) .. range.end |
| end |
| end |
| ranges |
| end |
| |
| def build_range( start, stop ) |
| size = start.size/2 |
| left = size - 1 |
| return [""] if size < 1 |
| |
| a = start[0..1] |
| b = stop[0..1] |
| |
| ### |
| # Shared prefix |
| |
| if a == b |
| return build_range(start[2..-1], stop[2..-1]).map do |elt| |
| "0x#{a} " + elt |
| end |
| end |
| |
| ### |
| # Unshared prefix, end of run |
| |
| return ["0x#{a}..0x#{b} "] if left.zero? |
| |
| ### |
| # Unshared prefix, not end of run |
| # Range can be 0x123456..0x56789A |
| # Which is equivalent to: |
| # 0x123456 .. 0x12FFFF |
| # 0x130000 .. 0x55FFFF |
| # 0x560000 .. 0x56789A |
| |
| ret = [] |
| ret << build_range(start, a + "FF" * left) |
| |
| ### |
| # Only generate middle range if need be. |
| |
| if a.hex+1 != b.hex |
| max = to_hex(b.hex - 1) |
| max = "FF" if b == "FF" |
| ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left |
| end |
| |
| ### |
| # Don't generate last range if it is covered by first range |
| |
| ret << build_range(b + "00" * left, stop) unless b == "FF" |
| ret.flatten! |
| end |
| |
| def to_utf8( range ) |
| utf8_ranges( range ).map do |r| |
| begin_enc = to_utf8_enc(r.begin) |
| end_enc = to_utf8_enc(r.end) |
| build_range begin_enc, end_enc |
| end.flatten! |
| end |
| |
| ## |
| # Perform a 3-way comparison of the number of codepoints advertised by |
| # the unicode spec for the given range, the originally parsed range, |
| # and the resulting utf8 encoded range. |
| |
| def count_codepoints( code ) |
| code.split(' ').inject(1) do |acc, elt| |
| if elt =~ /0x(.+)\.\.0x(.+)/ |
| if @encoding == :utf8 |
| acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) |
| else |
| acc * ($2.hex - $1.hex + 1) |
| end |
| else |
| acc |
| end |
| end |
| end |
| |
| def is_valid?( range, desc, codes ) |
| spec_count = 1 |
| spec_count = $1.to_i if desc =~ /\[(\d+)\]/ |
| range_count = range.end - range.begin + 1 |
| |
| sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } |
| sum == spec_count and sum == range_count |
| end |
| |
| ## |
| # Generate the state maching to stdout |
| |
| def generate_machine( name, property ) |
| pipe = " " |
| @output.puts " #{name} = " |
| each_alpha( @chart_url, property ) do |range, desc| |
| |
| codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) |
| |
| #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless |
| # is_valid? range, desc, codes |
| |
| range_width = codes.map { |a| a.size }.max |
| range_width = RANGE_WIDTH if range_width < RANGE_WIDTH |
| |
| desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 |
| desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH |
| |
| if desc.size > desc_width |
| desc = desc[0..desc_width - 4] + "..." |
| end |
| |
| codes.each_with_index do |r, idx| |
| desc = "" unless idx.zero? |
| code = "%-#{range_width}s" % r |
| @output.puts " #{pipe} #{code} ##{desc}" |
| pipe = "|" |
| end |
| end |
| @output.puts " ;" |
| @output.puts "" |
| end |
| |
| @output.puts <<EOF |
| # The following Ragel file was autogenerated with #{$0} |
| # from: #{@chart_url} |
| # |
| # It defines #{properties}. |
| # |
| # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]}, |
| # and that your input is in #{@encoding}. |
| |
| %%{ |
| machine #{machine_name}; |
| |
| EOF |
| |
| properties.each { |x| generate_machine( x, x ) } |
| |
| @output.puts <<EOF |
| }%% |
| EOF |