hclsyntax/unicode2ragel.rb - hashicorp/hcl/v2 - Git at Google

 #!/usr/bin/env ruby
 #
 # This scripted has been updated to accept more command-line arguments:
 #
 #    -u, --url                        URL to process
 #    -m, --machine                    Machine name
 #    -p, --properties                 Properties to add to the machine
 #    -o, --output                     Write output to file
 #
 # Updated by: Marty Schoch <marty.schoch@gmail.com>
 #
 # This script uses the unicode spec to generate a Ragel state machine
 # that recognizes unicode alphanumeric characters.  It generates 5
 # character classes: uupper, ulower, ualpha, udigit, and ualnum.
 # Currently supported encodings are UTF-8 [default] and UCS-4.
 #
 # Usage: unicode2ragel.rb [options]
 #    -e, --encoding [ucs4 | utf8]     Data encoding
 #    -h, --help                       Show this message
 #
 # This script was originally written as part of the Ferret search
 # engine library.
 #
 # Author: Rakan El-Khalil <rakan@well.com>

 require 'optparse'
 require 'open-uri'

 ENCODINGS = [ :utf8, :ucs4 ]
 ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
 DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
 DEFAULT_MACHINE_NAME= "WChar"

 ###
 # Display vars & default option

 TOTAL_WIDTH = 80
 RANGE_WIDTH = 23
 @encoding = :utf8
 @chart_url = DEFAULT_CHART_URL
 machine_name = DEFAULT_MACHINE_NAME
 properties = []
 @output = $stdout

 ###
 # Option parsing

 cli_opts = OptionParser.new do |opts|
   opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
     @encoding = o.downcase.to_sym
   end
   opts.on("-h", "--help", "Show this message") do
     puts opts
     exit
   end
   opts.on("-u", "--url URL", "URL to process") do |o|
     @chart_url = o
   end
   opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
     machine_name = o
   end
   opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
     properties = o
   end
   opts.on("-o", "--output FILE", "output file") do |o|
     @output = File.new(o, "w+")
   end
 end

 cli_opts.parse(ARGV)
 unless ENCODINGS.member? @encoding
   puts "Invalid encoding: #{@encoding}"
   puts cli_opts
   exit
 end

 ##
 # Downloads the document at url and yields every alpha line's hex
 # range and description.

 def each_alpha( url, property )
   open( url ) do |file|
     file.each_line do |line|
       next if line =~ /^#/;
       next if line !~ /; #{property} #/;

       range, description = line.split(/;/)
       range.strip!
       description.gsub!(/.*#/, '').strip!

       if range =~ /\.\./
            start, stop = range.split '..'
       else start = stop = range
       end

       yield start.hex .. stop.hex, description
     end
   end
 end

 ###
 # Formats to hex at minimum width

 def to_hex( n )
   r = "%0X" % n
   r = "0#{r}" unless (r.length % 2).zero?
   r
 end

 ###
 # UCS4 is just a straight hex conversion of the unicode codepoint.

 def to_ucs4( range )
   rangestr  =   "0x" + to_hex(range.begin)
   rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
   [ rangestr ]
 end

 ##
 # 0x00     - 0x7f     -> 0zzzzzzz[7]
 # 0x80     - 0x7ff    -> 110yyyyy[5] 10zzzzzz[6]
 # 0x800    - 0xffff   -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
 # 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]

 UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]

 def to_utf8_enc( n )
   r = 0
   if n <= 0x7f
     r = n
   elsif n <= 0x7ff
     y = 0xc0 | (n >> 6)
     z = 0x80 | (n & 0x3f)
     r = y << 8 | z
   elsif n <= 0xffff
     x = 0xe0 | (n >> 12)
     y = 0x80 | (n >>  6) & 0x3f
     z = 0x80 |  n        & 0x3f
     r = x << 16 | y << 8 | z
   elsif n <= 0x10ffff
     w = 0xf0 | (n >> 18)
     x = 0x80 | (n >> 12) & 0x3f
     y = 0x80 | (n >>  6) & 0x3f
     z = 0x80 |  n        & 0x3f
     r = w << 24 | x << 16 | y << 8 | z
   end

   to_hex(r)
 end

 def from_utf8_enc( n )
   n = n.hex
   r = 0
   if n <= 0x7f
     r = n
   elsif n <= 0xdfff
     y = (n >> 8) & 0x1f
     z =  n       & 0x3f
     r = y << 6 | z
   elsif n <= 0xefffff
     x = (n >> 16) & 0x0f
     y = (n >>  8) & 0x3f
     z =  n        & 0x3f
     r = x << 10 | y << 6 | z
   elsif n <= 0xf7ffffff
     w = (n >> 24) & 0x07
     x = (n >> 16) & 0x3f
     y = (n >>  8) & 0x3f
     z =  n        & 0x3f
     r = w << 18 | x << 12 | y << 6 | z
   end
   r
 end

 ###
 # Given a range, splits it up into ranges that can be continuously
 # encoded into utf8.  Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
 # This is not strictly needed since the current [5.1] unicode standard
 # doesn't have ranges that straddle utf8 boundaries.  This is included
 # for completeness as there is no telling if that will ever change.

 def utf8_ranges( range )
   ranges = []
   UTF8_BOUNDARIES.each do |max|
     if range.begin <= max
       if range.end <= max
         ranges << range
         return ranges
       end

       ranges << (range.begin .. max)
       range = (max + 1) .. range.end
     end
   end
   ranges
 end

 def build_range( start, stop )
   size = start.size/2
   left = size - 1
   return [""] if size < 1

   a = start[0..1]
   b = stop[0..1]

   ###
   # Shared prefix

   if a == b
     return build_range(start[2..-1], stop[2..-1]).map do |elt|
       "0x#{a} " + elt
     end
   end

   ###
   # Unshared prefix, end of run

   return ["0x#{a}..0x#{b} "] if left.zero?

   ###
   # Unshared prefix, not end of run
   # Range can be 0x123456..0x56789A
   # Which is equivalent to:
   #     0x123456 .. 0x12FFFF
   #     0x130000 .. 0x55FFFF
   #     0x560000 .. 0x56789A

   ret = []
   ret << build_range(start, a + "FF" * left)

   ###
   # Only generate middle range if need be.

   if a.hex+1 != b.hex
     max = to_hex(b.hex - 1)
     max = "FF" if b == "FF"
     ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
   end

   ###
   # Don't generate last range if it is covered by first range

   ret << build_range(b + "00" * left, stop) unless b == "FF"
   ret.flatten!
 end

 def to_utf8( range )
   utf8_ranges( range ).map do |r|
     begin_enc = to_utf8_enc(r.begin)
     end_enc = to_utf8_enc(r.end)
     build_range begin_enc, end_enc
   end.flatten!
 end

 ##
 # Perform a 3-way comparison of the number of codepoints advertised by
 # the unicode spec for the given range, the originally parsed range,
 # and the resulting utf8 encoded range.

 def count_codepoints( code )
   code.split(' ').inject(1) do |acc, elt|
     if elt =~ /0x(.+)\.\.0x(.+)/
       if @encoding == :utf8
         acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
       else
         acc * ($2.hex - $1.hex + 1)
       end
     else
       acc
     end
   end
 end

 def is_valid?( range, desc, codes )
   spec_count  = 1
   spec_count  = $1.to_i if desc =~ /\[(\d+)\]/
   range_count = range.end - range.begin + 1

   sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
   sum == spec_count and sum == range_count
 end

 ##
 # Generate the state maching to stdout

 def generate_machine( name, property )
   pipe = " "
   @output.puts "    #{name} = "
   each_alpha( @chart_url, property ) do |range, desc|

     codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)

     #raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
     #  is_valid? range, desc, codes

     range_width = codes.map { |a| a.size }.max
     range_width = RANGE_WIDTH if range_width < RANGE_WIDTH

     desc_width  = TOTAL_WIDTH - RANGE_WIDTH - 11
     desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH

     if desc.size > desc_width
       desc = desc[0..desc_width - 4] + "..."
     end

     codes.each_with_index do |r, idx|
       desc = "" unless idx.zero?
       code = "%-#{range_width}s" % r
       @output.puts "      #{pipe} #{code} ##{desc}"
       pipe = "|"
     end
   end
   @output.puts "      ;"
   @output.puts ""
 end

 @output.puts <<EOF
 # The following Ragel file was autogenerated with #{$0}
 # from: #{@chart_url}
 #
 # It defines #{properties}.
 #
 # To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
 # and that your input is in #{@encoding}.

 %%{
     machine #{machine_name};

 EOF

 properties.each { |x| generate_machine( x, x ) }

 @output.puts <<EOF
 }%%
 EOF
	#!/usr/bin/env ruby
	#
	# This scripted has been updated to accept more command-line arguments:
	#
	# -u, --url URL to process
	# -m, --machine Machine name
	# -p, --properties Properties to add to the machine
	# -o, --output Write output to file
	#
	# Updated by: Marty Schoch <marty.schoch@gmail.com>
	#
	# This script uses the unicode spec to generate a Ragel state machine
	# that recognizes unicode alphanumeric characters. It generates 5
	# character classes: uupper, ulower, ualpha, udigit, and ualnum.
	# Currently supported encodings are UTF-8 [default] and UCS-4.
	#
	# Usage: unicode2ragel.rb [options]
	# -e, --encoding [ucs4 \| utf8] Data encoding
	# -h, --help Show this message
	#
	# This script was originally written as part of the Ferret search
	# engine library.
	#
	# Author: Rakan El-Khalil <rakan@well.com>

	require 'optparse'
	require 'open-uri'

	ENCODINGS = [ :utf8, :ucs4 ]
	ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
	DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
	DEFAULT_MACHINE_NAME= "WChar"

	###
	# Display vars & default option

	TOTAL_WIDTH = 80
	RANGE_WIDTH = 23
	@encoding = :utf8
	@chart_url = DEFAULT_CHART_URL
	machine_name = DEFAULT_MACHINE_NAME
	properties = []
	@output = $stdout

	###
	# Option parsing

	cli_opts = OptionParser.new do \|opts\|
	opts.on("-e", "--encoding [ucs4 \| utf8]", "Data encoding") do \|o\|
	@encoding = o.downcase.to_sym
	end
	opts.on("-h", "--help", "Show this message") do
	puts opts
	exit
	end
	opts.on("-u", "--url URL", "URL to process") do \|o\|
	@chart_url = o
	end
	opts.on("-m", "--machine MACHINE_NAME", "Machine name") do \|o\|
	machine_name = o
	end
	opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do \|o\|
	properties = o
	end
	opts.on("-o", "--output FILE", "output file") do \|o\|
	@output = File.new(o, "w+")
	end
	end

	cli_opts.parse(ARGV)
	unless ENCODINGS.member? @encoding
	puts "Invalid encoding: #{@encoding}"
	puts cli_opts
	exit
	end

	##
	# Downloads the document at url and yields every alpha line's hex
	# range and description.

	def each_alpha( url, property )
	open( url ) do \|file\|
	file.each_line do \|line\|
	next if line =~ /^#/;
	next if line !~ /; #{property} #/;

	range, description = line.split(/;/)
	range.strip!
	description.gsub!(/.*#/, '').strip!

	if range =~ /\.\./
	start, stop = range.split '..'
	else start = stop = range
	end

	yield start.hex .. stop.hex, description
	end
	end
	end

	###
	# Formats to hex at minimum width

	def to_hex( n )
	r = "%0X" % n
	r = "0#{r}" unless (r.length % 2).zero?
	r
	end

	###
	# UCS4 is just a straight hex conversion of the unicode codepoint.

	def to_ucs4( range )
	rangestr = "0x" + to_hex(range.begin)
	rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
	[ rangestr ]
	end

	##
	# 0x00 - 0x7f -> 0zzzzzzz[7]
	# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
	# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
	# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]

	UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]

	def to_utf8_enc( n )
	r = 0
	if n <= 0x7f
	r = n
	elsif n <= 0x7ff
	y = 0xc0 \| (n >> 6)
	z = 0x80 \| (n & 0x3f)
	r = y << 8 \| z
	elsif n <= 0xffff
	x = 0xe0 \| (n >> 12)
	y = 0x80 \| (n >> 6) & 0x3f
	z = 0x80 \| n & 0x3f
	r = x << 16 \| y << 8 \| z
	elsif n <= 0x10ffff
	w = 0xf0 \| (n >> 18)
	x = 0x80 \| (n >> 12) & 0x3f
	y = 0x80 \| (n >> 6) & 0x3f
	z = 0x80 \| n & 0x3f
	r = w << 24 \| x << 16 \| y << 8 \| z
	end

	to_hex(r)
	end

	def from_utf8_enc( n )
	n = n.hex
	r = 0
	if n <= 0x7f
	r = n
	elsif n <= 0xdfff
	y = (n >> 8) & 0x1f
	z = n & 0x3f
	r = y << 6 \| z
	elsif n <= 0xefffff
	x = (n >> 16) & 0x0f
	y = (n >> 8) & 0x3f
	z = n & 0x3f
	r = x << 10 \| y << 6 \| z
	elsif n <= 0xf7ffffff
	w = (n >> 24) & 0x07
	x = (n >> 16) & 0x3f
	y = (n >> 8) & 0x3f
	z = n & 0x3f
	r = w << 18 \| x << 12 \| y << 6 \| z
	end
	r
	end

	###
	# Given a range, splits it up into ranges that can be continuously
	# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
	# This is not strictly needed since the current [5.1] unicode standard
	# doesn't have ranges that straddle utf8 boundaries. This is included
	# for completeness as there is no telling if that will ever change.

	def utf8_ranges( range )
	ranges = []
	UTF8_BOUNDARIES.each do \|max\|
	if range.begin <= max
	if range.end <= max
	ranges << range
	return ranges
	end

	ranges << (range.begin .. max)
	range = (max + 1) .. range.end
	end
	end
	ranges
	end

	def build_range( start, stop )
	size = start.size/2
	left = size - 1
	return [""] if size < 1

	a = start[0..1]
	b = stop[0..1]

	###
	# Shared prefix

	if a == b
	return build_range(start[2..-1], stop[2..-1]).map do \|elt\|
	"0x#{a} " + elt
	end
	end

	###
	# Unshared prefix, end of run

	return ["0x#{a}..0x#{b} "] if left.zero?

	###
	# Unshared prefix, not end of run
	# Range can be 0x123456..0x56789A
	# Which is equivalent to:
	# 0x123456 .. 0x12FFFF
	# 0x130000 .. 0x55FFFF
	# 0x560000 .. 0x56789A

	ret = []
	ret << build_range(start, a + "FF" * left)

	###
	# Only generate middle range if need be.

	if a.hex+1 != b.hex
	max = to_hex(b.hex - 1)
	max = "FF" if b == "FF"
	ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
	end

	###
	# Don't generate last range if it is covered by first range

	ret << build_range(b + "00" * left, stop) unless b == "FF"
	ret.flatten!
	end

	def to_utf8( range )
	utf8_ranges( range ).map do \|r\|
	begin_enc = to_utf8_enc(r.begin)
	end_enc = to_utf8_enc(r.end)
	build_range begin_enc, end_enc
	end.flatten!
	end

	##
	# Perform a 3-way comparison of the number of codepoints advertised by
	# the unicode spec for the given range, the originally parsed range,
	# and the resulting utf8 encoded range.

	def count_codepoints( code )
	code.split(' ').inject(1) do \|acc, elt\|
	if elt =~ /0x(.+)\.\.0x(.+)/
	if @encoding == :utf8
	acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
	else
	acc * ($2.hex - $1.hex + 1)
	end
	else
	acc
	end
	end
	end

	def is_valid?( range, desc, codes )
	spec_count = 1
	spec_count = $1.to_i if desc =~ /\[(\d+)\]/
	range_count = range.end - range.begin + 1

	sum = codes.inject(0) { \|acc, elt\| acc + count_codepoints(elt) }
	sum == spec_count and sum == range_count
	end

	##
	# Generate the state maching to stdout

	def generate_machine( name, property )
	pipe = " "
	@output.puts " #{name} = "
	each_alpha( @chart_url, property ) do \|range, desc\|

	codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)

	#raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
	# is_valid? range, desc, codes

	range_width = codes.map { \|a\| a.size }.max
	range_width = RANGE_WIDTH if range_width < RANGE_WIDTH

	desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
	desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH

	if desc.size > desc_width
	desc = desc[0..desc_width - 4] + "..."
	end

	codes.each_with_index do \|r, idx\|
	desc = "" unless idx.zero?
	code = "%-#{range_width}s" % r
	@output.puts " #{pipe} #{code} ##{desc}"
	pipe = "\|"
	end
	end
	@output.puts " ;"
	@output.puts ""
	end

	@output.puts <<EOF
	# The following Ragel file was autogenerated with #{$0}
	# from: #{@chart_url}
	#
	# It defines #{properties}.
	#
	# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
	# and that your input is in #{@encoding}.

	%%{
	machine #{machine_name};

	EOF

	properties.each { \|x\| generate_machine( x, x ) }

	@output.puts <<EOF
	}%%
	EOF