google3/third_party/grte/v5_src/glibc-2.27/localedata/unicode-gen/utf8_compatibility.py - GRTEv5 - Git at Google

 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 # Copyright (C) 2014-2018 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.

 '''
 This script is useful for checking backward compatibility of newly
 generated UTF-8 file from utf8_gen.py script

 To see how this script is used, call it with the “-h” option:

     $ ./utf8_compatibility.py -h
     … prints usage message …
 '''

 import sys
 import re
 import argparse
 import unicode_utils

 def create_charmap_dictionary(file_name):
     '''Create a dictionary for all code points found in the CHARMAP
     section of a file
     '''
     with open(file_name, mode='r') as utf8_file:
         charmap_dictionary = {}
         for line in utf8_file:
             if line.startswith('CHARMAP'):
                 break
         for line in utf8_file:
             if line.startswith('END CHARMAP'):
                 return charmap_dictionary
             if line.startswith('%'):
                 continue
             match = re.match(
                 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
                 +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
                 +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
                 line)
             if not match:
                 continue
             codepoint1 = match.group('codepoint1')
             codepoint2 = match.group('codepoint2')
             if not codepoint2:
                 codepoint2 = codepoint1
             for i in range(int(codepoint1, 16),
                            int(codepoint2, 16) + 1):
                 charmap_dictionary[i] = match.group('hexutf8')
         sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
                          %file_name)
         exit(1)

 def check_charmap(original_file_name, new_file_name):
     '''Report differences in the CHARMAP section between the old and the
     new file
     '''
     print('************************************************************')
     print('Report on CHARMAP:')
     ocharmap = create_charmap_dictionary(original_file_name)
     ncharmap = create_charmap_dictionary(new_file_name)
     print('------------------------------------------------------------')
     print('Total removed characters in newly generated CHARMAP: %d'
           %len(set(ocharmap)-set(ncharmap)))
     if ARGS.show_missing_characters:
         for key in sorted(set(ocharmap)-set(ncharmap)):
             print('removed: {:s}     {:s} {:s}'.format(
                 unicode_utils.ucs_symbol(key),
                 ocharmap[key],
                 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     changed_charmap = {}
     for key in set(ocharmap).intersection(set(ncharmap)):
         if ocharmap[key] != ncharmap[key]:
             changed_charmap[key] = (ocharmap[key], ncharmap[key])
     print('Total changed characters in newly generated CHARMAP: %d'
           %len(changed_charmap))
     if ARGS.show_changed_characters:
         for key in sorted(changed_charmap):
             print('changed: {:s}     {:s}->{:s} {:s}'.format(
                 unicode_utils.ucs_symbol(key),
                 changed_charmap[key][0],
                 changed_charmap[key][1],
                 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     print('Total added characters in newly generated CHARMAP: %d'
           %len(set(ncharmap)-set(ocharmap)))
     if ARGS.show_added_characters:
         for key in sorted(set(ncharmap)-set(ocharmap)):
             print('added: {:s}     {:s} {:s}'.format(
                 unicode_utils.ucs_symbol(key),
                 ncharmap[key],
                 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
                 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))

 def create_width_dictionary(file_name):
     '''Create a dictionary for all code points found in the WIDTH
     section of a file
     '''
     with open(file_name, mode='r') as utf8_file:
         width_dictionary = {}
         for line in utf8_file:
             if line.startswith('WIDTH'):
                 break
         for line in utf8_file:
             if line.startswith('END WIDTH'):
                 return width_dictionary
             match = re.match(
                 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
                 +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
                 +r'\s+(?P<width>[02])',
                 line)
             if not match:
                 continue
             codepoint1 = match.group('codepoint1')
             codepoint2 = match.group('codepoint2')
             if not codepoint2:
                 codepoint2 = codepoint1
             for i in range(int(codepoint1, 16),
                            int(codepoint2, 16) + 1):
                 width_dictionary[i] = int(match.group('width'))
         sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)

 def check_width(original_file_name, new_file_name):
     '''Report differences in the WIDTH section between the old and the new
     file
     '''
     print('************************************************************')
     print('Report on WIDTH:')
     owidth = create_width_dictionary(original_file_name)
     nwidth = create_width_dictionary(new_file_name)
     print('------------------------------------------------------------')
     print('Total removed characters in newly generated WIDTH: %d'
           %len(set(owidth)-set(nwidth)))
     print('(Characters not in WIDTH get width 1 by default, '
           + 'i.e. these have width 1 now.)')
     if ARGS.show_missing_characters:
         for key in sorted(set(owidth)-set(nwidth)):
             print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d} : '.format(owidth[key])
                   + 'eaw={:s} '.format(
                       unicode_utils.EAST_ASIAN_WIDTHS[key]
                       if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['name']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     changed_width = {}
     for key in set(owidth).intersection(set(nwidth)):
         if owidth[key] != nwidth[key]:
             changed_width[key] = (owidth[key], nwidth[key])
     print('Total changed characters in newly generated WIDTH: %d'
           %len(changed_width))
     if ARGS.show_changed_characters:
         for key in sorted(changed_width):
             print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d}->{:d} : '.format(changed_width[key][0],
                                           changed_width[key][1])
                   + 'eaw={:s} '.format(
                       unicode_utils.EAST_ASIAN_WIDTHS[key]
                       if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['name']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     print('------------------------------------------------------------')
     print('Total added characters in newly generated WIDTH: %d'
           %len(set(nwidth)-set(owidth)))
     print('(Characters not in WIDTH get width 1 by default, '
           + 'i.e. these had width 1 before.)')
     if ARGS.show_added_characters:
         for key in sorted(set(nwidth)-set(owidth)):
             print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
                   + '{:d} : '.format(nwidth[key])
                   + 'eaw={:s} '.format(
                       unicode_utils.EAST_ASIAN_WIDTHS[key]
                       if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
                   + 'category={:2s} '.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['category']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'bidi={:3s} '.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
                   + 'name={:s}'.format(
                       unicode_utils.UNICODE_ATTRIBUTES[key]['name']
                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))

 if __name__ == "__main__":
     PARSER = argparse.ArgumentParser(
         description='''
         Compare the contents of LC_CTYPE in two files and check for errors.
         ''')
     PARSER.add_argument(
         '-o', '--old_utf8_file',
         nargs='?',
         required=True,
         type=str,
         help='The old UTF-8 file.')
     PARSER.add_argument(
         '-n', '--new_utf8_file',
         nargs='?',
         required=True,
         type=str,
         help='The new UTF-8 file.')
     PARSER.add_argument(
         '-u', '--unicode_data_file',
         nargs='?',
         type=str,
         help='The UnicodeData.txt file to read.')
     PARSER.add_argument(
         '-e', '--east_asian_width_file',
         nargs='?',
         type=str,
         help='The EastAsianWidth.txt file to read.')
     PARSER.add_argument(
         '-a', '--show_added_characters',
         action='store_true',
         help='Show characters which were added in detail.')
     PARSER.add_argument(
         '-m', '--show_missing_characters',
         action='store_true',
         help='Show characters which were removed in detail.')
     PARSER.add_argument(
         '-c', '--show_changed_characters',
         action='store_true',
         help='Show characters whose width was changed in detail.')
     ARGS = PARSER.parse_args()

     if ARGS.unicode_data_file:
         unicode_utils.fill_attributes(ARGS.unicode_data_file)
     if ARGS.east_asian_width_file:
         unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
     check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
     check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
	#!/usr/bin/python3
	# -- coding: utf-8 --
	# Copyright (C) 2014-2018 Free Software Foundation, Inc.
	# This file is part of the GNU C Library.
	#
	# The GNU C Library is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# The GNU C Library is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with the GNU C Library; if not, see
	# <http://www.gnu.org/licenses/>.

	'''
	This script is useful for checking backward compatibility of newly
	generated UTF-8 file from utf8_gen.py script

	To see how this script is used, call it with the “-h” option:

	$ ./utf8_compatibility.py -h
	… prints usage message …
	'''

	import sys
	import re
	import argparse
	import unicode_utils

	def create_charmap_dictionary(file_name):
	'''Create a dictionary for all code points found in the CHARMAP
	section of a file
	'''
	with open(file_name, mode='r') as utf8_file:
	charmap_dictionary = {}
	for line in utf8_file:
	if line.startswith('CHARMAP'):
	break
	for line in utf8_file:
	if line.startswith('END CHARMAP'):
	return charmap_dictionary
	if line.startswith('%'):
	continue
	match = re.match(
	r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
	+r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
	+r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
	line)
	if not match:
	continue
	codepoint1 = match.group('codepoint1')
	codepoint2 = match.group('codepoint2')
	if not codepoint2:
	codepoint2 = codepoint1
	for i in range(int(codepoint1, 16),
	int(codepoint2, 16) + 1):
	charmap_dictionary[i] = match.group('hexutf8')
	sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
	%file_name)
	exit(1)

	def check_charmap(original_file_name, new_file_name):
	'''Report differences in the CHARMAP section between the old and the
	new file
	'''
	print('************************************************************')
	print('Report on CHARMAP:')
	ocharmap = create_charmap_dictionary(original_file_name)
	ncharmap = create_charmap_dictionary(new_file_name)
	print('------------------------------------------------------------')
	print('Total removed characters in newly generated CHARMAP: %d'
	%len(set(ocharmap)-set(ncharmap)))
	if ARGS.show_missing_characters:
	for key in sorted(set(ocharmap)-set(ncharmap)):
	print('removed: {:s} {:s} {:s}'.format(
	unicode_utils.ucs_symbol(key),
	ocharmap[key],
	unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
	print('------------------------------------------------------------')
	changed_charmap = {}
	for key in set(ocharmap).intersection(set(ncharmap)):
	if ocharmap[key] != ncharmap[key]:
	changed_charmap[key] = (ocharmap[key], ncharmap[key])
	print('Total changed characters in newly generated CHARMAP: %d'
	%len(changed_charmap))
	if ARGS.show_changed_characters:
	for key in sorted(changed_charmap):
	print('changed: {:s} {:s}->{:s} {:s}'.format(
	unicode_utils.ucs_symbol(key),
	changed_charmap[key][0],
	changed_charmap[key][1],
	unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
	print('------------------------------------------------------------')
	print('Total added characters in newly generated CHARMAP: %d'
	%len(set(ncharmap)-set(ocharmap)))
	if ARGS.show_added_characters:
	for key in sorted(set(ncharmap)-set(ocharmap)):
	print('added: {:s} {:s} {:s}'.format(
	unicode_utils.ucs_symbol(key),
	ncharmap[key],
	unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))

	def create_width_dictionary(file_name):
	'''Create a dictionary for all code points found in the WIDTH
	section of a file
	'''
	with open(file_name, mode='r') as utf8_file:
	width_dictionary = {}
	for line in utf8_file:
	if line.startswith('WIDTH'):
	break
	for line in utf8_file:
	if line.startswith('END WIDTH'):
	return width_dictionary
	match = re.match(
	r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
	+r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
	+r'\s+(?P<width>[02])',
	line)
	if not match:
	continue
	codepoint1 = match.group('codepoint1')
	codepoint2 = match.group('codepoint2')
	if not codepoint2:
	codepoint2 = codepoint1
	for i in range(int(codepoint1, 16),
	int(codepoint2, 16) + 1):
	width_dictionary[i] = int(match.group('width'))
	sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)

	def check_width(original_file_name, new_file_name):
	'''Report differences in the WIDTH section between the old and the new
	file
	'''
	print('************************************************************')
	print('Report on WIDTH:')
	owidth = create_width_dictionary(original_file_name)
	nwidth = create_width_dictionary(new_file_name)
	print('------------------------------------------------------------')
	print('Total removed characters in newly generated WIDTH: %d'
	%len(set(owidth)-set(nwidth)))
	print('(Characters not in WIDTH get width 1 by default, '
	+ 'i.e. these have width 1 now.)')
	if ARGS.show_missing_characters:
	for key in sorted(set(owidth)-set(nwidth)):
	print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
	+ '{:d} : '.format(owidth[key])
	+ 'eaw={:s} '.format(
	unicode_utils.EAST_ASIAN_WIDTHS[key]
	if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
	+ 'category={:2s} '.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['category']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
	+ 'bidi={:3s} '.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
	+ 'name={:s}'.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['name']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
	print('------------------------------------------------------------')
	changed_width = {}
	for key in set(owidth).intersection(set(nwidth)):
	if owidth[key] != nwidth[key]:
	changed_width[key] = (owidth[key], nwidth[key])
	print('Total changed characters in newly generated WIDTH: %d'
	%len(changed_width))
	if ARGS.show_changed_characters:
	for key in sorted(changed_width):
	print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
	+ '{:d}->{:d} : '.format(changed_width[key][0],
	changed_width[key][1])
	+ 'eaw={:s} '.format(
	unicode_utils.EAST_ASIAN_WIDTHS[key]
	if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
	+ 'category={:2s} '.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['category']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
	+ 'bidi={:3s} '.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
	+ 'name={:s}'.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['name']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
	print('------------------------------------------------------------')
	print('Total added characters in newly generated WIDTH: %d'
	%len(set(nwidth)-set(owidth)))
	print('(Characters not in WIDTH get width 1 by default, '
	+ 'i.e. these had width 1 before.)')
	if ARGS.show_added_characters:
	for key in sorted(set(nwidth)-set(owidth)):
	print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
	+ '{:d} : '.format(nwidth[key])
	+ 'eaw={:s} '.format(
	unicode_utils.EAST_ASIAN_WIDTHS[key]
	if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
	+ 'category={:2s} '.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['category']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
	+ 'bidi={:3s} '.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
	+ 'name={:s}'.format(
	unicode_utils.UNICODE_ATTRIBUTES[key]['name']
	if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))

	if __name__ == "__main__":
	PARSER = argparse.ArgumentParser(
	description='''
	Compare the contents of LC_CTYPE in two files and check for errors.
	''')
	PARSER.add_argument(
	'-o', '--old_utf8_file',
	nargs='?',
	required=True,
	type=str,
	help='The old UTF-8 file.')
	PARSER.add_argument(
	'-n', '--new_utf8_file',
	nargs='?',
	required=True,
	type=str,
	help='The new UTF-8 file.')
	PARSER.add_argument(
	'-u', '--unicode_data_file',
	nargs='?',
	type=str,
	help='The UnicodeData.txt file to read.')
	PARSER.add_argument(
	'-e', '--east_asian_width_file',
	nargs='?',
	type=str,
	help='The EastAsianWidth.txt file to read.')
	PARSER.add_argument(
	'-a', '--show_added_characters',
	action='store_true',
	help='Show characters which were added in detail.')
	PARSER.add_argument(
	'-m', '--show_missing_characters',
	action='store_true',
	help='Show characters which were removed in detail.')
	PARSER.add_argument(
	'-c', '--show_changed_characters',
	action='store_true',
	help='Show characters whose width was changed in detail.')
	ARGS = PARSER.parse_args()

	if ARGS.unicode_data_file:
	unicode_utils.fill_attributes(ARGS.unicode_data_file)
	if ARGS.east_asian_width_file:
	unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
	check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
	check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)