| #!/usr/bin/perl |
| # Uniset -- Unicode subset manager -- Markus Kuhn |
| # http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz |
| |
| # slightly modified for R to produce single-column output in |
| # the style previously used (l/c a-f, no leading zeroes). |
| |
| require 5.014; |
| use open ':utf8'; |
| use FindBin qw($RealBin); # to find directory where this file is located |
| |
| binmode(STDOUT, ":utf8"); |
| binmode(STDIN, ":utf8"); |
| |
| my (%name, %invname, %category, %comment); |
| |
| print <<End if $#ARGV < 0; |
| Uniset -- Unicode subset manager -- Markus Kuhn |
| |
| Uniset merges and subtracts Unicode subsets. It can output and |
| analyse the resulting character set in various formats. |
| |
| Uniset understand the following command-line arguments: |
| |
| Commands to define a set of characters: |
| |
| + filename add the character set described in the file to the set |
| - filename remove the character set described in the file from the set |
| +: filename add the characters in the UTF-8 file to the set |
| -: filename remove the characters in the UTF-8 file from the set |
| +xxxx..yyyy add the range to the set (xxxx and yyyy are hex numbers) |
| -xxxx..yyyy remove the range from the set (xxxx and yyyy are hex numbers) |
| +cat=Xx add all Unicode characters with category code Xx |
| -cat=Xx remove all Unicode characters with category code Xx |
| -cat!=Xx remove all Unicode characters without category code Xx |
| clean remove any elements that do not appear in the Unicode database |
| unknown remove any elements that do appear in the Unicode database |
| |
| Command to output descriptions of the constructed set of characters: |
| |
| table write a full table with one line per character |
| compact output the set in compact MES format |
| c output the set as C interval array |
| nr output the number of characters |
| sources output a table that shows the number of characters contributed |
| by the various combinations of input sets added with +. |
| utf8-list output a list of all characters encoded in UTF-8 |
| |
| Commands to tailor the following output commands: |
| |
| html write HTML tables instead of plain text |
| ucs add the unicode character itself to the table (UTF-8 in |
| plain table, numeric character reference in HTML) |
| |
| Formats of character set input files read by the + and - command: |
| |
| Empty lines, white space at the start and end of the line and any |
| comment text following a \# are ignored. The following formats are |
| recognized |
| |
| xx yyyy xx is the hex code in an 8-bit character set and yyyy |
| is the corresponding Unicode value. Both can optionally |
| be prefixed by 0x. This is the format used in the |
| files on <ftp://ftp.unicode.org/Public/MAPPINGS/>. |
| |
| yyyy yyyy (optionally prefixed with 0x) is a Unicode character |
| belonging to the specified subset. |
| |
| yyyy-yyyy a range of Unicode characters belonging to |
| yyyy..yyyy the specified subset. |
| |
| xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify |
| corresponding low bytes or with a hyphen also ranges of |
| low bytes in the Unicode values that belong to this |
| subset. This is also the format that is generated by |
| the compact command. |
| End |
| exit 1 if $#ARGV < 0; |
| |
| |
| # Subroutine to identify whether the ISO 10646/Unicode character code |
| # ucs belongs into the East Asian Wide (W) or East Asian FullWidth |
| # (F) category as defined in Unicode Technical Report #11. |
| |
| sub iswide ($) { |
| my $ucs = shift(@_); |
| |
| return ($ucs >= 0x1100 && |
| ($ucs <= 0x115f || # Hangul Jamo |
| $ucs == 0x2329 || $ucs == 0x232a || |
| ($ucs >= 0x2e80 && $ucs <= 0xa4cf && |
| $ucs != 0x303f) || # CJK .. Yi |
| ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables |
| ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs |
| ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms |
| ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms |
| ($ucs >= 0xffe0 && $ucs <= 0xffe6) || |
| ($ucs >= 0x20000 && $ucs <= 0x2fffd) || |
| ($ucs >= 0x30000 && $ucs <= 0x3fffd))); |
| } |
| |
| # Return the Unicode name that belongs to a given character code |
| |
| # Jamo short names, see Unicode 3.0, table 4-4, page 86 |
| |
| my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', |
| 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112 |
| my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', |
| 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', |
| 'EU', 'YI', 'I'); # 1161..1175 |
| my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', |
| 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', |
| 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2 |
| |
| sub name { |
| my $ucs = shift(@_); |
| |
| # The intervals used here reflect Unicode Version 3.2 |
| if (($ucs >= 0x3400 && $ucs <= 0x4db5) || |
| ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || |
| ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { |
| return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs); |
| } |
| |
| if ($ucs >= 0xac00 && $ucs <= 0xd7a3) { |
| my $s = $ucs - 0xac00; |
| my $l = 0x1100 + int($s / (21 * 28)); |
| my $v = 0x1161 + int(($s % (21 * 28)) / 28); |
| my $t = 0x11a7 + $s % 28; |
| return "HANGUL SYLLABLE " . |
| ($lname[int($s / (21 * 28))] . |
| $vname[int(($s % (21 * 28)) / 28)] . |
| $tname[$s % 28 - 1]); |
| } |
| |
| return $name{$ucs}; |
| } |
| |
| sub is_unicode { |
| my $ucs = shift(@_); |
| |
| # The intervals used here reflect Unicode Version 3.2 |
| if (($ucs >= 0x3400 && $ucs <= 0x4db5) || |
| ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || |
| ($ucs >= 0xac00 && $ucs <= 0xd7a3) || |
| ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { |
| return 1; |
| } |
| |
| return exists $name{$ucs}; |
| } |
| |
| my @search_path = (); |
| if ($RealBin =~ m|^(.*)/bin\z| && -d "$1/share/uniset") { |
| push @search_path, "$1/share/uniset"; |
| } else { |
| push @search_path, $RealBin; |
| } |
| |
| sub search_open { |
| my ($mode, $fn) = @_; |
| my $file; |
| return $file if open($file, $mode, $fn); |
| return undef if $fn =~ m|/|; |
| for my $path (@search_path) { |
| return $file if open($file, $mode, "$path/$fn"); |
| } |
| return undef; |
| } |
| |
| my $html = 0; |
| my $image = 0; |
| my $adducs = 0; |
| my $unicodedata = "UnicodeData.txt"; |
| my $blockdata = "Blocks.txt"; |
| |
| # read list of all Unicode names |
| my $data = search_open('<', $unicodedata); |
| unless ($data) { |
| die ("Can't open Unicode database '$unicodedata':\n$!\n\n" . |
| "Please make sure that you have downloaded the file\n" . |
| "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"); |
| } |
| while (<$data>) { |
| if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) { |
| next if $2 ne '<control>' && substr($2, 0, 1) eq '<'; |
| $ucs = hex($1); |
| $name{$ucs} = $2; |
| $invname{$2} = $ucs; |
| $category{$ucs} = $3; |
| $comment{$ucs} = $12; |
| } else { |
| die("Syntax error in line '$_' in file '$unicodedata'\n"); |
| } |
| } |
| close($data); |
| |
| # read list of all Unicode blocks |
| $data = search_open('<', $blockdata); |
| unless ($data) { |
| die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" . |
| "Please make sure that you have downloaded the file\n" . |
| "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n"); |
| } |
| my $blocks = 0; |
| my (@blockstart, @blockend, @blockname); |
| while (<$data>) { |
| if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) { |
| $blockstart[$blocks] = hex($1); |
| $blockend [$blocks] = hex($2); |
| $blockname [$blocks] = $3; |
| $blocks++; |
| } elsif (/^\s*\#/ || /^\s*$/) { |
| # ignore comments and empty lines |
| } else { |
| die("Syntax error in line '$_' in file '$blockdata'\n"); |
| } |
| } |
| close($data); |
| if ($blockend[$blocks-1] < 0x110000) { |
| $blockstart[$blocks] = 0x110000; |
| $blockend [$blocks] = 0x7FFFFFFF; |
| $blockname [$blocks] = "Beyond Plane 16"; |
| $blocks++; |
| } |
| |
| # process command line arguments |
| while ($_ = shift(@ARGV)) { |
| if (/^html$/) { |
| $html = 1; |
| } elsif (/^ucs$/) { |
| $adducs = 1; |
| } elsif (/^img$/) { |
| $html = 1; |
| $image = 1; |
| } elsif (/^template$/) { |
| $template = shift(@ARGV); |
| open(TEMPLATE, $template) || die("Can't open template file '$template': $!\n"); |
| while (<TEMPLATE>) { |
| if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) { |
| open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n"); |
| while (<INCLUDE>) { |
| print $_; |
| } |
| close(INCLUDE); |
| } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) { |
| open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n"); |
| while (<INCLUDE>) { |
| s/&/&/g; |
| s/</</g; |
| print $_; |
| } |
| close(INCLUDE); |
| } else { |
| print $_; |
| } |
| } |
| close(TEMPLATE); |
| } elsif (/^\+cat=(.+)$/) { |
| # add characters with given category |
| $cat = $1; |
| for $i (keys(%category)) { |
| $used{$i} = "[${cat}]" if $category{$i} eq $cat; |
| } |
| } elsif (/^\-cat=(.+)$/) { |
| # remove characters with given category |
| $cat = $1; |
| for $i (keys(%category)) { |
| delete $used{$i} if $category{$i} eq $cat; |
| } |
| } elsif (/^\-cat!=(.+)$/) { |
| # remove characters without given category |
| $cat = $1; |
| for $i (keys(%category)) { |
| delete $used{$i} unless $category{$i} eq $cat; |
| } |
| } elsif (/^([+-]):(.*)/) { |
| $remove = $1 eq "-"; |
| $setfile = $2; |
| $setfile = shift(@ARGV) if $setfile eq ""; |
| push(@SETS, $setfile); |
| open(SET, $setfile) || die("Can't open set file '$setfile': $!\n"); |
| $setname = $setfile; |
| while (<SET>) { |
| while ($_) { |
| $i = ord($_); |
| $used{$i} .= "[${setname}]" unless $remove; |
| delete $used{$i} if $remove; |
| $_ = substr($_, 1); |
| } |
| } |
| close SET; |
| } elsif (/^([+-])(.*)/) { |
| $remove = $1 eq "-"; |
| $setfile = $2; |
| $setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/; |
| if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) { |
| # handle intervall specification on command line |
| $first = hex($1); |
| $last = hex($3); |
| for ($i = $first; $i <= $last; $i++) { |
| $used{$i} .= "[ARG]" unless $remove; |
| delete $used{$i} if $remove; |
| } |
| next; |
| } |
| $setfile = shift(@ARGV) if $setfile eq ""; |
| push(@SETS, $setfile); |
| my $setf = search_open('<', $setfile); |
| die("Can't open set file '$setfile': $!\n") unless $setf; |
| $cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name |
| $setname = $setfile; |
| $setname =~ s/([^.\[\]]*)\..*/$1/; |
| while (<$setf>) { |
| if (/^<code_set_name>/) { |
| # handle ISO 15897 (POSIX registry) charset mapping format |
| undef $comment_char; |
| undef $escape_char; |
| while (<$setf>) { |
| if ($comment_char && /^$comment_char/) { |
| # remove comments |
| $_ = $`; |
| } |
| next if (/^\032?\s*$/); # skip empty lines |
| if (/^<comment_char> (\S)$/) { |
| $comment_char = $1; |
| } elsif (/^<escape_char> (\S)$/) { |
| $escape_char = $1; |
| } elsif (/^(END )?CHARMAP$/) { |
| #ignore |
| } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) { |
| $used{hex($2)} .= "[${setname}{$1}]" unless $remove; |
| delete $used{hex($2)} if $remove; |
| } else { |
| die("Syntax error in line $. in file '$setfile':\n'$_'\n"); |
| } |
| } |
| next; |
| } elsif (/^STARTFONT /) { |
| # handle X11 BDF file |
| while (<$setf>) { |
| if (/^ENCODING\s+([0-9]+)/) { |
| $used{$1} .= "[${setname}]" unless $remove; |
| delete $used{$1} if $remove; |
| } |
| } |
| next; |
| } |
| tr/a-z/A-Z/; # make input uppercase |
| if ($cedf) { |
| if ($. > 4) { |
| if (/^([0-9A-F]{2})\t.?\t(.*)$/) { |
| # handle Kosta Kosti's trans CEDF format |
| next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0)); |
| $ucs = $invname{$2}; |
| die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs; |
| $used{$ucs} .= "[${setname}{$1}]" unless $remove; |
| delete $used{$ucs} if $remove; |
| } else { |
| die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n"); |
| } |
| } |
| next; |
| } |
| if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) { |
| # ignore ftp.unicode.org mapping file lines with #UNDEFINED |
| next; |
| } |
| s/^([^\#]*)\#.*$/$1/; # remove comments |
| next if (/^\032?\s*$/); # skip empty lines |
| if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) { |
| # handle entry from a ftp.unicode.org mapping file |
| $used{hex($4)} .= "[${setname}{$2}]" unless $remove; |
| delete $used{hex($4)} if $remove; |
| } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) { |
| # handle interval specification |
| $first = hex($2); |
| $last = hex($5); |
| for ($i = $first; $i <= $last; $i++) { |
| $used{$i} .= "[${setname}]" unless $remove; |
| delete $used{$i} if $remove; |
| } |
| } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) { |
| # handle lines from P10 MES draft |
| $row = $1; |
| $cols = $_; |
| $cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/; |
| $cols =~ tr/,//d; |
| @cols = split(/\s+/, $cols); |
| for (@cols) { |
| if (/^(..)$/) { |
| $first = hex("$row$1"); |
| $last = $first; |
| } elsif (/^(..)-(..)$/) { |
| $first = hex("$row$1"); |
| $last = hex("$row$2"); |
| } else { |
| die ("this should never happen '$_'"); |
| } |
| for ($i = $first; $i <= $last; $i++) { |
| $used{$i} .= "[${setname}]" unless $remove; |
| delete $used{$i} if $remove; |
| } |
| } |
| } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) { |
| # handle single character |
| $used{hex($2)} .= "[${setname}]" unless $remove; |
| delete $used{hex($2)} if $remove; |
| } else { |
| die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/; |
| } |
| } |
| close $setf; |
| } elsif (/^loadimages$/ || /^loadbigimages$/) { |
| if (/^loadimages$/) { |
| $prefix = "Small.Glyphs"; |
| } else { |
| $prefix = "Glyphs"; |
| } |
| $total = 0; |
| for $i (keys(%used)) { |
| next if ($name{$i} eq "<control>"); |
| $total++; |
| } |
| $count = 0; |
| $| = 1; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| next if ($name{$i} eq "<control>"); |
| $count++; |
| $j = sprintf("%04X", $i); |
| $j =~ /(..)(..)/; |
| $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif"; |
| print("\r$count/$total: $gif"); |
| system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &"); |
| select(undef, undef, undef, 0.2); |
| } |
| print("\n"); |
| exit 0; |
| } elsif (/^giftable/) { |
| # form a table of glyphs (requires pbmtools installed) |
| $count = 0; |
| for $i (keys(%used)) { |
| $count++ unless $name{$i} eq "<control>"; |
| } |
| $width = int(sqrt($count/sqrt(2)) + 0.5); |
| $width = $1 if /^giftable([0-9]+)$/; |
| system("rm -f tmp-*.pnm table.pnm~ table.pnm"); |
| $col = 0; |
| $row = 0; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| next if ($name{$i} eq "<control>"); |
| $j = sprintf("%04X", $i); |
| $j =~ /(..)(..)/; |
| $gif = "Small.Glyphs/$1/U$j.gif"; |
| $pnm = sprintf("tmp-%02x.pnm", $col); |
| $fallback = "Small.Glyphs/FF/UFFFD.gif"; |
| system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }"); |
| if (++$col == $width) { |
| system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm"); |
| if ($row == 0) { |
| system("mv tmp-row.pnm table.pnm"); |
| } else { |
| system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm"); |
| } |
| $row++; |
| $col = 0; |
| system("rm -f tmp-*.pnm table.pnm~"); |
| } |
| } |
| if ($col > 0) { |
| system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm"); |
| if ($row == 0) { |
| system("mv tmp-row.pnm table.pnm"); |
| } else { |
| system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm"); |
| } |
| } |
| system("rm -f table.gif ; ppmtogif table.pnm > table.gif"); |
| system("rm -f tmp-*.pnm table.pnm~ table.pnm"); |
| } elsif (/^table$/) { |
| # go through all used names to print full table |
| print "<TABLE border=2>\n" if $html; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| next if ($name{$i} eq "<control>"); |
| if ($html) { |
| $sources = $used{$i}; |
| $sources =~ s/\]\[/, /g; |
| $sources =~ s/^\[//g; |
| $sources =~ s/\]$//g; |
| $sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g; |
| $j = sprintf("%04X", $i); |
| $j =~ /(..)(..)/; |
| $gif = "Small.Glyphs/$1/U$j.gif"; |
| print "<TR>"; |
| print "<TD><img width=32 height=32 src=\"$gif\">" if $image; |
| printf("<TD>&#%d;", $i) if $adducs; |
| print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i); |
| print " ($comment{$i})" if $comment{$i}; |
| print "</SAMP><TD><SMALL>$sources</SMALL>\n"; |
| } else { |
| printf("%04X \# ", $i); |
| print pack("U", $i) . " " if $adducs; |
| print name($i) ."\n"; |
| } |
| } |
| print "</TABLE>\n" if $html; |
| } elsif (/^imgblock$/) { |
| $width = 16; |
| $width = $1 if /giftable([0-9]+)/; |
| $col = 0; |
| $subline = ""; |
| print "\n<P><TABLE cellspacing=0 cellpadding=0>"; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| print "<TR>" if $col == 0; |
| $j = sprintf("%04X", $i); |
| $j =~ /(..)(..)/; |
| $gif = "Small.Glyphs/$1/U$j.gif"; |
| $alt = name($i); |
| print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">"; |
| $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>"; |
| if (++$col == $width) { |
| print "<TR align=center>$subline"; |
| $col = 0; |
| $subline = ""; |
| } |
| } |
| print "<TR align=center>$subline" if ($col > 0); |
| print "</TABLE>\n"; |
| } elsif (/^sources$/) { |
| # count how many characters are attributed to the various source set combinations |
| print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html; |
| for $i (keys(%used)) { |
| next if ($name{$i} eq "<control>"); |
| $sources = $used{$i}; |
| $sources =~ s/\]\[/, /g; |
| $sources =~ s/^\[//g; |
| $sources =~ s/\]$//g; |
| $sources =~ s/\{(..)\}//g; |
| $contribs{$sources} += 1; |
| } |
| for $j (keys(%contribs)) { |
| print "<TR><TD>$contribs{$j}<TD>$j\n" if $html; |
| } |
| print "</TABLE>\n" if $html; |
| } elsif (/^compact$/) { |
| # print compact table in P10 MES format |
| print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html; |
| print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html; |
| print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html; |
| $current_row = ''; |
| $start_col = ''; |
| $last_col = ''; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| next if ($name{$i} eq "<control>"); |
| $row = sprintf("%02X", $i >> 8); |
| $col = sprintf("%02X", $i & 0xff); |
| if ($row ne $current_row) { |
| if (($last_col ne '') and ($last_col ne $start_col)) { |
| print "-$last_col"; |
| print "</SAMP>" if $html; |
| } |
| print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html; |
| print "\n $row\t" unless $html; |
| $len = 0; |
| $current_row = $row; |
| $start_col = ''; |
| } |
| if ($start_col eq '') { |
| print "$col"; |
| $len += 2; |
| $start_col = $col; |
| $last_col = $col; |
| } elsif (hex($col) == hex($last_col) + 1) { |
| $last_col = $col; |
| } else { |
| if ($last_col ne $start_col) { |
| print "-$last_col"; |
| $len += 3; |
| } |
| if ($len > 60 && !$html) { |
| print "\n $row\t"; |
| $len = 0; |
| }; |
| print " " if $len; |
| print "$col"; |
| $len += 2 + !! $len; |
| $start_col = $col; |
| $last_col = $col; |
| } |
| } |
| if (($last_col ne '') and ($last_col ne $start_col)) { |
| print "-$last_col"; |
| print "</SAMP>" if $html; |
| } |
| print "\n" if ($current_row ne ''); |
| print "</TABLE>\n" if $html; |
| print "\n"; |
| } elsif (/^c$/) { |
| # print table as C interval array |
| print "{"; |
| $last_i = ''; |
| $columns = 1; |
| $col = $columns; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| next if ($name{$i} eq "<control>"); |
| if ($last_i eq '') { |
| if (++$col > $columns) { $col = 1; print "\n "; } |
| printf(" { 0x%x, ", $i); |
| $last_i = $i; |
| } elsif ($i == $last_i + 1) { |
| $last_i = $i; |
| } else { |
| printf("0x%x },", $last_i); |
| if (++$col > $columns) { $col = 1; print "\n "; } |
| printf(" { 0x%x, ", $i); |
| $last_i = $i; |
| } |
| } |
| if ($last_i ne '') { |
| printf("0x%x }", $last_i); |
| } |
| print "\n};\n"; |
| } elsif (/^utf8-list$/) { |
| $col = 0; |
| $block = 0; |
| $last = -1; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| next if ($name{$i} eq "<control>"); |
| while ($blockend[$block] < $i && $block < $blocks - 1) { |
| $block++; |
| } |
| if ($last <= $blockend[$block-1] && |
| $i < $blockstart[$block]) { |
| print "\n" if ($col); |
| printf "\nFree block (U+%04X-U+%04X):\n\n", |
| $blockend[$block-1] + 1, $blockstart[$block] - 1; |
| $col = 0; |
| } |
| if ($last < $blockstart[$block] && $i >= $blockstart[$block]) { |
| print "\n" if ($col); |
| printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n", |
| $blockstart[$block], $blockend[$block]; |
| $col = 0; |
| } |
| if ($category{$i} eq 'Mn') { |
| # prefix non-spacing character with U+25CC DOTTED CIRCLE |
| print "\x{25CC}"; |
| } elsif ($category{$i} eq 'Me') { |
| # prefix enclosing non-spacing character with space |
| print " "; |
| } |
| print pack("U", $i); |
| $col += 1 + iswide($i); |
| if ($col >= 64) { |
| print "\n"; |
| $col = 0; |
| } |
| $last = $i; |
| } |
| print "\n" if ($col); |
| } elsif (/^collections$/) { |
| $block = 0; |
| $last = -1; |
| for $i (sort({$a <=> $b} keys(%used))) { |
| next if ($name{$i} eq "<control>"); |
| while ($blockend[$block] < $i && $block < $blocks - 1) { |
| $block++; |
| } |
| if ($last < $blockstart[$block] && $i >= $blockstart[$block]) { |
| print $blockname[$block], |
| " " x (40 - length($blockname[$block])); |
| printf "%04X-%04X\n", |
| $blockstart[$block], $blockend[$block]; |
| } |
| $last = $i; |
| } |
| } elsif (/^nr$/) { |
| print "<P>" if $html; |
| print "# " unless $html; |
| print "Number of characters in above table: "; |
| $count = 0; |
| for $i (keys(%used)) { |
| $count++ unless $name{$i} eq "<control>"; |
| } |
| print $count; |
| print "\n"; |
| } elsif (/^clean$/) { |
| # remove characters from set that are not in $unicodedata |
| for $i (keys(%used)) { |
| delete $used{$i} unless is_unicode($i); |
| } |
| } elsif (/^unknown$/) { |
| # remove characters from set that are in $unicodedata |
| for $i (keys(%used)) { |
| delete $used{$i} if is_unicode($i); |
| } |
| } else { |
| die("Unknown command line command '$_'\n"); |
| }; |
| } |