7 use open OUT => ':utf8', ':std';
10 our $VERSION = '1.02';
12 # expect input data source at command line
13 @ARGV or die "Specify input source file or - for STDIN\n";
15 # skip everything until a character indented by 1 space (table start)
18 defined or die "Premature input end";
21 my @t = $_; # add first line (already read, assume it's ok)
23 # read the rest of the character table
24 while ($_ = readline) {
25 # check for table end (chapter 4)
28 # parse table lines (ignore (unindented) page break)
34 # continuation line (add to last entry)
43 # create a hash of desired input
46 my ($mnem, $char, $name) = split / +/, $_, 3;
47 next if length $mnem != 2;
48 $di{$mnem} = hex $char;
53 0xE001 => 0, # join lines: not accepted
54 0xE004 => 0, # umlaut is no different from diaeresis 0x0308
55 0xE005 => 0x0344, # discouraged
77 0xE01B => 0x03D0, # middle beta = curled beta?
81 0xE01F => 0x33C2, # am, compatibility char
82 0xE020 => 0x33D8, # pm, compatibility char
85 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway
87 0xE025 => 0x20D7, # also 20D1; non-spacing
90 0xE028 => 0x01F0, #but uppercase
94 $_ = $trans{$_} if defined $trans{$_};
99 if (-r 'shiar.inc.txt') {
100 open my $include, '<:utf8', 'shiar.inc.txt';
101 for (readline $include) {
102 m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next;
103 warn("$1 already defined"), next if defined $di{$1};
110 $di{chr $_} = $_ for 32 .. 126;
111 $di{'\\'.$_} = delete $di{$_} for '{', '}', '\\';
113 # optionally get unicode character information
115 require Unicode::UCD;
117 $_ => Unicode::UCD::charinfo($di{$_})
118 || { block => '?', category => 'Xn', name => '', script => '' }
122 # add custom categories for certain blocks
124 $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
125 $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
128 # mark unofficial extras as such
129 $info{$_}->{category} .= ' Xz' for @extra;
132 $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
133 # find control characters (first 32 chars from 0 and 128)
134 next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
135 # rename to something more descriptive
136 $info{$_}->{name} = $info{$_}->{unicode10}
137 ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful
138 : sprintf('<control U+%04X>', $di{$_}); # at least identify by value
139 # show descriptive symbols instead of control chars themselves
140 $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
142 # presentational string for some control(lish) entries
143 $info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di;
144 $info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di;
145 $info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di;
147 # convert info hashes into arrays of strings to output in display order
148 for my $row (values %info) {
149 $row = [ map { $row->{$_} } qw/name category script string/ ];
150 # strip off trailing missing values (especially string may be unknown)
151 defined $row->[-1] ? last : pop @$row for 1 .. @$row;
154 # output perl code of hash
155 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
157 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
158 map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
160 printf "q{%s}=>[%s],\n", $_, join(',',
161 $di{$_}, # original code point
162 $info{$_} # optional additional arguments
163 ? map {"'$_'"} @{ $info{$_} }
172 mkdigraphlist - Output digraph data from RFC-1345
176 Extract digraphs from text specifications as a perl hash:
178 mkdigraphlist rfc1345.txt custom.txt > digraphs.inc.pl
180 Input can be the literal RFC (or similar) document:
182 curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist -
184 Test by printing the character for DO (should be a dollar sign):
186 perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
190 Parses the official RFC-1345 document, searching the
191 'character mnemonic table' for all digraph definitions.
192 If successful, Perl code is output resulting in a hash
193 with character data keyed by digraph.
194 Any errors and warnings are given at STDERR.
196 The value can either be a scalar string containing another
197 digraph which can be considered identical (usually inverted),
198 or an array ref containing at least the resulting character's
199 Unicode code point value. If available, the following UCD data
200 is appended: character name, category, script, and output string.
204 AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
210 Mischa POSLAWSKY <perl@shiar.org>
214 Licensed under the GNU Affero General Public License version 3.