6 use open OUT => ':utf8', ':std';
11 # determine input data source
14 # manual contents specified (either piped or filename(s) given)
18 # automatic download from official website
20 my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt')
21 or die "Couldn't download RFC-1345 from ietf.org";
22 open $input, '<', \$contents; # emulate file handle
25 # skip everything until a character indented by 1 space (table start)
28 defined or die "Premature input end";
31 my @t = $_; # add first line (already read, assume it's ok)
33 # read the rest of the character table
34 while ($_ = readline $input) {
35 # check for table end (chapter 4)
38 # parse table lines (ignore (unindented) page break)
44 # continuation line (add to last entry)
53 # create a hash of desired input
56 my ($mnem, $char, $name) = split / +/, $_, 3;
57 next if length $mnem != 2;
58 $di{$mnem} = hex $char;
63 0xE001 => 0, # join lines: not accepted
64 0xE004 => 0, # umlaut is no different from diaeresis 0x0308
65 0xE005 => 0x0344, # discouraged
87 0xE01B => 0x03D0, # middle beta = curled beta?
91 0xE01F => 0x33C2, # am, compatibility char
92 0xE020 => 0x33D8, # pm, compatibility char
95 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway
97 0xE025 => 0x20D7, # also 20D1; non-spacing
100 0xE028 => 0x01F0, #but uppercase
103 $_ >= 0xE000 or next;
104 $_ = $trans{$_} if defined $trans{$_};
109 if (-r 'shiar.inc.txt') {
110 open my $include, '<:utf8', 'shiar.inc.txt';
111 for (readline $include) {
112 m{^([!"%'-Z_a-z]{2}) (.)} or next;
113 warn("$1 already defined"), next if defined $di{$1};
120 $di{chr $_} = $_ for 32 .. 126;
121 $di{'\\'.$_} = delete $di{$_} for '{', '}', '\\';
123 # optionally get unicode character information
125 require Unicode::UCD;
127 $_ => Unicode::UCD::charinfo($di{$_})
128 || { block => '?', category => 'Xn', name => '', script => '' }
132 # add custom categories for certain blocks
134 $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
135 $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
138 # mark unofficial extras as such
139 $info{$_}->{category} .= ' Xz' for @extra;
142 $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
143 # find control characters (first 32 chars from 0 and 128)
144 next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
145 # rename to something more descriptive
146 $info{$_}->{name} = $info{$_}->{unicode10}
147 ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful
148 : sprintf('<control U+%04X>', $di{$_}); # at least identify by value
149 # show descriptive symbols instead of control chars themselves
150 $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
153 # convert info hashes into arrays of strings to output in display order
154 for my $row (values %info) {
155 $row = [ map { $row->{$_} } qw/name category script string/ ];
156 # strip off trailing missing values (especially string may be unknown)
157 defined $row->[-1] ? last : pop @$row for 1 .. @$row;
160 # output perl code of hash
161 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
163 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
164 map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
166 printf "q{%s}=>[%s],\n", $_, join(',',
167 $di{$_}, # original code point
168 $info{$_} # optional additional arguments
169 ? map {"'$_'"} @{ $info{$_} }
178 rfc1345convert - Output digraph data from RFC-1345
182 Download and convert the digraph specification from ietf.org:
184 rfc1345convert > digraphs.inc.pl
186 Test by printing the character for DO (should be a dollar sign):
188 perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
190 Manual specification of source retrieval:
192 rfc1345convert rfc1345.txt
193 curl $url | rfc1345convert -
197 Parses the official RFC-1345 document, searching the
198 'character mnemonic table' for all digraph definitions.
199 If successful, Perl code is output resulting in a hash
200 with character data keyed by digraph.
201 Any errors and warnings are given at STDERR.
203 The value can either be a scalar string containing another
204 digraph which can be considered identical (usually inverted),
205 or an array ref containing at least the resulting character's
206 Unicode code point value. If available, the following UCD data
207 is appended: character name, category, script, and output string.
211 AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
217 Mischa POSLAWSKY <perl@shiar.org>
221 Licensed under the GNU Affero General Public License version 3.