7 use open OUT => ':utf8', ':std';
10 our $VERSION = '1.01';
12 # determine input data source
15 # manual contents specified (either piped or filename(s) given)
19 # automatic download from official website
21 my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt')
22 or die "Couldn't download RFC-1345 from ietf.org";
23 open $input, '<', \$contents; # emulate file handle
26 # skip everything until a character indented by 1 space (table start)
29 defined or die "Premature input end";
32 my @t = $_; # add first line (already read, assume it's ok)
34 # read the rest of the character table
35 while ($_ = readline $input) {
36 # check for table end (chapter 4)
39 # parse table lines (ignore (unindented) page break)
45 # continuation line (add to last entry)
54 # create a hash of desired input
57 my ($mnem, $char, $name) = split / +/, $_, 3;
58 next if length $mnem != 2;
59 $di{$mnem} = hex $char;
64 0xE001 => 0, # join lines: not accepted
65 0xE004 => 0, # umlaut is no different from diaeresis 0x0308
66 0xE005 => 0x0344, # discouraged
88 0xE01B => 0x03D0, # middle beta = curled beta?
92 0xE01F => 0x33C2, # am, compatibility char
93 0xE020 => 0x33D8, # pm, compatibility char
96 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway
98 0xE025 => 0x20D7, # also 20D1; non-spacing
101 0xE028 => 0x01F0, #but uppercase
104 $_ >= 0xE000 or next;
105 $_ = $trans{$_} if defined $trans{$_};
110 if (-r 'shiar.inc.txt') {
111 open my $include, '<:utf8', 'shiar.inc.txt';
112 for (readline $include) {
113 m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next;
114 warn("$1 already defined"), next if defined $di{$1};
121 $di{chr $_} = $_ for 32 .. 126;
122 $di{'\\'.$_} = delete $di{$_} for '{', '}', '\\';
124 # optionally get unicode character information
126 require Unicode::UCD;
128 $_ => Unicode::UCD::charinfo($di{$_})
129 || { block => '?', category => 'Xn', name => '', script => '' }
133 # add custom categories for certain blocks
135 $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
136 $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
139 # mark unofficial extras as such
140 $info{$_}->{category} .= ' Xz' for @extra;
143 $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
144 # find control characters (first 32 chars from 0 and 128)
145 next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
146 # rename to something more descriptive
147 $info{$_}->{name} = $info{$_}->{unicode10}
148 ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful
149 : sprintf('<control U+%04X>', $di{$_}); # at least identify by value
150 # show descriptive symbols instead of control chars themselves
151 $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
153 # presentational string for some control(lish) entries
154 $info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di;
155 $info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di;
156 $info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di;
158 # convert info hashes into arrays of strings to output in display order
159 for my $row (values %info) {
160 $row = [ map { $row->{$_} } qw/name category script string/ ];
161 # strip off trailing missing values (especially string may be unknown)
162 defined $row->[-1] ? last : pop @$row for 1 .. @$row;
165 # output perl code of hash
166 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
168 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
169 map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
171 printf "q{%s}=>[%s],\n", $_, join(',',
172 $di{$_}, # original code point
173 $info{$_} # optional additional arguments
174 ? map {"'$_'"} @{ $info{$_} }
183 rfc1345convert - Output digraph data from RFC-1345
187 Download and convert the digraph specification from ietf.org:
189 rfc1345convert > digraphs.inc.pl
191 Test by printing the character for DO (should be a dollar sign):
193 perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
195 Manual specification of source retrieval:
197 rfc1345convert rfc1345.txt
198 curl $url | rfc1345convert -
202 Parses the official RFC-1345 document, searching the
203 'character mnemonic table' for all digraph definitions.
204 If successful, Perl code is output resulting in a hash
205 with character data keyed by digraph.
206 Any errors and warnings are given at STDERR.
208 The value can either be a scalar string containing another
209 digraph which can be considered identical (usually inverted),
210 or an array ref containing at least the resulting character's
211 Unicode code point value. If available, the following UCD data
212 is appended: character name, category, script, and output string.
216 AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
222 Mischa POSLAWSKY <perl@shiar.org>
226 Licensed under the GNU Affero General Public License version 3.