10 # determine input data source
13 # manual contents specified (either piped or filename(s) given)
17 # automatic download from official website
19 my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt')
20 or die "Couldn't download RFC-1345 from ietf.org";
21 open $input, '<', \$contents; # emulate file handle
24 # skip everything until a character indented by 1 space (table start)
27 defined or die "Premature input end";
30 my @t = $_; # add first line (already read, assume it's ok)
32 # read the rest of the character table
33 while ($_ = readline $input) {
34 # check for table end (chapter 4)
37 # parse table lines (ignore (unindented) page break)
43 # continuation line (add to last entry)
52 # create a hash of desired input
55 my ($mnem, $char, $name) = split / +/, $_, 3;
56 next if length $mnem != 2;
57 $di{$mnem} = hex $char;
62 0xE001 => 0, # join lines: not accepted
63 0xE004 => 0, # umlaut is no different from diaeresis 0x0308
64 0xE005 => 0x0344, # discouraged
86 0xE01B => 0x03D0, # middle beta = curled beta?
90 0xE01F => 0x33C2, # am, compatibility char
91 0xE020 => 0x33D8, # pm, compatibility char
94 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway
96 0xE025 => 0x20D7, # also 20D1; non-spacing
99 0xE028 => 0x01F0, #but uppercase
102 $_ >= 0xE000 or next;
103 $_ = $trans{$_} if defined $trans{$_};
108 if (-r 'shiar.inc.txt') {
109 open my $include, '<:utf8', 'shiar.inc.txt';
110 for (readline $include) {
111 m{^([!"%'-Z_a-z]{2}) (.)} or next;
112 warn("$1 already defined"), next if defined $di{$1};
119 # optionally get unicode character information
121 require Unicode::UCD;
123 $_ => Unicode::UCD::charinfo($di{$_})
124 || { block => '?', category => 'Xn', name => '', script => '' }
128 # add custom categories for certain blocks
130 $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
131 $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
134 # mark unofficial extras as such
135 $info{$_}->{category} .= ' Xz' for @extra;
138 $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
139 # find control characters (first 32 chars from 0 and 128)
140 next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
141 # rename to something more descriptive
142 $info{$_}->{name} = $info{$_}->{unicode10}
143 ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful
144 : sprintf('<control U+%04X>', $di{$_}); # at least identify by value
145 # show descriptive symbols instead of control chars themselves
146 $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
149 # convert info hashes into arrays of strings to output in display order
150 for my $row (values %info) {
151 $row = [ map { $row->{$_} } qw/name category script string/ ];
152 # strip off trailing missing values (especially string may be unknown)
153 defined $row->[-1] ? last : pop @$row for 1 .. @$row;
156 # output perl code of hash
157 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
159 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
160 map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
162 printf "q{%s}=>[%s],\n", $_, join(',',
163 $di{$_}, # original code point
164 $info{$_} # optional additional arguments
165 ? map {"'$_'"} @{ $info{$_} }
174 rfc1345convert - Output digraph data from RFC-1345
178 Download and convert the digraph specification from ietf.org:
180 rfc1345convert > digraphs.inc.pl
182 Test by printing the character for DO (should be a dollar sign):
184 perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
186 Manual specification of source retrieval:
188 rfc1345convert rfc1345.txt
189 curl $url | rfc1345convert -
193 Parses the official RFC-1345 document, searching the
194 'character mnemonic table' for all digraph definitions.
195 If successful, Perl code is output resulting in a hash
196 with character data keyed by digraph.
197 Any errors and warnings are given at STDERR.
199 The value can either be a scalar string containing another
200 digraph which can be considered identical (usually inverted),
201 or an array ref containing at least the resulting character's
202 Unicode code point value. If available, the following UCD data
203 is appended: character name, category, script, and output string.
207 AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
213 Mischa POSLAWSKY <perl@shiar.org>
217 Licensed under the GNU Affero General Public License version 3.