rfc1345convert

   1 #!/usr/bin/env perl
   2
   3 use strict;
   4 use warnings;
   5
   6 use open OUT => ':utf8', ':std';
   7 use Data::Dumper;
   8
   9 our $VERSION = '1.01';
  10
  11 # determine input data source
  12 my $input;
  13 if (@ARGV) {
  14         # manual contents specified (either piped or filename(s) given)
  15         $input = \*ARGV;
  16 }
  17 else {
  18         # automatic download from official website
  19         require LWP::Simple;
  20         my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt')
  21                 or die "Couldn't download RFC-1345 from ietf.org";
  22         open $input, '<', \$contents;  # emulate file handle
  23 }
  24
  25 # skip everything until a character indented by 1 space (table start)
  26 do {
  27         $_ = readline $input;
  28         defined or die "Premature input end";
  29 } until /^\s\S/;
  30
  31 my @t = $_;  # add first line (already read, assume it's ok)
  32
  33 # read the rest of the character table
  34 while ($_ = readline $input) {
  35         # check for table end (chapter 4)
  36         last if /^\d/;
  37
  38         # parse table lines (ignore (unindented) page break)
  39         next unless s/^ //;
  40         chomp;
  41
  42         # add the line to @t
  43         if (s/^ {15}/ /) {
  44                 # continuation line (add to last entry)
  45                 $t[-1] .= $_;
  46         }
  47         else {
  48                 # add a new entry
  49                 push @t, $_;
  50         }
  51 }
  52
  53 # create a hash of desired input
  54 my %di;
  55 for (@t) {
  56         my ($mnem, $char, $name) = split / +/, $_, 3;
  57         next if length $mnem != 2;
  58         $di{$mnem} = hex $char;
  59 }
  60
  61 # XXX
  62 my %trans = (
  63         0xE001 => 0,  # join lines: not accepted
  64         0xE004 => 0,  # umlaut is no different from diaeresis 0x0308
  65         0xE005 => 0x0344, # discouraged
  66         0xE006 => 0x0300,
  67         0xE007 => 0x0301,
  68         0xE008 => 0x0302,
  69         0xE009 => 0x0303,
  70         0xE00A => 0x0304,
  71         0xE00B => 0x0306,
  72         0xE00C => 0x0307,
  73         0xE00D => 0x0308,
  74         0xE00E => 0x030A,
  75         0xE00F => 0x030B,
  76         0xE010 => 0x030C,
  77         0xE011 => 0x0327,
  78         0xE012 => 0x0328,
  79         0xE013 => 0x0332,
  80         0xE014 => 0x0333,
  81         0xE015 => 0x0338,
  82         0xE016 => 0x0345,
  83         0xE017 => 0x0314,
  84         0xE018 => 0x0313,
  85         0xE019 => 0x1FFE,
  86         0xE01A => 0x1FBF,
  87         0xE01B => 0x03D0,  # middle beta = curled beta?
  88         0xE01C => 0x25CB,
  89         0xE01D => 0x0192,
  90         0xE01E => 0x0292,
  91         0xE01F => 0x33C2,  # am, compatibility char
  92         0xE020 => 0x33D8,  # pm, compatibility char
  93         0xE021 => 0x2121,
  94         0xE022 => 0xFE8E,
  95         0xE023 => 0,  # dutch guilder 0192 is already encoded, and not very useful anyway
  96         0xE024 => 0x0393,
  97         0xE025 => 0x20D7,  # also 20D1; non-spacing
  98         0xE026 => 0x1FEF,
  99         0xE027 => 0x1FC0,
 100         0xE028 => 0x01F0, #but uppercase
 101 );
 102 for (values %di) {
 103         $_ >= 0xE000 or next;
 104         $_ = $trans{$_} if defined $trans{$_};
 105 }
 106
 107 # personal addendums
 108 my @extra;
 109 if (-r 'shiar.inc.txt') {
 110         open my $include, '<:utf8', 'shiar.inc.txt';
 111         for (readline $include) {
 112                 m{^([!"%'-Z_a-z]{2}) (.)} or next;
 113                 warn("$1 already defined"), next if defined $di{$1};
 114                 $di{$1} = ord $2;
 115                 push @extra, $1;
 116         }
 117 }
 118 warn $@ if $@;
 119
 120 $di{chr $_} = $_ for 32 .. 126;
 121 $di{'\\'.$_} = delete $di{$_} for '{', '}', '\\';
 122
 123 # optionally get unicode character information
 124 my %info = eval {
 125         require Unicode::UCD;
 126         map {
 127                 $_ => Unicode::UCD::charinfo($di{$_})
 128                         || { block => '?', category => 'Xn', name => '', script => '' }
 129         } keys %di;
 130 };
 131
 132 # add custom categories for certain blocks
 133 for (values %info) {
 134         $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
 135         $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
 136 }
 137
 138 # mark unofficial extras as such
 139 $info{$_}->{category} .= ' Xz' for @extra;
 140
 141 for (keys %di) {
 142         $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
 143         # find control characters (first 32 chars from 0 and 128)
 144         next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
 145         # rename to something more descriptive
 146         $info{$_}->{name} = $info{$_}->{unicode10}
 147                 ? '<'.$info{$_}->{unicode10}.'>'  # the old name was much more useful
 148                 : sprintf('<control U+%04X>', $di{$_});  # at least identify by value
 149         # show descriptive symbols instead of control chars themselves
 150         $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
 151 }
 152
 153 # convert info hashes into arrays of strings to output in display order
 154 for my $row (values %info) {
 155         $row = [ map { $row->{$_} } qw/name category script string/ ];
 156         # strip off trailing missing values (especially string may be unknown)
 157         defined $row->[-1] ? last : pop @$row for 1 .. @$row;
 158 }
 159
 160 # output perl code of hash
 161 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
 162 print "+{\n";
 163 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
 164         map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
 165 );
 166 printf "q{%s}=>[%s],\n", $_, join(',',
 167         $di{$_},   # original code point
 168         $info{$_}  # optional additional arguments
 169                 ? map {"'$_'"} @{ $info{$_} }
 170                 : ()
 171 ) for sort keys %di;
 172 print "}\n";
 173
 174 __END__
 175
 176 =head1 NAME
 177
 178 rfc1345convert - Output digraph data from RFC-1345
 179
 180 =head1 SYNOPSIS
 181
 182 Download and convert the digraph specification from ietf.org:
 183
 184     rfc1345convert > digraphs.inc.pl
 185
 186 Test by printing the character for DO (should be a dollar sign):
 187
 188     perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
 189
 190 Manual specification of source retrieval:
 191
 192     rfc1345convert rfc1345.txt
 193     curl $url | rfc1345convert -
 194
 195 =head1 DESCRIPTION
 196
 197 Parses the official RFC-1345 document, searching the
 198 'character mnemonic table' for all digraph definitions.
 199 If successful, Perl code is output resulting in a hash
 200 with character data keyed by digraph.
 201 Any errors and warnings are given at STDERR.
 202
 203 The value can either be a scalar string containing another
 204 digraph which can be considered identical (usually inverted),
 205 or an array ref containing at least the resulting character's
 206 Unicode code point value.  If available, the following UCD data
 207 is appended: character name, category, script, and output string.
 208 For example:
 209
 210  +{
 211    AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
 212    EA => 'AE',
 213   }
 214
 215 =head1 AUTHOR
 216
 217 Mischa POSLAWSKY <perl@shiar.org>
 218
 219 =head1 LICENSE
 220
 221 Licensed under the GNU Affero General Public License version 3.
 222