rfc1345convert

   1 #!/usr/bin/env perl
   2
   3 use strict;
   4 use warnings;
   5
   6 use Data::Dumper;
   7
   8 our $VERSION = '1.01';
   9
  10 # determine input data source
  11 my $input;
  12 if (@ARGV) {
  13         # manual contents specified (either piped or filename(s) given)
  14         $input = \*ARGV;
  15 }
  16 else {
  17         # automatic download from official website
  18         require LWP::Simple;
  19         my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt')
  20                 or die "Couldn't download RFC-1345 from ietf.org";
  21         open $input, '<', \$contents;  # emulate file handle
  22 }
  23
  24 # skip everything until a character indented by 1 space (table start)
  25 do {
  26         $_ = readline $input;
  27         defined or die "Premature input end";
  28 } until /^\s\S/;
  29
  30 my @t = $_;  # add first line (already read, assume it's ok)
  31
  32 # read the rest of the character table
  33 while ($_ = readline $input) {
  34         # check for table end (chapter 4)
  35         last if /^\d/;
  36
  37         # parse table lines (ignore (unindented) page break)
  38         next unless s/^ //;
  39         chomp;
  40
  41         # add the line to @t
  42         if (s/^ {15}/ /) {
  43                 # continuation line (add to last entry)
  44                 $t[-1] .= $_;
  45         }
  46         else {
  47                 # add a new entry
  48                 push @t, $_;
  49         }
  50 }
  51
  52 # create a hash of desired input
  53 my %di;
  54 for (@t) {
  55         my ($mnem, $char, $name) = split / +/, $_, 3;
  56         next if length $mnem != 2;
  57         $di{$mnem} = hex $char;
  58 }
  59
  60 # XXX
  61 my %trans = (
  62         0xE001 => 0,  # join lines: not accepted
  63         0xE004 => 0,  # umlaut is no different from diaeresis 0x0308
  64         0xE005 => 0x0344, # discouraged
  65         0xE006 => 0x0300,
  66         0xE007 => 0x0301,
  67         0xE008 => 0x0302,
  68         0xE009 => 0x0303,
  69         0xE00A => 0x0304,
  70         0xE00B => 0x0306,
  71         0xE00C => 0x0307,
  72         0xE00D => 0x0308,
  73         0xE00E => 0x030A,
  74         0xE00F => 0x030B,
  75         0xE010 => 0x030C,
  76         0xE011 => 0x0327,
  77         0xE012 => 0x0328,
  78         0xE013 => 0x0332,
  79         0xE014 => 0x0333,
  80         0xE015 => 0x0338,
  81         0xE016 => 0x0345,
  82         0xE017 => 0x0314,
  83         0xE018 => 0x0313,
  84         0xE019 => 0x1FFE,
  85         0xE01A => 0x1FBF,
  86         0xE01B => 0x03D0,  # middle beta = curled beta?
  87         0xE01C => 0x25CB,
  88         0xE01D => 0x0192,
  89         0xE01E => 0x0292,
  90         0xE01F => 0x33C2,  # am, compatibility char
  91         0xE020 => 0x33D8,  # pm, compatibility char
  92         0xE021 => 0x2121,
  93         0xE022 => 0xFE8E,
  94         0xE023 => 0,  # dutch guilder 0192 is already encoded, and not very useful anyway
  95         0xE024 => 0x0393,
  96         0xE025 => 0x20D7,  # also 20D1; non-spacing
  97         0xE026 => 0x1FEF,
  98         0xE027 => 0x1FC0,
  99         0xE028 => 0x01F0, #but uppercase
 100 );
 101 for (values %di) {
 102         $_ >= 0xE000 or next;
 103         $_ = $trans{$_} if defined $trans{$_};
 104 }
 105
 106 # personal addendums
 107 my @extra;
 108 if (-r 'shiar.inc.txt') {
 109         open my $include, '<:utf8', 'shiar.inc.txt';
 110         for (readline $include) {
 111                 m{^([!"%'-Z_a-z]{2}) (.)} or next;
 112                 warn("$1 already defined"), next if defined $di{$1};
 113                 $di{$1} = ord $2;
 114                 push @extra, $1;
 115         }
 116 }
 117 warn $@ if $@;
 118
 119 # optionally get unicode character information
 120 my %info = eval {
 121         require Unicode::UCD;
 122         map {
 123                 $_ => Unicode::UCD::charinfo($di{$_})
 124                         || { block => '?', category => 'Xn', name => '', script => '' }
 125         } keys %di;
 126 };
 127
 128 # add custom categories for certain blocks
 129 for (values %info) {
 130         $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
 131         $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
 132 }
 133
 134 # mark unofficial extras as such
 135 $info{$_}->{category} .= ' Xz' for @extra;
 136
 137 for (keys %di) {
 138         $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
 139         # find control characters (first 32 chars from 0 and 128)
 140         next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
 141         # rename to something more descriptive
 142         $info{$_}->{name} = $info{$_}->{unicode10}
 143                 ? '<'.$info{$_}->{unicode10}.'>'  # the old name was much more useful
 144                 : sprintf('<control U+%04X>', $di{$_});  # at least identify by value
 145         # show descriptive symbols instead of control chars themselves
 146         $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
 147 }
 148
 149 # convert info hashes into arrays of strings to output in display order
 150 for my $row (values %info) {
 151         $row = [ map { $row->{$_} } qw/name category script string/ ];
 152         # strip off trailing missing values (especially string may be unknown)
 153         defined $row->[-1] ? last : pop @$row for 1 .. @$row;
 154 }
 155
 156 # output perl code of hash
 157 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
 158 print "+{\n";
 159 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
 160         map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
 161 );
 162 printf "q{%s}=>[%s],\n", $_, join(',',
 163         $di{$_},   # original code point
 164         $info{$_}  # optional additional arguments
 165                 ? map {"'$_'"} @{ $info{$_} }
 166                 : ()
 167 ) for sort keys %di;
 168 print "}\n";
 169
 170 __END__
 171
 172 =head1 NAME
 173
 174 rfc1345convert - Output digraph data from RFC-1345
 175
 176 =head1 SYNOPSIS
 177
 178 Download and convert the digraph specification from ietf.org:
 179
 180     rfc1345convert > digraphs.inc.pl
 181
 182 Test by printing the character for DO (should be a dollar sign):
 183
 184     perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
 185
 186 Manual specification of source retrieval:
 187
 188     rfc1345convert rfc1345.txt
 189     curl $url | rfc1345convert -
 190
 191 =head1 DESCRIPTION
 192
 193 Parses the official RFC-1345 document, searching the
 194 'character mnemonic table' for all digraph definitions.
 195 If successful, Perl code is output resulting in a hash
 196 with character data keyed by digraph.
 197 Any errors and warnings are given at STDERR.
 198
 199 The value can either be a scalar string containing another
 200 digraph which can be considered identical (usually inverted),
 201 or an array ref containing at least the resulting character's
 202 Unicode code point value.  If available, the following UCD data
 203 is appended: character name, category, script, and output string.
 204 For example:
 205
 206  +{
 207    AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
 208    EA => 'AE',
 209   }
 210
 211 =head1 AUTHOR
 212
 213 Mischa POSLAWSKY <perl@shiar.org>
 214
 215 =head1 LICENSE
 216
 217 Licensed under the GNU Affero General Public License version 3.
 218