6 use open OUT => ':utf8', ':std';
10 our $VERSION = '1.01';
12 # translation table for deprecated code points
14 (map {$_ => 0} 0xE000 .. 0xE03F), # omit by default if unspecified
15 0xE001 => 0, # /c join lines: not accepted
16 0xE002 => "\N{EM QUAD}", # UA unit space A
17 #> ISO-IR 008-1: unit spaces A in position 4/0 and B in
18 #> position 6/0 are fixed length spaces whereby UA ≥ UB.
19 #> […] The lengths of UA and UB are determined by the user.
20 0xE003 => "\N{EN QUAD}", # UB unit space B
21 # 0xE004 => "\N{COMBINING DIAERESIS}", # "3 combining umlaut, unified with diaeresis
22 # 0xE005 => "\N{COMBINING GREEK DIALYTIKA TONOS}", # "1 combining diaeresis+accent, unspecified
23 0xE006 => "\N{COMBINING GRAVE ACCENT}", # "! combining grave
24 0xE007 => "\N{COMBINING ACUTE ACCENT}", # "' combining acute
25 0xE008 => "\N{COMBINING CIRCUMFLEX ACCENT}", # "> combining circumflex
26 0xE009 => "\N{COMBINING TILDE}", # "? combining tilde
27 0xE00A => "\N{COMBINING MACRON}", # "- combining macron
28 0xE00B => "\N{COMBINING BREVE}", # "( combining breve
29 0xE00C => "\N{COMBINING DOT ABOVE}", # ". combining dot above
30 0xE00D => "\N{COMBINING DIAERESIS}", # ": combining diaeresis
31 0xE00E => "\N{COMBINING RING ABOVE}", # "0 combining ring above
32 0xE00F => "\N{COMBINING DOUBLE ACUTE ACCENT}", # "" combining double accute (sic)
33 0xE010 => "\N{COMBINING CARON}", # "< combining caron
34 0xE011 => "\N{COMBINING CEDILLA}", # ", combining cedilla
35 0xE012 => "\N{COMBINING OGONEK}", # "; combining ogonek
36 0xE013 => "\N{COMBINING LOW LINE}", # "_ combining low line
37 0xE014 => "\N{COMBINING DOUBLE LOW LINE}", # "= combining double low line
38 0xE015 => "\N{COMBINING LONG SOLIDUS OVERLAY}", # "/ combining long solidus
39 0xE016 => "\N{COMBINING GREEK YPOGEGRAMMENI}", # "i combining greek iota below
40 0xE017 => "\N{COMBINING REVERSED COMMA ABOVE}", # "d combining greek dasia pneumata
41 0xE018 => "\N{COMBINING COMMA ABOVE}", # "p combining greek psili pneumata
42 0xE019 => "\N{GREEK DASIA}", # ;; greek dasia pneumata
43 0xE01A => "\N{GREEK PSILI}", # ,, greek psili pneumata
44 0xE01B => "\N{GREEK BETA SYMBOL}", # b3 middle beta = curled beta?
45 0xE01C => "\N{WHITE CIRCLE}", # Ci circle
46 0xE01D => "\N{LATIN SMALL LETTER F WITH HOOK}", # f( function sign
47 0xE01E => "\N{LATIN SMALL LETTER EZH}", # ed ezh
48 0xE01F => "\N{SQUARE AM}", # am, compatibility char
49 0xE020 => "\N{SQUARE PM}", # pm, compatibility char
50 0xE021 => "\N{TELEPHONE SIGN}", # Tel, compatibility char
51 0xE022 => "\N{ARABIC LETTER ALEF FINAL FORM}", # a+: final alef compatibility
52 # 0xE023 => "\N{LATIN SMALL LETTER F WITH HOOK}", # Fl dutch guilder, unified with function sign, obsolete
53 # 0xE024 => "\N{GREEK CAPITAL LETTER GAMMA}", # GF gamma function sign, unified with letter
54 0xE025 => "\N{COMBINING RIGHT ARROW ABOVE}", # >V combining(?) rightwards vector above; also U+20D1
55 0xE026 => "\N{GREEK VARIA}", # !* greek varia
56 0xE027 => "\N{GREEK PERISPOMENI}", # ?* greek perispomeni
57 0xE028 => "J\N{COMBINING CARON}", # J< J-caron = uppercase U+01F0, no single character
60 # expect input data source at command line
61 @ARGV or die "Specify input source file or - for STDIN\n";
63 # skip everything until a character indented by 1 space (table start)
66 defined or die "Premature input end";
67 } until s/^\s(?=\S)//;
70 my @line = $_; # add first line (already read, assume it's ok)
72 # read the rest of the character table
73 while ($_ = readline) {
74 # check for table end (chapter 4)
77 # parse table lines (ignore (unindented) page break)
81 # append line contents
83 # continuation line (add to last entry)
92 # output perl code of hash
93 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
94 say "# automatically generated by $0";
98 my ($mnem, $chrhex, $name) = split / +/, $_, 3;
99 next if length $mnem != 2;
100 my $chrnum = hex $chrhex;
101 my $chr = $replace{$chrnum} // chr $chrnum or next;
102 my $chrstr = pp $chr;
103 say "q{$mnem} => $chrstr, # $name";
111 mkdigraphs-rfc - Output digraph data from RFC-1345
115 Extract digraphs from text specifications as a perl hash:
117 mkdigraphs-rfc rfc1345.txt >digraphs-rfc.inc.pl
119 Input can be the literal RFC (or similar) document:
121 curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist -
123 Test by printing the character for DO (should be a dollar sign):
125 perl -e'$di = do "digraphs-rfc.inc.pl"; print chr $di->{DO}'
129 Parses the official RFC-1345 document, searching the
130 'character mnemonic table' for all digraph definitions.
131 If successful, Perl code is output resulting in a hash
132 with Unicode code points keyed by digraph.
133 Obsolete values (references to private use area)
134 are converted to modern alternatives.
135 Any errors and warnings are given at STDERR.
139 Mischa POSLAWSKY <perl@shiar.org>
143 Licensed under the GNU Affero General Public License version 3.