5 use open OUT => ':utf8', ':std';
11 # translation table for deprecated code points
13 (map {$_ => 0} 0xE000 .. 0xE03F), # omit by default if unspecified
14 0xE001 => 0, # /c join lines: not accepted
15 0xE002 => "\N{EM QUAD}", # UA unit space A
16 #> ISO-IR 008-1: unit spaces A in position 4/0 and B in
17 #> position 6/0 are fixed length spaces whereby UA ≥ UB.
18 #> […] The lengths of UA and UB are determined by the user.
19 0xE003 => "\N{EN QUAD}", # UB unit space B
20 # 0xE004 => "\N{COMBINING DIAERESIS}", # "3 combining umlaut, unified with diaeresis
21 # 0xE005 => "\N{COMBINING GREEK DIALYTIKA TONOS}", # "1 combining diaeresis+accent, unspecified
22 0xE006 => "\N{COMBINING GRAVE ACCENT}", # "! combining grave
23 0xE007 => "\N{COMBINING ACUTE ACCENT}", # "' combining acute
24 0xE008 => "\N{COMBINING CIRCUMFLEX ACCENT}", # "> combining circumflex
25 0xE009 => "\N{COMBINING TILDE}", # "? combining tilde
26 0xE00A => "\N{COMBINING MACRON}", # "- combining macron
27 0xE00B => "\N{COMBINING BREVE}", # "( combining breve
28 0xE00C => "\N{COMBINING DOT ABOVE}", # ". combining dot above
29 0xE00D => "\N{COMBINING DIAERESIS}", # ": combining diaeresis
30 0xE00E => "\N{COMBINING RING ABOVE}", # "0 combining ring above
31 0xE00F => "\N{COMBINING DOUBLE ACUTE ACCENT}", # "" combining double accute (sic)
32 0xE010 => "\N{COMBINING CARON}", # "< combining caron
33 0xE011 => "\N{COMBINING CEDILLA}", # ", combining cedilla
34 0xE012 => "\N{COMBINING OGONEK}", # "; combining ogonek
35 0xE013 => "\N{COMBINING LOW LINE}", # "_ combining low line
36 0xE014 => "\N{COMBINING DOUBLE LOW LINE}", # "= combining double low line
37 0xE015 => "\N{COMBINING LONG SOLIDUS OVERLAY}", # "/ combining long solidus
38 0xE016 => "\N{COMBINING GREEK YPOGEGRAMMENI}", # "i combining greek iota below
39 0xE017 => "\N{COMBINING REVERSED COMMA ABOVE}", # "d combining greek dasia pneumata
40 0xE018 => "\N{COMBINING COMMA ABOVE}", # "p combining greek psili pneumata
41 0xE019 => "\N{GREEK DASIA}", # ;; greek dasia pneumata
42 0xE01A => "\N{GREEK PSILI}", # ,, greek psili pneumata
43 0xE01B => "\N{GREEK BETA SYMBOL}", # b3 middle beta = curled beta?
44 0xE01C => "\N{WHITE CIRCLE}", # Ci circle
45 0xE01D => "\N{LATIN SMALL LETTER F WITH HOOK}", # f( function sign
46 0xE01E => "\N{LATIN SMALL LETTER EZH}", # ed ezh
47 0xE01F => "\N{SQUARE AM}", # am, compatibility char
48 0xE020 => "\N{SQUARE PM}", # pm, compatibility char
49 0xE021 => "\N{TELEPHONE SIGN}", # Tel, compatibility char
50 0xE022 => "\N{ARABIC LETTER ALEF FINAL FORM}", # a+: final alef compatibility
51 # 0xE023 => "\N{LATIN SMALL LETTER F WITH HOOK}", # Fl dutch guilder, unified with function sign, obsolete
52 # 0xE024 => "\N{GREEK CAPITAL LETTER GAMMA}", # GF gamma function sign, unified with letter
53 0xE025 => "\N{COMBINING RIGHT ARROW ABOVE}", # >V combining(?) rightwards vector above; also U+20D1
54 0xE026 => "\N{GREEK VARIA}", # !* greek varia
55 0xE027 => "\N{GREEK PERISPOMENI}", # ?* greek perispomeni
56 0xE028 => "J\N{COMBINING CARON}", # J< J-caron = uppercase U+01F0, no single character
59 # expect input data source at command line
60 @ARGV or die "Specify input source file or - for STDIN\n";
62 # skip everything until a character indented by 1 space (table start)
65 defined or die "Premature input end";
66 } until s/^\s(?=\S)//;
69 my @line = $_; # add first line (already read, assume it's ok)
71 # read the rest of the character table
72 while ($_ = readline) {
73 # check for table end (chapter 4)
76 # parse table lines (ignore (unindented) page break)
80 # append line contents
82 # continuation line (add to last entry)
91 # output perl code of hash
92 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
93 say "# automatically generated by $0";
97 my ($mnem, $chrhex, $name) = split / +/, $_, 3;
98 next if length $mnem != 2;
99 my $chrnum = hex $chrhex;
100 my $chr = $replace{$chrnum} // chr $chrnum or next;
101 my $chrstr = pp $chr;
102 say "q{$mnem} => $chrstr, # $name";
110 mkdigraphs-rfc - Output digraph data from RFC-1345
114 Extract digraphs from text specifications as a perl hash:
116 mkdigraphs-rfc rfc1345.txt >digraphs-rfc.inc.pl
118 Input can be the literal RFC (or similar) document:
120 curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist -
122 Test by printing the character for DO (should be a dollar sign):
124 perl -e'$di = do "digraphs-rfc.inc.pl"; print chr $di->{DO}'
128 Parses the official RFC-1345 document, searching the
129 'character mnemonic table' for all digraph definitions.
130 If successful, Perl code is output resulting in a hash
131 with Unicode code points keyed by digraph.
132 Obsolete values (references to private use area)
133 are converted to modern alternatives.
134 Any errors and warnings are given at STDERR.
138 Mischa POSLAWSKY <perl@shiar.org>
142 Licensed under the GNU Affero General Public License version 3.