# # UniConv.pm -- Perl subroutines for best-effort conversion # of UTF-8 Unicode strings into equivalent LaTeX and ASCII # representations. # # Markus Kuhn package UniConv; use utf8; # this file is UTF-8 encoded use Unicode::Normalize; use charnames ':full'; use Exporter 'import'; our @EXPORT_OK = qw(utf8_to_LaTeX utf8_to_ascii); # ²³->23 sub desuperscript { my ($s) = @_; $s =~ tr /\N{SUPERSCRIPT ONE}\N{SUPERSCRIPT TWO}\N{SUPERSCRIPT THREE}/123/; return $s; } # ²³->23 sub desubscript { my ($s) = @_; $s =~ tr /\N{SUBSCRIPT ONE}\N{SUBSCRIPT TWO}\N{SUBSCRIPT THREE}/123/; return $s; } my %tolatex = ( # ASCII characters with special TeX/LaTeX semantics '\\' => '$\\backslash$', "\#" => "\\\#", '$' => '\\$', '%' => '\\%', '&' => '\\&', '_' => '\\_', '^' => '\\^{}', '~' => '\\~{}', '{' => '$\\{$', '}' => '$\\}$', '<' => '$<$', '>' => '$>$', '|' => '$|$', '?' => '?', '!' => '!', '-' => '-', '`' => '\\`{}', # non-ASCII characters "\N{MICRO SIGN}" => "\$\\mu\$", "\N{LATIN SMALL LIGATURE OE}" => "\{\\oe\}", "\N{LATIN CAPITAL LIGATURE OE}" => "\{\\OE\}", "\N{LATIN SMALL LETTER AE}" => "\{\\ae\}", "\N{LATIN CAPITAL LETTER AE}" => "\{\\AE\}", "\N{LATIN SMALL LETTER A WITH RING ABOVE}" => "\{\\aa\}", "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" => "\{\\AA\}", "\N{LATIN SMALL LETTER O WITH STROKE}" => "\{\\o\}", "\N{LATIN CAPITAL LETTER O WITH STROKE}" => "\{\\O\}", "\N{LATIN SMALL LETTER L WITH STROKE}" => "\{\\l\}", "\N{LATIN CAPITAL LETTER L WITH STROKE}" => "\{\\L\}", "\N{LATIN SMALL LETTER SHARP S}" => "\{\\ss\}", "\N{INVERTED QUESTION MARK}" => "\?\'", "\N{INVERTED EXCLAMATION MARK}" => "\!\'", "\N{DAGGER}" => "\{\\dag\}", "\N{DOUBLE DAGGER}" => "\{\\ddag\}", "\N{BULLET}" => "\$\\bullet\$", "\N{SECTION SIGN}" => "\{\\S\}", "\N{PILCROW SIGN}" => "\{\\P\}", "\N{COPYRIGHT SIGN}" => "\{\\copyright\}", "\N{POUND SIGN}" => "\{\\pounds\}", "\N{PLUS-MINUS SIGN}" => "\$\\pm\$", "\N{NO-BREAK SPACE}" => "\~", "\N{PARAGRAPH SEPARATOR}" => "\n\n", "\N{GREEK CAPITAL LETTER ALPHA}" => 'A', "\N{GREEK CAPITAL LETTER BETA}" => 'B', "\N{GREEK CAPITAL LETTER GAMMA}" => '$\\Gamma$', "\N{GREEK CAPITAL LETTER DELTA}" => '$\\Delta$', "\N{GREEK CAPITAL LETTER EPSILON}" => 'E', "\N{GREEK CAPITAL LETTER ZETA}" => 'Z', "\N{GREEK CAPITAL LETTER ETA}" => 'H', "\N{GREEK CAPITAL LETTER THETA}" => '$\\Theta$', "\N{GREEK CAPITAL LETTER IOTA}" => 'I', "\N{GREEK CAPITAL LETTER KAPPA}" => 'K', "\N{GREEK CAPITAL LETTER LAMDA}" => '$\\Lambda$', "\N{GREEK CAPITAL LETTER MU}" => 'M', "\N{GREEK CAPITAL LETTER NU}" => 'N', "\N{GREEK CAPITAL LETTER XI}" => '$\\Xi$', "\N{GREEK CAPITAL LETTER OMICRON}" => 'O', "\N{GREEK CAPITAL LETTER PI}" => '$\\Pi$', "\N{GREEK CAPITAL LETTER RHO}" => 'P', "\N{GREEK CAPITAL LETTER SIGMA}" => '$\\Sigma$', "\N{GREEK CAPITAL LETTER TAU}" => 'T', "\N{GREEK CAPITAL LETTER UPSILON}" => '$\\Upsilon$', "\N{GREEK CAPITAL LETTER PHI}" => '$\\Phi$', "\N{GREEK CAPITAL LETTER CHI}" => 'X', "\N{GREEK CAPITAL LETTER PSI}" => '$\\Psi$', "\N{GREEK CAPITAL LETTER OMEGA}" => '$\\Omega$', "\N{GREEK SMALL LETTER ALPHA}" => '$\\alpha$', "\N{GREEK SMALL LETTER BETA}" => '$\\beta$', "\N{GREEK SMALL LETTER GAMMA}" => '$\\gamma$', "\N{GREEK SMALL LETTER DELTA}" => '$\\delta$', "\N{GREEK SMALL LETTER EPSILON}" => '$\\epsilon$', "\N{GREEK SMALL LETTER ZETA}" => '$\\zeta$', "\N{GREEK SMALL LETTER ETA}" => '$\\eta$', "\N{GREEK SMALL LETTER THETA}" => '$\\theta$', "\N{GREEK SMALL LETTER IOTA}" => '$\\iota$', "\N{GREEK SMALL LETTER KAPPA}" => '$\\kappa$', "\N{GREEK SMALL LETTER LAMDA}" => '$\\lambda$', "\N{GREEK SMALL LETTER MU}" => '$\\mu$', "\N{GREEK SMALL LETTER NU}" => '$\\nu$', "\N{GREEK SMALL LETTER XI}" => '$\\xi$', "\N{GREEK SMALL LETTER OMICRON}" => '$o$', "\N{GREEK SMALL LETTER PI}" => '$\\pi$', "\N{GREEK SMALL LETTER RHO}" => '$\\rho$', "\N{GREEK SMALL LETTER FINAL SIGMA}" => '$\\sigma$', "\N{GREEK SMALL LETTER SIGMA}" => '$\\sigma$', "\N{GREEK SMALL LETTER TAU}" => '$\\tau$', "\N{GREEK SMALL LETTER UPSILON}" => '$\\upsilon$', "\N{GREEK SMALL LETTER PHI}" => '$\\phi$', "\N{GREEK SMALL LETTER CHI}" => '$\\chi$', "\N{GREEK SMALL LETTER PSI}" => '$\\psi$', "\N{GREEK SMALL LETTER OMEGA}" => '$\\omega$', "\N{MINUS SIGN}" => "\$-\$", "\N{ASTERISK OPERATOR}" => "\$\\ast\$", "\N{RING OPERATOR}" => "\$\\circ\$", "\N{WHITE SQUARE}" => "\$\\Box\$", "\N{WHITE MEDIUM SQUARE}" => "\$\\Box\$", "\N{LESS-THAN OR EQUAL TO}" => "\$\\le\$", "\N{GREATER-THAN OR EQUAL TO}" => "\$\\ge\$", "\N{MUCH GREATER-THAN}" => "\$\\gg\$", "\N{WHITE DIAMOND}" => "\$\\Diamond\$", "\N{WHITE DIAMOND SUIT}" => "\$\\diamondsuit\$", "\N{RIGHT TACK}" => "\$\\vdash\$", "\N{LEFT TACK}" => "\$\\dashv\$", "\N{UP TACK}" => "\$\\bot\$", "\N{DOWN TACK}" => "\$\\top\$", "—" => "{---}", "–" => "{--}", "“" => "``", "”" => "''", "‘" => "\`", "’" => "\'", "′" => "\$\'\$", "→" => "\$\\rightarrow\$", "∀" => "\$\\forall\$", "\N{DEGREE SIGN}" => "\$\^\\circ\$", "\N{MIDDLE DOT}" => '$\cdot$', "\N{FUNCTION APPLICATION}" => "", "\N{MATHEMATICAL ITALIC SMALL A}" => '$a$', "\N{MATHEMATICAL ITALIC SMALL B}" => '$b$', "\N{MATHEMATICAL ITALIC SMALL C}" => '$c$', "\N{MATHEMATICAL ITALIC SMALL D}" => '$d$', "\N{MATHEMATICAL ITALIC SMALL E}" => '$e$', "\N{MATHEMATICAL ITALIC SMALL F}" => '$f$', "\N{MATHEMATICAL ITALIC SMALL G}" => '$g$', "\N{PLANCK CONSTANT}" => '$h$', "\N{MATHEMATICAL ITALIC SMALL I}" => '$i$', "\N{MATHEMATICAL ITALIC SMALL J}" => '$j$', "\N{MATHEMATICAL ITALIC SMALL K}" => '$k$', "\N{MATHEMATICAL ITALIC SMALL L}" => '$l$', "\N{MATHEMATICAL ITALIC SMALL M}" => '$m$', "\N{MATHEMATICAL ITALIC SMALL N}" => '$n$', "\N{MATHEMATICAL ITALIC SMALL O}" => '$o$', "\N{MATHEMATICAL ITALIC SMALL P}" => '$p$', "\N{MATHEMATICAL ITALIC SMALL Q}" => '$q$', "\N{MATHEMATICAL ITALIC SMALL R}" => '$r$', "\N{MATHEMATICAL ITALIC SMALL S}" => '$s$', "\N{MATHEMATICAL ITALIC SMALL T}" => '$t$', "\N{MATHEMATICAL ITALIC SMALL U}" => '$u$', "\N{MATHEMATICAL ITALIC SMALL V}" => '$v$', "\N{MATHEMATICAL ITALIC SMALL W}" => '$w$', "\N{MATHEMATICAL ITALIC SMALL X}" => '$x$', "\N{MATHEMATICAL ITALIC SMALL Y}" => '$y$', "\N{MATHEMATICAL ITALIC SMALL Z}" => '$z$', "\N{MODIFIER LETTER SMALL C}" => '$^c$', "\N{COMBINING GRAVE ACCENT}" => '\`', "\N{COMBINING ACUTE ACCENT}" => "\\'", "\N{COMBINING CIRCUMFLEX ACCENT}" => '\^', "\N{COMBINING TILDE}" => '\~', "\N{COMBINING MACRON}" => '\=', "\N{COMBINING OVERLINE}" => '\=', "\N{COMBINING BREVE}" => '\u ', "\N{COMBINING DOT ABOVE}" => '\.', "\N{COMBINING DIAERESIS}" => '\"', "\N{COMBINING DOUBLE ACUTE ACCENT}" => '\H ', "\N{COMBINING CARON}" => '\v ', "\N{COMBINING CEDILLA}" => '\c ', "\N{COMBINING DOT BELOW}" => '\d ', "\N{COMBINING MACRON BELOW}" => '\b ' ); # Convert Unicode strings to the equivalent LaTeX representation # (to be extended as the need arises) # # $linesep is an optional parameter for the amount of vspace # inserted by a LINE SEPARATOR that is not preceded by punctuation # (used to set titles) sub utf8_to_LaTeX { my ($s, $linesep) = @_; return $s unless defined $s; $s = NFD($s); # Normalization Form D my @s; while ($s !~ /\G\z/) { if ($s =~ /\G([\n 0-9a-z\@A-Z\.,;:\(\)\[\]\+\*=\/]+)(?!\pM)/gc) { # pass through ASCII letters, digits, most punctuation and # LF but not any metacharacters or punctuation characters # that appear in ligatures, and no character followed by a # combining character push @s, $1; } elsif ($s =~ /\Gi(\pM+)/gc) { # handle combining characters after 'i' push @s, '{', join('', map($tolatex{$_}//die("combining $_"), reverse(split('', $1)))), '\i}'; } elsif ($s =~ /\G(\PM)(\pM+)/gc) { # handle other combining characters push @s, '{', join('', map($tolatex{$_}//die("combining $_"), reverse(split('', $2)))), utf8_to_LaTeX($1), '}'; } elsif ($s =~ /\G(-+)/gc) { # disarm ASCII ligatures --, --- push @s, join('{}', split('', $1)); } elsif ($s =~ /\G(.)/ && defined(my $a = $tolatex{$1})) { # table-based substitution $s =~ /\G(.)/gc; push @s, $a; } elsif ($s =~ /\G([\N{SUPERSCRIPT ONE}\N{SUPERSCRIPT TWO}\N{SUPERSCRIPT THREE}]+)/gc) { # prefix a sequence of superscripts with ^, as in $2^{32}$ push @s, '$^{', desuperscript($1), '}$'; } elsif ($s =~ /\G([\N{SUBSCRIPT ONE}\N{SUBSCRIPT TWO}\N{SUBSCRIPT THREE}]+)/gc) { # prefix a sequence of subscripts with _, as in $a_{12}$ push @s, '$_{', desubscript($1), '}$'; } elsif ($s =~ /\G(?<=[\p{Pd}\.:,;?!])\N{LINE SEPARATOR}/gc) { push @s, "\\\\\\relax\n"; } elsif ($s =~ /\G\N{LINE SEPARATOR}/gc) { if ($linesep) { push @s, "\\\\[$linesep]\n"; } else { push @s, "\\\\\\relax\n"; } } elsif ($s =~ /\G(.)/gc) { # encountered a Unicode character not yet implemented my $u = $1; $s =~ /^(.*\G.*)$/m; die(sprintf("Can't convert '$u' (U+%04X %s) to LaTeX in:\n$1\n", ord($u), charnames::viacode(ord($u)))) } else { # this should never happen die("utf8_to_ascii failed after\n", @s); } } $s = join('', @s); # some optimizations: # remove $$ unless this might extend macronames # $s =~ s/(? "GBP", "\N{MICRO SIGN}" => "u", "\N{LATIN SMALL LIGATURE OE}" => "oe", "\N{LATIN CAPITAL LIGATURE OE}" => "OE", "\N{LATIN SMALL LETTER AE}" => "ae", "\N{LATIN CAPITAL LETTER AE}" => "AE", "\N{LATIN SMALL LETTER A WITH RING ABOVE}" => "aa", "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" => "AA", "\N{LATIN SMALL LETTER O WITH STROKE}" => "o", "\N{LATIN CAPITAL LETTER O WITH STROKE}" => "O", "\N{LATIN SMALL LETTER L WITH STROKE}" => "l", "\N{LATIN CAPITAL LETTER L WITH STROKE}" => "L", "\N{LATIN SMALL LETTER SHARP S}" => "ss", "\N{INVERTED QUESTION MARK}" => "\?", "\N{INVERTED EXCLAMATION MARK}" => "\!", "\N{DAGGER}" => "\+", "\N{DOUBLE DAGGER}" => "\+\+", "\N{BULLET}" => "*", "\N{MIDDLE DOT}" => "*", "\N{COPYRIGHT SIGN}" => "(c)", "\N{PLUS-MINUS SIGN}" => "+\/-", "\N{NO-BREAK SPACE}" => " ", "\N{LINE SEPARATOR}" => "\n", "\N{PARAGRAPH SEPARATOR}" => "\n\n", "\N{GREEK CAPITAL LETTER ALPHA}" => "A", "\N{GREEK CAPITAL LETTER BETA}" => "B", "\N{GREEK CAPITAL LETTER GAMMA}" => "Gamma", "\N{GREEK CAPITAL LETTER DELTA}" => "Delta", "\N{GREEK CAPITAL LETTER EPSILON}" => "E", "\N{GREEK CAPITAL LETTER ZETA}" => "Z", "\N{GREEK CAPITAL LETTER ETA}" => "H", "\N{GREEK CAPITAL LETTER THETA}" => "Theta", "\N{GREEK CAPITAL LETTER IOTA}" => "I", "\N{GREEK CAPITAL LETTER KAPPA}" => "K", "\N{GREEK CAPITAL LETTER LAMDA}" => "Lambda", "\N{GREEK CAPITAL LETTER MU}" => "M", "\N{GREEK CAPITAL LETTER NU}" => "N", "\N{GREEK CAPITAL LETTER XI}" => "Xi", "\N{GREEK CAPITAL LETTER OMICRON}" => "O", "\N{GREEK CAPITAL LETTER PI}" => "Pi", "\N{GREEK CAPITAL LETTER RHO}" => "P", "\N{GREEK CAPITAL LETTER SIGMA}" => "Sigma", "\N{GREEK CAPITAL LETTER TAU}" => "T", "\N{GREEK CAPITAL LETTER UPSILON}" => "Y", "\N{GREEK CAPITAL LETTER PHI}" => "Phi", "\N{GREEK CAPITAL LETTER CHI}" => "X", "\N{GREEK CAPITAL LETTER PSI}" => "Psi", "\N{GREEK CAPITAL LETTER OMEGA}" => "Omega", "\N{GREEK SMALL LETTER ALPHA}" => "alpha", "\N{GREEK SMALL LETTER BETA}" => "beta", "\N{GREEK SMALL LETTER GAMMA}" => "gamma", "\N{GREEK SMALL LETTER DELTA}" => "delta", "\N{GREEK SMALL LETTER EPSILON}" => "epsilon", "\N{GREEK SMALL LETTER ZETA}" => "zeta", "\N{GREEK SMALL LETTER ETA}" => "eta", "\N{GREEK SMALL LETTER THETA}" => "theta", "\N{GREEK SMALL LETTER IOTA}" => "iota", "\N{GREEK SMALL LETTER KAPPA}" => "kappa", "\N{GREEK SMALL LETTER LAMDA}" => "lambda", "\N{GREEK SMALL LETTER MU}" => "mu", "\N{GREEK SMALL LETTER NU}" => "nu", "\N{GREEK SMALL LETTER XI}" => "xi", "\N{GREEK SMALL LETTER OMICRON}" => "o", "\N{GREEK SMALL LETTER PI}" => "pi", "\N{GREEK SMALL LETTER RHO}" => "rho", "\N{GREEK SMALL LETTER FINAL SIGMA}" => "sigma", "\N{GREEK SMALL LETTER SIGMA}" => "sigma", "\N{GREEK SMALL LETTER TAU}" => "tau", "\N{GREEK SMALL LETTER UPSILON}" => "upsilon", "\N{GREEK SMALL LETTER PHI}" => "phi", "\N{GREEK SMALL LETTER CHI}" => "chi", "\N{GREEK SMALL LETTER PSI}" => "psi", "\N{GREEK SMALL LETTER OMEGA}" => "omega", "\N{MINUS SIGN}" => "-", "\N{ASTERISK OPERATOR}" => "\*", "\N{RING OPERATOR}" => "o", "\N{LESS-THAN OR EQUAL TO}" => "<=", "\N{GREATER-THAN OR EQUAL TO}" => ">=", "\N{MUCH GREATER-THAN}" => ">>", "\N{WHITE DIAMOND}" => "<>", "\N{WHITE DIAMOND SUIT}" => "<>", "\N{WHITE SQUARE}" => "[]", "\N{WHITE MEDIUM SQUARE}" => "[]", "\N{RIGHT TACK}" => "|-", "\N{LEFT TACK}" => "-|", "\N{UP TACK}" => "_|_", "\N{DOWN TACK}" => "T", "—" => "--", "–" => "-", "“" => "\"", "”" => "\"", "‘" => "\'", "’" => "\'", "′" => "\'", "→" => "->", "∀" => "forall", "\N{DEGREE SIGN}" => "deg", "\N{FUNCTION APPLICATION}" => "", "\N{MATHEMATICAL ITALIC SMALL A}" => 'a', "\N{MATHEMATICAL ITALIC SMALL B}" => 'b', "\N{MATHEMATICAL ITALIC SMALL C}" => 'c', "\N{MATHEMATICAL ITALIC SMALL D}" => 'd', "\N{MATHEMATICAL ITALIC SMALL E}" => 'e', "\N{MATHEMATICAL ITALIC SMALL F}" => 'f', "\N{MATHEMATICAL ITALIC SMALL G}" => 'g', "\N{PLANCK CONSTANT}" => 'h', "\N{MATHEMATICAL ITALIC SMALL I}" => 'i', "\N{MATHEMATICAL ITALIC SMALL J}" => 'j', "\N{MATHEMATICAL ITALIC SMALL K}" => 'k', "\N{MATHEMATICAL ITALIC SMALL L}" => 'l', "\N{MATHEMATICAL ITALIC SMALL M}" => 'm', "\N{MATHEMATICAL ITALIC SMALL N}" => 'n', "\N{MATHEMATICAL ITALIC SMALL O}" => 'o', "\N{MATHEMATICAL ITALIC SMALL P}" => 'p', "\N{MATHEMATICAL ITALIC SMALL Q}" => 'q', "\N{MATHEMATICAL ITALIC SMALL R}" => 'r', "\N{MATHEMATICAL ITALIC SMALL S}" => 's', "\N{MATHEMATICAL ITALIC SMALL T}" => 't', "\N{MATHEMATICAL ITALIC SMALL U}" => 'u', "\N{MATHEMATICAL ITALIC SMALL V}" => 'v', "\N{MATHEMATICAL ITALIC SMALL W}" => 'w', "\N{MATHEMATICAL ITALIC SMALL X}" => 'x', "\N{MATHEMATICAL ITALIC SMALL Y}" => 'y', "\N{MATHEMATICAL ITALIC SMALL Z}" => 'z', "\N{MODIFIER LETTER SMALL C}" => '^c', ); # Convert Unicode strings to an equivalent ASCII fallback representation # (to be extended as the need arises) sub utf8_to_ascii { my ($s) = @_; return $s unless defined $s; $s = NFD($s); # Normalization Form D my @s; while ($s !~ /\G\z/) { if ($s =~ /\G([ -~\n]+)/gc) { # pass through printable ASCII and LF push @s, $1; } elsif ($s =~ /\G(.)/ && defined(my $a = $toascii{$1})) { # table-based substitution $s =~ /\G(.)/gc; push @s, $a; } elsif ($s =~ /\G[\p{Mn}\p{Me}]+/gc) { # remove combining characters } elsif ($s =~ /\G([\N{SUPERSCRIPT ONE}\N{SUPERSCRIPT TWO}\N{SUPERSCRIPT THREE}]+)/gc) { # prefix a sequence of superscripts with ^, as in "2^32" push @s, '^', desuperscript($1); } elsif ($s =~ /\G([\N{SUBSCRIPT ONE}\N{SUBSCRIPT TWO}\N{SUBSCRIPT THREE}]+)/gc) { # prefix a sequence of subscripts with _, as in "a_12" push @s, '_', desubscript($1); } elsif ($s =~ /\G(.)/gc) { # encountered a Unicode character not yet implemented my $u = $1; $s =~ /^(.*\G.*)$/m; die(sprintf("Can't convert '$u' (U+%04X %s) to ASCII in:\n$1\n", ord($u), charnames::viacode(ord($u)))) } else { # this should never happen die("utf8_to_ascii failed after\n", @s); } } return join('', @s); } 1;