#!/usr/bin/perl
# Emacs: -*- mode: perl; perl-indent-level: 8; indent-tabs-mode: t; -*-
#
# Convert a text file to (basic) HTML.
#
# Usage:
# convert.pl [options] [source-pattern... dest-pattern]
# Options:
# -charset charset-id
# Causes a tag to be added to
# the HTML header indicating the given character set for the
# file.
# -font font-family
# Causes the given font family to be included as the font
# family in the CSS style for the
tag. Multiple fonts
# can be separated with commas, e.g.:
# -font 'Times New Roman, serif'
# -fontsize size
# Causes the given size in points to be specified in the CSS
# style for the tag. If omitted, the font size style
# rule will be omitted.
# -fontmono font-family
# Causes the given font family to be included as the font
# family in the CSS style for monospace text (within |...|
# quotes). Multiple fonts can be separated with commas.
# -fontmonosize size
# Causes the given size in points to be specified in the CSS
# style for monospace text. If omitted, the font size style
# rule will be omitted.
# -japanese
# Assume Japanese EUC-JP input, and process furigana
# constructs "{basetext:furigana}" using the XHTML 1.1
# tag. Generates XHTML 1.1 instead of HTML.
# -title title-string
# Specifies the title to use for the HTML file. Can include
# any of the following tags:
# %t The title given in the file, if any.
# %i The base input filename.
# %I The input filename, including any path.
# %o The base output filename.
# %O The output filename, including any path.
# The default is "%t".
# -notitle
# Specifies that the HTML file should not be given a title.
# -header header-string
# -trailer trailer-string
# Specifies the HTML code to insert:
# -header: after the opening and before the title
# -trailer: after the text and before the closing
# Can include any of the following tags:
# %p The base name of the previous output file,
# or empty if this is the first file.
# %P(...) Include the text "..." iff %p is not empty.
# Parentheses in "..." must be escaped
# with "\".
# %n The base name of the next output file, or
# or empty if this is the last file.
# %N(...) Include the text "..." iff %n is not empty.
# The default for both options is "" (the empty string).
# -noheader
# -notrailer
# Specifies that no header or trailer should be written (the
# default).
# -separator separator-string
# Specifies the HTML code to insert for a section separator.
# The default is "* * *
".
# -regex match-regex
# Specifies that the dest-pattern is a substitution string
# for the given match-regex.
# -inplace
# Allow in-place file modification. The default is to not
# allow it.
# -blockquote
# Allow block-quote recognition (set off by larger indents
# than the first paragraph indent and maybe blank lines).
# -touch
# Copy the source file's modification time to the output
# file.
# -merge
# Output the full text as a single HTML file, rather than a
# separate output file for each input file.
#
# Normal operation is to read stdin and write to stdout. If one or more
# source-patterns and a dest-pattern are given, they specify that, instead,
# text should be read from each file matching each source-pattern
# (standard shell wildcards; can also be plain filenames, meaning shell
# globbing is fine too) and placed in a file whose name is taken from
# applying the dest-pattern to each source filename. The dest-pattern can
# contain any of the following tags if -regex is not given:
# %f The source filename (everything after the last "/", or the
# whole string if there is no "/").
# %p The path to the source file (everything up to and including
# the final "/", or empty if there is no "/").
# %n The name (without extension) of the source file (everything
# in %f up to but not including the last ".", or the whole
# string is there is no ".").
# %e The extension of the source file (everything in %f after
# the last ".", or empty if there is no ".").
# If the -regex option is given, the destination filename is computed as
# "s/match-regex/dest-pattern/" and the tags above are ignored.
#
# If a file ".convert" is present in the current directory, options will
# be read from it for the defaults, one option (and argument) per line
# (arguments do not need to be quoted even if they contain spaces). They
# can be overridden by the command line. #'s in the file must be escaped
# or they will be interpreted as comments. Newlines can be included in
# arguments by ending a line with a \ (backslash).
#
###########################################################################
$title = "%t";
$header = "";
$trailer = "";
$separator = "* * *
";
$font = undef;
$fontsize = undef;
$fontmono = undef;
$fontmonosize = undef;
$jp_mode = 0;
if (open(CONF, ".convert")) {
@file = ();
# Read the lines in
while () {
chop($line = $_);
while ($line =~ /\\$/) {
chop($line);
$line .= "\n";
$line .= ;
chop($line);
}
$line =~ s/^#.*//;
$line =~ s/[^\\]#.*//;
$line =~ s/^\s+//;
if (!$line) {
next;
}
$opt = $line;
$opt =~ s/(\S*)(.|\n)*/$1/;
$line =~ s/\S*[ \t]+(.*)/$1/;
push(@file, $opt, $line);
}
close(CONF);
# Add them to the command line so they're in the right order
foreach $i (reverse @file) {
unshift(@ARGV, $i);
}
}
# Parse command line
$i = 0;
while ($i <= $#ARGV && $ARGV[$i] =~ /^-/) {
if ($ARGV[$i] eq "-charset") {
$charset = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-font") {
$font = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-fontsize") {
$fontsize = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-fontmono") {
$fontmono = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-fontmonosize") {
$fontmonosize = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-japanese") {
$jp_mode = 1;
$charset = "EUC-JP";
} elsif ($ARGV[$i] eq "-title") {
$title = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-notitle") {
$title = "";
} elsif ($ARGV[$i] eq "-header") {
$header = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-noheader") {
$header = "";
} elsif ($ARGV[$i] eq "-trailer") {
$trailer = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-notrailer") {
$trailer = "";
} elsif ($ARGV[$i] eq "-separator") {
$separator = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-regex") {
$is_regex = 1;
$match_regex = $ARGV[++$i];
} elsif ($ARGV[$i] eq "-inplace") {
$allow_inplace = 1;
} elsif ($ARGV[$i] eq "-blockquote") {
$allow_blockquote = 1;
} elsif ($ARGV[$i] eq "-touch") {
$touch = 1;
} elsif ($ARGV[$i] eq "-merge") {
$merge = 1;
} else {
die "Unknown option $ARGV[$i]\n";
}
$i++;
}
if ($i < $#ARGV) {
# Regexp to match stuff inside a pair of parens:
$inside_paren = '[^)\\\]*(\\\\\)[^)\\\]*)*';
while ($i < $#ARGV) {
$srcpatt = "$srcpatt $ARGV[$i++]";
}
$destpatt = $ARGV[$i];
@files = <${srcpatt}>;
for ($i = 0; $i <= $#files; $i++) {
if ($merge) {
print STDERR "Converting $files[$i]\n";
$dest = ($i==0 ? $destpatt : $i==$#files ? "" : undef);
$h = $header;
$t = $trailer;
$h =~ s/%[PN]\($inside_paren\)//g;
$h =~ s/%[pn]//g;
$t =~ s/%[PN]\($inside_paren\)//g;
$t =~ s/%[pn]//g;
$h = "" if $i > 0;
$t = "" if $i < $#files;
} else {
$dest = &make_dest($files[$i],$destpatt,$is_regex);
if (!$allow_inplace && $dest eq $files[$i]) {
die "$files[$i]: source and destination filenames are the same\n";
}
print STDERR "Converting $files[$i] to $dest\n";
$h = $header;
$t = $trailer;
if ($i > 0) {
$prev = &make_dest($files[$i-1],$destpatt,$is_regex);
$prev =~ s#.*/(.*)#$1#;
$h =~ s/%P\(($inside_paren)\)/$1/g;
$h =~ s/\\\(/(/g;
$h =~ s/\\\)/)/g;
$h =~ s/%p/$prev/g;
$t =~ s/%P\(($inside_paren)\)/$1/g;
$t =~ s/\\\(/(/g;
$t =~ s/\\\)/)/g;
$t =~ s/%p/$prev/g;
} else {
$h =~ s/%P\($inside_paren\)//g;
$h =~ s/%p//g;
$t =~ s/%P\($inside_paren\)//g;
$t =~ s/%p//g;
}
if ($i < $#files) {
$next = &make_dest($files[$i+1],$destpatt,$is_regex);
$next =~ s#.*/(.*)#$1#;
$h =~ s/%N\(($inside_paren)\)/$1/g;
$h =~ s/\\\(/(/g;
$h =~ s/\\\)/)/g;
$h =~ s/%n/$next/g;
$t =~ s/%N\(($inside_paren)\)/$1/g;
$t =~ s/\\\(/(/g;
$t =~ s/\\\)/)/g;
$t =~ s/%n/$next/g;
} else {
$h =~ s/%N\($inside_paren\)//g;
$h =~ s/%n//g;
$t =~ s/%N\($inside_paren\)//g;
$t =~ s/%n//g;
}
}
&parse($title, $h, $t, $separator, $files[$i], $dest);
}
} elsif ($i == $#ARGV) {
die "Destination pattern required if source pattern given\n";
} else {
$trailer =~ s/%P\($inside_paren\)//g;
$trailer =~ s/%p//g;
$trailer =~ s/%N\($inside_paren\)//g;
$trailer =~ s/%n//g;
&parse($title, $header, $trailer, $separator);
}
###########################################################################
sub make_dest
{
my ($file,$destpatt,$is_regex) = @_;
my ($dest,$path,$name,$ext);
if ($is_regex) {
$destpatt =~ s/\\([0-9])/\$$1/;
$destpatt = '"'.$destpatt.'"';
$dest = $file;
$dest =~ /$match_regex/;
$dest = eval $destpatt;
} else {
$dest = $destpatt;
$path = $file;
$path =~ s#(.*/).*#$1#;
$file =~ s#.*/(.*)#$1#;
$name = $file;
$name =~ s/(.*)\..*/$1/;
$ext = $file;
$ext =~ s/.*\.(.*)/$1/;
$dest =~ s/%p/$path/g;
$dest =~ s/%f/$file/g;
$dest =~ s/%n/$name/g;
$dest =~ s/%e/$ext/g;
}
$dest;
}
###########################################################################
sub parse
{
my ($title,$header,$trailer,$separator,$infile,$outfile) = @_;
my (@file,$inbase,$outbase);
if ($infile) {
open(IN, $infile) || die "$infile: $!\n";
if ($touch) {
$mtime = (stat(IN))[9];
}
@file = ;
close(IN);
$inbase = $infile;
$inbase =~ s#.*/(.*)#$1#;
if ($outfile) {
$outbase = $outfile;
$outbase =~ s#.*/(.*)#$1#;
if ($outfile eq "-") {
open(OUT, ">&STDOUT") || die "STDOUT: $!\n";
} else {
open(OUT, ">$outfile") || die "$outfile: $!\n";
}
}
$oldhandle = select(OUT);
} else {
@file = ;
$oldhandle = select(STDOUT);
}
# Parse text. Rules:
# A paragraph begins with at least one space or a preceding
# blank line.
# A section ends with at least one more blank line than between
# paragraphs, or at a line containing only whitespace and any of the
# characters `~' `*' `-' `_' `=' `.' (but at least one non-whitespace
# character).
# Emphasis is marked by text surrounded by _underscores_ or
# *asterisks*. Emphasized text is printed in italics, except when in
# italicized text, in which case it is printed un-italicized.
# Text to be italicized is surrounded by /slashes/ or . Italicized sections cannot be nested.
# Text surrounded by |vertical bars| is rendered in a monospace
# font.
# In Japanese mode only, furigana is indicated by putting it
# after its base text separated by a colon and surrounding the
# entire construct by {curly brackets}: {basetext:furigana}
# Text (within a paragraph) surrounded by *** will be displayed
# in red (e.g. for editing).
# If the second line consists of only whitespace and (dashes or
# equals signs), the first line is a title.
# If block quote parsing is enabled, a block quote is recognized
# by zero or more extra blank lines, followed by one or more
# paragraphs with a larger indent than the first paragraph, followed
# by zero or more extra blank lines, followed by a paragraph with
# normal or less indenting.
# In Japanese mode, blank lines that do not end paragraphs are
# deleted.
my $between_para = -1; # Number of blank lines between paragraphs
my $blank_count = 0; # Number of consecutive blank lines seen
my $para = ""; # Current paragraph text
my $line = 0; # Line number
my $found_body = 0; # Have we found body text yet?
my $in_blockquote = 0; # Are we in a block quote?
my $normal_indent = -1; # Size of normal indent
my $block_indent = -1; # Block quote indent string
my $initspace = '\s'; # Regex for initial spaces (different for JP)
my $do_last_line = 1;
$initspace = '(\s|\xA1\xA1)' if $jp_mode;
while (($_ = $file[$line]) || $do_last_line--) {
$line++;
# If this is the second line, we know whether the first
# line was a title or not, and we can output the header.
if ($line == 2) {
if (!(/[^ \r\t\n\f\-]/)) {
chop($para);
$para =~ s/^\s*//;
$para =~ s/\s*$//;
$title =~ s/%t/$para/;
$title =~ s/%i/$inbase/;
$title =~ s/%o/$outbase/;
$title =~ s/%I/$infile/;
$title =~ s/%O/$outfile/;
$title =~ s/^\s*//;
$title =~ s/\s*$//;
if ($outfile || !$merge) {
&print_html_header($title);
print "\n";
print "$header\n" if $header;
print "\n";
print "$para
\n\n";
} else {
print "$para
\n\n";
}
$para = "";
$found_body = 0;
$normal_indent = -1;
next;
} else {
$title =~ s/^\s*//;
$title =~ s/\s*$//;
if ($outfile || !$merge) {
&print_html_header($title);
print "\n";
print "$header\n" if $header;
} else {
print "";
}
}
}
# Separator line
if ($line > 1 && /\S/ && !/[^\s~*\-_=.]/) {
if ($between_para >= 0 &&
$blank_count <= $between_para) {
$print_sep = 2;
$blank_count = $between_para+1;
}
# End of a paragraph
} elsif ($line > 1 && (!(/\S/) || (/^${initspace}/ &&
(!$in_blockquote || /^$block_indent/)))) {
$input = $_; # save it for later
if ($para =~ /\S/) {
# Print separator first if needed
print "$separator\n" if $print_sep == 1
&& !$in_blockquote;
$print_sep-- if $print_sep > 0;
# Fix up punctuation
$para =~ s/\s+$//; # zaps newline too
$para =~ s/--\s/--/g;
$para =~ s/\s--/--/g;
$para =~ s/---+/--/g;
$para .= "\n"; # put newline back
$para =~ s/\.\.\.(["\n])/ . . .$1/g;
$para =~ s/\.\.\.(?!\.)/ . . . /g;
$para =~ s/ (\. \. \.\W*)$/ $1/g;
$para =~ s/\.([^0-9 \t\r\n\f\"\'&])/. $1/g;
# Add HTML escapes
$para =~ s/&(?!nbsp;)/&/g;
$para =~ s/</g;
$para =~ s/>/>/g;
# HTMLize punctuation
$para =~ s/--/—/g;
$para =~ s/`/‘/g;
$para =~ s/'/’/g;
$para =~ s/"([^"]*)"/“$1”/g;
$para =~ s/"/“/g;
# Do italics (must precede any other tag-inserting operations)
@para = split(/\//, $para);
$ital_on = -1;
$para = "";
foreach (@para) {
if ($ital_on > 0) {
$para .= "";
$on = "";
$off = "";
} else {
if ($ital_on < 0) {
$ital_on = 0;
} else {
$para .= "";
}
$on = "";
$off = "";
}
s/[_*]([^_*]*)[_*]/$on$1$off/g;
$para .= $_;
$ital_on = !$ital_on;
}
if (!$ital_on) {
chop($para);
$para .= "\n";
}
# Do monospace
$para =~ s/\|([^|]*)(?:$|\|)/$1<\/span>/g;
# Furigana (Japanese mode only)
if ($jp_mode) {
while ($para =~ s/\{([^:\}]+):([^:\}]+)\}/$1<\/rb>\n";
if (!$merge || (defined($outfile) && $outfile eq "")) {
print "$trailer\n" if $trailer;
print "\n\n";
}
# All done!
if ($outfile) {
if (!$merge) { # save filehandle for later if merging
close(OUT);
}
if ($touch) {
utime time(), $mtime, $outfile;
}
}
select($oldhandle);
}
###########################################################################
sub print_html_header
{
my ($title) = @_;
if ($jp_mode) {
print "\n";
print "\n";
print "\n";
} else {
print "\n";
print "\n";
}
print "\n";
if ($charset && !$jp_mode) {
print "\n";
}
print "\n";
if ($title) {
$title =~ s/&/&/g;
$title =~ s/</g;
$title =~ s/>/>/g;
print "$title\n";
}
print "\n";
print "\n\n";
}