#!/usr/bin/perl # Emacs: -*- mode: perl; perl-indent-level: 8; -*- # # Convert a text file to (basic) HTML. # # Usage: # convert.pl [options] [source-pattern... dest-pattern] # Options: # -charset charset-id # Causes a tag to be added to # the HTML header indicating the given character set for the # file. # -font font-family # Causes the given font family to be included as the font # family in the CSS style for the tag. Multiple fonts # can be separated with commas, e.g.: # -font 'Times New Roman, serif' # -japanese # Assume Japanese EUC-JP input, and process furigana # constructs "{basetext:furigana}" using the XHTML 1.1 # tag. Generates XHTML 1.1 instead of HTML 4.01. # -title title-string # Specifies the title to use for the HTML file. Can include # any of the following tags: # %t The title given in the file, if any. # %i The base input filename. # %I The input filename, including any path. # %o The base output filename. # %O The output filename, including any path. # The default is "%t". # -notitle # Specifies that the HTML file should not be given a title. # -header header-string # -trailer trailer-string # Specifies the HTML code to insert: # -header: after the opening and before the title # -trailer: after the text and before the closing # Can include any of the following tags: # %p The base name of the previous output file, # or empty if this is the first file. # %P(...) Include the text "..." iff %p is not empty. # Parentheses in "..." must be escaped # with "\". # %n The base name of the next output file, or # or empty if this is the last file. # %N(...) Include the text "..." iff %n is not empty. # The default for both options is "" (the empty string). # -noheader # -notrailer # Specifies that no header or trailer should be written (the # default). # -separator separator-string # Specifies the HTML code to insert for a section separator. # The default is "

* * *

". # -regex match-regex # Specifies that the dest-pattern is a substitution string # for the given match-regex. # -inplace # Allow in-place file modification. The default is to not # allow it. # -blockquote # Allow block-quote recognition (set off by larger indents # than the first paragraph indent and maybe blank lines). # -touch # Copy the source file's modification time to the output # file. # -merge # Output the full text as a single HTML file, rather than a # separate output file for each input file. # # Normal operation is to read stdin and write to stdout. If one or more # source-patterns and a dest-pattern are given, they specify that, instead, # text should be read from each file matching each source-pattern # (standard shell wildcards; can also be plain filenames, meaning shell # globbing is fine too) and placed in a file whose name is taken from # applying the dest-pattern to each source filename. The dest-pattern can # contain any of the following tags if -regex is not given: # %f The source filename (everything after the last "/", or the # whole string if there is no "/"). # %p The path to the source file (everything up to and including # the final "/", or empty if there is no "/"). # %n The name (without extension) of the source file (everything # in %f up to but not including the last ".", or the whole # string is there is no "."). # %e The extension of the source file (everything in %f after # the last ".", or empty if there is no "."). # If the -regex option is given, the destination filename is computed as # "s/match-regex/dest-pattern/" and the tags above are ignored. # # If a file ".convert" is present in the current directory, options will # be read from it for the defaults, one option (and argument) per line # (arguments do not need to be quoted even if they contain spaces). They # can be overridden by the command line. #'s in the file must be escaped # or they will be interpreted as comments. Newlines can be included in # arguments by ending a line with a \ (backslash). # ########################################################################### $title = "%t"; $header = ""; $trailer = ""; $separator = "

*   *   *

"; $jp_mode = 0; if (open(CONF, ".convert")) { @file = (); # Read the lines in while () { chop($line = $_); while ($line =~ /\\$/) { chop($line); $line .= "\n"; $line .= ; chop($line); } $line =~ s/^#.*//; $line =~ s/[^\\]#.*//; $line =~ s/^\s+//; if (!$line) { next; } $opt = $line; $opt =~ s/(\S*)(.|\n)*/$1/; $line =~ s/\S*[ \t]+(.*)/$1/; push(@file, $opt, $line); } close(CONF); # Add them to the command line so they're in the right order foreach $i (reverse @file) { unshift(@ARGV, $i); } } # Parse command line $i = 0; while ($i <= $#ARGV && $ARGV[$i] =~ /^-/) { if ($ARGV[$i] eq "-charset") { $charset = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-font") { $font = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-japanese") { $jp_mode = 1; $charset = "EUC-JP"; } elsif ($ARGV[$i] eq "-title") { $title = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-notitle") { $title = ""; } elsif ($ARGV[$i] eq "-header") { $header = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-noheader") { $header = ""; } elsif ($ARGV[$i] eq "-trailer") { $trailer = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-notrailer") { $trailer = ""; } elsif ($ARGV[$i] eq "-separator") { $separator = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-regex") { $is_regex = 1; $match_regex = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-inplace") { $allow_inplace = 1; } elsif ($ARGV[$i] eq "-blockquote") { $allow_blockquote = 1; } elsif ($ARGV[$i] eq "-touch") { $touch = 1; } elsif ($ARGV[$i] eq "-merge") { $merge = 1; } else { die "Unknown option $ARGV[$i]\n"; } $i++; } if ($i < $#ARGV) { # Regexp to match stuff inside a pair of parens: $inside_paren = '[^)\\\]*(\\\\\)[^)\\\]*)*'; while ($i < $#ARGV) { $srcpatt = "$srcpatt $ARGV[$i++]"; } $destpatt = $ARGV[$i]; @files = <${srcpatt}>; for ($i = 0; $i <= $#files; $i++) { if ($merge) { print STDERR "Converting $files[$i]\n"; $dest = ($i==0 ? $destpatt : $i==$#files ? "" : undef); $h = $header; $t = $trailer; $h =~ s/%[PN]\($inside_paren\)//g; $h =~ s/%[pn]//g; $t =~ s/%[PN]\($inside_paren\)//g; $t =~ s/%[pn]//g; $h = "" if $i > 0; $t = "" if $i < $#files; } else { $dest = &make_dest($files[$i],$destpatt,$is_regex); if (!$allow_inplace && $dest eq $files[$i]) { die "$files[$i]: source and destination filenames are the same\n"; } print STDERR "Converting $files[$i] to $dest\n"; $h = $header; $t = $trailer; if ($i > 0) { $prev = &make_dest($files[$i-1],$destpatt,$is_regex); $prev =~ s#.*/(.*)#$1#; $h =~ s/%P\(($inside_paren)\)/$1/g; $h =~ s/\\\(/(/g; $h =~ s/\\\)/)/g; $h =~ s/%p/$prev/g; $t =~ s/%P\(($inside_paren)\)/$1/g; $t =~ s/\\\(/(/g; $t =~ s/\\\)/)/g; $t =~ s/%p/$prev/g; } else { $h =~ s/%P\($inside_paren\)//g; $h =~ s/%p//g; $t =~ s/%P\($inside_paren\)//g; $t =~ s/%p//g; } if ($i < $#files) { $next = &make_dest($files[$i+1],$destpatt,$is_regex); $next =~ s#.*/(.*)#$1#; $h =~ s/%N\(($inside_paren)\)/$1/g; $h =~ s/\\\(/(/g; $h =~ s/\\\)/)/g; $h =~ s/%n/$next/g; $t =~ s/%N\(($inside_paren)\)/$1/g; $t =~ s/\\\(/(/g; $t =~ s/\\\)/)/g; $t =~ s/%n/$next/g; } else { $h =~ s/%N\($inside_paren\)//g; $h =~ s/%n//g; $t =~ s/%N\($inside_paren\)//g; $t =~ s/%n//g; } } &parse($title, $h, $t, $separator, $files[$i], $dest); } } elsif ($i == $#ARGV) { die "Destination pattern required if source pattern given\n"; } else { $trailer =~ s/%P\($inside_paren\)//g; $trailer =~ s/%p//g; $trailer =~ s/%N\($inside_paren\)//g; $trailer =~ s/%n//g; &parse($title, $header, $trailer, $separator); } ########################################################################### sub make_dest { my ($file,$destpatt,$is_regex) = @_; my ($dest,$path,$name,$ext); if ($is_regex) { $destpatt =~ s/\\([0-9])/\$$1/; $destpatt = '"'.$destpatt.'"'; $dest = $file; $dest =~ /$match_regex/; $dest = eval $destpatt; } else { $dest = $destpatt; $path = $file; $path =~ s#(.*/).*#$1#; $file =~ s#.*/(.*)#$1#; $name = $file; $name =~ s/(.*)\..*/$1/; $ext = $file; $ext =~ s/.*\.(.*)/$1/; $dest =~ s/%p/$path/g; $dest =~ s/%f/$file/g; $dest =~ s/%n/$name/g; $dest =~ s/%e/$ext/g; } $dest; } ########################################################################### sub parse { my ($title,$header,$trailer,$separator,$infile,$outfile) = @_; my (@file,$inbase,$outbase); if ($infile) { open(IN, $infile) || die "$infile: $!\n"; if ($touch) { $mtime = (stat(IN))[9]; } @file = ; close(IN); $inbase = $infile; $inbase =~ s#.*/(.*)#$1#; if ($outfile) { $outbase = $outfile; $outbase =~ s#.*/(.*)#$1#; if ($outfile eq "-") { open(OUT, ">&STDOUT") || die "STDOUT: $!\n"; } else { open(OUT, ">$outfile") || die "$outfile: $!\n"; } } $oldhandle = select(OUT); } else { @file = ; $oldhandle = select(STDOUT); } # Parse text. Rules: # A paragraph begins with at least one space or a preceding # blank line. # A section ends with at least one more blank line than between # paragraphs, or at a line containing only whitespace and any of the # characters `~' `*' `-' `_' `=' `.' (but at least one non-whitespace # character). # Emphasis is marked by text surrounded by _underscores_ or # *asterisks*. Emphasized text is printed in italics, except when in # italicized text, in which case it is printed un-italicized. # Text to be italicized is surrounded by /slashes/ or . Italicized sections cannot be nested. # In Japanese mode only, furigana is indicated by putting it # after its base text separated by a colon and surrounding the # entire construct by {curly brackets}: {basetext:furigana} # Text (within a paragraph) surrounded by *** will be displayed # in red (e.g. for editing). # If the second line consists of only whitespace and (dashes or # equals signs), the first line is a title. # If block quote parsing is enabled, a block quote is recognized # by zero or more extra blank lines, followed by one or more # paragraphs with a larger indent than the first paragraph, followed # by zero or more extra blank lines, followed by a paragraph with # normal or less indenting. # In Japanese mode, blank lines that do not end paragraphs are # deleted. my $between_para = -1; # Number of blank lines between paragraphs my $blank_count = 0; # Number of consecutive blank lines seen my $para = ""; # Current paragraph text my $line = 0; # Line number my $found_body = 0; # Have we found body text yet? my $in_blockquote = 0; # Are we in a block quote? my $normal_indent = -1; # Size of normal indent my $block_indent = -1; # Block quote indent string my $initspace = '\s'; # Regex for initial spaces (different for JP) my $do_last_line = 1; $initspace = '(\s|\xA1\xA1)' if $jp_mode; while (($_ = $file[$line]) || $do_last_line--) { $line++; # If this is the second line, we know whether the first # line was a title or not, and we can output the header. if ($line == 2) { if (!(/[^ \r\t\n\f\-]/)) { chop($para); $para =~ s/^\s*//; $para =~ s/\s*$//; $title =~ s/%t/$para/; $title =~ s/%i/$inbase/; $title =~ s/%o/$outbase/; $title =~ s/%I/$infile/; $title =~ s/%O/$outfile/; $title =~ s/^\s*//; $title =~ s/\s*$//; if ($outfile || !$merge) { &print_html_header($title); print "\n"; print "$header\n" if $header; print "

$para

\n\n"; } else { print "

$para

\n\n"; } $para = ""; $found_body = 0; $normal_indent = -1; next; } else { $title =~ s/^\s*//; $title =~ s/\s*$//; if ($outfile || !$merge) { &print_html_header($title); print "\n"; print "$header\n" if $header; } else { print "

"; } } } # Separator line if ($line > 1 && /\S/ && !/[^\s~*\-_=.]/) { if ($between_para >= 0 && $blank_count <= $between_para) { $print_sep = 2; $blank_count = $between_para+1; } # End of a paragraph } elsif ($line > 1 && (!(/\S/) || (/^${initspace}/ && (!$in_blockquote || /^$block_indent/)))) { $input = $_; # save it for later if ($para =~ /\S/) { # Print separator first if needed print "$separator\n" if $print_sep == 1 && !$in_blockquote; $print_sep-- if $print_sep > 0; # Fix up punctuation $para =~ s/\s+$//; # zaps newline too $para =~ s/--\s/--/g; $para =~ s/\s--/--/g; $para =~ s/---+/--/g; $para .= "\n"; # put newline back $para =~ s/\.\.\.(["\n])/ . . .$1/g; $para =~ s/\.\.\.(?!\.)/ . . . /g; $para =~ s/ (\. \. \.\W*)$/ $1/g; $para =~ s/\.([^0-9 \t\r\n\f\"\'&])/. $1/g; # Add HTML escapes $para =~ s/&(?!nbsp;)/&/g; $para =~ s//>/g; # HTMLize punctuation $para =~ s/--/—/g; $para =~ s/`/‘/g; $para =~ s/'/’/g; $para =~ s/"([^"]*)"/“$1”/g; $para =~ s/"/“/g; # Do italics @para = split(/\//, $para); $ital_on = -1; $para = ""; foreach (@para) { if ($ital_on > 0) { $para .= ""; $on = ""; $off = ""; } else { if ($ital_on < 0) { $ital_on = 0; } else { $para .= ""; } $on = ""; $off = ""; } s/[_*]([^_*]*)[_*]/$on$1$off/g; $para .= $_; $ital_on = !$ital_on; } if (!$ital_on) { chop($para); $para .= "\n"; } # Furigana (Japanese mode only) if ($jp_mode) { while ($para =~ s/\{([^:\}]+):([^:\}]+)\}/$1<\/rb>กส<\/rp>$2<\/rt>กห<\/rp><\/ruby>/) { if (length($1) > 16 || length($2) > 32) { print STDERR "Warning: long furigana around line $line\n"; } } } # Redness $para =~ s/<<<([\0-\377]*?)>>>/$1<\/font>/g; # Last-minute whitespace fixing $para =~ s/^${initspace}+//; $para =~ s/\s+$//; $para =~ s/[ \t]+\n/\n/g; if ($jp_mode) { $para =~ s/\n+//g; } # And put it inside a paragraph tag print "

$para

\n"; } if ($input =~ /\S/ && $para) { if ($between_para < 0) { $between_para = 0; } $blank_count = 0; } elsif ($do_last_line) { # misses the last-line case if ($blank_count == $between_para) { $print_sep++ if $print_sep <= 0; } $blank_count++; if ($between_para < 0 && $found_body) { $between_para--; } } $para = $input; if ($para =~ /\S/ && $normal_indent < 0) { my ($s) = ($input =~ /^(${initspace}+)/); $s =~ s/\t/ /g; $normal_indent = length($s); } elsif ($normal_indent >= 0) { my ($s) = ($input =~ /^(${initspace}+)/); my ($t) = $s; $s =~ s/\t/ /g; if (length($s) > $normal_indent && !$in_blockquote) { $in_blockquote = 1; $print_sep = 0; print "
\n"; $block_indent = $t; } elsif (length($s) <= $normal_indent && $in_blockquote) { $in_blockquote = 0; print "
\n"; $print_sep = 0; } } # Not end of a paragraph (first line always comes here) } else { $blank_count = 0; $para .= $_; if ($para =~ /\S/ && $normal_indent < 0) { my ($s) = /^(${initspace}+)/; $s =~ s/\t/ /g; $normal_indent = length($s); } if (!$found_body) { $found_body = -1; } } # If this is the second paragraph, we now know how many # blank lines there are between a paragraph. if ($found_body > 0 && $para && $between_para < -1) { $between_para = -between_para - 1; } if ($found_body < 0) { $found_body = 1; } } # for each line # Print trailer if (!$merge || (defined($outfile) && $outfile eq "")) { print "$trailer\n" if $trailer; print "\n\n"; } # All done! if ($outfile) { if (!$merge) { # save filehandle for later if merging close(OUT); } if ($touch) { utime time(), $mtime, $outfile; } } select($oldhandle); } ########################################################################### sub print_html_header { my ($title) = @_; if ($jp_mode) { print "\n"; print "\n"; print "\n"; } else { print "\n"; print "\n"; } print "\n"; if ($charset && !$jp_mode) { print "\n"; } print "\n"; if ($title) { $title =~ s/&/&/g; $title =~ s//>/g; print "$title\n"; } print "\n"; print "\n\n"; }