gitweb.ps.run Git - ps-cgit/blob - filters/html-converters/resources/markdown.pl

   1 #!/usr/bin/perl
   2
   3 #
   4 # Markdown -- A text-to-HTML conversion tool for web writers
   5 #
   6 # Copyright (c) 2004 John Gruber
   7 # <http://daringfireball.net/projects/markdown/>
   8 #
   9
  10
  11 package Markdown;
  12 require 5.006_000;
  13 use strict;
  14 use warnings;
  15
  16 use Digest::MD5 qw(md5_hex);
  17 use vars qw($VERSION);
  18 $VERSION = '1.0.1';
  19 # Tue 14 Dec 2004
  20
  21
  22 #
  23 # Global default settings:
  24 #
  25 my $g_empty_element_suffix = " />";     # Change to ">" for HTML output
  26 my $g_tab_width = 4;
  27
  28
  29 #
  30 # Globals:
  31 #
  32
  33 # Regex to match balanced [brackets]. See Friedl's
  34 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  35 my $g_nested_brackets;
  36 $g_nested_brackets = qr{
  37         (?>                                                             # Atomic matching
  38            [^\[\]]+                                                     # Anything other than brackets
  39          |
  40            \[
  41                  (??{ $g_nested_brackets })             # Recursive set of nested brackets
  42            \]
  43         )*
  44 }x;
  45
  46
  47 # Table of hash values for escaped characters:
  48 my %g_escape_table;
  49 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
  50         $g_escape_table{$char} = md5_hex($char);
  51 }
  52
  53
  54 # Global hashes, used by various utility routines
  55 my %g_urls;
  56 my %g_titles;
  57 my %g_html_blocks;
  58
  59 # Used to track when we're inside an ordered or unordered list
  60 # (see _ProcessListItems() for details):
  61 my $g_list_level = 0;
  62
  63
  64 #### Blosxom plug-in interface ##########################################
  65
  66 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
  67 # which posts Markdown should process, using a "meta-markup: markdown"
  68 # header. If it's set to 0 (the default), Markdown will process all
  69 # entries.
  70 my $g_blosxom_use_meta = 0;
  71
  72 sub start { 1; }
  73 sub story {
  74         my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  75
  76         if ( (! $g_blosxom_use_meta) or
  77              (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  78              ){
  79                         $$body_ref  = Markdown($$body_ref);
  80      }
  81      1;
  82 }
  83
  84
  85 #### Movable Type plug-in interface #####################################
  86 eval {require MT};  # Test to see if we're running in MT.
  87 unless ($@) {
  88     require MT;
  89     import  MT;
  90     require MT::Template::Context;
  91     import  MT::Template::Context;
  92
  93         eval {require MT::Plugin};  # Test to see if we're running >= MT 3.0.
  94         unless ($@) {
  95                 require MT::Plugin;
  96                 import  MT::Plugin;
  97                 my $plugin = new MT::Plugin({
  98                         name => "Markdown",
  99                         description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
 100                         doc_link => 'http://daringfireball.net/projects/markdown/'
 101                 });
 102                 MT->add_plugin( $plugin );
 103         }
 104
 105         MT::Template::Context->add_container_tag(MarkdownOptions => sub {
 106                 my $ctx  = shift;
 107                 my $args = shift;
 108                 my $builder = $ctx->stash('builder');
 109                 my $tokens = $ctx->stash('tokens');
 110
 111                 if (defined ($args->{'output'}) ) {
 112                         $ctx->stash('markdown_output', lc $args->{'output'});
 113                 }
 114
 115                 defined (my $str = $builder->build($ctx, $tokens) )
 116                         or return $ctx->error($builder->errstr);
 117                 $str;           # return value
 118         });
 119
 120         MT->add_text_filter('markdown' => {
 121                 label     => 'Markdown',
 122                 docs      => 'http://daringfireball.net/projects/markdown/',
 123                 on_format => sub {
 124                         my $text = shift;
 125                         my $ctx  = shift;
 126                         my $raw  = 0;
 127                     if (defined $ctx) {
 128                         my $output = $ctx->stash('markdown_output');
 129                                 if (defined $output  &&  $output =~ m/^html/i) {
 130                                         $g_empty_element_suffix = ">";
 131                                         $ctx->stash('markdown_output', '');
 132                                 }
 133                                 elsif (defined $output  &&  $output eq 'raw') {
 134                                         $raw = 1;
 135                                         $ctx->stash('markdown_output', '');
 136                                 }
 137                                 else {
 138                                         $raw = 0;
 139                                         $g_empty_element_suffix = " />";
 140                                 }
 141                         }
 142                         $text = $raw ? $text : Markdown($text);
 143                         $text;
 144                 },
 145         });
 146
 147         # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
 148         my $smartypants;
 149
 150         {
 151                 no warnings "once";
 152                 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
 153         }
 154
 155         if ($smartypants) {
 156                 MT->add_text_filter('markdown_with_smartypants' => {
 157                         label     => 'Markdown With SmartyPants',
 158                         docs      => 'http://daringfireball.net/projects/markdown/',
 159                         on_format => sub {
 160                                 my $text = shift;
 161                                 my $ctx  = shift;
 162                                 if (defined $ctx) {
 163                                         my $output = $ctx->stash('markdown_output');
 164                                         if (defined $output  &&  $output eq 'html') {
 165                                                 $g_empty_element_suffix = ">";
 166                                         }
 167                                         else {
 168                                                 $g_empty_element_suffix = " />";
 169                                         }
 170                                 }
 171                                 $text = Markdown($text);
 172                                 $text = $smartypants->($text, '1');
 173                         },
 174                 });
 175         }
 176 }
 177 else {
 178 #### BBEdit/command-line text filter interface ##########################
 179 # Needs to be hidden from MT (and Blosxom when running in static mode).
 180
 181     # We're only using $blosxom::version once; tell Perl not to warn us:
 182         no warnings 'once';
 183     unless ( defined($blosxom::version) ) {
 184                 use warnings;
 185
 186                 #### Check for command-line switches: #################
 187                 my %cli_opts;
 188                 use Getopt::Long;
 189                 Getopt::Long::Configure('pass_through');
 190                 GetOptions(\%cli_opts,
 191                         'version',
 192                         'shortversion',
 193                         'html4tags',
 194                 );
 195                 if ($cli_opts{'version'}) {             # Version info
 196                         print "\nThis is Markdown, version $VERSION.\n";
 197                         print "Copyright 2004 John Gruber\n";
 198                         print "http://daringfireball.net/projects/markdown/\n\n";
 199                         exit 0;
 200                 }
 201                 if ($cli_opts{'shortversion'}) {                # Just the version number string.
 202                         print $VERSION;
 203                         exit 0;
 204                 }
 205                 if ($cli_opts{'html4tags'}) {                   # Use HTML tag style instead of XHTML
 206                         $g_empty_element_suffix = ">";
 207                 }
 208
 209
 210                 #### Process incoming text: ###########################
 211                 my $text;
 212                 {
 213                         local $/;               # Slurp the whole file
 214                         $text = <>;
 215                 }
 216         print <<'EOT';
 217 <style>
 218 .markdown-body {
 219     font-size: 14px;
 220     line-height: 1.6;
 221     overflow: hidden;
 222 }
 223 .markdown-body>*:first-child {
 224     margin-top: 0 !important;
 225 }
 226 .markdown-body>*:last-child {
 227     margin-bottom: 0 !important;
 228 }
 229 .markdown-body a.absent {
 230     color: #c00;
 231 }
 232 .markdown-body a.anchor {
 233     display: block;
 234     padding-left: 30px;
 235     margin-left: -30px;
 236     cursor: pointer;
 237     position: absolute;
 238     top: 0;
 239     left: 0;
 240     bottom: 0;
 241 }
 242 .markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 {
 243     margin: 20px 0 10px;
 244     padding: 0;
 245     font-weight: bold;
 246     -webkit-font-smoothing: antialiased;
 247     cursor: text;
 248     position: relative;
 249 }
 250 .markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link {
 251     display: none;
 252     color: #000;
 253 }
 254 .markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor {
 255     text-decoration: none;
 256     line-height: 1;
 257     padding-left: 0;
 258     margin-left: -22px;
 259     top: 15%}
 260 .markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link {
 261     display: inline-block;
 262 }
 263 .markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code {
 264     font-size: inherit;
 265 }
 266 .markdown-body h1 {
 267     font-size: 28px;
 268     color: #000;
 269 }
 270 .markdown-body h2 {
 271     font-size: 24px;
 272     border-bottom: 1px solid #ccc;
 273     color: #000;
 274 }
 275 .markdown-body h3 {
 276     font-size: 18px;
 277 }
 278 .markdown-body h4 {
 279     font-size: 16px;
 280 }
 281 .markdown-body h5 {
 282     font-size: 14px;
 283 }
 284 .markdown-body h6 {
 285     color: #777;
 286     font-size: 14px;
 287 }
 288 .markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre {
 289     margin: 15px 0;
 290 }
 291 .markdown-body hr {
 292     background: transparent url("/dirty-shade.png") repeat-x 0 0;
 293     border: 0 none;
 294     color: #ccc;
 295     height: 4px;
 296     padding: 0;
 297 }
 298 .markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child {
 299     margin-top: 0;
 300     padding-top: 0;
 301 }
 302 .markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 {
 303     margin-top: 0;
 304     padding-top: 0;
 305 }
 306 .markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p {
 307     margin-top: 0;
 308 }
 309 .markdown-body li p.first {
 310     display: inline-block;
 311 }
 312 .markdown-body ul, .markdown-body ol {
 313     padding-left: 30px;
 314 }
 315 .markdown-body ul.no-list, .markdown-body ol.no-list {
 316     list-style-type: none;
 317     padding: 0;
 318 }
 319 .markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type {
 320     margin-top: 0px;
 321 }
 322 .markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type {
 323     margin-bottom: 0;
 324 }
 325 .markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul {
 326     margin-bottom: 0;
 327 }
 328 .markdown-body dl {
 329     padding: 0;
 330 }
 331 .markdown-body dl dt {
 332     font-size: 14px;
 333     font-weight: bold;
 334     font-style: italic;
 335     padding: 0;
 336     margin: 15px 0 5px;
 337 }
 338 .markdown-body dl dt:first-child {
 339     padding: 0;
 340 }
 341 .markdown-body dl dt>:first-child {
 342     margin-top: 0px;
 343 }
 344 .markdown-body dl dt>:last-child {
 345     margin-bottom: 0px;
 346 }
 347 .markdown-body dl dd {
 348     margin: 0 0 15px;
 349     padding: 0 15px;
 350 }
 351 .markdown-body dl dd>:first-child {
 352     margin-top: 0px;
 353 }
 354 .markdown-body dl dd>:last-child {
 355     margin-bottom: 0px;
 356 }
 357 .markdown-body blockquote {
 358     border-left: 4px solid #DDD;
 359     padding: 0 15px;
 360     color: #777;
 361 }
 362 .markdown-body blockquote>:first-child {
 363     margin-top: 0px;
 364 }
 365 .markdown-body blockquote>:last-child {
 366     margin-bottom: 0px;
 367 }
 368 .markdown-body table th {
 369     font-weight: bold;
 370 }
 371 .markdown-body table th, .markdown-body table td {
 372     border: 1px solid #ccc;
 373     padding: 6px 13px;
 374 }
 375 .markdown-body table tr {
 376     border-top: 1px solid #ccc;
 377     background-color: #fff;
 378 }
 379 .markdown-body table tr:nth-child(2n) {
 380     background-color: #f8f8f8;
 381 }
 382 .markdown-body img {
 383     max-width: 100%;
 384     -moz-box-sizing: border-box;
 385     box-sizing: border-box;
 386 }
 387 .markdown-body span.frame {
 388     display: block;
 389     overflow: hidden;
 390 }
 391 .markdown-body span.frame>span {
 392     border: 1px solid #ddd;
 393     display: block;
 394     float: left;
 395     overflow: hidden;
 396     margin: 13px 0 0;
 397     padding: 7px;
 398     width: auto;
 399 }
 400 .markdown-body span.frame span img {
 401     display: block;
 402     float: left;
 403 }
 404 .markdown-body span.frame span span {
 405     clear: both;
 406     color: #333;
 407     display: block;
 408     padding: 5px 0 0;
 409 }
 410 .markdown-body span.align-center {
 411     display: block;
 412     overflow: hidden;
 413     clear: both;
 414 }
 415 .markdown-body span.align-center>span {
 416     display: block;
 417     overflow: hidden;
 418     margin: 13px auto 0;
 419     text-align: center;
 420 }
 421 .markdown-body span.align-center span img {
 422     margin: 0 auto;
 423     text-align: center;
 424 }
 425 .markdown-body span.align-right {
 426     display: block;
 427     overflow: hidden;
 428     clear: both;
 429 }
 430 .markdown-body span.align-right>span {
 431     display: block;
 432     overflow: hidden;
 433     margin: 13px 0 0;
 434     text-align: right;
 435 }
 436 .markdown-body span.align-right span img {
 437     margin: 0;
 438     text-align: right;
 439 }
 440 .markdown-body span.float-left {
 441     display: block;
 442     margin-right: 13px;
 443     overflow: hidden;
 444     float: left;
 445 }
 446 .markdown-body span.float-left span {
 447     margin: 13px 0 0;
 448 }
 449 .markdown-body span.float-right {
 450     display: block;
 451     margin-left: 13px;
 452     overflow: hidden;
 453     float: right;
 454 }
 455 .markdown-body span.float-right>span {
 456     display: block;
 457     overflow: hidden;
 458     margin: 13px auto 0;
 459     text-align: right;
 460 }
 461 .markdown-body code, .markdown-body tt {
 462     margin: 0 2px;
 463     padding: 0px 5px;
 464     border: 1px solid #eaeaea;
 465     background-color: #f8f8f8;
 466     border-radius: 3px;
 467 }
 468 .markdown-body code {
 469     white-space: nowrap;
 470 }
 471 .markdown-body pre>code {
 472     margin: 0;
 473     padding: 0;
 474     white-space: pre;
 475     border: none;
 476     background: transparent;
 477 }
 478 .markdown-body .highlight pre, .markdown-body pre {
 479     background-color: #f8f8f8;
 480     border: 1px solid #ccc;
 481     font-size: 13px;
 482     line-height: 19px;
 483     overflow: auto;
 484     padding: 6px 10px;
 485     border-radius: 3px;
 486 }
 487 .markdown-body pre code, .markdown-body pre tt {
 488     margin: 0;
 489     padding: 0;
 490     background-color: transparent;
 491     border: none;
 492 }
 493 </style>
 494 EOT
 495         print "<div class='markdown-body'>";
 496         print Markdown($text);
 497         print "</div>";
 498     }
 499 }
 500
 501
 502
 503 sub Markdown {
 504 #
 505 # Main function. The order in which other subs are called here is
 506 # essential. Link and image substitutions need to happen before
 507 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 508 # and <img> tags get encoded.
 509 #
 510         my $text = shift;
 511
 512         # Clear the global hashes. If we don't clear these, you get conflicts
 513         # from other articles when generating a page which contains more than
 514         # one article (e.g. an index page that shows the N most recent
 515         # articles):
 516         %g_urls = ();
 517         %g_titles = ();
 518         %g_html_blocks = ();
 519
 520
 521         # Standardize line endings:
 522         $text =~ s{\r\n}{\n}g;  # DOS to Unix
 523         $text =~ s{\r}{\n}g;    # Mac to Unix
 524
 525         # Make sure $text ends with a couple of newlines:
 526         $text .= "\n\n";
 527
 528         # Convert all tabs to spaces.
 529         $text = _Detab($text);
 530
 531         # Strip any lines consisting only of spaces and tabs.
 532         # This makes subsequent regexen easier to write, because we can
 533         # match consecutive blank lines with /\n+/ instead of something
 534         # contorted like /[ \t]*\n+/ .
 535         $text =~ s/^[ \t]+$//mg;
 536
 537         # Turn block-level HTML blocks into hash entries
 538         $text = _HashHTMLBlocks($text);
 539
 540         # Strip link definitions, store in hashes.
 541         $text = _StripLinkDefinitions($text);
 542
 543         $text = _RunBlockGamut($text);
 544
 545         $text = _UnescapeSpecialChars($text);
 546
 547         return $text . "\n";
 548 }
 549
 550
 551 sub _StripLinkDefinitions {
 552 #
 553 # Strips link definitions from text, stores the URLs and titles in
 554 # hash references.
 555 #
 556         my $text = shift;
 557         my $less_than_tab = $g_tab_width - 1;
 558
 559         # Link defs are in the form: ^[id]: url "optional title"
 560         while ($text =~ s{
 561                                                 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
 562                                                   [ \t]*
 563                                                   \n?                           # maybe *one* newline
 564                                                   [ \t]*
 565                                                 <?(\S+?)>?                      # url = $2
 566                                                   [ \t]*
 567                                                   \n?                           # maybe one newline
 568                                                   [ \t]*
 569                                                 (?:
 570                                                         (?<=\s)                 # lookbehind for whitespace
 571                                                         ["(]
 572                                                         (.+?)                   # title = $3
 573                                                         [")]
 574                                                         [ \t]*
 575                                                 )?      # title is optional
 576                                                 (?:\n+|\Z)
 577                                         }
 578                                         {}mx) {
 579                 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
 580                 if ($3) {
 581                         $g_titles{lc $1} = $3;
 582                         $g_titles{lc $1} =~ s/"/&quot;/g;
 583                 }
 584         }
 585
 586         return $text;
 587 }
 588
 589
 590 sub _HashHTMLBlocks {
 591         my $text = shift;
 592         my $less_than_tab = $g_tab_width - 1;
 593
 594         # Hashify HTML blocks:
 595         # We only want to do this for block-level HTML tags, such as headers,
 596         # lists, and tables. That's because we still want to wrap <p>s around
 597         # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 598         # phrase emphasis, and spans. The list of tags we're looking for is
 599         # hard-coded:
 600         my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
 601         my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
 602
 603         # First, look for nested blocks, e.g.:
 604         #       <div>
 605         #               <div>
 606         #               tags for inner block must be indented.
 607         #               </div>
 608         #       </div>
 609         #
 610         # The outermost tags must start at the left margin for this to match, and
 611         # the inner nested divs must be indented.
 612         # We need to do this before the next, more liberal match, because the next
 613         # match will start at the first `<div>` and stop at the first `</div>`.
 614         $text =~ s{
 615                                 (                                               # save in $1
 616                                         ^                                       # start of line  (with /m)
 617                                         <($block_tags_a)        # start tag = $2
 618                                         \b                                      # word break
 619                                         (.*\n)*?                        # any number of lines, minimally matching
 620                                         </\2>                           # the matching end tag
 621                                         [ \t]*                          # trailing spaces/tabs
 622                                         (?=\n+|\Z)      # followed by a newline or end of document
 623                                 )
 624                         }{
 625                                 my $key = md5_hex($1);
 626                                 $g_html_blocks{$key} = $1;
 627                                 "\n\n" . $key . "\n\n";
 628                         }egmx;
 629
 630
 631         #
 632         # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 633         #
 634         $text =~ s{
 635                                 (                                               # save in $1
 636                                         ^                                       # start of line  (with /m)
 637                                         <($block_tags_b)        # start tag = $2
 638                                         \b                                      # word break
 639                                         (.*\n)*?                        # any number of lines, minimally matching
 640                                         .*</\2>                         # the matching end tag
 641                                         [ \t]*                          # trailing spaces/tabs
 642                                         (?=\n+|\Z)      # followed by a newline or end of document
 643                                 )
 644                         }{
 645                                 my $key = md5_hex($1);
 646                                 $g_html_blocks{$key} = $1;
 647                                 "\n\n" . $key . "\n\n";
 648                         }egmx;
 649         # Special case just for <hr />. It was easier to make a special case than
 650         # to make the other regex more complicated.
 651         $text =~ s{
 652                                 (?:
 653                                         (?<=\n\n)               # Starting after a blank line
 654                                         |                               # or
 655                                         \A\n?                   # the beginning of the doc
 656                                 )
 657                                 (                                               # save in $1
 658                                         [ ]{0,$less_than_tab}
 659                                         <(hr)                           # start tag = $2
 660                                         \b                                      # word break
 661                                         ([^<>])*?                       #
 662                                         /?>                                     # the matching end tag
 663                                         [ \t]*
 664                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 665                                 )
 666                         }{
 667                                 my $key = md5_hex($1);
 668                                 $g_html_blocks{$key} = $1;
 669                                 "\n\n" . $key . "\n\n";
 670                         }egx;
 671
 672         # Special case for standalone HTML comments:
 673         $text =~ s{
 674                                 (?:
 675                                         (?<=\n\n)               # Starting after a blank line
 676                                         |                               # or
 677                                         \A\n?                   # the beginning of the doc
 678                                 )
 679                                 (                                               # save in $1
 680                                         [ ]{0,$less_than_tab}
 681                                         (?s:
 682                                                 <!
 683                                                 (--.*?--\s*)+
 684                                                 >
 685                                         )
 686                                         [ \t]*
 687                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 688                                 )
 689                         }{
 690                                 my $key = md5_hex($1);
 691                                 $g_html_blocks{$key} = $1;
 692                                 "\n\n" . $key . "\n\n";
 693                         }egx;
 694
 695
 696         return $text;
 697 }
 698
 699
 700 sub _RunBlockGamut {
 701 #
 702 # These are all the transformations that form block-level
 703 # tags like paragraphs, headers, and list items.
 704 #
 705         my $text = shift;
 706
 707         $text = _DoHeaders($text);
 708
 709         # Do Horizontal Rules:
 710         $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 711         $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 712         $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 713
 714         $text = _DoLists($text);
 715
 716         $text = _DoCodeBlocks($text);
 717
 718         $text = _DoBlockQuotes($text);
 719
 720         # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 721         # was to escape raw HTML in the original Markdown source. This time,
 722         # we're escaping the markup we've just created, so that we don't wrap
 723         # <p> tags around block-level tags.
 724         $text = _HashHTMLBlocks($text);
 725
 726         $text = _FormParagraphs($text);
 727
 728         return $text;
 729 }
 730
 731
 732 sub _RunSpanGamut {
 733 #
 734 # These are all the transformations that occur *within* block-level
 735 # tags like paragraphs, headers, and list items.
 736 #
 737         my $text = shift;
 738
 739         $text = _DoCodeSpans($text);
 740
 741         $text = _EscapeSpecialChars($text);
 742
 743         # Process anchor and image tags. Images must come first,
 744         # because ![foo][f] looks like an anchor.
 745         $text = _DoImages($text);
 746         $text = _DoAnchors($text);
 747
 748         # Make links out of things like `<http://example.com/>`
 749         # Must come after _DoAnchors(), because you can use < and >
 750         # delimiters in inline links like [this](<url>).
 751         $text = _DoAutoLinks($text);
 752
 753         $text = _EncodeAmpsAndAngles($text);
 754
 755         $text = _DoItalicsAndBold($text);
 756
 757         # Do hard breaks:
 758         $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
 759
 760         return $text;
 761 }
 762
 763
 764 sub _EscapeSpecialChars {
 765         my $text = shift;
 766         my $tokens ||= _TokenizeHTML($text);
 767
 768         $text = '';   # rebuild $text from the tokens
 769 #       my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
 770 #       my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
 771
 772         foreach my $cur_token (@$tokens) {
 773                 if ($cur_token->[0] eq "tag") {
 774                         # Within tags, encode * and _ so they don't conflict
 775                         # with their use in Markdown for italics and strong.
 776                         # We're replacing each such character with its
 777                         # corresponding MD5 checksum value; this is likely
 778                         # overkill, but it should prevent us from colliding
 779                         # with the escape values by accident.
 780                         $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gx;
 781                         $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gx;
 782                         $text .= $cur_token->[1];
 783                 } else {
 784                         my $t = $cur_token->[1];
 785                         $t = _EncodeBackslashEscapes($t);
 786                         $text .= $t;
 787                 }
 788         }
 789         return $text;
 790 }
 791
 792
 793 sub _DoAnchors {
 794 #
 795 # Turn Markdown link shortcuts into XHTML <a> tags.
 796 #
 797         my $text = shift;
 798
 799         #
 800         # First, handle reference-style links: [link text] [id]
 801         #
 802         $text =~ s{
 803                 (                                       # wrap whole match in $1
 804                   \[
 805                     ($g_nested_brackets)        # link text = $2
 806                   \]
 807
 808                   [ ]?                          # one optional space
 809                   (?:\n[ ]*)?           # one optional newline followed by spaces
 810
 811                   \[
 812                     (.*?)               # id = $3
 813                   \]
 814                 )
 815         }{
 816                 my $result;
 817                 my $whole_match = $1;
 818                 my $link_text   = $2;
 819                 my $link_id     = lc $3;
 820
 821                 if ($link_id eq "") {
 822                         $link_id = lc $link_text;     # for shortcut links like [this][].
 823                 }
 824
 825                 if (defined $g_urls{$link_id}) {
 826                         my $url = $g_urls{$link_id};
 827                         $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 828                         $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 829                         $result = "<a href=\"$url\"";
 830                         if ( defined $g_titles{$link_id} ) {
 831                                 my $title = $g_titles{$link_id};
 832                                 $title =~ s! \* !$g_escape_table{'*'}!gx;
 833                                 $title =~ s!  _ !$g_escape_table{'_'}!gx;
 834                                 $result .=  " title=\"$title\"";
 835                         }
 836                         $result .= ">$link_text</a>";
 837                 }
 838                 else {
 839                         $result = $whole_match;
 840                 }
 841                 $result;
 842         }xsge;
 843
 844         #
 845         # Next, inline-style links: [link text](url "optional title")
 846         #
 847         $text =~ s{
 848                 (                               # wrap whole match in $1
 849                   \[
 850                     ($g_nested_brackets)        # link text = $2
 851                   \]
 852                   \(                    # literal paren
 853                         [ \t]*
 854                         <?(.*?)>?       # href = $3
 855                         [ \t]*
 856                         (                       # $4
 857                           (['"])        # quote char = $5
 858                           (.*?)         # Title = $6
 859                           \5            # matching quote
 860                         )?                      # title is optional
 861                   \)
 862                 )
 863         }{
 864                 my $result;
 865                 my $whole_match = $1;
 866                 my $link_text   = $2;
 867                 my $url                 = $3;
 868                 my $title               = $6;
 869
 870                 $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 871                 $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 872                 $result = "<a href=\"$url\"";
 873
 874                 if (defined $title) {
 875                         $title =~ s/"/&quot;/g;
 876                         $title =~ s! \* !$g_escape_table{'*'}!gx;
 877                         $title =~ s!  _ !$g_escape_table{'_'}!gx;
 878                         $result .=  " title=\"$title\"";
 879                 }
 880
 881                 $result .= ">$link_text</a>";
 882
 883                 $result;
 884         }xsge;
 885
 886         return $text;
 887 }
 888
 889
 890 sub _DoImages {
 891 #
 892 # Turn Markdown image shortcuts into <img> tags.
 893 #
 894         my $text = shift;
 895
 896         #
 897         # First, handle reference-style labeled images: ![alt text][id]
 898         #
 899         $text =~ s{
 900                 (                               # wrap whole match in $1
 901                   !\[
 902                     (.*?)               # alt text = $2
 903                   \]
 904
 905                   [ ]?                          # one optional space
 906                   (?:\n[ ]*)?           # one optional newline followed by spaces
 907
 908                   \[
 909                     (.*?)               # id = $3
 910                   \]
 911
 912                 )
 913         }{
 914                 my $result;
 915                 my $whole_match = $1;
 916                 my $alt_text    = $2;
 917                 my $link_id     = lc $3;
 918
 919                 if ($link_id eq "") {
 920                         $link_id = lc $alt_text;     # for shortcut links like ![this][].
 921                 }
 922
 923                 $alt_text =~ s/"/&quot;/g;
 924                 if (defined $g_urls{$link_id}) {
 925                         my $url = $g_urls{$link_id};
 926                         $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 927                         $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 928                         $result = "<img src=\"$url\" alt=\"$alt_text\"";
 929                         if (defined $g_titles{$link_id}) {
 930                                 my $title = $g_titles{$link_id};
 931                                 $title =~ s! \* !$g_escape_table{'*'}!gx;
 932                                 $title =~ s!  _ !$g_escape_table{'_'}!gx;
 933                                 $result .=  " title=\"$title\"";
 934                         }
 935                         $result .= $g_empty_element_suffix;
 936                 }
 937                 else {
 938                         # If there's no such link ID, leave intact:
 939                         $result = $whole_match;
 940                 }
 941
 942                 $result;
 943         }xsge;
 944
 945         #
 946         # Next, handle inline images:  ![alt text](url "optional title")
 947         # Don't forget: encode * and _
 948
 949         $text =~ s{
 950                 (                               # wrap whole match in $1
 951                   !\[
 952                     (.*?)               # alt text = $2
 953                   \]
 954                   \(                    # literal paren
 955                         [ \t]*
 956                         <?(\S+?)>?      # src url = $3
 957                         [ \t]*
 958                         (                       # $4
 959                           (['"])        # quote char = $5
 960                           (.*?)         # title = $6
 961                           \5            # matching quote
 962                           [ \t]*
 963                         )?                      # title is optional
 964                   \)
 965                 )
 966         }{
 967                 my $result;
 968                 my $whole_match = $1;
 969                 my $alt_text    = $2;
 970                 my $url                 = $3;
 971                 my $title               = '';
 972                 if (defined($6)) {
 973                         $title          = $6;
 974                 }
 975
 976                 $alt_text =~ s/"/&quot;/g;
 977                 $title    =~ s/"/&quot;/g;
 978                 $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 979                 $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 980                 $result = "<img src=\"$url\" alt=\"$alt_text\"";
 981                 if (defined $title) {
 982                         $title =~ s! \* !$g_escape_table{'*'}!gx;
 983                         $title =~ s!  _ !$g_escape_table{'_'}!gx;
 984                         $result .=  " title=\"$title\"";
 985                 }
 986                 $result .= $g_empty_element_suffix;
 987
 988                 $result;
 989         }xsge;
 990
 991         return $text;
 992 }
 993
 994
 995 sub _DoHeaders {
 996         my $text = shift;
 997
 998         # Setext-style headers:
 999         #         Header 1
1000         #         ========
1001         #
1002         #         Header 2
1003         #         --------
1004         #
1005         $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
1006                 "<h1>"  .  _RunSpanGamut($1)  .  "</h1>\n\n";
1007         }egmx;
1008
1009         $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
1010                 "<h2>"  .  _RunSpanGamut($1)  .  "</h2>\n\n";
1011         }egmx;
1012
1013
1014         # atx-style headers:
1015         #       # Header 1
1016         #       ## Header 2
1017         #       ## Header 2 with closing hashes ##
1018         #       ...
1019         #       ###### Header 6
1020         #
1021         $text =~ s{
1022                         ^(\#{1,6})      # $1 = string of #'s
1023                         [ \t]*
1024                         (.+?)           # $2 = Header text
1025                         [ \t]*
1026                         \#*                     # optional closing #'s (not counted)
1027                         \n+
1028                 }{
1029                         my $h_level = length($1);
1030                         "<h$h_level>"  .  _RunSpanGamut($2)  .  "</h$h_level>\n\n";
1031                 }egmx;
1032
1033         return $text;
1034 }
1035
1036
1037 sub _DoLists {
1038 #
1039 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1040 #
1041         my $text = shift;
1042         my $less_than_tab = $g_tab_width - 1;
1043
1044         # Re-usable patterns to match list item bullets and number markers:
1045         my $marker_ul  = qr/[*+-]/;
1046         my $marker_ol  = qr/\d+[.]/;
1047         my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
1048
1049         # Re-usable pattern to match any entirel ul or ol list:
1050         my $whole_list = qr{
1051                 (                                                               # $1 = whole list
1052                   (                                                             # $2
1053                         [ ]{0,$less_than_tab}
1054                         (${marker_any})                         # $3 = first list item marker
1055                         [ \t]+
1056                   )
1057                   (?s:.+?)
1058                   (                                                             # $4
1059                           \z
1060                         |
1061                           \n{2,}
1062                           (?=\S)
1063                           (?!                                           # Negative lookahead for another list item marker
1064                                 [ \t]*
1065                                 ${marker_any}[ \t]+
1066                           )
1067                   )
1068                 )
1069         }mx;
1070
1071         # We use a different prefix before nested lists than top-level lists.
1072         # See extended comment in _ProcessListItems().
1073         #
1074         # Note: There's a bit of duplication here. My original implementation
1075         # created a scalar regex pattern as the conditional result of the test on
1076         # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1077         # substitution once, using the scalar as the pattern. This worked,
1078         # everywhere except when running under MT on my hosting account at Pair
1079         # Networks. There, this caused all rebuilds to be killed by the reaper (or
1080         # perhaps they crashed, but that seems incredibly unlikely given that the
1081         # same script on the same server ran fine *except* under MT. I've spent
1082         # more time trying to figure out why this is happening than I'd like to
1083         # admit. My only guess, backed up by the fact that this workaround works,
1084         # is that Perl optimizes the substition when it can figure out that the
1085         # pattern will never change, and when this optimization isn't on, we run
1086         # afoul of the reaper. Thus, the slightly redundant code to that uses two
1087         # static s/// patterns rather than one conditional pattern.
1088
1089         if ($g_list_level) {
1090                 $text =~ s{
1091                                 ^
1092                                 $whole_list
1093                         }{
1094                                 my $list = $1;
1095                                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1096                                 # Turn double returns into triple returns, so that we can make a
1097                                 # paragraph for the last item in a list, if necessary:
1098                                 $list =~ s/\n{2,}/\n\n\n/g;
1099                                 my $result = _ProcessListItems($list, $marker_any);
1100                                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1101                                 $result;
1102                         }egmx;
1103         }
1104         else {
1105                 $text =~ s{
1106                                 (?:(?<=\n\n)|\A\n?)
1107                                 $whole_list
1108                         }{
1109                                 my $list = $1;
1110                                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1111                                 # Turn double returns into triple returns, so that we can make a
1112                                 # paragraph for the last item in a list, if necessary:
1113                                 $list =~ s/\n{2,}/\n\n\n/g;
1114                                 my $result = _ProcessListItems($list, $marker_any);
1115                                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1116                                 $result;
1117                         }egmx;
1118         }
1119
1120
1121         return $text;
1122 }
1123
1124
1125 sub _ProcessListItems {
1126 #
1127 #       Process the contents of a single ordered or unordered list, splitting it
1128 #       into individual list items.
1129 #
1130
1131         my $list_str = shift;
1132         my $marker_any = shift;
1133
1134
1135         # The $g_list_level global keeps track of when we're inside a list.
1136         # Each time we enter a list, we increment it; when we leave a list,
1137         # we decrement. If it's zero, we're not in a list anymore.
1138         #
1139         # We do this because when we're not inside a list, we want to treat
1140         # something like this:
1141         #
1142         #               I recommend upgrading to version
1143         #               8. Oops, now this line is treated
1144         #               as a sub-list.
1145         #
1146         # As a single paragraph, despite the fact that the second line starts
1147         # with a digit-period-space sequence.
1148         #
1149         # Whereas when we're inside a list (or sub-list), that line will be
1150         # treated as the start of a sub-list. What a kludge, huh? This is
1151         # an aspect of Markdown's syntax that's hard to parse perfectly
1152         # without resorting to mind-reading. Perhaps the solution is to
1153         # change the syntax rules such that sub-lists must start with a
1154         # starting cardinal number; e.g. "1." or "a.".
1155
1156         $g_list_level++;
1157
1158         # trim trailing blank lines:
1159         $list_str =~ s/\n{2,}\z/\n/;
1160
1161
1162         $list_str =~ s{
1163                 (\n)?                                                   # leading line = $1
1164                 (^[ \t]*)                                               # leading whitespace = $2
1165                 ($marker_any) [ \t]+                    # list marker = $3
1166                 ((?s:.+?)                                               # list item text   = $4
1167                 (\n{1,2}))
1168                 (?= \n* (\z | \2 ($marker_any) [ \t]+))
1169         }{
1170                 my $item = $4;
1171                 my $leading_line = $1;
1172                 my $leading_space = $2;
1173
1174                 if ($leading_line or ($item =~ m/\n{2,}/)) {
1175                         $item = _RunBlockGamut(_Outdent($item));
1176                 }
1177                 else {
1178                         # Recursion for sub-lists:
1179                         $item = _DoLists(_Outdent($item));
1180                         chomp $item;
1181                         $item = _RunSpanGamut($item);
1182                 }
1183
1184                 "<li>" . $item . "</li>\n";
1185         }egmx;
1186
1187         $g_list_level--;
1188         return $list_str;
1189 }
1190
1191
1192
1193 sub _DoCodeBlocks {
1194 #
1195 #       Process Markdown `<pre><code>` blocks.
1196 #
1197
1198         my $text = shift;
1199
1200         $text =~ s{
1201                         (?:\n\n|\A)
1202                         (                   # $1 = the code block -- one or more lines, starting with a space/tab
1203                           (?:
1204                             (?:[ ]{$g_tab_width} | \t)  # Lines must start with a tab or a tab-width of spaces
1205                             .*\n+
1206                           )+
1207                         )
1208                         ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1209                 }{
1210                         my $codeblock = $1;
1211                         my $result; # return value
1212
1213                         $codeblock = _EncodeCode(_Outdent($codeblock));
1214                         $codeblock = _Detab($codeblock);
1215                         $codeblock =~ s/\A\n+//; # trim leading newlines
1216                         $codeblock =~ s/\s+\z//; # trim trailing whitespace
1217
1218                         $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1219
1220                         $result;
1221                 }egmx;
1222
1223         return $text;
1224 }
1225
1226
1227 sub _DoCodeSpans {
1228 #
1229 #       *       Backtick quotes are used for <code></code> spans.
1230 #
1231 #       *       You can use multiple backticks as the delimiters if you want to
1232 #               include literal backticks in the code span. So, this input:
1233 #
1234 #         Just type ``foo `bar` baz`` at the prompt.
1235 #
1236 #       Will translate to:
1237 #
1238 #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1239 #
1240 #               There's no arbitrary limit to the number of backticks you
1241 #               can use as delimters. If you need three consecutive backticks
1242 #               in your code, use four for delimiters, etc.
1243 #
1244 #       *       You can use spaces to get literal backticks at the edges:
1245 #
1246 #         ... type `` `bar` `` ...
1247 #
1248 #       Turns to:
1249 #
1250 #         ... type <code>`bar`</code> ...
1251 #
1252
1253         my $text = shift;
1254
1255         $text =~ s@
1256                         (`+)            # $1 = Opening run of `
1257                         (.+?)           # $2 = The code block
1258                         (?<!`)
1259                         \1                      # Matching closer
1260                         (?!`)
1261                 @
1262                         my $c = "$2";
1263                         $c =~ s/^[ \t]*//g; # leading whitespace
1264                         $c =~ s/[ \t]*$//g; # trailing whitespace
1265                         $c = _EncodeCode($c);
1266                         "<code>$c</code>";
1267                 @egsx;
1268
1269         return $text;
1270 }
1271
1272
1273 sub _EncodeCode {
1274 #
1275 # Encode/escape certain characters inside Markdown code runs.
1276 # The point is that in code, these characters are literals,
1277 # and lose their special Markdown meanings.
1278 #
1279     local $_ = shift;
1280
1281         # Encode all ampersands; HTML entities are not
1282         # entities within a Markdown code span.
1283         s/&/&amp;/g;
1284
1285         # Encode $'s, but only if we're running under Blosxom.
1286         # (Blosxom interpolates Perl variables in article bodies.)
1287         {
1288                 no warnings 'once';
1289         if (defined($blosxom::version)) {
1290                 s/\$/&#036;/g;
1291         }
1292     }
1293
1294
1295         # Do the angle bracket song and dance:
1296         s! <  !&lt;!gx;
1297         s! >  !&gt;!gx;
1298
1299         # Now, escape characters that are magic in Markdown:
1300         s! \* !$g_escape_table{'*'}!gx;
1301         s! _  !$g_escape_table{'_'}!gx;
1302         s! {  !$g_escape_table{'{'}!gx;
1303         s! }  !$g_escape_table{'}'}!gx;
1304         s! \[ !$g_escape_table{'['}!gx;
1305         s! \] !$g_escape_table{']'}!gx;
1306         s! \\ !$g_escape_table{'\\'}!gx;
1307
1308         return $_;
1309 }
1310
1311
1312 sub _DoItalicsAndBold {
1313         my $text = shift;
1314
1315         # <strong> must go first:
1316         $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1317                 {<strong>$2</strong>}gsx;
1318
1319         $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1320                 {<em>$2</em>}gsx;
1321
1322         return $text;
1323 }
1324
1325
1326 sub _DoBlockQuotes {
1327         my $text = shift;
1328
1329         $text =~ s{
1330                   (                                                             # Wrap whole match in $1
1331                         (
1332                           ^[ \t]*>[ \t]?                        # '>' at the start of a line
1333                             .+\n                                        # rest of the first line
1334                           (.+\n)*                                       # subsequent consecutive lines
1335                           \n*                                           # blanks
1336                         )+
1337                   )
1338                 }{
1339                         my $bq = $1;
1340                         $bq =~ s/^[ \t]*>[ \t]?//gm;    # trim one level of quoting
1341                         $bq =~ s/^[ \t]+$//mg;                  # trim whitespace-only lines
1342                         $bq = _RunBlockGamut($bq);              # recurse
1343
1344                         $bq =~ s/^/  /g;
1345                         # These leading spaces screw with <pre> content, so we need to fix that:
1346                         $bq =~ s{
1347                                         (\s*<pre>.+?</pre>)
1348                                 }{
1349                                         my $pre = $1;
1350                                         $pre =~ s/^  //mg;
1351                                         $pre;
1352                                 }egsx;
1353
1354                         "<blockquote>\n$bq\n</blockquote>\n\n";
1355                 }egmx;
1356
1357
1358         return $text;
1359 }
1360
1361
1362 sub _FormParagraphs {
1363 #
1364 #       Params:
1365 #               $text - string to process with html <p> tags
1366 #
1367         my $text = shift;
1368
1369         # Strip leading and trailing lines:
1370         $text =~ s/\A\n+//;
1371         $text =~ s/\n+\z//;
1372
1373         my @grafs = split(/\n{2,}/, $text);
1374
1375         #
1376         # Wrap <p> tags.
1377         #
1378         foreach (@grafs) {
1379                 unless (defined( $g_html_blocks{$_} )) {
1380                         $_ = _RunSpanGamut($_);
1381                         s/^([ \t]*)/<p>/;
1382                         $_ .= "</p>";
1383                 }
1384         }
1385
1386         #
1387         # Unhashify HTML blocks
1388         #
1389         foreach (@grafs) {
1390                 if (defined( $g_html_blocks{$_} )) {
1391                         $_ = $g_html_blocks{$_};
1392                 }
1393         }
1394
1395         return join "\n\n", @grafs;
1396 }
1397
1398
1399 sub _EncodeAmpsAndAngles {
1400 # Smart processing for ampersands and angle brackets that need to be encoded.
1401
1402         my $text = shift;
1403
1404         # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1405         #   http://bumppo.net/projects/amputator/
1406         $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1407
1408         # Encode naked <'s
1409         $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1410
1411         return $text;
1412 }
1413
1414
1415 sub _EncodeBackslashEscapes {
1416 #
1417 #   Parameter:  String.
1418 #   Returns:    The string, with after processing the following backslash
1419 #               escape sequences.
1420 #
1421     local $_ = shift;
1422
1423     s! \\\\  !$g_escape_table{'\\'}!gx;         # Must process escaped backslashes first.
1424     s! \\`   !$g_escape_table{'`'}!gx;
1425     s! \\\*  !$g_escape_table{'*'}!gx;
1426     s! \\_   !$g_escape_table{'_'}!gx;
1427     s! \\\{  !$g_escape_table{'{'}!gx;
1428     s! \\\}  !$g_escape_table{'}'}!gx;
1429     s! \\\[  !$g_escape_table{'['}!gx;
1430     s! \\\]  !$g_escape_table{']'}!gx;
1431     s! \\\(  !$g_escape_table{'('}!gx;
1432     s! \\\)  !$g_escape_table{')'}!gx;
1433     s! \\>   !$g_escape_table{'>'}!gx;
1434     s! \\\#  !$g_escape_table{'#'}!gx;
1435     s! \\\+  !$g_escape_table{'+'}!gx;
1436     s! \\\-  !$g_escape_table{'-'}!gx;
1437     s! \\\.  !$g_escape_table{'.'}!gx;
1438     s{ \\!  }{$g_escape_table{'!'}}gx;
1439
1440     return $_;
1441 }
1442
1443
1444 sub _DoAutoLinks {
1445         my $text = shift;
1446
1447         $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1448
1449         # Email addresses: <address@domain.foo>
1450         $text =~ s{
1451                 <
1452         (?:mailto:)?
1453                 (
1454                         [-.\w]+
1455                         \@
1456                         [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1457                 )
1458                 >
1459         }{
1460                 _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1461         }egix;
1462
1463         return $text;
1464 }
1465
1466
1467 sub _EncodeEmailAddress {
1468 #
1469 #       Input: an email address, e.g. "foo@example.com"
1470 #
1471 #       Output: the email address as a mailto link, with each character
1472 #               of the address encoded as either a decimal or hex entity, in
1473 #               the hopes of foiling most address harvesting spam bots. E.g.:
1474 #
1475 #         <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1476 #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1477 #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1478 #
1479 #       Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1480 #       mailing list: <http://tinyurl.com/yu7ue>
1481 #
1482
1483         my $addr = shift;
1484
1485         srand;
1486         my @encode = (
1487                 sub { '&#' .                 ord(shift)   . ';' },
1488                 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1489                 sub {                            shift          },
1490         );
1491
1492         $addr = "mailto:" . $addr;
1493
1494         $addr =~ s{(.)}{
1495                 my $char = $1;
1496                 if ( $char eq '@' ) {
1497                         # this *must* be encoded. I insist.
1498                         $char = $encode[int rand 1]->($char);
1499                 } elsif ( $char ne ':' ) {
1500                         # leave ':' alone (to spot mailto: later)
1501                         my $r = rand;
1502                         # roughly 10% raw, 45% hex, 45% dec
1503                         $char = (
1504                                 $r > .9   ?  $encode[2]->($char)  :
1505                                 $r < .45  ?  $encode[1]->($char)  :
1506                                                          $encode[0]->($char)
1507                         );
1508                 }
1509                 $char;
1510         }gex;
1511
1512         $addr = qq{<a href="$addr">$addr</a>};
1513         $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1514
1515         return $addr;
1516 }
1517
1518
1519 sub _UnescapeSpecialChars {
1520 #
1521 # Swap back in all the special characters we've hidden.
1522 #
1523         my $text = shift;
1524
1525         while( my($char, $hash) = each(%g_escape_table) ) {
1526                 $text =~ s/$hash/$char/g;
1527         }
1528     return $text;
1529 }
1530
1531
1532 sub _TokenizeHTML {
1533 #
1534 #   Parameter:  String containing HTML markup.
1535 #   Returns:    Reference to an array of the tokens comprising the input
1536 #               string. Each token is either a tag (possibly with nested,
1537 #               tags contained therein, such as <a href="<MTFoo>">, or a
1538 #               run of text between tags. Each element of the array is a
1539 #               two-element array; the first is either 'tag' or 'text';
1540 #               the second is the actual value.
1541 #
1542 #
1543 #   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1544 #       <http://www.bradchoate.com/past/mtregex.php>
1545 #
1546
1547     my $str = shift;
1548     my $pos = 0;
1549     my $len = length $str;
1550     my @tokens;
1551
1552     my $depth = 6;
1553     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
1554     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
1555                    (?s: <\? .*? \?> ) |              # processing instruction
1556                    $nested_tags/ix;                   # nested tags
1557
1558     while ($str =~ m/($match)/g) {
1559         my $whole_tag = $1;
1560         my $sec_start = pos $str;
1561         my $tag_start = $sec_start - length $whole_tag;
1562         if ($pos < $tag_start) {
1563             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1564         }
1565         push @tokens, ['tag', $whole_tag];
1566         $pos = pos $str;
1567     }
1568     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1569     \@tokens;
1570 }
1571
1572
1573 sub _Outdent {
1574 #
1575 # Remove one level of line-leading tabs or spaces
1576 #
1577         my $text = shift;
1578
1579         $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1580         return $text;
1581 }
1582
1583
1584 sub _Detab {
1585 #
1586 # Cribbed from a post by Bart Lateur:
1587 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1588 #
1589         my $text = shift;
1590
1591         $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1592         return $text;
1593 }
1594
1595
1596 1;
1597
1598 __END__
1599
1600
1601 =pod
1602
1603 =head1 NAME
1604
1605 B<Markdown>
1606
1607
1608 =head1 SYNOPSIS
1609
1610 B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1611     [ I<file> ... ]
1612
1613
1614 =head1 DESCRIPTION
1615
1616 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1617 easy-to-write structured text format into HTML. Markdown's text format
1618 is most similar to that of plain text email, and supports features such
1619 as headers, *emphasis*, code blocks, blockquotes, and links.
1620
1621 Markdown's syntax is designed not as a generic markup language, but
1622 specifically to serve as a front-end to (X)HTML. You can  use span-level
1623 HTML tags anywhere in a Markdown document, and you can use block level
1624 HTML tags (like <div> and <table> as well).
1625
1626 For more information about Markdown's syntax, see:
1627
1628     http://daringfireball.net/projects/markdown/
1629
1630
1631 =head1 OPTIONS
1632
1633 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1634
1635         Markdown.pl -- -z
1636
1637 =over 4
1638
1639
1640 =item B<--html4tags>
1641
1642 Use HTML 4 style for empty element tags, e.g.:
1643
1644     <br>
1645
1646 instead of Markdown's default XHTML style tags, e.g.:
1647
1648     <br />
1649
1650
1651 =item B<-v>, B<--version>
1652
1653 Display Markdown's version number and copyright information.
1654
1655
1656 =item B<-s>, B<--shortversion>
1657
1658 Display the short-form version number.
1659
1660
1661 =back
1662
1663
1664
1665 =head1 BUGS
1666
1667 To file bug reports or feature requests (other than topics listed in the
1668 Caveats section above) please send email to:
1669
1670     support@daringfireball.net
1671
1672 Please include with your report: (1) the example input; (2) the output
1673 you expected; (3) the output Markdown actually produced.
1674
1675
1676 =head1 VERSION HISTORY
1677
1678 See the readme file for detailed release notes for this version.
1679
1680 1.0.1 - 14 Dec 2004
1681
1682 1.0 - 28 Aug 2004
1683
1684
1685 =head1 AUTHOR
1686
1687     John Gruber
1688     http://daringfireball.net
1689
1690     PHP port and other contributions by Michel Fortin
1691     http://michelf.com
1692
1693
1694 =head1 COPYRIGHT AND LICENSE
1695
1696 Copyright (c) 2003-2004 John Gruber
1697 <http://daringfireball.net/>
1698 All rights reserved.
1699
1700 Redistribution and use in source and binary forms, with or without
1701 modification, are permitted provided that the following conditions are
1702 met:
1703
1704 * Redistributions of source code must retain the above copyright notice,
1705   this list of conditions and the following disclaimer.
1706
1707 * Redistributions in binary form must reproduce the above copyright
1708   notice, this list of conditions and the following disclaimer in the
1709   documentation and/or other materials provided with the distribution.
1710
1711 * Neither the name "Markdown" nor the names of its contributors may
1712   be used to endorse or promote products derived from this software
1713   without specific prior written permission.
1714
1715 This software is provided by the copyright holders and contributors "as
1716 is" and any express or implied warranties, including, but not limited
1717 to, the implied warranties of merchantability and fitness for a
1718 particular purpose are disclaimed. In no event shall the copyright owner
1719 or contributors be liable for any direct, indirect, incidental, special,
1720 exemplary, or consequential damages (including, but not limited to,
1721 procurement of substitute goods or services; loss of use, data, or
1722 profits; or business interruption) however caused and on any theory of
1723 liability, whether in contract, strict liability, or tort (including
1724 negligence or otherwise) arising in any way out of the use of this
1725 software, even if advised of the possibility of such damage.
1726
1727 =cut