gitweb.ps.run Git - ps-cgit/blob - filters/html-converters/resources/markdown.pl

   1 #!/usr/bin/perl
   2
   3 #
   4 # Markdown -- A text-to-HTML conversion tool for web writers
   5 #
   6 # Copyright (c) 2004 John Gruber
   7 # <http://daringfireball.net/projects/markdown/>
   8 #
   9
  10
  11 package Markdown;
  12 require 5.006_000;
  13 use strict;
  14 use warnings;
  15
  16 use Digest::MD5 qw(md5_hex);
  17 use vars qw($VERSION);
  18 $VERSION = '1.0.1';
  19 # Tue 14 Dec 2004
  20
  21 ## Disabled; causes problems under Perl 5.6.1:
  22 use utf8;
  23 binmode( STDOUT, ":utf8" );  # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
  24
  25
  26 #
  27 # Global default settings:
  28 #
  29 my $g_empty_element_suffix = " />";     # Change to ">" for HTML output
  30 my $g_tab_width = 4;
  31
  32
  33 #
  34 # Globals:
  35 #
  36
  37 # Regex to match balanced [brackets]. See Friedl's
  38 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  39 my $g_nested_brackets;
  40 $g_nested_brackets = qr{
  41         (?>                                                             # Atomic matching
  42            [^\[\]]+                                                     # Anything other than brackets
  43          |
  44            \[
  45                  (??{ $g_nested_brackets })             # Recursive set of nested brackets
  46            \]
  47         )*
  48 }x;
  49
  50
  51 # Table of hash values for escaped characters:
  52 my %g_escape_table;
  53 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
  54         $g_escape_table{$char} = md5_hex($char);
  55 }
  56
  57
  58 # Global hashes, used by various utility routines
  59 my %g_urls;
  60 my %g_titles;
  61 my %g_html_blocks;
  62
  63 # Used to track when we're inside an ordered or unordered list
  64 # (see _ProcessListItems() for details):
  65 my $g_list_level = 0;
  66
  67
  68 #### Blosxom plug-in interface ##########################################
  69
  70 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
  71 # which posts Markdown should process, using a "meta-markup: markdown"
  72 # header. If it's set to 0 (the default), Markdown will process all
  73 # entries.
  74 my $g_blosxom_use_meta = 0;
  75
  76 sub start { 1; }
  77 sub story {
  78         my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  79
  80         if ( (! $g_blosxom_use_meta) or
  81              (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  82              ){
  83                         $$body_ref  = Markdown($$body_ref);
  84      }
  85      1;
  86 }
  87
  88
  89 #### Movable Type plug-in interface #####################################
  90 eval {require MT};  # Test to see if we're running in MT.
  91 unless ($@) {
  92     require MT;
  93     import  MT;
  94     require MT::Template::Context;
  95     import  MT::Template::Context;
  96
  97         eval {require MT::Plugin};  # Test to see if we're running >= MT 3.0.
  98         unless ($@) {
  99                 require MT::Plugin;
 100                 import  MT::Plugin;
 101                 my $plugin = new MT::Plugin({
 102                         name => "Markdown",
 103                         description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
 104                         doc_link => 'http://daringfireball.net/projects/markdown/'
 105                 });
 106                 MT->add_plugin( $plugin );
 107         }
 108
 109         MT::Template::Context->add_container_tag(MarkdownOptions => sub {
 110                 my $ctx  = shift;
 111                 my $args = shift;
 112                 my $builder = $ctx->stash('builder');
 113                 my $tokens = $ctx->stash('tokens');
 114
 115                 if (defined ($args->{'output'}) ) {
 116                         $ctx->stash('markdown_output', lc $args->{'output'});
 117                 }
 118
 119                 defined (my $str = $builder->build($ctx, $tokens) )
 120                         or return $ctx->error($builder->errstr);
 121                 $str;           # return value
 122         });
 123
 124         MT->add_text_filter('markdown' => {
 125                 label     => 'Markdown',
 126                 docs      => 'http://daringfireball.net/projects/markdown/',
 127                 on_format => sub {
 128                         my $text = shift;
 129                         my $ctx  = shift;
 130                         my $raw  = 0;
 131                     if (defined $ctx) {
 132                         my $output = $ctx->stash('markdown_output');
 133                                 if (defined $output  &&  $output =~ m/^html/i) {
 134                                         $g_empty_element_suffix = ">";
 135                                         $ctx->stash('markdown_output', '');
 136                                 }
 137                                 elsif (defined $output  &&  $output eq 'raw') {
 138                                         $raw = 1;
 139                                         $ctx->stash('markdown_output', '');
 140                                 }
 141                                 else {
 142                                         $raw = 0;
 143                                         $g_empty_element_suffix = " />";
 144                                 }
 145                         }
 146                         $text = $raw ? $text : Markdown($text);
 147                         $text;
 148                 },
 149         });
 150
 151         # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
 152         my $smartypants;
 153
 154         {
 155                 no warnings "once";
 156                 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
 157         }
 158
 159         if ($smartypants) {
 160                 MT->add_text_filter('markdown_with_smartypants' => {
 161                         label     => 'Markdown With SmartyPants',
 162                         docs      => 'http://daringfireball.net/projects/markdown/',
 163                         on_format => sub {
 164                                 my $text = shift;
 165                                 my $ctx  = shift;
 166                                 if (defined $ctx) {
 167                                         my $output = $ctx->stash('markdown_output');
 168                                         if (defined $output  &&  $output eq 'html') {
 169                                                 $g_empty_element_suffix = ">";
 170                                         }
 171                                         else {
 172                                                 $g_empty_element_suffix = " />";
 173                                         }
 174                                 }
 175                                 $text = Markdown($text);
 176                                 $text = $smartypants->($text, '1');
 177                         },
 178                 });
 179         }
 180 }
 181 else {
 182 #### BBEdit/command-line text filter interface ##########################
 183 # Needs to be hidden from MT (and Blosxom when running in static mode).
 184
 185     # We're only using $blosxom::version once; tell Perl not to warn us:
 186         no warnings 'once';
 187     unless ( defined($blosxom::version) ) {
 188                 use warnings;
 189
 190                 #### Check for command-line switches: #################
 191                 my %cli_opts;
 192                 use Getopt::Long;
 193                 Getopt::Long::Configure('pass_through');
 194                 GetOptions(\%cli_opts,
 195                         'version',
 196                         'shortversion',
 197                         'html4tags',
 198                 );
 199                 if ($cli_opts{'version'}) {             # Version info
 200                         print "\nThis is Markdown, version $VERSION.\n";
 201                         print "Copyright 2004 John Gruber\n";
 202                         print "http://daringfireball.net/projects/markdown/\n\n";
 203                         exit 0;
 204                 }
 205                 if ($cli_opts{'shortversion'}) {                # Just the version number string.
 206                         print $VERSION;
 207                         exit 0;
 208                 }
 209                 if ($cli_opts{'html4tags'}) {                   # Use HTML tag style instead of XHTML
 210                         $g_empty_element_suffix = ">";
 211                 }
 212
 213
 214                 #### Process incoming text: ###########################
 215                 my $text;
 216                 {
 217                         local $/;               # Slurp the whole file
 218                         $text = <>;
 219                 }
 220         print <<'EOT';
 221 <style>
 222 .markdown-body {
 223     font-size: 14px;
 224     line-height: 1.6;
 225     overflow: hidden;
 226 }
 227 .markdown-body>*:first-child {
 228     margin-top: 0 !important;
 229 }
 230 .markdown-body>*:last-child {
 231     margin-bottom: 0 !important;
 232 }
 233 .markdown-body a.absent {
 234     color: #c00;
 235 }
 236 .markdown-body a.anchor {
 237     display: block;
 238     padding-left: 30px;
 239     margin-left: -30px;
 240     cursor: pointer;
 241     position: absolute;
 242     top: 0;
 243     left: 0;
 244     bottom: 0;
 245 }
 246 .markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 {
 247     margin: 20px 0 10px;
 248     padding: 0;
 249     font-weight: bold;
 250     -webkit-font-smoothing: antialiased;
 251     cursor: text;
 252     position: relative;
 253 }
 254 .markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link {
 255     display: none;
 256     color: #000;
 257 }
 258 .markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor {
 259     text-decoration: none;
 260     line-height: 1;
 261     padding-left: 0;
 262     margin-left: -22px;
 263     top: 15%}
 264 .markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link {
 265     display: inline-block;
 266 }
 267 .markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code {
 268     font-size: inherit;
 269 }
 270 .markdown-body h1 {
 271     font-size: 28px;
 272     color: #000;
 273 }
 274 .markdown-body h2 {
 275     font-size: 24px;
 276     border-bottom: 1px solid #ccc;
 277     color: #000;
 278 }
 279 .markdown-body h3 {
 280     font-size: 18px;
 281 }
 282 .markdown-body h4 {
 283     font-size: 16px;
 284 }
 285 .markdown-body h5 {
 286     font-size: 14px;
 287 }
 288 .markdown-body h6 {
 289     color: #777;
 290     font-size: 14px;
 291 }
 292 .markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre {
 293     margin: 15px 0;
 294 }
 295 .markdown-body hr {
 296     background: transparent url("/dirty-shade.png") repeat-x 0 0;
 297     border: 0 none;
 298     color: #ccc;
 299     height: 4px;
 300     padding: 0;
 301 }
 302 .markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child {
 303     margin-top: 0;
 304     padding-top: 0;
 305 }
 306 .markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 {
 307     margin-top: 0;
 308     padding-top: 0;
 309 }
 310 .markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p {
 311     margin-top: 0;
 312 }
 313 .markdown-body li p.first {
 314     display: inline-block;
 315 }
 316 .markdown-body ul, .markdown-body ol {
 317     padding-left: 30px;
 318 }
 319 .markdown-body ul.no-list, .markdown-body ol.no-list {
 320     list-style-type: none;
 321     padding: 0;
 322 }
 323 .markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type {
 324     margin-top: 0px;
 325 }
 326 .markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type {
 327     margin-bottom: 0;
 328 }
 329 .markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul {
 330     margin-bottom: 0;
 331 }
 332 .markdown-body dl {
 333     padding: 0;
 334 }
 335 .markdown-body dl dt {
 336     font-size: 14px;
 337     font-weight: bold;
 338     font-style: italic;
 339     padding: 0;
 340     margin: 15px 0 5px;
 341 }
 342 .markdown-body dl dt:first-child {
 343     padding: 0;
 344 }
 345 .markdown-body dl dt>:first-child {
 346     margin-top: 0px;
 347 }
 348 .markdown-body dl dt>:last-child {
 349     margin-bottom: 0px;
 350 }
 351 .markdown-body dl dd {
 352     margin: 0 0 15px;
 353     padding: 0 15px;
 354 }
 355 .markdown-body dl dd>:first-child {
 356     margin-top: 0px;
 357 }
 358 .markdown-body dl dd>:last-child {
 359     margin-bottom: 0px;
 360 }
 361 .markdown-body blockquote {
 362     border-left: 4px solid #DDD;
 363     padding: 0 15px;
 364     color: #777;
 365 }
 366 .markdown-body blockquote>:first-child {
 367     margin-top: 0px;
 368 }
 369 .markdown-body blockquote>:last-child {
 370     margin-bottom: 0px;
 371 }
 372 .markdown-body table th {
 373     font-weight: bold;
 374 }
 375 .markdown-body table th, .markdown-body table td {
 376     border: 1px solid #ccc;
 377     padding: 6px 13px;
 378 }
 379 .markdown-body table tr {
 380     border-top: 1px solid #ccc;
 381     background-color: #fff;
 382 }
 383 .markdown-body table tr:nth-child(2n) {
 384     background-color: #f8f8f8;
 385 }
 386 .markdown-body img {
 387     max-width: 100%;
 388     -moz-box-sizing: border-box;
 389     box-sizing: border-box;
 390 }
 391 .markdown-body span.frame {
 392     display: block;
 393     overflow: hidden;
 394 }
 395 .markdown-body span.frame>span {
 396     border: 1px solid #ddd;
 397     display: block;
 398     float: left;
 399     overflow: hidden;
 400     margin: 13px 0 0;
 401     padding: 7px;
 402     width: auto;
 403 }
 404 .markdown-body span.frame span img {
 405     display: block;
 406     float: left;
 407 }
 408 .markdown-body span.frame span span {
 409     clear: both;
 410     color: #333;
 411     display: block;
 412     padding: 5px 0 0;
 413 }
 414 .markdown-body span.align-center {
 415     display: block;
 416     overflow: hidden;
 417     clear: both;
 418 }
 419 .markdown-body span.align-center>span {
 420     display: block;
 421     overflow: hidden;
 422     margin: 13px auto 0;
 423     text-align: center;
 424 }
 425 .markdown-body span.align-center span img {
 426     margin: 0 auto;
 427     text-align: center;
 428 }
 429 .markdown-body span.align-right {
 430     display: block;
 431     overflow: hidden;
 432     clear: both;
 433 }
 434 .markdown-body span.align-right>span {
 435     display: block;
 436     overflow: hidden;
 437     margin: 13px 0 0;
 438     text-align: right;
 439 }
 440 .markdown-body span.align-right span img {
 441     margin: 0;
 442     text-align: right;
 443 }
 444 .markdown-body span.float-left {
 445     display: block;
 446     margin-right: 13px;
 447     overflow: hidden;
 448     float: left;
 449 }
 450 .markdown-body span.float-left span {
 451     margin: 13px 0 0;
 452 }
 453 .markdown-body span.float-right {
 454     display: block;
 455     margin-left: 13px;
 456     overflow: hidden;
 457     float: right;
 458 }
 459 .markdown-body span.float-right>span {
 460     display: block;
 461     overflow: hidden;
 462     margin: 13px auto 0;
 463     text-align: right;
 464 }
 465 .markdown-body code, .markdown-body tt {
 466     margin: 0 2px;
 467     padding: 0px 5px;
 468     border: 1px solid #eaeaea;
 469     background-color: #f8f8f8;
 470     border-radius: 3px;
 471 }
 472 .markdown-body code {
 473     white-space: nowrap;
 474 }
 475 .markdown-body pre>code {
 476     margin: 0;
 477     padding: 0;
 478     white-space: pre;
 479     border: none;
 480     background: transparent;
 481 }
 482 .markdown-body .highlight pre, .markdown-body pre {
 483     background-color: #f8f8f8;
 484     border: 1px solid #ccc;
 485     font-size: 13px;
 486     line-height: 19px;
 487     overflow: auto;
 488     padding: 6px 10px;
 489     border-radius: 3px;
 490 }
 491 .markdown-body pre code, .markdown-body pre tt {
 492     margin: 0;
 493     padding: 0;
 494     background-color: transparent;
 495     border: none;
 496 }
 497 </style>
 498 EOT
 499         print "<div class='markdown-body'>";
 500         print Markdown($text);
 501         print "</div>";
 502     }
 503 }
 504
 505
 506
 507 sub Markdown {
 508 #
 509 # Main function. The order in which other subs are called here is
 510 # essential. Link and image substitutions need to happen before
 511 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 512 # and <img> tags get encoded.
 513 #
 514         my $text = shift;
 515
 516         # Clear the global hashes. If we don't clear these, you get conflicts
 517         # from other articles when generating a page which contains more than
 518         # one article (e.g. an index page that shows the N most recent
 519         # articles):
 520         %g_urls = ();
 521         %g_titles = ();
 522         %g_html_blocks = ();
 523
 524
 525         # Standardize line endings:
 526         $text =~ s{\r\n}{\n}g;  # DOS to Unix
 527         $text =~ s{\r}{\n}g;    # Mac to Unix
 528
 529         # Make sure $text ends with a couple of newlines:
 530         $text .= "\n\n";
 531
 532         # Convert all tabs to spaces.
 533         $text = _Detab($text);
 534
 535         # Strip any lines consisting only of spaces and tabs.
 536         # This makes subsequent regexen easier to write, because we can
 537         # match consecutive blank lines with /\n+/ instead of something
 538         # contorted like /[ \t]*\n+/ .
 539         $text =~ s/^[ \t]+$//mg;
 540
 541         # Turn block-level HTML blocks into hash entries
 542         $text = _HashHTMLBlocks($text);
 543
 544         # Strip link definitions, store in hashes.
 545         $text = _StripLinkDefinitions($text);
 546
 547         $text = _RunBlockGamut($text);
 548
 549         $text = _UnescapeSpecialChars($text);
 550
 551         return $text . "\n";
 552 }
 553
 554
 555 sub _StripLinkDefinitions {
 556 #
 557 # Strips link definitions from text, stores the URLs and titles in
 558 # hash references.
 559 #
 560         my $text = shift;
 561         my $less_than_tab = $g_tab_width - 1;
 562
 563         # Link defs are in the form: ^[id]: url "optional title"
 564         while ($text =~ s{
 565                                                 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
 566                                                   [ \t]*
 567                                                   \n?                           # maybe *one* newline
 568                                                   [ \t]*
 569                                                 <?(\S+?)>?                      # url = $2
 570                                                   [ \t]*
 571                                                   \n?                           # maybe one newline
 572                                                   [ \t]*
 573                                                 (?:
 574                                                         (?<=\s)                 # lookbehind for whitespace
 575                                                         ["(]
 576                                                         (.+?)                   # title = $3
 577                                                         [")]
 578                                                         [ \t]*
 579                                                 )?      # title is optional
 580                                                 (?:\n+|\Z)
 581                                         }
 582                                         {}mx) {
 583                 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
 584                 if ($3) {
 585                         $g_titles{lc $1} = $3;
 586                         $g_titles{lc $1} =~ s/"/&quot;/g;
 587                 }
 588         }
 589
 590         return $text;
 591 }
 592
 593
 594 sub _HashHTMLBlocks {
 595         my $text = shift;
 596         my $less_than_tab = $g_tab_width - 1;
 597
 598         # Hashify HTML blocks:
 599         # We only want to do this for block-level HTML tags, such as headers,
 600         # lists, and tables. That's because we still want to wrap <p>s around
 601         # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 602         # phrase emphasis, and spans. The list of tags we're looking for is
 603         # hard-coded:
 604         my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
 605         my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
 606
 607         # First, look for nested blocks, e.g.:
 608         #       <div>
 609         #               <div>
 610         #               tags for inner block must be indented.
 611         #               </div>
 612         #       </div>
 613         #
 614         # The outermost tags must start at the left margin for this to match, and
 615         # the inner nested divs must be indented.
 616         # We need to do this before the next, more liberal match, because the next
 617         # match will start at the first `<div>` and stop at the first `</div>`.
 618         $text =~ s{
 619                                 (                                               # save in $1
 620                                         ^                                       # start of line  (with /m)
 621                                         <($block_tags_a)        # start tag = $2
 622                                         \b                                      # word break
 623                                         (.*\n)*?                        # any number of lines, minimally matching
 624                                         </\2>                           # the matching end tag
 625                                         [ \t]*                          # trailing spaces/tabs
 626                                         (?=\n+|\Z)      # followed by a newline or end of document
 627                                 )
 628                         }{
 629                                 my $key = md5_hex($1);
 630                                 $g_html_blocks{$key} = $1;
 631                                 "\n\n" . $key . "\n\n";
 632                         }egmx;
 633
 634
 635         #
 636         # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 637         #
 638         $text =~ s{
 639                                 (                                               # save in $1
 640                                         ^                                       # start of line  (with /m)
 641                                         <($block_tags_b)        # start tag = $2
 642                                         \b                                      # word break
 643                                         (.*\n)*?                        # any number of lines, minimally matching
 644                                         .*</\2>                         # the matching end tag
 645                                         [ \t]*                          # trailing spaces/tabs
 646                                         (?=\n+|\Z)      # followed by a newline or end of document
 647                                 )
 648                         }{
 649                                 my $key = md5_hex($1);
 650                                 $g_html_blocks{$key} = $1;
 651                                 "\n\n" . $key . "\n\n";
 652                         }egmx;
 653         # Special case just for <hr />. It was easier to make a special case than
 654         # to make the other regex more complicated.
 655         $text =~ s{
 656                                 (?:
 657                                         (?<=\n\n)               # Starting after a blank line
 658                                         |                               # or
 659                                         \A\n?                   # the beginning of the doc
 660                                 )
 661                                 (                                               # save in $1
 662                                         [ ]{0,$less_than_tab}
 663                                         <(hr)                           # start tag = $2
 664                                         \b                                      # word break
 665                                         ([^<>])*?                       #
 666                                         /?>                                     # the matching end tag
 667                                         [ \t]*
 668                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 669                                 )
 670                         }{
 671                                 my $key = md5_hex($1);
 672                                 $g_html_blocks{$key} = $1;
 673                                 "\n\n" . $key . "\n\n";
 674                         }egx;
 675
 676         # Special case for standalone HTML comments:
 677         $text =~ s{
 678                                 (?:
 679                                         (?<=\n\n)               # Starting after a blank line
 680                                         |                               # or
 681                                         \A\n?                   # the beginning of the doc
 682                                 )
 683                                 (                                               # save in $1
 684                                         [ ]{0,$less_than_tab}
 685                                         (?s:
 686                                                 <!
 687                                                 (--.*?--\s*)+
 688                                                 >
 689                                         )
 690                                         [ \t]*
 691                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 692                                 )
 693                         }{
 694                                 my $key = md5_hex($1);
 695                                 $g_html_blocks{$key} = $1;
 696                                 "\n\n" . $key . "\n\n";
 697                         }egx;
 698
 699
 700         return $text;
 701 }
 702
 703
 704 sub _RunBlockGamut {
 705 #
 706 # These are all the transformations that form block-level
 707 # tags like paragraphs, headers, and list items.
 708 #
 709         my $text = shift;
 710
 711         $text = _DoHeaders($text);
 712
 713         # Do Horizontal Rules:
 714         $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 715         $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 716         $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
 717
 718         $text = _DoLists($text);
 719
 720         $text = _DoCodeBlocks($text);
 721
 722         $text = _DoBlockQuotes($text);
 723
 724         # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 725         # was to escape raw HTML in the original Markdown source. This time,
 726         # we're escaping the markup we've just created, so that we don't wrap
 727         # <p> tags around block-level tags.
 728         $text = _HashHTMLBlocks($text);
 729
 730         $text = _FormParagraphs($text);
 731
 732         return $text;
 733 }
 734
 735
 736 sub _RunSpanGamut {
 737 #
 738 # These are all the transformations that occur *within* block-level
 739 # tags like paragraphs, headers, and list items.
 740 #
 741         my $text = shift;
 742
 743         $text = _DoCodeSpans($text);
 744
 745         $text = _EscapeSpecialChars($text);
 746
 747         # Process anchor and image tags. Images must come first,
 748         # because ![foo][f] looks like an anchor.
 749         $text = _DoImages($text);
 750         $text = _DoAnchors($text);
 751
 752         # Make links out of things like `<http://example.com/>`
 753         # Must come after _DoAnchors(), because you can use < and >
 754         # delimiters in inline links like [this](<url>).
 755         $text = _DoAutoLinks($text);
 756
 757         $text = _EncodeAmpsAndAngles($text);
 758
 759         $text = _DoItalicsAndBold($text);
 760
 761         # Do hard breaks:
 762         $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
 763
 764         return $text;
 765 }
 766
 767
 768 sub _EscapeSpecialChars {
 769         my $text = shift;
 770         my $tokens ||= _TokenizeHTML($text);
 771
 772         $text = '';   # rebuild $text from the tokens
 773 #       my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
 774 #       my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
 775
 776         foreach my $cur_token (@$tokens) {
 777                 if ($cur_token->[0] eq "tag") {
 778                         # Within tags, encode * and _ so they don't conflict
 779                         # with their use in Markdown for italics and strong.
 780                         # We're replacing each such character with its
 781                         # corresponding MD5 checksum value; this is likely
 782                         # overkill, but it should prevent us from colliding
 783                         # with the escape values by accident.
 784                         $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gx;
 785                         $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gx;
 786                         $text .= $cur_token->[1];
 787                 } else {
 788                         my $t = $cur_token->[1];
 789                         $t = _EncodeBackslashEscapes($t);
 790                         $text .= $t;
 791                 }
 792         }
 793         return $text;
 794 }
 795
 796
 797 sub _DoAnchors {
 798 #
 799 # Turn Markdown link shortcuts into XHTML <a> tags.
 800 #
 801         my $text = shift;
 802
 803         #
 804         # First, handle reference-style links: [link text] [id]
 805         #
 806         $text =~ s{
 807                 (                                       # wrap whole match in $1
 808                   \[
 809                     ($g_nested_brackets)        # link text = $2
 810                   \]
 811
 812                   [ ]?                          # one optional space
 813                   (?:\n[ ]*)?           # one optional newline followed by spaces
 814
 815                   \[
 816                     (.*?)               # id = $3
 817                   \]
 818                 )
 819         }{
 820                 my $result;
 821                 my $whole_match = $1;
 822                 my $link_text   = $2;
 823                 my $link_id     = lc $3;
 824
 825                 if ($link_id eq "") {
 826                         $link_id = lc $link_text;     # for shortcut links like [this][].
 827                 }
 828
 829                 if (defined $g_urls{$link_id}) {
 830                         my $url = $g_urls{$link_id};
 831                         $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 832                         $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 833                         $result = "<a href=\"$url\"";
 834                         if ( defined $g_titles{$link_id} ) {
 835                                 my $title = $g_titles{$link_id};
 836                                 $title =~ s! \* !$g_escape_table{'*'}!gx;
 837                                 $title =~ s!  _ !$g_escape_table{'_'}!gx;
 838                                 $result .=  " title=\"$title\"";
 839                         }
 840                         $result .= ">$link_text</a>";
 841                 }
 842                 else {
 843                         $result = $whole_match;
 844                 }
 845                 $result;
 846         }xsge;
 847
 848         #
 849         # Next, inline-style links: [link text](url "optional title")
 850         #
 851         $text =~ s{
 852                 (                               # wrap whole match in $1
 853                   \[
 854                     ($g_nested_brackets)        # link text = $2
 855                   \]
 856                   \(                    # literal paren
 857                         [ \t]*
 858                         <?(.*?)>?       # href = $3
 859                         [ \t]*
 860                         (                       # $4
 861                           (['"])        # quote char = $5
 862                           (.*?)         # Title = $6
 863                           \5            # matching quote
 864                         )?                      # title is optional
 865                   \)
 866                 )
 867         }{
 868                 my $result;
 869                 my $whole_match = $1;
 870                 my $link_text   = $2;
 871                 my $url                 = $3;
 872                 my $title               = $6;
 873
 874                 $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 875                 $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 876                 $result = "<a href=\"$url\"";
 877
 878                 if (defined $title) {
 879                         $title =~ s/"/&quot;/g;
 880                         $title =~ s! \* !$g_escape_table{'*'}!gx;
 881                         $title =~ s!  _ !$g_escape_table{'_'}!gx;
 882                         $result .=  " title=\"$title\"";
 883                 }
 884
 885                 $result .= ">$link_text</a>";
 886
 887                 $result;
 888         }xsge;
 889
 890         return $text;
 891 }
 892
 893
 894 sub _DoImages {
 895 #
 896 # Turn Markdown image shortcuts into <img> tags.
 897 #
 898         my $text = shift;
 899
 900         #
 901         # First, handle reference-style labeled images: ![alt text][id]
 902         #
 903         $text =~ s{
 904                 (                               # wrap whole match in $1
 905                   !\[
 906                     (.*?)               # alt text = $2
 907                   \]
 908
 909                   [ ]?                          # one optional space
 910                   (?:\n[ ]*)?           # one optional newline followed by spaces
 911
 912                   \[
 913                     (.*?)               # id = $3
 914                   \]
 915
 916                 )
 917         }{
 918                 my $result;
 919                 my $whole_match = $1;
 920                 my $alt_text    = $2;
 921                 my $link_id     = lc $3;
 922
 923                 if ($link_id eq "") {
 924                         $link_id = lc $alt_text;     # for shortcut links like ![this][].
 925                 }
 926
 927                 $alt_text =~ s/"/&quot;/g;
 928                 if (defined $g_urls{$link_id}) {
 929                         my $url = $g_urls{$link_id};
 930                         $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 931                         $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 932                         $result = "<img src=\"$url\" alt=\"$alt_text\"";
 933                         if (defined $g_titles{$link_id}) {
 934                                 my $title = $g_titles{$link_id};
 935                                 $title =~ s! \* !$g_escape_table{'*'}!gx;
 936                                 $title =~ s!  _ !$g_escape_table{'_'}!gx;
 937                                 $result .=  " title=\"$title\"";
 938                         }
 939                         $result .= $g_empty_element_suffix;
 940                 }
 941                 else {
 942                         # If there's no such link ID, leave intact:
 943                         $result = $whole_match;
 944                 }
 945
 946                 $result;
 947         }xsge;
 948
 949         #
 950         # Next, handle inline images:  ![alt text](url "optional title")
 951         # Don't forget: encode * and _
 952
 953         $text =~ s{
 954                 (                               # wrap whole match in $1
 955                   !\[
 956                     (.*?)               # alt text = $2
 957                   \]
 958                   \(                    # literal paren
 959                         [ \t]*
 960                         <?(\S+?)>?      # src url = $3
 961                         [ \t]*
 962                         (                       # $4
 963                           (['"])        # quote char = $5
 964                           (.*?)         # title = $6
 965                           \5            # matching quote
 966                           [ \t]*
 967                         )?                      # title is optional
 968                   \)
 969                 )
 970         }{
 971                 my $result;
 972                 my $whole_match = $1;
 973                 my $alt_text    = $2;
 974                 my $url                 = $3;
 975                 my $title               = '';
 976                 if (defined($6)) {
 977                         $title          = $6;
 978                 }
 979
 980                 $alt_text =~ s/"/&quot;/g;
 981                 $title    =~ s/"/&quot;/g;
 982                 $url =~ s! \* !$g_escape_table{'*'}!gx;         # We've got to encode these to avoid
 983                 $url =~ s!  _ !$g_escape_table{'_'}!gx;         # conflicting with italics/bold.
 984                 $result = "<img src=\"$url\" alt=\"$alt_text\"";
 985                 if (defined $title) {
 986                         $title =~ s! \* !$g_escape_table{'*'}!gx;
 987                         $title =~ s!  _ !$g_escape_table{'_'}!gx;
 988                         $result .=  " title=\"$title\"";
 989                 }
 990                 $result .= $g_empty_element_suffix;
 991
 992                 $result;
 993         }xsge;
 994
 995         return $text;
 996 }
 997
 998
 999 sub _DoHeaders {
1000         my $text = shift;
1001
1002         # Setext-style headers:
1003         #         Header 1
1004         #         ========
1005         #
1006         #         Header 2
1007         #         --------
1008         #
1009         $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
1010                 "<h1>"  .  _RunSpanGamut($1)  .  "</h1>\n\n";
1011         }egmx;
1012
1013         $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
1014                 "<h2>"  .  _RunSpanGamut($1)  .  "</h2>\n\n";
1015         }egmx;
1016
1017
1018         # atx-style headers:
1019         #       # Header 1
1020         #       ## Header 2
1021         #       ## Header 2 with closing hashes ##
1022         #       ...
1023         #       ###### Header 6
1024         #
1025         $text =~ s{
1026                         ^(\#{1,6})      # $1 = string of #'s
1027                         [ \t]*
1028                         (.+?)           # $2 = Header text
1029                         [ \t]*
1030                         \#*                     # optional closing #'s (not counted)
1031                         \n+
1032                 }{
1033                         my $h_level = length($1);
1034                         "<h$h_level>"  .  _RunSpanGamut($2)  .  "</h$h_level>\n\n";
1035                 }egmx;
1036
1037         return $text;
1038 }
1039
1040
1041 sub _DoLists {
1042 #
1043 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1044 #
1045         my $text = shift;
1046         my $less_than_tab = $g_tab_width - 1;
1047
1048         # Re-usable patterns to match list item bullets and number markers:
1049         my $marker_ul  = qr/[*+-]/;
1050         my $marker_ol  = qr/\d+[.]/;
1051         my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
1052
1053         # Re-usable pattern to match any entirel ul or ol list:
1054         my $whole_list = qr{
1055                 (                                                               # $1 = whole list
1056                   (                                                             # $2
1057                         [ ]{0,$less_than_tab}
1058                         (${marker_any})                         # $3 = first list item marker
1059                         [ \t]+
1060                   )
1061                   (?s:.+?)
1062                   (                                                             # $4
1063                           \z
1064                         |
1065                           \n{2,}
1066                           (?=\S)
1067                           (?!                                           # Negative lookahead for another list item marker
1068                                 [ \t]*
1069                                 ${marker_any}[ \t]+
1070                           )
1071                   )
1072                 )
1073         }mx;
1074
1075         # We use a different prefix before nested lists than top-level lists.
1076         # See extended comment in _ProcessListItems().
1077         #
1078         # Note: There's a bit of duplication here. My original implementation
1079         # created a scalar regex pattern as the conditional result of the test on
1080         # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1081         # substitution once, using the scalar as the pattern. This worked,
1082         # everywhere except when running under MT on my hosting account at Pair
1083         # Networks. There, this caused all rebuilds to be killed by the reaper (or
1084         # perhaps they crashed, but that seems incredibly unlikely given that the
1085         # same script on the same server ran fine *except* under MT. I've spent
1086         # more time trying to figure out why this is happening than I'd like to
1087         # admit. My only guess, backed up by the fact that this workaround works,
1088         # is that Perl optimizes the substition when it can figure out that the
1089         # pattern will never change, and when this optimization isn't on, we run
1090         # afoul of the reaper. Thus, the slightly redundant code to that uses two
1091         # static s/// patterns rather than one conditional pattern.
1092
1093         if ($g_list_level) {
1094                 $text =~ s{
1095                                 ^
1096                                 $whole_list
1097                         }{
1098                                 my $list = $1;
1099                                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1100                                 # Turn double returns into triple returns, so that we can make a
1101                                 # paragraph for the last item in a list, if necessary:
1102                                 $list =~ s/\n{2,}/\n\n\n/g;
1103                                 my $result = _ProcessListItems($list, $marker_any);
1104                                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1105                                 $result;
1106                         }egmx;
1107         }
1108         else {
1109                 $text =~ s{
1110                                 (?:(?<=\n\n)|\A\n?)
1111                                 $whole_list
1112                         }{
1113                                 my $list = $1;
1114                                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1115                                 # Turn double returns into triple returns, so that we can make a
1116                                 # paragraph for the last item in a list, if necessary:
1117                                 $list =~ s/\n{2,}/\n\n\n/g;
1118                                 my $result = _ProcessListItems($list, $marker_any);
1119                                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1120                                 $result;
1121                         }egmx;
1122         }
1123
1124
1125         return $text;
1126 }
1127
1128
1129 sub _ProcessListItems {
1130 #
1131 #       Process the contents of a single ordered or unordered list, splitting it
1132 #       into individual list items.
1133 #
1134
1135         my $list_str = shift;
1136         my $marker_any = shift;
1137
1138
1139         # The $g_list_level global keeps track of when we're inside a list.
1140         # Each time we enter a list, we increment it; when we leave a list,
1141         # we decrement. If it's zero, we're not in a list anymore.
1142         #
1143         # We do this because when we're not inside a list, we want to treat
1144         # something like this:
1145         #
1146         #               I recommend upgrading to version
1147         #               8. Oops, now this line is treated
1148         #               as a sub-list.
1149         #
1150         # As a single paragraph, despite the fact that the second line starts
1151         # with a digit-period-space sequence.
1152         #
1153         # Whereas when we're inside a list (or sub-list), that line will be
1154         # treated as the start of a sub-list. What a kludge, huh? This is
1155         # an aspect of Markdown's syntax that's hard to parse perfectly
1156         # without resorting to mind-reading. Perhaps the solution is to
1157         # change the syntax rules such that sub-lists must start with a
1158         # starting cardinal number; e.g. "1." or "a.".
1159
1160         $g_list_level++;
1161
1162         # trim trailing blank lines:
1163         $list_str =~ s/\n{2,}\z/\n/;
1164
1165
1166         $list_str =~ s{
1167                 (\n)?                                                   # leading line = $1
1168                 (^[ \t]*)                                               # leading whitespace = $2
1169                 ($marker_any) [ \t]+                    # list marker = $3
1170                 ((?s:.+?)                                               # list item text   = $4
1171                 (\n{1,2}))
1172                 (?= \n* (\z | \2 ($marker_any) [ \t]+))
1173         }{
1174                 my $item = $4;
1175                 my $leading_line = $1;
1176                 my $leading_space = $2;
1177
1178                 if ($leading_line or ($item =~ m/\n{2,}/)) {
1179                         $item = _RunBlockGamut(_Outdent($item));
1180                 }
1181                 else {
1182                         # Recursion for sub-lists:
1183                         $item = _DoLists(_Outdent($item));
1184                         chomp $item;
1185                         $item = _RunSpanGamut($item);
1186                 }
1187
1188                 "<li>" . $item . "</li>\n";
1189         }egmx;
1190
1191         $g_list_level--;
1192         return $list_str;
1193 }
1194
1195
1196
1197 sub _DoCodeBlocks {
1198 #
1199 #       Process Markdown `<pre><code>` blocks.
1200 #
1201
1202         my $text = shift;
1203
1204         $text =~ s{
1205                         (?:\n\n|\A)
1206                         (                   # $1 = the code block -- one or more lines, starting with a space/tab
1207                           (?:
1208                             (?:[ ]{$g_tab_width} | \t)  # Lines must start with a tab or a tab-width of spaces
1209                             .*\n+
1210                           )+
1211                         )
1212                         ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1213                 }{
1214                         my $codeblock = $1;
1215                         my $result; # return value
1216
1217                         $codeblock = _EncodeCode(_Outdent($codeblock));
1218                         $codeblock = _Detab($codeblock);
1219                         $codeblock =~ s/\A\n+//; # trim leading newlines
1220                         $codeblock =~ s/\s+\z//; # trim trailing whitespace
1221
1222                         $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1223
1224                         $result;
1225                 }egmx;
1226
1227         return $text;
1228 }
1229
1230
1231 sub _DoCodeSpans {
1232 #
1233 #       *       Backtick quotes are used for <code></code> spans.
1234 #
1235 #       *       You can use multiple backticks as the delimiters if you want to
1236 #               include literal backticks in the code span. So, this input:
1237 #
1238 #         Just type ``foo `bar` baz`` at the prompt.
1239 #
1240 #       Will translate to:
1241 #
1242 #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1243 #
1244 #               There's no arbitrary limit to the number of backticks you
1245 #               can use as delimters. If you need three consecutive backticks
1246 #               in your code, use four for delimiters, etc.
1247 #
1248 #       *       You can use spaces to get literal backticks at the edges:
1249 #
1250 #         ... type `` `bar` `` ...
1251 #
1252 #       Turns to:
1253 #
1254 #         ... type <code>`bar`</code> ...
1255 #
1256
1257         my $text = shift;
1258
1259         $text =~ s@
1260                         (`+)            # $1 = Opening run of `
1261                         (.+?)           # $2 = The code block
1262                         (?<!`)
1263                         \1                      # Matching closer
1264                         (?!`)
1265                 @
1266                         my $c = "$2";
1267                         $c =~ s/^[ \t]*//g; # leading whitespace
1268                         $c =~ s/[ \t]*$//g; # trailing whitespace
1269                         $c = _EncodeCode($c);
1270                         "<code>$c</code>";
1271                 @egsx;
1272
1273         return $text;
1274 }
1275
1276
1277 sub _EncodeCode {
1278 #
1279 # Encode/escape certain characters inside Markdown code runs.
1280 # The point is that in code, these characters are literals,
1281 # and lose their special Markdown meanings.
1282 #
1283     local $_ = shift;
1284
1285         # Encode all ampersands; HTML entities are not
1286         # entities within a Markdown code span.
1287         s/&/&amp;/g;
1288
1289         # Encode $'s, but only if we're running under Blosxom.
1290         # (Blosxom interpolates Perl variables in article bodies.)
1291         {
1292                 no warnings 'once';
1293         if (defined($blosxom::version)) {
1294                 s/\$/&#036;/g;
1295         }
1296     }
1297
1298
1299         # Do the angle bracket song and dance:
1300         s! <  !&lt;!gx;
1301         s! >  !&gt;!gx;
1302
1303         # Now, escape characters that are magic in Markdown:
1304         s! \* !$g_escape_table{'*'}!gx;
1305         s! _  !$g_escape_table{'_'}!gx;
1306         s! {  !$g_escape_table{'{'}!gx;
1307         s! }  !$g_escape_table{'}'}!gx;
1308         s! \[ !$g_escape_table{'['}!gx;
1309         s! \] !$g_escape_table{']'}!gx;
1310         s! \\ !$g_escape_table{'\\'}!gx;
1311
1312         return $_;
1313 }
1314
1315
1316 sub _DoItalicsAndBold {
1317         my $text = shift;
1318
1319         # <strong> must go first:
1320         $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1321                 {<strong>$2</strong>}gsx;
1322
1323         $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1324                 {<em>$2</em>}gsx;
1325
1326         return $text;
1327 }
1328
1329
1330 sub _DoBlockQuotes {
1331         my $text = shift;
1332
1333         $text =~ s{
1334                   (                                                             # Wrap whole match in $1
1335                         (
1336                           ^[ \t]*>[ \t]?                        # '>' at the start of a line
1337                             .+\n                                        # rest of the first line
1338                           (.+\n)*                                       # subsequent consecutive lines
1339                           \n*                                           # blanks
1340                         )+
1341                   )
1342                 }{
1343                         my $bq = $1;
1344                         $bq =~ s/^[ \t]*>[ \t]?//gm;    # trim one level of quoting
1345                         $bq =~ s/^[ \t]+$//mg;                  # trim whitespace-only lines
1346                         $bq = _RunBlockGamut($bq);              # recurse
1347
1348                         $bq =~ s/^/  /g;
1349                         # These leading spaces screw with <pre> content, so we need to fix that:
1350                         $bq =~ s{
1351                                         (\s*<pre>.+?</pre>)
1352                                 }{
1353                                         my $pre = $1;
1354                                         $pre =~ s/^  //mg;
1355                                         $pre;
1356                                 }egsx;
1357
1358                         "<blockquote>\n$bq\n</blockquote>\n\n";
1359                 }egmx;
1360
1361
1362         return $text;
1363 }
1364
1365
1366 sub _FormParagraphs {
1367 #
1368 #       Params:
1369 #               $text - string to process with html <p> tags
1370 #
1371         my $text = shift;
1372
1373         # Strip leading and trailing lines:
1374         $text =~ s/\A\n+//;
1375         $text =~ s/\n+\z//;
1376
1377         my @grafs = split(/\n{2,}/, $text);
1378
1379         #
1380         # Wrap <p> tags.
1381         #
1382         foreach (@grafs) {
1383                 unless (defined( $g_html_blocks{$_} )) {
1384                         $_ = _RunSpanGamut($_);
1385                         s/^([ \t]*)/<p>/;
1386                         $_ .= "</p>";
1387                 }
1388         }
1389
1390         #
1391         # Unhashify HTML blocks
1392         #
1393         foreach (@grafs) {
1394                 if (defined( $g_html_blocks{$_} )) {
1395                         $_ = $g_html_blocks{$_};
1396                 }
1397         }
1398
1399         return join "\n\n", @grafs;
1400 }
1401
1402
1403 sub _EncodeAmpsAndAngles {
1404 # Smart processing for ampersands and angle brackets that need to be encoded.
1405
1406         my $text = shift;
1407
1408         # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1409         #   http://bumppo.net/projects/amputator/
1410         $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1411
1412         # Encode naked <'s
1413         $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1414
1415         return $text;
1416 }
1417
1418
1419 sub _EncodeBackslashEscapes {
1420 #
1421 #   Parameter:  String.
1422 #   Returns:    The string, with after processing the following backslash
1423 #               escape sequences.
1424 #
1425     local $_ = shift;
1426
1427     s! \\\\  !$g_escape_table{'\\'}!gx;         # Must process escaped backslashes first.
1428     s! \\`   !$g_escape_table{'`'}!gx;
1429     s! \\\*  !$g_escape_table{'*'}!gx;
1430     s! \\_   !$g_escape_table{'_'}!gx;
1431     s! \\\{  !$g_escape_table{'{'}!gx;
1432     s! \\\}  !$g_escape_table{'}'}!gx;
1433     s! \\\[  !$g_escape_table{'['}!gx;
1434     s! \\\]  !$g_escape_table{']'}!gx;
1435     s! \\\(  !$g_escape_table{'('}!gx;
1436     s! \\\)  !$g_escape_table{')'}!gx;
1437     s! \\>   !$g_escape_table{'>'}!gx;
1438     s! \\\#  !$g_escape_table{'#'}!gx;
1439     s! \\\+  !$g_escape_table{'+'}!gx;
1440     s! \\\-  !$g_escape_table{'-'}!gx;
1441     s! \\\.  !$g_escape_table{'.'}!gx;
1442     s{ \\!  }{$g_escape_table{'!'}}gx;
1443
1444     return $_;
1445 }
1446
1447
1448 sub _DoAutoLinks {
1449         my $text = shift;
1450
1451         $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1452
1453         # Email addresses: <address@domain.foo>
1454         $text =~ s{
1455                 <
1456         (?:mailto:)?
1457                 (
1458                         [-.\w]+
1459                         \@
1460                         [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1461                 )
1462                 >
1463         }{
1464                 _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1465         }egix;
1466
1467         return $text;
1468 }
1469
1470
1471 sub _EncodeEmailAddress {
1472 #
1473 #       Input: an email address, e.g. "foo@example.com"
1474 #
1475 #       Output: the email address as a mailto link, with each character
1476 #               of the address encoded as either a decimal or hex entity, in
1477 #               the hopes of foiling most address harvesting spam bots. E.g.:
1478 #
1479 #         <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1480 #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1481 #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1482 #
1483 #       Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1484 #       mailing list: <http://tinyurl.com/yu7ue>
1485 #
1486
1487         my $addr = shift;
1488
1489         srand;
1490         my @encode = (
1491                 sub { '&#' .                 ord(shift)   . ';' },
1492                 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1493                 sub {                            shift          },
1494         );
1495
1496         $addr = "mailto:" . $addr;
1497
1498         $addr =~ s{(.)}{
1499                 my $char = $1;
1500                 if ( $char eq '@' ) {
1501                         # this *must* be encoded. I insist.
1502                         $char = $encode[int rand 1]->($char);
1503                 } elsif ( $char ne ':' ) {
1504                         # leave ':' alone (to spot mailto: later)
1505                         my $r = rand;
1506                         # roughly 10% raw, 45% hex, 45% dec
1507                         $char = (
1508                                 $r > .9   ?  $encode[2]->($char)  :
1509                                 $r < .45  ?  $encode[1]->($char)  :
1510                                                          $encode[0]->($char)
1511                         );
1512                 }
1513                 $char;
1514         }gex;
1515
1516         $addr = qq{<a href="$addr">$addr</a>};
1517         $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1518
1519         return $addr;
1520 }
1521
1522
1523 sub _UnescapeSpecialChars {
1524 #
1525 # Swap back in all the special characters we've hidden.
1526 #
1527         my $text = shift;
1528
1529         while( my($char, $hash) = each(%g_escape_table) ) {
1530                 $text =~ s/$hash/$char/g;
1531         }
1532     return $text;
1533 }
1534
1535
1536 sub _TokenizeHTML {
1537 #
1538 #   Parameter:  String containing HTML markup.
1539 #   Returns:    Reference to an array of the tokens comprising the input
1540 #               string. Each token is either a tag (possibly with nested,
1541 #               tags contained therein, such as <a href="<MTFoo>">, or a
1542 #               run of text between tags. Each element of the array is a
1543 #               two-element array; the first is either 'tag' or 'text';
1544 #               the second is the actual value.
1545 #
1546 #
1547 #   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1548 #       <http://www.bradchoate.com/past/mtregex.php>
1549 #
1550
1551     my $str = shift;
1552     my $pos = 0;
1553     my $len = length $str;
1554     my @tokens;
1555
1556     my $depth = 6;
1557     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
1558     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
1559                    (?s: <\? .*? \?> ) |              # processing instruction
1560                    $nested_tags/ix;                   # nested tags
1561
1562     while ($str =~ m/($match)/g) {
1563         my $whole_tag = $1;
1564         my $sec_start = pos $str;
1565         my $tag_start = $sec_start - length $whole_tag;
1566         if ($pos < $tag_start) {
1567             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1568         }
1569         push @tokens, ['tag', $whole_tag];
1570         $pos = pos $str;
1571     }
1572     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1573     \@tokens;
1574 }
1575
1576
1577 sub _Outdent {
1578 #
1579 # Remove one level of line-leading tabs or spaces
1580 #
1581         my $text = shift;
1582
1583         $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1584         return $text;
1585 }
1586
1587
1588 sub _Detab {
1589 #
1590 # Cribbed from a post by Bart Lateur:
1591 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1592 #
1593         my $text = shift;
1594
1595         $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1596         return $text;
1597 }
1598
1599
1600 1;
1601
1602 __END__
1603
1604
1605 =pod
1606
1607 =head1 NAME
1608
1609 B<Markdown>
1610
1611
1612 =head1 SYNOPSIS
1613
1614 B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1615     [ I<file> ... ]
1616
1617
1618 =head1 DESCRIPTION
1619
1620 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1621 easy-to-write structured text format into HTML. Markdown's text format
1622 is most similar to that of plain text email, and supports features such
1623 as headers, *emphasis*, code blocks, blockquotes, and links.
1624
1625 Markdown's syntax is designed not as a generic markup language, but
1626 specifically to serve as a front-end to (X)HTML. You can  use span-level
1627 HTML tags anywhere in a Markdown document, and you can use block level
1628 HTML tags (like <div> and <table> as well).
1629
1630 For more information about Markdown's syntax, see:
1631
1632     http://daringfireball.net/projects/markdown/
1633
1634
1635 =head1 OPTIONS
1636
1637 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1638
1639         Markdown.pl -- -z
1640
1641 =over 4
1642
1643
1644 =item B<--html4tags>
1645
1646 Use HTML 4 style for empty element tags, e.g.:
1647
1648     <br>
1649
1650 instead of Markdown's default XHTML style tags, e.g.:
1651
1652     <br />
1653
1654
1655 =item B<-v>, B<--version>
1656
1657 Display Markdown's version number and copyright information.
1658
1659
1660 =item B<-s>, B<--shortversion>
1661
1662 Display the short-form version number.
1663
1664
1665 =back
1666
1667
1668
1669 =head1 BUGS
1670
1671 To file bug reports or feature requests (other than topics listed in the
1672 Caveats section above) please send email to:
1673
1674     support@daringfireball.net
1675
1676 Please include with your report: (1) the example input; (2) the output
1677 you expected; (3) the output Markdown actually produced.
1678
1679
1680 =head1 VERSION HISTORY
1681
1682 See the readme file for detailed release notes for this version.
1683
1684 1.0.1 - 14 Dec 2004
1685
1686 1.0 - 28 Aug 2004
1687
1688
1689 =head1 AUTHOR
1690
1691     John Gruber
1692     http://daringfireball.net
1693
1694     PHP port and other contributions by Michel Fortin
1695     http://michelf.com
1696
1697
1698 =head1 COPYRIGHT AND LICENSE
1699
1700 Copyright (c) 2003-2004 John Gruber
1701 <http://daringfireball.net/>
1702 All rights reserved.
1703
1704 Redistribution and use in source and binary forms, with or without
1705 modification, are permitted provided that the following conditions are
1706 met:
1707
1708 * Redistributions of source code must retain the above copyright notice,
1709   this list of conditions and the following disclaimer.
1710
1711 * Redistributions in binary form must reproduce the above copyright
1712   notice, this list of conditions and the following disclaimer in the
1713   documentation and/or other materials provided with the distribution.
1714
1715 * Neither the name "Markdown" nor the names of its contributors may
1716   be used to endorse or promote products derived from this software
1717   without specific prior written permission.
1718
1719 This software is provided by the copyright holders and contributors "as
1720 is" and any express or implied warranties, including, but not limited
1721 to, the implied warranties of merchantability and fitness for a
1722 particular purpose are disclaimed. In no event shall the copyright owner
1723 or contributors be liable for any direct, indirect, incidental, special,
1724 exemplary, or consequential damages (including, but not limited to,
1725 procurement of substitute goods or services; loss of use, data, or
1726 profits; or business interruption) however caused and on any theory of
1727 liability, whether in contract, strict liability, or tort (including
1728 negligence or otherwise) arising in any way out of the use of this
1729 software, even if advised of the possibility of such damage.
1730
1731 =cut