if not modules then modules = { } end modules ['m-markdown'] = { version = 1.002, comment = "companion to m-markdown.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "see below", license = "see context related readme files" } -- Copyright (C) 2009-... John MacFarlane & Hans Hagen -- At some point I ran into the \LUA\ parser for markdown but it was quite slow and -- could crash on memory usage. Eventually I could speed up the parser (a factor 20) -- so instead a complete rewrite (I might do that) the original was adapted and sort -- of kept. I'm not sure if pandoc still uses Lua. -- -- The code here is mostly meant for processing snippets embedded in a context -- documents and is no replacement for pandoc at all. Therefore an alternative is to -- use pandoc in combination with Aditya's filter module. -- -- This is a second rewrite. The mentioned speed gain largely depended on the kind of -- content: blocks, references and items can be rather demanding. Also, There were -- some limitations with respect to the captures. So, table storage has been removed in -- favor of strings, and nesting has been simplified. The first example at the end of this -- file now takes .33 seconds for 567KB code (resulting in over 1MB) so we're getting there. -- -- There might be a third rewrite eventually. -- -- todo: we have better quote and tag scanners in ctx -- todo: provide an xhtml mapping -- todo: add a couple of extensions -- todo: check patches to the real peg -- todo: more closures so less issues with number o flocals -- -- Because I don't want to waste time on checking with \MKIV\ we use the "lmt" suffix -- to indicate that we aim at lmtx (although there is nothing specific here currently). -- -- There might be styling directives that we need to support but these can just -- be mapped into regular context features. There will be no manipulation of content -- at this end as that makes no sense! local type, next, tonumber = type, next, tonumber local lower, upper, gsub, format, length = string.lower, string.upper, string.gsub, string.format, string.len local concat = table.concat local P, R, S, V, C, Ct, Cg, Cb, Cmt, Cc, Cf, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.C, lpeg.Ct, lpeg.Cg, lpeg.Cb, lpeg.Cmt, lpeg.Cc, lpeg.Cf, lpeg.Cs local lpegmatch = lpeg.match local utfbyte, utfchar = utf.byte, utf.char local formatters = string.formatters -- we can use the unicode lower and upper if needed moduledata = moduledata or { } moduledata.markdown = moduledata.markdown or { } local markdown = moduledata.markdown local nofruns, nofbytes, nofhtmlblobs = 0, 0, 0 --------------------------------------------------------------------------------------------- local nestedparser local syntax nestedparser = function(str) return lpegmatch(syntax,str) end --------------------------------------------------------------------------------------------- local asterisk = P("*") local dash = P("-") local plus = P("+") local underscore = P("_") local period = P(".") local hash = P("#") local ampersand = P("&") local backtick = P("`") local less = P("<") local more = P(">") local space = P(" ") local squote = P("'") local dquote = P('"') local lparent = P("(") local rparent = P(")") local lbracket = P("[") local rbracket = P("]") local slash = P("/") local equal = P("=") local colon = P(":") local semicolon = P(";") local exclamation = P("!") local digit = R("09") local hexdigit = R("09","af","AF") local alphanumeric = R("AZ","az","09") local doubleasterisks = P("**") local doubleunderscores = P("__") local fourspaces = P(" ") local any = P(1) local always = P("") local tab = P("\t") local spacechar = S("\t ") local spacing = S(" \n\r\t") -- local newline = P("\r")^-1 * P("\n") local newline = lpeg.patterns.newline local spaceornewline = spacechar + newline local nonspacechar = any - spaceornewline local optionalspace = spacechar^0 local spaces = spacechar^1 local eof = - any local nonindentspace = space^-3 local blankline = optionalspace * C(newline) local blanklines = blankline^0 local skipblanklines = (optionalspace * newline)^0 local linechar = P(1 - newline) local indent = fourspaces + (nonindentspace * tab) / "" local indentedline = indent /"" * C(linechar^1 * (newline + eof)) local optionallyindentedline = indent^-1 /"" * C(linechar^1 * (newline + eof)) local spnl = optionalspace * (newline * optionalspace)^-1 local specialchar = S("*_`*&[] -- [3]:http://example.com/ (Optional Title Here) -- [2]: http://example.com/ 'Optional Title Here' -- [a]: http://example.com/ "Optional *oeps* Title Here" -- ]] -- -- local linktest = [[ -- [This link] (http://example.net/) -- [an example] (http://example.com/ "Title") -- [an example][1] -- [an example] [2] -- ]] -- -- lpeg.match((define_reference_parser+1)^0,reftest) -- -- inspect(references) -- -- lpeg.match((direct_link_parser/print + indirect_link_parser/print + 1)^0,linktest) --------------------------------------------------------------------------------------------- local blocktags = table.tohash { "address", "blockquote" , "center", "dir", "div", "p", "pre", "li", "ol", "ul", "dl", "dd", "form", "fieldset", "isindex", "menu", "noframes", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ht", "script", "noscript", "table", "tbody", "tfoot", "thead", "th", "td", "tr", } ----- htmlattributevalue = squote * C((any - (blankline + squote))^0) * squote ----- + dquote * C((any - (blankline + dquote))^0) * dquote ----- + (any - S("\t >"))^1 -- any - tab - space - more ----- htmlattribute = (alphanumeric + S("_-"))^1 * spnl * (equal * spnl * htmlattributevalue)^-1 * spnl ----- htmlcomment = P(""))^0 * P("-->") ----- htmltag = less * spnl * slash^-1 * alphanumeric^1 * spnl * htmlattribute^0 * slash^-1 * spnl * more ----- ----- blocktag = Cmt(C(alphanumeric^1), function(s,i,a) return blocktags[lower(a)] and i, a end) ----- ----- openblocktag = less * Cg(blocktag, "opentag") * spnl * htmlattribute^0 * more ----- closeblocktag = less * slash * Cmt(C(alphanumeric^1) * Cb("opentag"), function(s,i,a,b) return lower(a) == lower(b) and i end) * spnl * more ----- selfclosingblocktag = less * blocktag * spnl * htmlattribute^0 * slash * more ----- ----- displayhtml = Cs { "HtmlBlock", ----- InBlockTags = openblocktag * (V("HtmlBlock") + (any - closeblocktag))^0 * closeblocktag, ----- HtmlBlock = C(V("InBlockTags") + selfclosingblocktag + htmlcomment), ----- } ----- ----- inlinehtml = Cs(htmlcomment + htmltag) -- There is no reason to support crappy html, so we expect proper attributes. local htmlattributevalue = squote * C((any - (blankline + squote))^0) * squote + dquote * C((any - (blankline + dquote))^0) * dquote local htmlattribute = (alphanumeric + S("_-"))^1 * spnl * equal * spnl * htmlattributevalue * spnl local htmlcomment = P(""))^0 * P("-->") local htmlinstruction = P("" ))^0 * P("?>" ) -- We don't care too much about matching elements and there is no reason why display elements could not -- have inline elements so the above should be patched then. Well, markdown mixed with html is not meant -- for anything else than webpages anyway. local blocktag = Cmt(C(alphanumeric^1), function(s,i,a) return blocktags[lower(a)] and i, a end) local openelement = less * alphanumeric^1 * spnl * htmlattribute^0 * more local closeelement = less * slash * alphanumeric^1 * spnl * more local emptyelement = less * alphanumeric^1 * spnl * htmlattribute^0 * slash * more local displaytext = (any - less)^1 local inlinetext = displaytext / nestedparser local displayhtml = #(less * blocktag * spnl * htmlattribute^0 * more) * Cs { "HtmlBlock", InBlockTags = openelement * (V("HtmlBlock") + displaytext)^0 * closeelement, HtmlBlock = (V("InBlockTags") + emptyelement + htmlcomment + htmlinstruction), } local inlinehtml = Cs { "HtmlBlock", InBlockTags = openelement * (V("HtmlBlock") + inlinetext)^0 * closeelement, HtmlBlock = (V("InBlockTags") + emptyelement + htmlcomment + htmlinstruction), } --------------------------------------------------------------------------------------------- local hexentity = ampersand * hash * S("Xx") * C(hexdigit ^1) * semicolon local decentity = ampersand * hash * C(digit ^1) * semicolon local tagentity = ampersand * C(alphanumeric^1) * semicolon --------------------------------------------------------------------------------------------- -- --[[ local escaped = { ["{" ] = "", ["}" ] = "", ["$" ] = "", ["&" ] = "", ["#" ] = "", ["~" ] = "", ["|" ] = "", ["%%"] = "", ["\\"] = "", } for k, v in next, escaped do escaped[k] = "\\char" .. utfbyte(k) .. "{}" end local function c_string(s) -- has to be done more often return (gsub(s,".",escaped)) end local c_trigger = utfchar(3) local c_linebreak = "\\crlf\n" -- is this ok? local c_space = " " local function c_paragraph(c) return c .. "\n\n" -- { "\\startparagraph ", c, " \\stopparagraph\n" } end local f_listitem = formatters[ c_trigger .. "startitem\n%s\n" .. c_trigger .. "stopitem\n" ] local function listitem(c) return f_listitem(nestedparser(c)) end local c_tightbulletlist = formatters[ "\n" .. c_trigger .. "startmarkdownitemize[packed]\n%s\n" .. c_trigger .. "stopmarkdownitemize\n" ] local c_loosebulletlist = formatters[ "\n" .. c_trigger .. "startmarkdownitemize\n%s\n" .. c_trigger .. "stopmarkdownitemize\n" ] local c_tightorderedlist = formatters[ "\n" .. c_trigger .. "startmarkdownitemize[n,packed]\n%s\n" .. c_trigger .. "stopmarkdownitemize\n" ] local c_looseorderedlist = formatters[ "\n" .. c_trigger .. "startmarkdownitemize[n]\n%s\n" .. c_trigger .. "stopmarkdownitemize>\n" ] local f_inlinehtml = formatters[ c_trigger .. "markdowninlinehtml{%s}" ] local f_displayhtml = formatters[ c_trigger .. "startmarkdowndisplayhtml\n%s\n" .. c_trigger .. "stopmarkdowndisplayhtml" ] local function c_inline_html(content) nofhtmlblobs = nofhtmlblobs + 1 return f_inlinehtml(content) end local function c_display_html(content) nofhtmlblobs = nofhtmlblobs + 1 return f_displayhtml(content) end local c_emphasis = string.formatters[ c_trigger .. "markdownemphasis{%s}" ] local c_strong = string.formatters[ c_trigger .. "markdownstrong{%s}" ] local f_blockquote = string.formatters [ c_trigger .. "startmarkdownblockquote\n%s" .. c_trigger .. "stopmarkdownblockquote\n" ] local function c_blockquote(c) return f_blockquote(nestedparser(c)) end local c_verbatim = string.formatters[ c_trigger .. "startmarkdowntyping\n%s" .. c_trigger .. "stopmarkdowntyping\n" ] local c_code = string.formatters[ c_trigger .. "markdowntype{%s}" ] local levels = { "", "", "", "", "", "" } local function c_start_document() levels = { "", "", "", "", "", "" } return "" end local function c_stop_document() return concat(levels,"\n") or "" end local f_heading = formatters [ "%s" .. c_trigger .. "startstructurelevel[title={%s}]\n" ] local s_heading = c_trigger .. "stopstructurelevel" local function c_heading(level,c) if level > #levels then level = #levels end local finish = concat(levels,"\n",level) or "" for i=level+1,#levels do levels[i] = "" end levels[level] = s_heading return f_heading(finish,c) end local function f_heading(c,n) return c_heading(n,c) end local c_hrule = formatters [ c_trigger .. "markdownrule\n" ] local f_link = formatters [ c_trigger .. "goto{%s}[url(%s)]" ] local f_image = formatters [ c_trigger .. "externalfigure[%s]" ] local f_email_link = formatters [ c_trigger .. "goto{%s}[url(mailto:%s)]" ] local f_url_link = formatters [ c_trigger .. "goto{%s}[url(%s)]" ] local c_link = function(lab,src,tit) return f_link(nestedparser(lab),src) end local c_image = function(lab,src,tit) return f_image(src) end local c_email_link = function(address) return f_email_link(c_string(address),address) end local c_url_link = function(url) return f_url_link(c_string(url),url) end local function c_hex_entity(s) return utfchar(tonumber(s,16)) end local function c_dec_entity(s) return utfchar(tonumber(s)) end local function c_tag_entity(s) return s end -- todo: use the default resolver --]] --------------------------------------------------------------------------------------------- --[[ local escaped = { ["<"] = "<", [">"] = ">", ["&"] = "&", ['"'] = """, } local function c_string(s) -- has to be done more often return (gsub(s,".",escaped)) end local c_linebreak = "
" local c_space = " " local function c_paragraph(c) return format("

%s

\n", c) end local function listitem(c) return format("
  • %s
  • ",nestedparser(c)) end local function c_tightbulletlist(c) return format("\n",c) end local function c_loosebulletlist(c) return format("\n",c) end local function c_tightorderedlist(c) return format("
      \n%s\n
    \n",c) end local function c_looseorderedlist(c) return format("
      \n%s\n
    \n",c) end local function c_inline_html(content) nofhtmlblobs = nofhtmlblobs + 1 return content end local function c_display_html(content) nofhtmlblobs = nofhtmlblobs + 1 return format("\n%s\n",content) end local function c_emphasis(c) return format("%s",c) end local function c_strong(c) return format("%s",c) end local function c_blockquote(c) return format("
    \n%s\n
    ",nestedparser(c)) end local function c_verbatim(c) return format("
    %s
    ",c) end local function c_code(c) return format("%s",c) end local c_start_document = "" local c_stop_document = "" local function c_heading(level,c) return format("%s\n",level,c,level) end local function c_hrule() return "
    \n" end local function c_link(lab,src,tit) local titattr = #tit > 0 and format(" title=%q",tit) or "" return format("%s",src,titattr,nestedparser(lab)) end local function c_image(lab,src,tit) return format("%s",src,tit,nestedparser(lab)) end local function c_email_link(address) return format("%s","mailto:",address,c_escape(address)) end local function c_url_link(url) return format("%s",url,c_string(url)) end local function f_heading(c,n) return c_heading(n,c) end local function c_hex_entity(s) return utfchar(tonumber(s,16)) end local function c_dec_entity(s) return utfchar(tonumber(s)) end local function c_tag_entity(s) return format("&%s;",s) end --]] --------------------------------------------------------------------------------------------- local Str = normalchar^1 / c_string local Space = spacechar^1 / c_space local Symbol = specialchar / c_string local Code = inticks / c_code local HeadingStart = C(hash * hash^-5) / length local HeadingStop = optionalspace * hash^0 * optionalspace * newline * blanklines local HeadingLevel = equal^3 * Cc(1) + dash ^3 * Cc(2) local NormalEndline = optionalspace * newline * -( blankline + more + HeadingStart + ( line * (P("===")^3 + P("---")^3) * newline ) ) / c_space -- hm, two spaces: local LineBreak = P(" ") * NormalEndline / c_linebreak local TerminalEndline = optionalspace * newline * eof / "" local Endline = LineBreak + TerminalEndline + NormalEndline local AutoLinkUrl = less * C(alphanumeric^1 * P("://") * (any - (newline + more))^1) * more / c_url_link local AutoLinkEmail = less * C((alphanumeric + S("-_+"))^1 * P("@") * (any - (newline + more))^1) * more / c_email_link local DirectLink = direct_link_parser / c_link local IndirectLink = indirect_link_parser / c_link local ImageLink = exclamation * (direct_link_parser + indirect_link_parser) / c_image -- we can combine this with image ... smaller lpeg local UlOrStarLine = asterisk^4 + underscore^4 + (spaces * S("*_")^1 * #spaces) / c_string local EscapedChar = P("\\") * C(P(1 - newline)) / c_string local InlineHtml = inlinehtml / c_inline_html local DisplayHtml = displayhtml / c_display_html local HtmlEntity = hexentity / c_hex_entity + decentity / c_dec_entity + tagentity / c_tag_entity local NestedList = Cs(optionallyindentedline - (bullet + enumerator))^1 / nestedparser local ListBlockLine = -blankline * -(indent^-1 * (bullet + enumerator)) * optionallyindentedline local Verbatim = Cs(blanklines * (indentedline - blankline)^1) / c_verbatim * (blankline^1 + eof) -- not really needed, probably capture trailing? we can do that beforehand local Blockquote = Cs(( ((nonindentspace * more * space^-1)/"" * linechar^0 * newline)^1 * ((linechar - blankline)^1 * newline)^0 * blankline^0 )^1) / c_blockquote local HorizontalRule = (lineof_asterisks + lineof_dashes + lineof_underscores) / c_hrule local Reference = define_reference_parser / "" -- could be a mini grammar local ListBlock = line * ListBlockLine^0 local ListContinuationBlock = blanklines * indent * ListBlock local ListItem = Cs(ListBlock * (NestedList + ListContinuationBlock^0)) / listitem ---- LeadingLines = blankline^0 / "" ---- TrailingLines = blankline^1 * #(any) / "\n" syntax = Cs { "Document", Document = V("Display")^0, Display = blankline -- ^1/"\n" + Blockquote + Verbatim + Reference + HorizontalRule + HeadingStart * optionalspace * Cs((V("Inline") - HeadingStop)^1) * HeadingStop / c_heading + Cs((V("Inline") - Endline)^1) * newline * HeadingLevel * newline * blanklines / f_heading + Cs((bullet /"" * ListItem)^1) * blanklines * -bullet / c_tightbulletlist + Cs((bullet /"" * ListItem * C(blanklines))^1) / c_loosebulletlist + Cs((enumerator /"" * ListItem)^1) * blanklines * -enumerator / c_tightorderedlist + Cs((enumerator /"" * ListItem * C(blanklines))^1) / c_looseorderedlist + DisplayHtml + nonindentspace * Cs(V("Inline")^1)* newline * blankline^1 / c_paragraph + V("Inline")^1, Inline = Str + Space + Endline + UlOrStarLine -- still needed ? + doubleasterisks * -spaceornewline * Cs((V("Inline") - doubleasterisks )^1) * doubleasterisks / c_strong + doubleunderscores * -spaceornewline * Cs((V("Inline") - doubleunderscores)^1) * doubleunderscores / c_strong + asterisk * -spaceornewline * Cs((V("Inline") - asterisk )^1) * asterisk / c_emphasis + underscore * -spaceornewline * Cs((V("Inline") - underscore )^1) * underscore / c_emphasis + ImageLink + DirectLink + IndirectLink + AutoLinkUrl + AutoLinkEmail + Code + InlineHtml + HtmlEntity + EscapedChar + Symbol, } --------------------------------------------------------------------------------------------- local function convert(str) nofruns = nofruns + 1 nofbytes = nofbytes + #str statistics.starttiming(markdown) referenceparser(str) local result = c_start_document() .. nestedparser(str) .. c_stop_document() -- We had to get around nested \\ that got messed up so we protected them -- but now we need to revert them: result = gsub(result,c_trigger,"\\") -- statistics.stoptiming(markdown) return result end markdown.convert = convert if context then local viafile = context.viafile local getcontent = buffers.getcontent local loaddata = io.loaddata local function typesetstring(data) if data and data ~= "" then viafile(convert(data)) end end markdown.typesetstring = typesetstring function markdown.typesetbuffer(name) typesetstring(getcontent(name)) end function markdown.typesetfile(name) local fullname = resolvers.findctxfile(name) if fullname and fullname ~= "" then typesetstring(loaddata(fullname)) end end statistics.register("markdown",function() if nofruns > 0 then return format("%s bytes converted, %s runs, %s html blobs, %s seconds used", nofbytes, nofruns, nofhtmlblobs, statistics.elapsedtime(markdown)) end end) end --------------------------------------------------------------------------------------------- -- if not tex or not tex.jobname then -- -- local one = [[ -- Test *123* -- ========== -- -- BOLD *BOLD* BOLD -- --
    PRE PRE PRE
    -- -- -- * Test -- ** Test -- * Test1 -- * Test2 -- * Test -- -- Test -- ==== -- -- > test -- > test **123** *123* -- > test `code` -- -- test -- -- Test -- ==== -- -- > test -- > test -- > test -- -- test -- oeps -- -- more -- -- code -- code -- -- oeps -- -- [an example][a] -- -- [an example] [2] -- -- [a]: http://example.com/ "Optional *oeps* Title Here" -- [2]: http://example.com/ 'Optional Title Here' -- [3]: http://example.com/ (Optional Title Here) -- -- [an example][a] -- -- [an example] [2] -- -- [an [tricky] example](http://example.com/ "Title") -- -- [This **xx** link](http://example.net/) -- ]] -- -- -- This snippet takes some 4 seconds in the original parser (the one that is -- -- a bit clearer from the perspective of grammars but somewhat messy with -- -- respect to the captures. In the above parser it takes .1 second. Also, -- -- in the later case only memory is the limit. -- -- local two = [[ -- Test -- ==== -- * Test -- * Test -- * Test -- * Test -- * Test -- -- Test -- ==== -- -- > test -- > test -- > test -- -- test -- -- Test -- ==== -- -- > test -- > test -- > test -- -- test -- ]] -- -- local function test(str) -- local n = 1 -- 000 -- local t = os.clock() -- local one = convert(str) -- -- print("runtime",1,#str,#one,os.clock()-t) -- str = string.rep(str,n) -- local t = os.clock() -- local two = convert(str) -- print(two) -- -- print("runtime",n,#str,#two,os.clock()-t) -- -- print(format("==============\n%s\n==============",one)) -- end -- -- -- test(one) -- -- test(two) -- -- print(two) -- -- test(io.read("*all")) -- -- end