if not modules then modules = { } end modules ['m-markdown'] = {
version = 1.002,
comment = "companion to m-markdown.mkiv",
author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
copyright = "see below",
license = "see context related readme files"
}
-- Copyright (C) 2009-... John MacFarlane & Hans Hagen
-- At some point I ran into the \LUA\ parser for markdown but it was quite slow and
-- could crash on memory usage. Eventually I could speed up the parser (a factor 20)
-- so instead a complete rewrite (I might do that) the original was adapted and sort
-- of kept. I'm not sure if pandoc still uses Lua.
--
-- The code here is mostly meant for processing snippets embedded in a context
-- documents and is no replacement for pandoc at all. Therefore an alternative is to
-- use pandoc in combination with Aditya's filter module.
--
-- This is a second rewrite. The mentioned speed gain largely depended on the kind of
-- content: blocks, references and items can be rather demanding. Also, There were
-- some limitations with respect to the captures. So, table storage has been removed in
-- favor of strings, and nesting has been simplified. The first example at the end of this
-- file now takes .33 seconds for 567KB code (resulting in over 1MB) so we're getting there.
--
-- There might be a third rewrite eventually.
--
-- todo: we have better quote and tag scanners in ctx
-- todo: provide an xhtml mapping
-- todo: add a couple of extensions
-- todo: check patches to the real peg
-- todo: more closures so less issues with number o flocals
--
-- Because I don't want to waste time on checking with \MKIV\ we use the "lmt" suffix
-- to indicate that we aim at lmtx (although there is nothing specific here currently).
--
-- There might be styling directives that we need to support but these can just
-- be mapped into regular context features. There will be no manipulation of content
-- at this end as that makes no sense!
local type, next, tonumber = type, next, tonumber
local lower, upper, gsub, format, length = string.lower, string.upper, string.gsub, string.format, string.len
local concat = table.concat
local P, R, S, V, C, Ct, Cg, Cb, Cmt, Cc, Cf, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.C, lpeg.Ct, lpeg.Cg, lpeg.Cb, lpeg.Cmt, lpeg.Cc, lpeg.Cf, lpeg.Cs
local lpegmatch = lpeg.match
local utfbyte, utfchar = utf.byte, utf.char
local formatters = string.formatters
-- we can use the unicode lower and upper if needed
moduledata = moduledata or { }
moduledata.markdown = moduledata.markdown or { }
local markdown = moduledata.markdown
local nofruns, nofbytes, nofhtmlblobs = 0, 0, 0
---------------------------------------------------------------------------------------------
local nestedparser
local syntax
nestedparser = function(str) return lpegmatch(syntax,str) end
---------------------------------------------------------------------------------------------
local asterisk = P("*")
local dash = P("-")
local plus = P("+")
local underscore = P("_")
local period = P(".")
local hash = P("#")
local ampersand = P("&")
local backtick = P("`")
local less = P("<")
local more = P(">")
local space = P(" ")
local squote = P("'")
local dquote = P('"')
local lparent = P("(")
local rparent = P(")")
local lbracket = P("[")
local rbracket = P("]")
local slash = P("/")
local equal = P("=")
local colon = P(":")
local semicolon = P(";")
local exclamation = P("!")
local digit = R("09")
local hexdigit = R("09","af","AF")
local alphanumeric = R("AZ","az","09")
local doubleasterisks = P("**")
local doubleunderscores = P("__")
local fourspaces = P(" ")
local any = P(1)
local always = P("")
local tab = P("\t")
local spacechar = S("\t ")
local spacing = S(" \n\r\t")
-- local newline = P("\r")^-1 * P("\n")
local newline = lpeg.patterns.newline
local spaceornewline = spacechar + newline
local nonspacechar = any - spaceornewline
local optionalspace = spacechar^0
local spaces = spacechar^1
local eof = - any
local nonindentspace = space^-3
local blankline = optionalspace * C(newline)
local blanklines = blankline^0
local skipblanklines = (optionalspace * newline)^0
local linechar = P(1 - newline)
local indent = fourspaces + (nonindentspace * tab) / ""
local indentedline = indent /"" * C(linechar^1 * (newline + eof))
local optionallyindentedline = indent^-1 /"" * C(linechar^1 * (newline + eof))
local spnl = optionalspace * (newline * optionalspace)^-1
local specialchar = S("*_`*&[]
-- [3]:http://example.com/ (Optional Title Here)
-- [2]: http://example.com/ 'Optional Title Here'
-- [a]: http://example.com/ "Optional *oeps* Title Here"
-- ]]
--
-- local linktest = [[
-- [This link] (http://example.net/)
-- [an example] (http://example.com/ "Title")
-- [an example][1]
-- [an example] [2]
-- ]]
--
-- lpeg.match((define_reference_parser+1)^0,reftest)
--
-- inspect(references)
--
-- lpeg.match((direct_link_parser/print + indirect_link_parser/print + 1)^0,linktest)
---------------------------------------------------------------------------------------------
local blocktags = table.tohash {
"address", "blockquote" , "center", "dir", "div", "p", "pre",
"li", "ol", "ul", "dl", "dd",
"form", "fieldset", "isindex", "menu", "noframes", "frameset",
"h1", "h2", "h3", "h4", "h5", "h6",
"hr", "ht", "script", "noscript",
"table", "tbody", "tfoot", "thead", "th", "td", "tr",
}
----- htmlattributevalue = squote * C((any - (blankline + squote))^0) * squote
----- + dquote * C((any - (blankline + dquote))^0) * dquote
----- + (any - S("\t >"))^1 -- any - tab - space - more
----- htmlattribute = (alphanumeric + S("_-"))^1 * spnl * (equal * spnl * htmlattributevalue)^-1 * spnl
----- htmlcomment = P(""))^0 * P("-->")
----- htmltag = less * spnl * slash^-1 * alphanumeric^1 * spnl * htmlattribute^0 * slash^-1 * spnl * more
-----
----- blocktag = Cmt(C(alphanumeric^1), function(s,i,a) return blocktags[lower(a)] and i, a end)
-----
----- openblocktag = less * Cg(blocktag, "opentag") * spnl * htmlattribute^0 * more
----- closeblocktag = less * slash * Cmt(C(alphanumeric^1) * Cb("opentag"), function(s,i,a,b) return lower(a) == lower(b) and i end) * spnl * more
----- selfclosingblocktag = less * blocktag * spnl * htmlattribute^0 * slash * more
-----
----- displayhtml = Cs { "HtmlBlock",
----- InBlockTags = openblocktag * (V("HtmlBlock") + (any - closeblocktag))^0 * closeblocktag,
----- HtmlBlock = C(V("InBlockTags") + selfclosingblocktag + htmlcomment),
----- }
-----
----- inlinehtml = Cs(htmlcomment + htmltag)
-- There is no reason to support crappy html, so we expect proper attributes.
local htmlattributevalue = squote * C((any - (blankline + squote))^0) * squote
+ dquote * C((any - (blankline + dquote))^0) * dquote
local htmlattribute = (alphanumeric + S("_-"))^1 * spnl * equal * spnl * htmlattributevalue * spnl
local htmlcomment = P(""))^0 * P("-->")
local htmlinstruction = P("") * (any - P("?>" ))^0 * P("?>" )
-- We don't care too much about matching elements and there is no reason why display elements could not
-- have inline elements so the above should be patched then. Well, markdown mixed with html is not meant
-- for anything else than webpages anyway.
local blocktag = Cmt(C(alphanumeric^1), function(s,i,a) return blocktags[lower(a)] and i, a end)
local openelement = less * alphanumeric^1 * spnl * htmlattribute^0 * more
local closeelement = less * slash * alphanumeric^1 * spnl * more
local emptyelement = less * alphanumeric^1 * spnl * htmlattribute^0 * slash * more
local displaytext = (any - less)^1
local inlinetext = displaytext / nestedparser
local displayhtml = #(less * blocktag * spnl * htmlattribute^0 * more)
* Cs { "HtmlBlock",
InBlockTags = openelement * (V("HtmlBlock") + displaytext)^0 * closeelement,
HtmlBlock = (V("InBlockTags") + emptyelement + htmlcomment + htmlinstruction),
}
local inlinehtml = Cs { "HtmlBlock",
InBlockTags = openelement * (V("HtmlBlock") + inlinetext)^0 * closeelement,
HtmlBlock = (V("InBlockTags") + emptyelement + htmlcomment + htmlinstruction),
}
---------------------------------------------------------------------------------------------
local hexentity = ampersand * hash * S("Xx") * C(hexdigit ^1) * semicolon
local decentity = ampersand * hash * C(digit ^1) * semicolon
local tagentity = ampersand * C(alphanumeric^1) * semicolon
---------------------------------------------------------------------------------------------
-- --[[
local escaped = {
["{" ] = "",
["}" ] = "",
["$" ] = "",
["&" ] = "",
["#" ] = "",
["~" ] = "",
["|" ] = "",
["%%"] = "",
["\\"] = "",
}
for k, v in next, escaped do
escaped[k] = "\\char" .. utfbyte(k) .. "{}"
end
local function c_string(s) -- has to be done more often
return (gsub(s,".",escaped))
end
local c_trigger = utfchar(3)
local c_linebreak = "\\crlf\n" -- is this ok?
local c_space = " "
local function c_paragraph(c)
return c .. "\n\n" -- { "\\startparagraph ", c, " \\stopparagraph\n" }
end
local f_listitem = formatters[
c_trigger .. "startitem\n%s\n" .. c_trigger .. "stopitem\n"
]
local function listitem(c)
return f_listitem(nestedparser(c))
end
local c_tightbulletlist = formatters[
"\n" .. c_trigger .. "startmarkdownitemize[packed]\n%s\n" .. c_trigger .. "stopmarkdownitemize\n"
]
local c_loosebulletlist = formatters[
"\n" .. c_trigger .. "startmarkdownitemize\n%s\n" .. c_trigger .. "stopmarkdownitemize\n"
]
local c_tightorderedlist = formatters[
"\n" .. c_trigger .. "startmarkdownitemize[n,packed]\n%s\n" .. c_trigger .. "stopmarkdownitemize\n"
]
local c_looseorderedlist = formatters[
"\n" .. c_trigger .. "startmarkdownitemize[n]\n%s\n" .. c_trigger .. "stopmarkdownitemize>\n"
]
local f_inlinehtml = formatters[
c_trigger .. "markdowninlinehtml{%s}"
]
local f_displayhtml = formatters[
c_trigger .. "startmarkdowndisplayhtml\n%s\n" .. c_trigger .. "stopmarkdowndisplayhtml"
]
local function c_inline_html(content)
nofhtmlblobs = nofhtmlblobs + 1
return f_inlinehtml(content)
end
local function c_display_html(content)
nofhtmlblobs = nofhtmlblobs + 1
return f_displayhtml(content)
end
local c_emphasis = string.formatters[
c_trigger .. "markdownemphasis{%s}"
]
local c_strong = string.formatters[
c_trigger .. "markdownstrong{%s}"
]
local f_blockquote = string.formatters [
c_trigger .. "startmarkdownblockquote\n%s" .. c_trigger .. "stopmarkdownblockquote\n"
]
local function c_blockquote(c)
return f_blockquote(nestedparser(c))
end
local c_verbatim = string.formatters[
c_trigger .. "startmarkdowntyping\n%s" .. c_trigger .. "stopmarkdowntyping\n"
]
local c_code = string.formatters[
c_trigger .. "markdowntype{%s}"
]
local levels = { "", "", "", "", "", "" }
local function c_start_document()
levels = { "", "", "", "", "", "" }
return ""
end
local function c_stop_document()
return concat(levels,"\n") or ""
end
local f_heading = formatters [ "%s" .. c_trigger .. "startstructurelevel[title={%s}]\n" ]
local s_heading = c_trigger .. "stopstructurelevel"
local function c_heading(level,c)
if level > #levels then
level = #levels
end
local finish = concat(levels,"\n",level) or ""
for i=level+1,#levels do
levels[i] = ""
end
levels[level] = s_heading
return f_heading(finish,c)
end
local function f_heading(c,n)
return c_heading(n,c)
end
local c_hrule = formatters [ c_trigger .. "markdownrule\n" ]
local f_link = formatters [ c_trigger .. "goto{%s}[url(%s)]" ]
local f_image = formatters [ c_trigger .. "externalfigure[%s]" ]
local f_email_link = formatters [ c_trigger .. "goto{%s}[url(mailto:%s)]" ]
local f_url_link = formatters [ c_trigger .. "goto{%s}[url(%s)]" ]
local c_link = function(lab,src,tit) return f_link(nestedparser(lab),src) end
local c_image = function(lab,src,tit) return f_image(src) end
local c_email_link = function(address) return f_email_link(c_string(address),address) end
local c_url_link = function(url) return f_url_link(c_string(url),url) end
local function c_hex_entity(s) return utfchar(tonumber(s,16)) end
local function c_dec_entity(s) return utfchar(tonumber(s)) end
local function c_tag_entity(s) return s end -- todo: use the default resolver
--]]
---------------------------------------------------------------------------------------------
--[[
local escaped = {
["<"] = "<",
[">"] = ">",
["&"] = "&",
['"'] = """,
}
local function c_string(s) -- has to be done more often
return (gsub(s,".",escaped))
end
local c_linebreak = "
"
local c_space = " "
local function c_paragraph(c)
return format("
%s
\n", c) end local function listitem(c) return format("\n%s\n",nestedparser(c)) end local function c_verbatim(c) return format("
%s
",c)
end
local function c_code(c)
return format("%s
",c)
end
local c_start_document = ""
local c_stop_document = ""
local function c_heading(level,c)
return format("PRE PRE PRE-- -- -- * Test -- ** Test -- * Test1 -- * Test2 -- * Test -- -- Test -- ==== -- -- > test -- > test **123** *123* -- > test `code` -- -- test -- -- Test -- ==== -- -- > test -- > test -- > test -- -- test -- oeps -- -- more -- -- code -- code -- -- oeps -- -- [an example][a] -- -- [an example] [2] -- -- [a]: http://example.com/ "Optional *oeps* Title Here" -- [2]: http://example.com/ 'Optional Title Here' -- [3]: http://example.com/ (Optional Title Here) -- -- [an example][a] -- -- [an example] [2] -- -- [an [tricky] example](http://example.com/ "Title") -- -- [This **xx** link](http://example.net/) -- ]] -- -- -- This snippet takes some 4 seconds in the original parser (the one that is -- -- a bit clearer from the perspective of grammars but somewhat messy with -- -- respect to the captures. In the above parser it takes .1 second. Also, -- -- in the later case only memory is the limit. -- -- local two = [[ -- Test -- ==== -- * Test -- * Test -- * Test -- * Test -- * Test -- -- Test -- ==== -- -- > test -- > test -- > test -- -- test -- -- Test -- ==== -- -- > test -- > test -- > test -- -- test -- ]] -- -- local function test(str) -- local n = 1 -- 000 -- local t = os.clock() -- local one = convert(str) -- -- print("runtime",1,#str,#one,os.clock()-t) -- str = string.rep(str,n) -- local t = os.clock() -- local two = convert(str) -- print(two) -- -- print("runtime",n,#str,#two,os.clock()-t) -- -- print(format("==============\n%s\n==============",one)) -- end -- -- -- test(one) -- -- test(two) -- -- print(two) -- -- test(io.read("*all")) -- -- end