پودمان:Wikitext Parsing

    از ویکی تراث
    نسخهٔ تاریخ ‏۴ اوت ۲۰۲۳، ساعت ۱۷:۲۱ توسط Admin (بحث | مشارکت‌ها) (۱ نسخه واردشده: الگو و پوددمانهای اصلاح شده)
    (تفاوت) → نسخهٔ قدیمی‌تر | نمایش نسخهٔ فعلی (تفاوت) | نسخهٔ جدیدتر ← (تفاوت)

    require("strict")
    
    --Helper functions
    local function startswith(text, subtext)
    	return string.sub(text, 1, #subtext) == subtext
    end
    local function endswith(text, subtext)
    	return string.sub(text, -#subtext, -1) == subtext
    end
    local function allcases(s)
    	return s:gsub("%a", function(c) 
    		return "["..c:upper()..c:lower().."]"
    	end)
    end
    local trimcache = {}
    local whitespace = {[" "]=1, ["\n"]=1, ["\t"]=1, ["\r"]=1}
    local function cheaptrim(str) --mw.text.trim is surprisingly expensive, so here's an alternative approach
    	local quick = trimcache[str]
    	if quick then
    		return quick
    	else
    		-- local out = string.gsub(str, "^%s*(.-)%s*$", "%1")
    		local lowEnd
    		for i = 1,#str do
    			if not whitespace[string.sub(str, i, i)] then
    				lowEnd = i
    				break
    			end
    		end
    		if not lowEnd then
    			trimcache[str] = ""
    			return ""
    		end
    		for i = #str,1,-1 do
    			if not whitespace[string.sub(str, i, i)] then
    				local out = string.sub(str, lowEnd, i)
    				trimcache[str] = out
    				return out
    			end
    		end
    	end
    end
    
    --[=[ Implementation notes
    ---- NORMAL HTML TAGS ----
    Tags are very strict on how they want to start, but loose on how they end.
    The start must strictly follow <[tAgNaMe](%s|>) with no room for whitespace in
    the tag's name, but may then flow as they want afterwards, making
    <div\nclass\n=\n"\nerror\n"\n> valid
    
    There's no sense of escaping < or >
    E.g.
     <div class="error\>"> will end at \> despite it being inside a quote
     <div class="<span class="error">error</span>"> will not process the larger div
    
    If a tag has no end, it will consume all text instead of not processing
    
    ---- NOPROCESSING TAGS (nowiki, pre, syntaxhighlight, source) ----
    (In most comments, <source> will not be mentioned. This is because it is the
    deprecated version of <syntaxhighlight>)
    
    No-Processing tags have some interesting differences to the above rules.
    For example, their syntax is a lot stricter. While an opening tag appears to
    follow the same set of rules, A closing tag can't have any sort of extra
    formatting period. While </div a/a> is valid, </nowiki a/a> isn't - only
    newlines and spaces are allowed in closing tags (except in <pre> tags, which
    follow the rules of a regular html tag for formatting).
    
    Both the content inside the tag pair and the content inside each side of the
    pair is not processed. E.g. <nowiki |}}>|}}</nowiki> would have both of the |}}
    escaped in practice.
    
    When something in the code is referenced to as a "Nowiki Tag", it means a tag
    which causes wiki text to not be processed, which includes <nowiki>, <pre>,
    and <syntaxhighlight>
    
    Since we only care about these tags, we can ignore the idea of an intercepting
    tag preventing processing, and just go straight for the first ending we can find
    If there is no ending to find, the tag will NOT consume the rest of the text in
    terms of processing behaviour (though <pre> will appear to have an effect).
    Even if there is no end of the tag, the content inside the opening half will
    still be unprocessed, meaning {{X20|<nowiki }}>}} wouldn't end at the first }}
    despite there being no ending to the tag.
    
    Note that there are some tags, like <math>, which also function like <nowiki>
    which are included in this aswell. Some other tags, like <ref>, have far too
    unpredictable behaviour to be handled currently (they'd have to be split and
    processed as something seperate - its complicated, but maybe not impossible.)
    I suspect that every tag listed in [[Special:Version]] may behave somewhat like
    this, but that's far too many cases worth checking for rarely used tags that may
    not even have a good reason to contain {{ or }} anyways, so we leave them alone.
    
    ---- HTML COMMENTS AND INCLUDEONLY ----
    HTML Comments are about as basic as it could get for this
    Start at <!--, end at -->, no extra conditions. Simple enough
    If a comment has no end, it will eat all text instead of not being processed
    
    includeonly tags function mostly like a regular nowiki tag, with the exception
    that the tag will actually consume all future text if not given an ending as
    opposed to simply giving up and not changing anything. Due to complications and
    the fact that this is far less likely to be present on a page, aswell as being
    something that may not want to be escaped, includeonly tags are ignored during
    our processing
    --]=]
    local validtags = {nowiki=1, pre=1, syntaxhighlight=1, source=1, math=1}
    --This function expects the string to start with the tag
    local function TestForNowikiTag(text, scanPosition)
    	local tagName = (string.match(text, "^<([^\n />]+)", scanPosition) or ""):lower()
    	if not validtags[tagName] then
    		return nil
    	end
    	local nextOpener = string.find(text, "<", scanPosition+1) or -1
    	local nextCloser = string.find(text, ">", scanPosition+1) or -1
    	if nextCloser > -1 and (nextOpener == -1 or nextCloser < nextOpener) then
    		local startingTag = string.sub(text, scanPosition, nextCloser)
    		--We have our starting tag (E.g. '<pre style="color:red">')
    		--Now find our ending...
    		if endswith(startingTag, "/>") then --self-closing tag (we are our own ending)
    			return {
    				Tag = tagName,
    				Start = startingTag,
    				Content = "", End = "",
    				Length = #startingTag
    			}
    
    		else
    			local endingTag
    			if tagName == "pre" then --Looser restrictions for <pre>
    				endingTag = --no | so we just use 2 matches
    					string.match(text, "</[Pp][Rr][Ee]>", scanPosition) or
    					string.match(text, "</[Pp][Rr][Ee][ \t\n/][^<]*>", scanPosition)
    			else
    				endingTag = string.match(text, "</"..allcases(tagName).."[ \t\n]*>", scanPosition)
    			end
    
    			if endingTag then --Regular tag formation
    				local endingTagPosition = string.find(text, endingTag, nextCloser, true)
    				local tagContent = string.sub(text, nextCloser+1, endingTagPosition-1)
    				return {
    					Tag = tagName,
    					Start = startingTag,
    					Content = tagContent,
    					End = endingTag,
    					Length = #startingTag + #tagContent + #endingTag
    				}
    
    			else --Content inside still needs escaping (also linter error!)
    				return {
    					Tag = tagName,
    					Start = startingTag,
    					Content = "", End = "",
    					Length = #startingTag
    				}
    			end
    		end
    	end
    	return nil
    end
    local function TestForComment(text, scanPosition) --Like TestForNowikiTag but for <!-- -->
    	if string.match(text, "^<!%-%-", scanPosition) then
    		local commentEnd = string.find(text, "-->", scanPosition+4, true)
    		if commentEnd then
    			return {
    				Start = "<!--", End = "-->",
    				Content = string.sub(text, scanPosition+4, commentEnd-1),
    				Length = commentEnd-scanPosition+3
    			}
    		else --Consumes all text if not given an ending
    			return {
    				Start = "<!--", End = "",
    				Content = string.sub(text, scanPosition+4),
    				Length = #text-scanPosition+1
    			}
    		end
    	end
    	return nil
    end
    
    --[[ Implementation notes
    The goal of this function is to escape all text that wouldn't be parsed if it
    was preprocessed (see above implementation notes).
    
    Using keepComments will keep all HTML comments instead of removing them. They
    will still be escaped regardless to avoid processing errors
    --]]
    local function PrepareText(text, keepComments)
    	local newtext = {}
    	local scanPosition = 1
    	while true do
    		local NextCheck = string.find(text, "<[NnSsPpMm!]", scanPosition) --Advance to the next potential tag we care about
    		if not NextCheck then --Done
    			newtext[#newtext+1] =  string.sub(text,scanPosition)
    			break
    		end
    		newtext[#newtext+1] = string.sub(text,scanPosition,NextCheck-1)
    		scanPosition = NextCheck
    		local Comment = TestForComment(text, scanPosition)
    		if Comment then
    			if keepComments then
    				newtext[#newtext+1] = Comment.Start .. mw.text.nowiki(Comment.Content) .. Comment.End
    			end
    			scanPosition = scanPosition + Comment.Length
    		else
    			local Tag = TestForNowikiTag(text, scanPosition)
    			if Tag then
    				local newTagStart = "<" .. mw.text.nowiki(string.sub(Tag.Start,2,-2)) .. ">"
    				local newTagEnd = 
    					Tag.End == "" and "" or --Respect no tag ending
    					"</" .. mw.text.nowiki(string.sub(Tag.End,3,-2)) .. ">"
    				local newContent = mw.text.nowiki(Tag.Content)
    				newtext[#newtext+1] = newTagStart .. newContent .. newTagEnd
    				scanPosition = scanPosition + Tag.Length
    			else --Nothing special, move on...
    				newtext[#newtext+1] = string.sub(text, scanPosition, scanPosition)
    				scanPosition = scanPosition + 1
    			end
    		end
    	end
    	return table.concat(newtext, "")
    end
    
    --[=[ Implementation notes
    This function is an alternative to Transcluder's getParameters which considers
    the potential for a singular { or } or other odd syntax that %b doesn't like to
    be in a parameter's value.
    
    When handling the difference between {{ and {{{, mediawiki will attempt to match
    as many sequences of {{{ as possible before matching a {{
    E.g.
     {{{{A}}}} -> { {{{A}}} }
     {{{{{{{{Text|A}}}}}}}} -> {{ {{{ {{{Text|A}}} }}} }}
    If there aren't enough triple braces on both sides, the parser will compromise
    for a template interpretation.
    E.g.
     {{{{A}} }} -> {{ {{ A }} }}
    
    While there are technically concerns about things such as wikilinks breaking
    template processing (E.g. {{[[}}]]}} doesn't stop at the first }}), it shouldn't
    be our job to process inputs perfectly when the input has garbage ({ / } isn't
    legal in titles anyways, so if something's unmatched in a wikilink, it's
    guaranteed GIGO)
    
    Setting dontEscape will prevent running the input text through EET. Avoid
    setting this to true if you don't have to set it.
    
    Returned values:
    A table of all templates. Template data goes as follows:
     Text: The raw text of the template
     Name: The name of the template
     Args: A list of arguments
     Children: A list of immediate template children
    --]=]
    --Helper functions
    local function boundlen(pair)
    	return pair.End-pair.Start+1
    end
    
    --Main function
    local function ParseTemplates(InputText, dontEscape)
    	--Setup
    	if not dontEscape then
    		InputText = PrepareText(InputText)
    	end
    	local function finalise(text)
    		if not dontEscape then
    			return mw.text.decode(text)
    		else
    			return text
    		end
    	end
    	local function CreateContainerObj(Container)
    		Container.Text = {}
    		Container.Args = {}
    		Container.ArgOrder = {}
    		Container.Children = {}
    		-- Container.Name = nil
    		-- Container.Value = nil
    		-- Container.Key = nil
    		Container.BeyondStart = false
    		Container.LastIndex = 1
    		Container.finalise = finalise
    		function Container:HandleArgInput(character, internalcall)
    			if not internalcall then
    				self.Text[#self.Text+1] = character
    			end
    			if character == "=" then
    				if self.Key then
    					self.Value[#self.Value+1] = character
    				else
    					self.Key = cheaptrim(self.Value and table.concat(self.Value, "") or "")
    					self.Value = {}
    				end
    			else --"|" or "}"
    				if not self.Name then
    					self.Name = cheaptrim(self.Value and table.concat(self.Value, "") or "")
    					self.Value = nil
    				else
    					self.Value = self.finalise(self.Value and table.concat(self.Value, "") or "")
    					if self.Key then
    						self.Key = self.finalise(self.Key)
    						self.Args[self.Key] = cheaptrim(self.Value)
    						self.ArgOrder[#self.ArgOrder+1] = self.Key
    					else
    						local Key = tostring(self.LastIndex)
    						self.Args[Key] = self.Value
    						self.ArgOrder[#self.ArgOrder+1] = Key
    						self.LastIndex = self.LastIndex + 1
    					end
    					self.Key = nil
    					self.Value = nil
    				end
    			end
    		end
    		function Container:AppendText(text, ftext)
    			self.Text[#self.Text+1] = (ftext or text)
    			if not self.Value then
    				self.Value = {}
    			end
    			self.BeyondStart = self.BeyondStart or (#table.concat(self.Text, "") > 2)
    			if self.BeyondStart then
    				self.Value[#self.Value+1] = text
    			end
    		end
    		function Container:Clean(IsTemplate)
    			self.Text = table.concat(self.Text, "")
    			if self.Value and IsTemplate then
    				self.Value = {string.sub(table.concat(self.Value, ""), 1, -3)} --Trim ending }}
    				self:HandleArgInput("|", true) --Simulate ending
    			end
    			self.Value = nil
    			self.Key = nil
    			self.BeyondStart = nil
    			self.LastIndex = nil
    			self.finalise = nil
    			self.HandleArgInput = nil
    			self.AppendText = nil
    			self.Clean = nil
    		end
    		return Container
    	end
    	
    	--Step 1: Find and escape the content of all wikilinks on the page, which are stronger than templates (see implementation notes)
    	local scannerPosition = 1
    	local wikilinks = {}
    	local openWikilinks = {}
    	while true do
    		local Position, _, Character = string.find(InputText, "([%[%]])%1", scannerPosition)
    		if not Position then --Done
    			break
    		end
    
    		scannerPosition = Position+2 --+2 to pass the [[ / ]]
    		if Character == "[" then --Add a [[ to the pending wikilink queue
    			openWikilinks[#openWikilinks+1] = Position
    		else --Pair up the ]] to any available [[
    			if #openWikilinks >= 1 then
    				local start = table.remove(openWikilinks) --Pop the latest [[
    				wikilinks[start] = {Start=start, End=Position+1, Type="Wikilink"} --Note the pair
    			end
    		end
    	end
    	
    	--Step 2: Find the bounds of every valid template and variable ({{ and {{{)
    	local scannerPosition = 1
    	local templates = {}
    	local variables = {}
    	local openBrackets = {}
    	while true do
    		local Start, _, Character = string.find(InputText, "([{}])%1", scannerPosition)
    		if not Start then --Done (both 9e9)
    			break
    		end
    		local _, End = string.find(InputText, "^"..Character.."+", Start)
    
    		scannerPosition = Start --Get to the {{ / }} set
    		if Character == "{" then --Add the {{+ set to the queue
    			openBrackets[#openBrackets+1] = {Start=Start, End=End}
    
    		else --Pair up the }} to any available {{, accounting for {{{ / }}}
    			local BracketCount = End-Start+1
    			while BracketCount >= 2 and #openBrackets >= 1 do
    				local OpenSet = table.remove(openBrackets)
    				if boundlen(OpenSet) >= 3 and BracketCount >= 3 then --We have a {{{variable}}} (both sides have 3 spare)
    					variables[OpenSet.End-2] = {Start=OpenSet.End-2, End=scannerPosition+2, Type="Variable"} --Done like this to ensure chronological order
    					BracketCount = BracketCount - 3
    					OpenSet.End = OpenSet.End - 3
    					scannerPosition = scannerPosition + 3
    
    				else --We have a {{template}} (both sides have 2 spare, but at least one side doesn't have 3 spare)
    					templates[OpenSet.End-1] = {Start=OpenSet.End-1, End=scannerPosition+1, Type="Template"} --Done like this to ensure chronological order
    					BracketCount = BracketCount - 2
    					OpenSet.End = OpenSet.End - 2
    					scannerPosition = scannerPosition + 2
    				end
    
    				if boundlen(OpenSet) >= 2 then --Still has enough data left, leave it in
    					openBrackets[#openBrackets+1] = OpenSet
    				end
    			end
    		end
    		scannerPosition = End --Now move past the bracket set
    	end
    	
    	--Step 3: Re-trace every object using their known bounds, collecting our parameters with (slight) ease
    	local scannerPosition = 1
    	local activeObjects = {}
    	local finalObjects = {}
    	while true do
    		local LatestObject = activeObjects[#activeObjects] --Commonly needed object
    		local NNC, _, Character --NNC = NextNotableCharacter
    		if LatestObject then
    			NNC, _, Character = string.find(InputText, "([{}%[%]|=])", scannerPosition)
    		else
    			NNC, _, Character = string.find(InputText, "([{}])", scannerPosition) --We are only after templates right now
    		end
    		if not NNC then
    			break
    		end
    		if NNC > scannerPosition and LatestObject then
    			local scannedContent = string.sub(InputText, scannerPosition, NNC-1)
    			LatestObject:AppendText(scannedContent, finalise(scannedContent))
    		end
    
    		scannerPosition = NNC+1
    		if Character == "{" or Character == "[" then
    			local Container = templates[NNC] or variables[NNC] or wikilinks[NNC]
    			if Container then
    				CreateContainerObj(Container)
    				if Container.Type == "Template" then
    					Container:AppendText("{{")
    					scannerPosition = NNC+2
    				elseif Container.Type == "Variable" then
    					Container:AppendText("{{{")
    					scannerPosition = NNC+3
    				else --Wikilink
    					Container:AppendText("[[")
    					scannerPosition = NNC+2
    				end
    				if LatestObject and Container.Type == "Template" then --Only templates count as children
    					LatestObject.Children[#LatestObject.Children+1] = Container
    				end
    				activeObjects[#activeObjects+1] = Container
    			elseif LatestObject then
    				LatestObject:AppendText(Character)
    			end
    
    		elseif Character == "}" or Character == "]" then
    			if LatestObject then
    				LatestObject:AppendText(Character)
    				if LatestObject.End == NNC then
    					if LatestObject.Type == "Template" then
    						LatestObject:Clean(true)
    						finalObjects[#finalObjects+1] = LatestObject
    					else
    						LatestObject:Clean(false)
    					end
    					activeObjects[#activeObjects] = nil
    					local NewLatest = activeObjects[#activeObjects]
    					if NewLatest then
    						NewLatest:AppendText(LatestObject.Text) --Append to new latest
    					end
    				end
    			end
    
    		else --| or =
    			if LatestObject then
    				LatestObject:HandleArgInput(Character)
    			end
    		end
    	end
    	
    	--Step 4: Fix the order
    	local FixedOrder = {}
    	local SortableReference = {}
    	for _,Object in next,finalObjects do
    		SortableReference[#SortableReference+1] = Object.Start
    	end
    	table.sort(SortableReference)
    	for i = 1,#SortableReference do
    		local start = SortableReference[i]
    		for n,Object in next,finalObjects do
    			if Object.Start == start then
    				finalObjects[n] = nil
    				Object.Start = nil --Final cleanup
    				Object.End = nil
    				Object.Type = nil
    				FixedOrder[#FixedOrder+1] = Object
    				break
    			end
    		end
    	end
    	
    	--Finished, return
    	return FixedOrder
    end
    
    local p = {}
    --Main entry points
    p.PrepareText = PrepareText
    p.ParseTemplates = ParseTemplates
    --Extra entry points, not really required
    p.TestForNowikiTag = TestForNowikiTag
    p.TestForComment = TestForComment
    
    return p
    
    --[==[ console tests
    
    local s = [=[Hey!{{Text|<nowiki | ||>
    Hey! }}
    A</nowiki>|<!--AAAAA|AAA-->Should see|Shouldn't see}}]=]
    local out = p.PrepareText(s)
    mw.logObject(out)
    
    local s = [=[B<!--
    Hey!
    -->A]=]
    local out = p.TestForComment(s, 2)
    mw.logObject(out); mw.log(string.sub(s, 2, out.Length))
    
    local a = p.ParseTemplates([=[
    {{User:Aidan9382/templates/dummy
    |A|B|C {{{A|B}}} { } } {
    |<nowiki>D</nowiki>
    |<pre>E
    |F</pre>
    |G|=|a=|A  =  [[{{PAGENAME}}|A=B]]{{Text|1==<nowiki>}}</nowiki>}}|A B=Success}}
    ]=])
    mw.logObject(a)
    
    ]==]