Jump to content

Module:languages/doSubstitutions

From Wiktionary
 -- Converts any specified exceptions into PUA characters, to avoid having diacritics stripped. Uses the supplemetary PUA planes (U+FXXXX & U+10XXXX), to ensure that any characters in the BMP (U+0XXXX) or SMP (U+1XXXX) can be round-trip converted to PUA.
-- This will need to be reviewed if any characters in the SIP (U+2XXXX) or TIP (U+3XXXX) need to be processed by it, but as these planes are exclusively CJK characters as of 2022, this is unlikely to happen for the time being. However, it is unwise to start using non-PUA codepoints in the U+4XXXX-U+EXXXX range, as support for these is completely untested, so they may result in unpredictable behaviour.
local function removeExceptions(text, sc, remove_exceptions, undo)
	if remove_exceptions then
		local u, cp, len, substitute = mw.ustring.char, mw.ustring.codepoint, require("Module:string utilities").len
		for _, exception in ipairs(remove_exceptions) do
			exception = sc:toFixedNFD(exception)
			substitute = {cp(exception, 1, len(exception))}
			for i, cp in ipairs(substitute) do substitute[i] = u(cp+0xF0000) end
			if undo then
				text = text:gsub(table.concat(substitute), exception)
			else
				text = text:gsub(exception, table.concat(substitute))
			end
		end
	end
	return text
end

local function doSubstitutions(text, self, sc, substitution_data, function_name, recursed)
	local fail, cats = nil, {}
	-- If there are language-specific substitutes given in the data module, use those.
	if type(substitution_data) == "table" then
		-- If a script is specified, run this function with the script-specific data before continuing.
		local sc_code = sc:getCode()
		if substitution_data[sc_code] then
			text, fail, cats = doSubstitutions(text, self, sc, substitution_data[sc_code], function_name, true)
		-- Hant, Hans and Hani are usually treated the same, so add a special case to avoid having to specify each one separately.
		elseif sc_code:match("^Han") and substitution_data.Hani then
			text, fail, cats = doSubstitutions(text, self, sc, substitution_data.Hani, function_name, true)
		-- Substitution data with key 1 in the outer table may be given as a fallback.
		elseif substitution_data[1] then
			text, fail, cats = doSubstitutions(text, self, sc, substitution_data[1], function_name, true)
		end
		-- Iterate over all strings in the "from" subtable, and gsub with the corresponding string in "to". We work with the NFD decomposed forms, as this simplifies many substitutions.
		if substitution_data.from then
			local gsub = require("Module:string utilities").gsub
			for i, from in ipairs(substitution_data.from) do
				-- We normalize each loop, to ensure multi-stage substitutions work correctly.
				text = sc:toFixedNFD(text)
				-- Check whether specific magic characters are present, as they rely on UTF-8 compatibility. If not, just use string.gsub. In most cases, doing this is faster than using mw.ustring.gsub every time.
				text = gsub(text, sc:toFixedNFD(from), substitution_data.to[i] or "")
			end
		end
		
		if substitution_data.remove_diacritics then
			text = sc:toFixedNFD(text)
			-- Convert exceptions to PUA.
			text = removeExceptions(text, sc, substitution_data.remove_exceptions)
			-- Strip diacritics.
			text =  require("Module:string utilities").gsub(text, "[" .. substitution_data.remove_diacritics .. "]", "")
			-- Convert exceptions back.
			text = removeExceptions(text, sc, substitution_data.remove_exceptions, true)
		end
	elseif type(substitution_data) == "string" then
		-- If there is a dedicated function module, use that.
		local is_module, module = pcall(require, "Module:" .. substitution_data)
		if is_module then
			if function_name == "tr" then
				text, fail, cats = module[function_name](text, self:getCode(), sc:getCode())
			else
				text, fail, cats = module[function_name](sc:toFixedNFD(text), self:getCode(), sc:getCode())
			end
		else
			error("Substitution data does not match an existing module.")
		end
	end
	
	-- Don't normalize to NFC if this is the inner loop or if a module returned nil.
	if recursed or not text then
		return text, fail, cats
	else
		-- Fix any discouraged sequences created during the substitution process, and normalize into the final form.
		text = sc:fixDiscouragedSequences(text)
		return sc:toFixedNFC(text), fail, cats
	end
end

-- This avoids calling into globals with require when the main function recurses.
return function (text, self, sc, substitution_data, function_name)
	return doSubstitutions(text, self, sc, substitution_data, function_name)
end