Module:languages/doSubstitutions
Appearance
You might want to create a documentation page for this Scribunto module. Editors can experiment in this module's sandbox (create | mirror) and testcases (create) pages. Please add categories to the /doc subpage. Subpages of this module. |
-- Converts any specified exceptions into PUA characters, to avoid having diacritics stripped. Uses the supplemetary PUA planes (U+FXXXX & U+10XXXX), to ensure that any characters in the BMP (U+0XXXX) or SMP (U+1XXXX) can be round-trip converted to PUA.
-- This will need to be reviewed if any characters in the SIP (U+2XXXX) or TIP (U+3XXXX) need to be processed by it, but as these planes are exclusively CJK characters as of 2022, this is unlikely to happen for the time being. However, it is unwise to start using non-PUA codepoints in the U+4XXXX-U+EXXXX range, as support for these is completely untested, so they may result in unpredictable behaviour.
local function removeExceptions(text, sc, remove_exceptions, undo)
if remove_exceptions then
local u, cp, len, substitute = mw.ustring.char, mw.ustring.codepoint, require("Module:string utilities").len
for _, exception in ipairs(remove_exceptions) do
exception = sc:toFixedNFD(exception)
substitute = {cp(exception, 1, len(exception))}
for i, cp in ipairs(substitute) do substitute[i] = u(cp+0xF0000) end
if undo then
text = text:gsub(table.concat(substitute), exception)
else
text = text:gsub(exception, table.concat(substitute))
end
end
end
return text
end
local function doSubstitutions(text, self, sc, substitution_data, function_name, recursed)
local fail, cats = nil, {}
-- If there are language-specific substitutes given in the data module, use those.
if type(substitution_data) == "table" then
-- If a script is specified, run this function with the script-specific data before continuing.
local sc_code = sc:getCode()
if substitution_data[sc_code] then
text, fail, cats = doSubstitutions(text, self, sc, substitution_data[sc_code], function_name, true)
-- Hant, Hans and Hani are usually treated the same, so add a special case to avoid having to specify each one separately.
elseif sc_code:match("^Han") and substitution_data.Hani then
text, fail, cats = doSubstitutions(text, self, sc, substitution_data.Hani, function_name, true)
-- Substitution data with key 1 in the outer table may be given as a fallback.
elseif substitution_data[1] then
text, fail, cats = doSubstitutions(text, self, sc, substitution_data[1], function_name, true)
end
-- Iterate over all strings in the "from" subtable, and gsub with the corresponding string in "to". We work with the NFD decomposed forms, as this simplifies many substitutions.
if substitution_data.from then
local gsub = require("Module:string utilities").gsub
for i, from in ipairs(substitution_data.from) do
-- We normalize each loop, to ensure multi-stage substitutions work correctly.
text = sc:toFixedNFD(text)
-- Check whether specific magic characters are present, as they rely on UTF-8 compatibility. If not, just use string.gsub. In most cases, doing this is faster than using mw.ustring.gsub every time.
text = gsub(text, sc:toFixedNFD(from), substitution_data.to[i] or "")
end
end
if substitution_data.remove_diacritics then
text = sc:toFixedNFD(text)
-- Convert exceptions to PUA.
text = removeExceptions(text, sc, substitution_data.remove_exceptions)
-- Strip diacritics.
text = require("Module:string utilities").gsub(text, "[" .. substitution_data.remove_diacritics .. "]", "")
-- Convert exceptions back.
text = removeExceptions(text, sc, substitution_data.remove_exceptions, true)
end
elseif type(substitution_data) == "string" then
-- If there is a dedicated function module, use that.
local is_module, module = pcall(require, "Module:" .. substitution_data)
if is_module then
if function_name == "tr" then
text, fail, cats = module[function_name](text, self:getCode(), sc:getCode())
else
text, fail, cats = module[function_name](sc:toFixedNFD(text), self:getCode(), sc:getCode())
end
else
error("Substitution data does not match an existing module.")
end
end
-- Don't normalize to NFC if this is the inner loop or if a module returned nil.
if recursed or not text then
return text, fail, cats
else
-- Fix any discouraged sequences created during the substitution process, and normalize into the final form.
text = sc:fixDiscouragedSequences(text)
return sc:toFixedNFC(text), fail, cats
end
end
-- This avoids calling into globals with require when the main function recurses.
return function (text, self, sc, substitution_data, function_name)
return doSubstitutions(text, self, sc, substitution_data, function_name)
end