Module:lt-common
Itsura
- This module lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = mw.ustring.char
local ugsub = mw.ustring.gsub
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local grave = u(0x0300)
local acute = u(0x0301)
local tilde = u(0x0303)
local macron = u(0x0304)
local dotabove = u(0x0307)
local caron = u(0x030C)
local ogonek = u(0x0328)
local accents = "[" .. grave .. acute .. tilde .. "]"
local dotless_to_dotted = {
["ı"] = "i",
["ȷ"] = "j",
}
local function char_to_dotted_form(base, below)
return (dotless_to_dotted[base] or base) .. below
end
local function normalize_dotted_chars(text)
-- Remove any dots above, and convert dotless forms to dotted. On entry, text must be in NFD form.
return (ugsub(text, "([iıjȷ])(" .. ogonek .. "?)" .. dotabove, char_to_dotted_form))
end
local function char_to_accent_form(base, below)
-- Add a 'dot above' after the base.
if base == "i" or base == "j" then
return base .. below .. dotabove
end
-- Convert any dotless chars combining with accents to the dotted form, so
-- that they normalize properly. This shouldn't happen, but just in case.
return char_to_dotted_form(base, below)
end
function export.makeDisplayText(text, lang, sc)
-- Normalize dotless characters and dot-above diacritics (while retaining accents).
text = normalize_dotted_chars(toNFD(text))
-- Add a 'dot above' between "i" or "j" and an accent.
text = ugsub(text, "([iıjȷ])(" .. ogonek .. "?)%f" .. accents, char_to_accent_form)
return toNFC(text)
end
local function stripped_text_form(text)
-- Remove accents.
text = ugsub(toNFD(text), accents .. "+", "")
-- Normalize dotless characters and dot-above diacritics.
return normalize_dotted_chars(text)
end
-- Called from [[Module:languages]] since [[Module:lt-common]] is set as the stripDiacritics handler in
-- [[Module:languages/data/2]].
function export.stripDiacritics(text, lang, sc)
return toNFC(stripped_text_form(text))
end
local sortkey_substitutes = {
[ogonek] = u(0xF000),
[caron] = u(0xF001),
[macron] = u(0xF002),
[dotabove] = u(0xF003),
["y"] = "i" .. u(0xF004),
}
function export.makeSortKey(text, lang, sc)
-- Normalize to the stripped-text form and convert diacritics to Private Use Area characters so they sort after
-- all other characters.
text = stripped_text_form(ulower(text))
:gsub(".[\128-\191]*", sortkey_substitutes)
return toNFC(uupper(text))
end
return export