Pumunta sa nilalaman

Module:lt-common

Mula Wiksiyonaryo


local export = {}

local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = mw.ustring.char
local ugsub = mw.ustring.gsub
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper

local grave = u(0x0300)
local acute = u(0x0301)
local tilde = u(0x0303)
local macron = u(0x0304)
local dotabove = u(0x0307)
local caron = u(0x030C)
local ogonek = u(0x0328)
local accents = "[" .. grave .. acute .. tilde .. "]"

local dotless_to_dotted = {
	["ı"] = "i",
	["ȷ"] = "j",
}

local function char_to_dotted_form(base, below)
	return (dotless_to_dotted[base] or base) .. below
end

local function normalize_dotted_chars(text)
	-- Remove any dots above, and convert dotless forms to dotted. On entry, text must be in NFD form.
	return (ugsub(text, "([iıjȷ])(" .. ogonek .. "?)" .. dotabove, char_to_dotted_form))
end

local function char_to_accent_form(base, below)
	-- Add a 'dot above' after the base.
	if base == "i" or base == "j" then
		return base .. below .. dotabove
	end
	-- Convert any dotless chars combining with accents to the dotted form, so
	-- that they normalize properly. This shouldn't happen, but just in case.
	return char_to_dotted_form(base, below)
end

function export.makeDisplayText(text, lang, sc)
	-- Normalize dotless characters and dot-above diacritics (while retaining accents).
	text = normalize_dotted_chars(toNFD(text))
	-- Add a 'dot above' between "i" or "j" and an accent.
	text = ugsub(text, "([iıjȷ])(" .. ogonek .. "?)%f" .. accents, char_to_accent_form)
	return toNFC(text)
end

local function stripped_text_form(text)
	-- Remove accents.
	text = ugsub(toNFD(text), accents .. "+", "")
	-- Normalize dotless characters and dot-above diacritics.
	return normalize_dotted_chars(text)
end

-- Called from [[Module:languages]] since [[Module:lt-common]] is set as the stripDiacritics handler in
-- [[Module:languages/data/2]].
function export.stripDiacritics(text, lang, sc)
	return toNFC(stripped_text_form(text))
end

local sortkey_substitutes = {
	[ogonek] = u(0xF000),
	[caron] = u(0xF001),
	[macron] = u(0xF002),
	[dotabove] = u(0xF003),
	["y"] = "i" .. u(0xF004),
}

function export.makeSortKey(text, lang, sc)
	-- Normalize to the stripped-text form and convert diacritics to Private Use Area characters so they sort after
	-- all other characters. 
	text = stripped_text_form(ulower(text))
		:gsub(".[\128-\191]*", sortkey_substitutes)
	return toNFC(uupper(text))
end

return export