Pumunta sa nilalaman

Module:Polyt-stripdiacritics

Mula Wiksiyonaryo


local export = {}

local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = require("Module:string/char")
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match

local grave = u(0x300)
local acute = u(0x301)
local smooth = u(0x313)
local rough = u(0x314)

local word_ch = "[%w" .. grave .. acute .. smooth .. rough .. u(0x308, 0x342, 0x345) .. "]"
local following_word_pattern = "^" .. word_ch .. "*%s+" .. word_ch -- not punctuation

local breathing_ch = "[" .. smooth .. rough .. "]"
local rho_cap_smooth_sub = u(0x1FDC) -- temporary (unused) codepoint for Ρ̓, which has no atomic codepoint
local rho = "[ρῤῥΡ" .. rho_cap_smooth_sub .. "Ῥ]"
local two_or_more_rhos =  rho .. rho .. "+"
local expected_rho_breathings = "^[ρῤΡ" .. rho_cap_smooth_sub .. "]+[ρῥΡῬ]$"

local Grek_stripDiacritics = require("Module:Grek-common").stripDiacritics

function export.stripDiacritics(text, lang, sc)
	-- Do some substitutions done for all Greek text.
	text = Grek_stripDiacritics(text, lang, sc)
	-- Remove length marks and double undertie.
	text = toNFD(text):gsub("\204[\132\134]", ""):gsub("\205\156", "")
	-- Convert grave to acute unless followed by another word.
	text = ugsub(text, grave .. "()", function(pos)
		if not umatch(text, following_word_pattern, pos) then
			return acute
		end
	end)
	-- Convert "ῤῥ" to "ρρ".
	text = ugsub(toNFC(text):gsub("Ρ̓", rho_cap_smooth_sub), two_or_more_rhos, function(rhos)
		if umatch(rhos, expected_rho_breathings) then
			return (toNFD(rhos:gsub(rho_cap_smooth_sub, "Ρ̓")):gsub(breathing_ch, ""))
		end
	end):gsub(rho_cap_smooth_sub, "Ρ̓")
	return toNFC(text)
end

return export