Module:Polyt-stripdiacritics
Itsura
- This module lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = require("Module:string/char")
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match
local grave = u(0x300)
local acute = u(0x301)
local smooth = u(0x313)
local rough = u(0x314)
local word_ch = "[%w" .. grave .. acute .. smooth .. rough .. u(0x308, 0x342, 0x345) .. "]"
local following_word_pattern = "^" .. word_ch .. "*%s+" .. word_ch -- not punctuation
local breathing_ch = "[" .. smooth .. rough .. "]"
local rho_cap_smooth_sub = u(0x1FDC) -- temporary (unused) codepoint for Ρ̓, which has no atomic codepoint
local rho = "[ρῤῥΡ" .. rho_cap_smooth_sub .. "Ῥ]"
local two_or_more_rhos = rho .. rho .. "+"
local expected_rho_breathings = "^[ρῤΡ" .. rho_cap_smooth_sub .. "]+[ρῥΡῬ]$"
local Grek_stripDiacritics = require("Module:Grek-common").stripDiacritics
function export.stripDiacritics(text, lang, sc)
-- Do some substitutions done for all Greek text.
text = Grek_stripDiacritics(text, lang, sc)
-- Remove length marks and double undertie.
text = toNFD(text):gsub("\204[\132\134]", ""):gsub("\205\156", "")
-- Convert grave to acute unless followed by another word.
text = ugsub(text, grave .. "()", function(pos)
if not umatch(text, following_word_pattern, pos) then
return acute
end
end)
-- Convert "ῤῥ" to "ρρ".
text = ugsub(toNFC(text):gsub("Ρ̓", rho_cap_smooth_sub), two_or_more_rhos, function(rhos)
if umatch(rhos, expected_rho_breathings) then
return (toNFD(rhos:gsub(rho_cap_smooth_sub, "Ρ̓")):gsub(breathing_ch, ""))
end
end):gsub(rho_cap_smooth_sub, "Ρ̓")
return toNFC(text)
end
return export