Pumunta sa nilalaman

Module:fa-cls-translit

Mula Wiksiyonaryo

This module will transliterate Classical Persian text. It is also used to transliterate Mogholi. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:fa-cls-translit/testcases.

Functions

[baguhin]
tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

-- Authors: Sameerhameedy

local U = mw.ustring.char
local gsub = mw.ustring.gsub
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local export = {}

local fatHataan = U(0x64B) -- اً, tanvin-e nasb (تنوین نصب)
local Dammataan = U(0x64C) -- un
local kasrataan = U(0x64D) -- in
local zabar = U(0x64E)
local zer = U(0x650)
local pesh = U(0x64F)
local tashdid = U(0x651) -- also called shadda
local jazm = "ْ"
local he = "ه"
local zwnj = U(0x200C)
local highhmz = U(0x654)
local lrm = U(0x200e) -- left-to-right mark
local rlm = U(0x200f) -- right-to-left mark
local balticons = "ڃڇڑڗݜݨݩǩ"

local consonants = "بپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنؤهئء" .. balticons
local consonants2 = "ءبپتټٹثجچحخدډڈذرزژسشصضطظعغفقکگلمنوؤهیئywة" .. balticons -- including semivowels
local vowels = "āēīōū"
local semivowel = "یو"
local hes = "هح"
local diacritics = "َُِّْٰ"
local ZZP = "َُِ"
local alif_wasla = "ٱ"
local space_like = "%s'" .. '"'
local space_like_class = "[" .. space_like .. zwnj .. "]"

--- The characters ټ ٹ ډ ڈ ے are included only for Mughal Persian and Hazaragi.

local mapping = {
	["آ"] = "ā",
	["ب"] = "b",
	["پ"] = "p",
	["ت"] = "t",
	["ث"] = "s",
	["ج"] = "j",
	["چ"] = "č",
	["ح"] = "h",
	["خ"] = "x",
	["د"] = "d",
	["ذ"] = "z",
	["ر"] = "r",
	["ز"] = "z",
	["ژ"] = "ž",
	["س"] = "s",
	["ش"] = "š",
	["ص"] = "s",
	["ض"] = "z",
	["ط"] = "t",
	["ظ"] = "z",
	["غ"] = "ğ",
	["ف"] = "f",
	["ق"] = "q",
	["ک"] = "k",
	["گ"] = "g",
	["ل"] = "l",
	["م"] = "m",
	["ن"] = "n",
	["و"] = "ō",
	["ی"] = "ē",
	["۔"] = ".",

	["ه"] = "h",

	["ع"] = "'",
	["ء"] = "'",
	["ئ"] = "'",
	["ؤ"] = "'",
	["أ"] = "'",

	-- diacritics
	[zabar] = "a",
	[zer] = "i",
	[pesh] = "u",
	[jazm] = "", -- also sukun - no vowel
	[zwnj] = "-", -- ZWNJ (zero-width non-joiner)
	[highhmz] = "-yi",

	-- ligatures
	["ﻻ"] = "lā",
	["ﷲ"] = "allāh",

	-- kashida
	["ـ"] = "-", -- kashida, no sound

	-- alif_wasla
	[alif_wasla] = "", -- nothing

	-- numerals
	["۱"] = "1",
	["۲"] = "2",
	["۳"] = "3",
	["۴"] = "4",
	["۵"] = "5",
	["۶"] = "6",
	["۷"] = "7",
	["۸"] = "8",
	["۹"] = "9",
	["۰"] = "0",

	-- punctuation (leave on separate lines)
	["؟"] = "?", -- question mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
	["«"] = "“", -- quotation mark
	["»"] = "”", -- quotation mark
	["٪"] = "%", -- percent
	["؉"] = "‰", -- per mille
	["٫"] = ".", -- decimals
	["٬"] = ",", -- thousan

	-- regional characters (FOR VERY SPECIFIC USECASES)
	["ټ"] = "ṭ",
	["ٹ"] = "ṭ",
	["ډ"] = "ḍ",
	["ڈ"] = "ḍ",
	-- balti
	-- cant do anything about ژ because it conflicts with persian
	["ڃ"] = "ž",
	["ڇ"] = "č̣",
	["ڑ"] = "ṛ",
	["ڗ"] = "dz",
	["ݜ"] = "ṣ",
	["ݨ"] = "ng",
	["ݩ"] = "ny",
	["ھ"] = "h",
	["ے"] = "e",
}

local punctuation = ":%(%)%[%]*&٫؛؟،ـ«\".'!»٪؉۔`,/–—%{%}"
local numbers = "۱۲۳۴۵۶۷۸۹۰"

local ain = "ع"
local alif = "ا"
local malif = "آ"
local hamza = "ء"
local ye = "ی"
local ye2 = "ئ"
local vao = "و"
local dagger_alif = U(0x670)
local marbuta = U(0x629)
local te = "ت"
local ye3 = "ے"
local laam = "ل"
local vowel = "[" .. vowels .. ZZP .. jazm .. semivowel .. malif .. "]"
local sun_letters = "تثدذرزسشصضطظلن"

local before_diacritic_checking_subs = {
	------------ transformations prior to checking for diacritics --------------
	{ U(0x06E5), "و" },
	{ U(0x06E6), "ی" },
	{ "ہ", he }, -- get rid of balti he (allows balti to transliterate)
	{ "ک" .. highhmz, "ǩ" },
	{ "([" .. fatHataan .. ZZP .. dagger_alif .. "])" .. tashdid, tashdid .. "%1" },
	{ alif .. fatHataan, zabar .. "ن" },
	{ fatHataan .. alif, zabar .. "ن" },
	{ jazm .. ye .. dagger_alif, jazm .. ye .. zabar .. alif },
	{ zabar .. "[" .. ye .. vao .. "]" .. dagger_alif, zabar .. alif },
	{ ye .. dagger_alif, zabar .. alif }, -- the first letter is U+06CC
	{ ye3, ye },
	{ "[أإ]", ye2 },
	-- kashiida
	{ "^" .. "ـ" .. zabar .. alif , "ـ" .. malif },
	{ "^" .. "ـ" .. "([" .. ZZP .. "])" , "ـ" .. alif .. "%1" },
	{ zabar .. dagger_alif, zabar .. alif },
	{ dagger_alif, zabar .. alif },
	{ fatHataan, zabar .. "ن" }, -- fatḥatan
	{ Dammataan, pesh .. "ن" }, -- ḍammatan
	{ kasrataan, zer .. "ن" }, -- kasratan

	-- allah ligatures and arabic al
	{ alif_wasla .. laam , "l-" },
	{ alif_wasla, "" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "])" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1-l-%2" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. ZZP .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" },
	{ "([" .. consonants2 .. "]" .. tashdid .. "?" .. "[" .. pesh .. zer .. "]" .. "[" .. vao .. ye .. "]" .. space_like_class .. ")" .. alif .. laam .. jazm .. "?" .. "([" .. consonants2 .. "])", "%1l-%2" },
	{ marbuta .. "([" .. ZZP .. "])" .. alif .. laam , te .. "%1-" .. laam .. "%-" },
	{ "l%-" .. "([" .. sun_letters .. "])" .. tashdid, "%1" .. jazm .. "-%1" },
	{ "l%-" .. laam .. tashdid, laam .. laam },
	{ "l%-" .. laam, laam .. laam },
	{ "l%-", laam .. "-" },
	{ marbuta .. "([" .. ZZP .. "])" .. alif, te .. "%1-" },
	{ marbuta .. "([" .. ZZP .. jazm .. "])", te .. "%1" },
	{ marbuta, he },
	{
		"(["
			.. consonants2
			.. "]["
			.. ZZP
			.. "])("
			.. space_like_class
			.. ")"
			.. alif
			.. laam
			.. "(["
			.. jazm
			.. laam
			.. "])",
		"%1%2" .. laam .. "%3",
	},
	{ laam .. laam .. tashdid, laam .. tashdid },
	-- use jazm/sukoon to prevent this conversion
	{ "(خ)" .. vao .. zabar .. alif, "%1" .. zabar .. alif },
	{ "(خ)" .. vao .. zabar, "%1" .. pesh },
	{ "(خ)" .. vao .. ye .. "([^" .. ZZP .. jazm .. "])", "%1" .. ye .. "%2" },
	-- izāfa
	{ zwnj, "-" },
	{ jazm .. alif, jazm .. "-" .. alif }, -- vowel killing, invisible ZWNJ
	{ zabar .. jazm, "-" }, -- vowel killing, invisible ZWNJ
}

local has_diacritics_subs = {
	-- this ensure allah ligatures and al- work
	{ "l%-", "" },
	{ "[" .. sun_letters .. "]" .. jazm .. "%-" , "" },
	{ "[" .. consonants2 .. "]" .. "([" .. ZZP .. "])" .. space_like_class .. alif .. laam , "" },
	-- remove punctuation and tashdid
	{ "[" .. punctuation .. tashdid .. highhmz .. numbers .. fatHataan .. "]", "" },
	{ "[" .. consonants .. "]$", "" },
	{ "[" .. consonants .. "](" .. space_like_class .. ")", "%1" },
	{ "[" .. consonants .. "]%-", "-" },
	-- these are required for arabic al- to work
	{ "[" .. consonants2 .. "]" .. "([" .. zer .. pesh .. "])" .. alif .. laam, laam },
	{ "[" .. consonants2 .. "]([" .. zer .. pesh .. "])%-" .. alif .. laam, laam },
	-- remove CV pairs
	-- consonants paired to alif
	{ "[" .. consonants2 .. "]" .. jazm, "" },
	{ "[" .. consonants2 .. "]" .. jazm .. malif, "" },
	{ "[" .. consonants2 .. "]" .. zabar .. alif, "" },
	-- consonants paired to a semivowel
	{
		"[" .. consonants .. alif .. "][" .. semivowel .. ZZP .. "]([" .. semivowel .. "])([" .. semivowel .. "])",
		"%1%2",
	},
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. jazm .. semivowel .. "]", "" },
	{ "[" .. alif .. consonants2 .. "][" .. ZZP .. "][" .. semivowel .. "]", "" },
	{ malif, "" }, -- counts as a CV pair
	{ jazm .. alif .. "[" .. ZZP .. "]", "" },
	{ "[" .. consonants2 .. alif .. "][" .. ZZP .. "]", "" },
	{ "[" .. consonants2 .. alif .. semivowel .. "][" .. semivowel .. "]", "" },
	-- remove numbers, hamzatu l-waṣl, alif madda and ZWNJ
	{ "[" .. numbers .. "ٱ" .. "آ" .. "]", "" },
	{ "%s", "" },
	{ "%-", "" },
	{ "[" .. semivowel .. "]", "" },
	{ "(" .. vowel .. ")", "" },
}

local function has_diacritics(text)
	local count
	text, count = gsub(text, "[" .. lrm .. rlm .. "]", "")
	if count > 0 then
		require("Module:debug").track("fa-translit/lrm or rlm")
	end
	for _, sub in ipairs(has_diacritics_subs) do
		text = gsub(text, unpack(sub))
	end
	return #text == 0
end

function export.tr(text, lang, sc)
	if type(text) == "table" then
		local function f(x)
			return (x ~= "") and x or nil
		end
		text, lang, sc, omit_i3raab, force_translit =
			f(text.args[1]), f(text.args[2]), f(text.args[3]), f(text.args[4]), f(text.args[5])
	end
	for _, sub in ipairs(before_diacritic_checking_subs) do
		text = gsub(text, sub[1], sub[2])
	end

	if not force_translit and not has_diacritics(text) then
		require("Module:debug").track("fa-translit/lacking diacritics")
		return nil
	end

	--define the "end" of a word
	text = gsub(text, "#", "HASHTAG")
	text = gsub(text, "^", "#")
	text = gsub(text, "$", "#")
	text = gsub(text, " | ", "# | #")
	text = gsub(text, "%s", "# #")
	text = gsub(text, "\n", "#" .. "\n" .. "#")
	text = gsub(text, "([" .. punctuation .. "])", "#" .. "%1" .. "#")
	text = "##" .. gsub(text, " ", "# #") .. "##"
	text = gsub(text, "%-", "#-#")
	-- hastags now mark the beginning and end of a word
	--character reformatting and exceptions
	text = gsub(text, highhmz, "#" .. highhmz .. "#")
	--this ensures "and" is transliterated as a short vowel
	text = gsub(text, "#" .. vao .. "#", "#u#")
	text = gsub(text, "#" .. vao .. jazm .. malif, "#w-" .. malif )
	-- prevent izafa from converting until later

	-- Tashdeed
	text = gsub(text, "([" .. consonants .. "])" .. tashdid, "%1%1")
	text = gsub(text, "([" .. consonants .. "])" .. tashdid .. "([" .. ZZP .. "])", "%1%1%2")
	text = gsub(text, "([" .. consonants .. "])" .. "([" .. ZZP .. "])" .. tashdid, "%1%1%2")
	text = gsub(text, ye .. "([" .. ZZP .. "])" .. tashdid, "yy%1")
	text = gsub(text, vao .. "([" .. ZZP .. "])" .. tashdid, "ww%1")
	text = gsub(text, ye .. tashdid .. "([" .. ZZP .. "])", "yy%1")
	text = gsub(text, vao .. tashdid .. "([" .. ZZP .. "])", "ww%1")

	-- distinguish initial alif from vowel alif
	text = gsub(text, "([" .. consonants2 .. "])" .. zabar .. alif, "%1ā")
	text = gsub(text, "([" .. consonants2 .. "])" .. alif, "%1ā")
	text = gsub(text, jazm .. malif, "'ā") -- invisible ZWNJ
	text = gsub(text, "([" .. consonants2 .. "])" .. malif, "%1'ā")
	text = gsub(text, alif .. ye, "ē")
	text = gsub(text, alif .. vao, "ō")
	text = gsub(text, alif .. zer .. ye, "ī")
	text = gsub(text, alif .. pesh .. vao, "ū")
	text = gsub(text, tashdid .. alif, tashdid .. "ā")

	-- convert semi vowels
	text = gsub(text, ye .. "ā", "yā")
	text = gsub(text, vao .. "ā", "wā")
	text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1")
	text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1")
	text = gsub(text, ye .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ē%1%2")
	text = gsub(text, vao .. "([" .. semivowel .. "])([" .. semivowel .. "])", "ō%1%2")
	text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2")
	text = gsub(text, "([" .. diacritics .. ZZP .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2")
	text = gsub(text, "([" .. consonants .. "])" .. ye .. "([" .. semivowel .. "])", "%1y%2")
	text = gsub(text, "([" .. consonants .. "])" .. vao .. "([" .. semivowel .. "])", "%1w%2")

	-- conversions for vaav/waaw/vao
	text = gsub(text, pesh .. vao, "ū")
	text = gsub(text, vao .. "([" .. diacritics .. ZZP .. "])", "w%1")
	text = gsub(text, "(" .. vowel .. ")" .. vao, "%1w")
	-- conversions for ye
	text = gsub(text, zer .. ye, "ī")
	text = gsub(text, ye .. "([" .. diacritics .. ZZP .. "])", "y%1")
	text = gsub(text, "(" .. vowel .. ")" .. ye, "%1y")

	--Alif with short vowel
	text = gsub(text, alif .. "([" .. ZZP .. "])", "%1")

	-- final changes
	-- izafa
	text = gsub(text, "ē" .. zer .. "#", "ē-yi#")
	text = gsub(text, zer .. "y" .. zer .. "#", "ī-yi#")
	text = gsub(text, "([^" .. consonants .. "])" .. "y" .. zer .. "#", "%1-yi#")
	text = gsub(text, "([" .. consonants2 .. "])" .. zer .. "#", "%1-i#")
	text = gsub(text, '("\'")' .. "##" .. zer .. "#", "%1-i#")
	-- do not count zer as izafa before silent alif
	text = gsub(text, "%-i" .. "##" .. "(" .. space_like_class .. ")" .. "##" .. "([" .. sun_letters .. "]" .. jazm .. "#%-#" .. ")", "i%1%2")
	text = gsub(text, "%-i" .. "#%-#" .. "([" .. sun_letters .. "]" .. "#%-#" .. ")", "i-%1")
	-- he deletion
	text = gsub(text, "([" .. ZZP .. "])" .. he .. "#" .. zwnj, "%1-")
	text = gsub(text, "([" .. ZZP .. "])" .. he .. "#", "%1#")
	text = gsub(text, "#" .. ain , "#")

	-- get rid of hashtags (not needed)
	text = gsub(text, "#", "")
	text = gsub(text, "HASHTAG", "#")
	text = string.gsub(text, lrm, "")
	text = string.gsub(text, rlm, "")
	-- convert all characters
	text = mw.ustring.gsub(text, ".", mapping)

	-- alif
	-- Final corrections
	text = mw.ustring.gsub(text, "āa", "ā")
	text = mw.ustring.gsub(text, "aaa", "ā")
	text = mw.ustring.gsub(text, "āā", "ā")
	text = mw.ustring.gsub(text, "aa", "ā")
	text = mw.ustring.gsub(text, "ī" .. "([" .. vowels .. "])", "iy%1")
	text = mw.ustring.gsub(text, "ū" .. "([" .. vowels .. "])", "uw%1")

	text = mw.ustring.toNFC(text)

	return text
end

return export