Module:Hani-sortkey
Itsura
- This module lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local m_str_utils = require("Module:string utilities")
local byte = string.byte
local codepoint = m_str_utils.codepoint
local concat = table.concat
local convert_iteration_marks = require("Module:Hani").convert_iteration_marks
local explode = m_str_utils.explode_utf8
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local sub = string.sub
local u = m_str_utils.char
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match
local upper = m_str_utils.upper
local m_data = require("Module:Hani-sortkey/data/serialized")
local m_data_core = mw.loadData("Module:Hani-sortkey/data/core")
local cache = {}
--[[
Returns the index in the string where the ideographic description sequence
(IDS) ends, or the index of the end of the string. Iterates whenever
another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
if not (text and IDchar and i) then
return nil
end
local j = i
local component = 1
-- Number of components expected after current IDC.
local components = m_data_core.ids[IDchar]
while component <= components do
j = j + 1
local char = text[j]
if not char then
break
elseif m_data_core.ids[char] then
j = findEndOfIDS(text, char, j)
end
component = component + 1
end
--[[
If the expected number of components has been found,
return the current index in the text.
]]
if component - components == 1 then
return j
else
return nil
end
end
local function unserialize(a, b)
return m_data_core.radicals[byte(a)] .. format("%02d", byte(b) - 10)
end
-- The data is stored in [[Module:Hani-sortkey/data]]. This data is not accessed directly (due to the large amount of memory this would consume), but is instead stored in a serialized form as [[Module:Hani-sortkey/data/serialized]]. If the data is changed, the new serialized data can be generated with [[Module:Hani-sortkey/data/serializer]].
function export.getData(char)
if type(char) == "string" then
char = codepoint(char)
elseif type(char) ~= "number" then
error("getData must operate on a single character or codepoint.")
end
local offset, s, f, lookup = 0
for i = 2, m_data_core.ranges.n, 2 do
s, f = m_data_core.ranges[i - 1], m_data_core.ranges[i]
if char > f then
offset = offset + f - s + 1
elseif char >= s and char <= f then
lookup = 2 * (offset + char - s + 1)
return (gsub(sub(m_data, lookup - 1, lookup), "(.)(.)", unserialize))
end
end
return u(char)
end
function export.makeSortKey(text, lang, sc)
-- Convert any iteration marks into full characters, and remove any spaces. Also remove punctuation if the term contains non-punctuation (so that entries for punctuation characters can still be sorted properly).
text = ugsub(convert_iteration_marks(text), "%s+", "")
if not umatch(text, "^%p+$") then
text = ugsub(text, "%p+", "")
end
text = explode(text)
local sort, text_len, i = {}, #text, 0
while i < text_len do
i = i + 1
local char = text[i]
if m_data_core.preconvert[char] then
local j = 0
for c in gmatch(m_data_core.preconvert[char], ".[\128-\191]*") do
if j == 0 then
text[i] = c
else
insert(text, i + j, c)
end
j = j + 1
end
char = text[i]
text_len = #text
end
--[=[
If we encounter an ideographic description character (IDC),
find out if it begins a valid ideographic description sequence (IDS).
If the IDS is valid and a sortkey for it is listed in
[[Module:Hani-sortkey/data/unsupported]], then return
the sortkey, and move to the next character after the
IDS.
Otherwise, insert the IDC into the sortkey and move to the next
character after the IDC.
If the IDS is valid and no sortkey for it is found, track it.
]=]
if m_data_core.ids[char] then
local j = findEndOfIDS(text, char, i)
local IDS, data
if j then
IDS = concat(text, nil, i, j)
data = m_data_core.unsupported[IDS]
end
if not data then
if IDS then
require("Module:debug").track("Hani-sortkey/IDS-without-sortkey")
mw.log("ideographic description sequence without sortkey: '"
.. IDS .. "'")
else
require("Module:debug").track("Hani-sortkey/invalid-IDS")
mw.log("invalid ideographic description sequence at the beginning of '"
.. text[i] .. "'")
end
end
if IDS and data then
insert(sort, data)
i = j
else
insert(sort, char)
end
else
if not cache[char] then
cache[char] = export.getData(char)
end
insert(sort, cache[char])
end
end
return concat(sort)
end
return export
Mga kategorya:
- Sortkey-generating modules by script
- Katitikang Han na modyul
- Sortkey-generating modules
- Chinese Pidgin English na modyul
- Yonaguni na modyul
- Daur na modyul
- Katimugang Pinghua na modyul
- Shaojiang Min na modyul
- Jie na modyul
- Datian Min na modyul
- Central Bai na modyul
- Maramihang wika na modyul
- Tuoba na modyul
- Ai-Cham na modyul
- Shaozhou Tuhua na modyul
- Zhuang na modyul
- Gaya na modyul
- Sitsuwanes na modyul
- Northern Pinghua na modyul
- Goguryeo na modyul
- Kikai na modyul
- Viyetnamita na modyul
- Yoron na modyul
- Toisanes na modyul
- Zauzou na modyul
- Kyakala na modyul
- Sanxiang Min na modyul
- Nùng na modyul
- Hokkien na modyul
- Zakhring na modyul
- Gan na modyul
- Baekje na modyul
- Lumang Tsino na modyul
- Wuhuan na modyul
- Lama Bai na modyul
- Hakka na modyul
- Zhenan Min na modyul
- Puxian Min na modyul
- Okinoerabu na modyul
- Yemaek na modyul
- Jin na modyul
- Macau Pidgin Portuguese na modyul
- Dungan na modyul
- Tsino na modyul
- Classical Tibetan na modyul
- Northern Min na modyul
- Hapones na modyul
- Kunigami na modyul
- Leizhou Min na modyul
- Yaeyama na modyul
- Lumang Hapones na modyul
- Hachijō na modyul
- Miyako na modyul
- Caolan na modyul
- E na modyul
- Wu na modyul
- Teochew na modyul
- Hainanese na modyul
- Min Nan na modyul
- Tuyuhun na modyul
- Xianbei na modyul
- Northern Amami Ōshima na modyul
- Panyi Bai na modyul
- Bala na modyul
- Tokunoshima na modyul
- Mandarin na modyul
- Huizhou na modyul
- Central Min na modyul
- Longyan Min na modyul
- Middle Mongol na modyul
- Kantones na modyul
- Silanganang Min na modyul
- Waxiang na modyul
- Southern Amami Ōshima na modyul
- Sui na modyul
- Pampanitikang Tsino na modyul
- Tày na modyul
- Middle Vietnamese na modyul
- Alchuka na modyul
- Gitnang Tsino na modyul
- Okinawan na modyul
- Biao-Jiao Mien na modyul
- Bailang na modyul
- Xiang na modyul
- Old Uyghur na modyul
- Khitan na modyul
- Hailufeng Min na modyul
- Biyo na modyul
- Buyeo na modyul
- Bouyei na modyul
- Southern Bai na modyul
- Xiongnu na modyul
- Rouran na modyul
- Templates and modules needing documentation