Module:headword utilities
Itsura
- This module lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local fun_is_callable_module = "Module:fun/isCallable"
local languages_module = "Module:languages"
local links_module = "Module:links"
local parse_utilities_module = "Module:parse utilities"
local string_pattern_escape_module = "Module:string/patternEscape"
local string_replacement_escape_module = "Module:string/replacementEscape"
local string_utilities_module = "Module:string utilities"
local table_module = "Module:table"
local dump = mw.dumpObject
local unpack = unpack or table.unpack -- Lua 5.2 compatibility
local insert = table.insert
local concat = table.concat
local remove = table.remove
local sort = table.sort
local function deepEquals(...)
deepEquals = require(table_module).deepEquals
return deepEquals(...)
end
local function escape_wikicode(...)
escape_wikicode = require(parse_utilities_module).escape_wikicode
return escape_wikicode(...)
end
local function extend(...)
extend = require(table_module).extend
return extend(...)
end
local function get_lang(...)
get_lang = require(languages_module).getByCode
return get_lang(...)
end
local function insert_if_not(...)
insert_if_not = require(table_module).insertIfNot
return insert_if_not(...)
end
local function is_callable(...)
is_callable = require(fun_is_callable_module)
return is_callable(...)
end
local function parse_inline_modifiers(...)
parse_inline_modifiers = require(parse_utilities_module).parse_inline_modifiers
return parse_inline_modifiers(...)
end
local function pattern_escape(...)
pattern_escape = require(string_pattern_escape_module)
return pattern_escape(...)
end
local function replacement_escape(...)
replacement_escape = require(string_replacement_escape_module)
return replacement_escape(...)
end
local function shallow_copy(...)
shallow_copy = require(table_module).shallowCopy
return shallow_copy(...)
end
local function split(...)
split = require(string_utilities_module).split
return split(...)
end
local function term_contains_top_level_html(...)
term_contains_top_level_html = require(parse_utilities_module).term_contains_top_level_html
return term_contains_top_level_html(...)
end
local function ugsub(...)
ugsub = require(string_utilities_module).gsub
return ugsub(...)
end
local function umatch(...)
umatch = require(string_utilities_module).match
return umatch(...)
end
local param_mods = {
id = {}, -- disabled when `is_head = true`
q = {type = "qualifier"},
qq = {type = "qualifier"},
l = {type = "labels"},
ll = {type = "labels"},
-- [[Module:headword]] expects part references in `.refs`.
ref = {item_dest = "refs", type = "references", store = "insert-flattened"},
}
local optional_param_mods = {
g = {item_dest = "genders", type = "genders"},
alt = {},
lang = {type = "language"},
sc = {type = "script"},
t = {item_dest = "gloss"},
gloss = {},
pos = {},
lit = {},
tr = {},
ts = {},
face = {},
nolinkinfl = {type = "boolean"},
}
local optional_headword_param_mods = {
sc = {type = "script"},
tr = {},
ts = {},
}
--[==[
Parse a single inflection or headword form or list of such forms. In either case, inline modifiers may be attached.
`data` is an object with the following fields:
* `val`: The raw value to parse. Required.
* `paramname`: The name of the parameter from which the value was taken; used in error messages. Required.
* `is_head`: We are parsing a headword parameter (a value which goes into the `heads` field of `data`). This changes
the allowed modifiers, disabling the `id` modifier and only allowing a subset of optional modifiers.
* `frob`: An optional function of one value to apply to the form after inline modifiers have been removed (i.e. to
apply to the `.term` field of the returned object).
* `include_mods`: List of extra inline modifiers to include, besides the default ones (see below). Each list item is
either a string specifying a recognized extra inline modifier (see `optional_param_mods` in the code), or a two-item
list of modifier name and modifier spec, where the spec should follow the syntax for modifier specs in
`parse_inline_modifiers` in [[Module:parse utilities]].
* `exclude_mods`: List of default inline modifiers to not include.
* `splitchar`: If specified, the value in `val` can be a list of forms to parse, separated by the value of `splitchar`
(which is a Lua pattern, as in `parse_inline_modifiers` in [[Module:parse utilities]]). Most commonly, `splitchar` is
a single comma and the values are comma-separated (in this case, splitting will not happen if a space follows the
comma).
* `preserve_splitchar`, `delimiter_key`, `escape_fun`, `unescape_fun`, `pre_normalize_modifiers`: As in
`parse_inline_modifiers` in [[Module:parse utilities]].
Returns an object suitable for storing as one element of one of the lists in `headdata.inflections`, where `headdata`
is the structure passed to [[Module:headword]]. If `splitchar` is specified, howeve, the return value is a list of such
objects.
The following default inline modifiers are currently recognized:
* `q`: Left qualifier.
* `qq`: Right qualifier.
* `l`: Comma-separated list of left labels. No space should follow the comma.
* `ll`: Comma-separated list of right labels. No space should follow the comma.
* `ref`: Reference or references. See {{tl|IPA}} for the syntax.
* `id`: Sense ID, in case there are multiple senses. See {{tl|l}}.
The following are the recognized additional inline modifiers:
* `g`: Comma-separated list of genders.
* `alt`: Display text.
* `lang`: Language code of language of the form, if different from the language of the headword.
* `sc`: Script code of script of the form. Almost never needed.
* `t`: Gloss for the form.
* `gloss`: Gloss for the form (alias for `t`).
* `pos`: Part of speech of the form.
* `lit`: Literal meaning of the form.
* `tr`: Manual transliteration of the form.
* `ts`: Transcription of the form, for languages where the transliteration differs markedly from the pronunciation.
* `face`: Face to display the form in, e.g. {"hypothetical"} for a hypothetical form (unlinkable and displayed in italics).
* `nolinkinfl`: Make the form unlinkable.
]==]
function export.parse_term_with_modifiers(data)
local paramname, val, frob = data.paramname, data.val, data.frob
local function generate_obj(term, parse_err)
if frob then
term = frob(term, parse_err)
end
return {term = term}
end
-- Check for inline modifier, e.g. מרים<tr:Miryem>. But exclude top-level HTML entry with <span ...>,
-- <sup> or similar in it.
if (val:find("<", nil, true) or data.splitchar) and not term_contains_top_level_html(val) and
-- don't parse inline modifiers if is_head and the value begins with a ~ (link modifier syntax)
(not data.is_head or not val:find("^~")) then
local param_mods = param_mods
if data.is_head then
param_mods = shallow_copy(param_mods)
param_mods.id = nil
end
if data.include_mods or data.exclude_mods then
if not data.is_head then
-- already copied when data.is_head
param_mods = shallow_copy(param_mods)
end
if data.include_mods then
local optional_mods = data.is_head and optional_headword_param_mods or optional_param_mods
for _, mod in ipairs(data.include_mods) do
if type(mod) == "table" then
if #mod ~= 2 then
error(("Internal error: Modifier spec %s in `include_mods` should be of length 2"):format(
dump(mod)))
end
local modkey, modvalue = unpack(mod)
param_mods[modkey] = modvalue
elseif not optional_mods[mod] then
error(("Internal error: Unrecognized modifier spec %s in `include_mods`"):format(
dump(mod)))
else
param_mods[mod] = optional_mods[mod]
end
end
end
if data.exclude_mods then
for _, mod in ipairs(data.exclude_mods) do
if not param_mods[mod] then
error(("Internal error: Modifier spec %s in `exclude_mods` not found among existing modifiers"
):format(dump(mod)))
else
param_mods[mod] = nil
end
end
end
end
return parse_inline_modifiers(val, {
paramname = paramname,
param_mods = param_mods,
generate_obj = generate_obj,
splitchar = data.splitchar,
preserve_splitchar = data.preserve_splitchar,
delimiter_key = data.delimiter_key,
escape_fun = data.escape_fun,
unescape_fun = data.unescape_fun,
pre_normalize_modifiers = data.pre_normalize_modifiers,
})
else
local retval = generate_obj(val)
if data.splitchar then
retval = {retval}
end
return retval
end
end
--[==[
Parse a list of inflection forms that may have inline modifiers attached. `data` is an object with the following fields:
* `forms`: The list of raw values to parse. Required.
* `paramname`: The name of the first parameter from which the value was taken; used in error messages. If this is a
two-element list, the first element is the first parameter and the second element is the prefix of the remaining
parameters. Parameter names that are numbers are handled correctly, as are those with \1 in it marking where the
parameter index goes. Required.
* `qualifiers`: If specified, a possibly gappy list of left qualifiers to add to the parsed terms (for compatibility
purposes).
* `splitchar`: As in `parse_term_with_modifiers()`. The resulting per-term lists will be flattened.
* `frob`, `include_mods`, `exclude_mods`, `is_head`, `preserve_splitchar`, `delimiter_key`, `escape_fun`,
`unescape_fun`, `pre_normalize_modifiers`: As in `parse_term_with_modifiers()`.
Returns a list of objects, suitable for storing as one of the lists in `headdata.inflections` (once a label is added),
where `headdata` is the structure passed to [[Module:headword]].
]==]
function export.parse_term_list_with_modifiers(data)
local paramname, forms = data.paramname, data.forms
local qualifiers = data.qualifiers
local first, restpref
if type(paramname) == "table" then
first = paramname[1]
restpref = paramname[2]
else
first = paramname
restpref = paramname
end
local terms = {}
data = shallow_copy(data)
for i, val in ipairs(forms) do
data.paramname = i == 1 and first or type(restpref) == "number" and restpref + i - 1 or
restpref:find("\1", nil, true) and restpref:gsub("\1", tostring(i)) or restpref .. i
data.val = val
local parsed = export.parse_term_with_modifiers(data)
if qualifiers and qualifiers[i] then
if data.splitchar then
for _, term in ipairs(parsed) do
term.q = {qualifiers[i]}
end
else
parsed.q = {qualifiers[i]}
end
end
if data.splitchar then
extend(terms, parsed)
else
terms[i] = parsed
end
end
return terms
end
--[==[
Check if any of a list of parsed terms (as returned by `parse_term_list_with_modifiers()`) are red links (i.e.
nonexistent pages). If so, a category such as [[Category:Spanish nouns with red links in their headword lines]] is added
to `headdata.categories`. `data` is an object with the following fields:
* `headdata`: The headword structure passed to [[Module:headword]]. Required.
* `terms`: The list of parsed terms. Required.
* `lang`: The language object for the language of the terms. Required.
* `plpos`: The plural part of speech, for the category name. Required.
]==]
function export.check_term_list_missing(data)
local headdata, terms, lang, plpos = data.headdata, data.terms, data.lang, data.plpos
for _, term in ipairs(terms) do
if type(term) == "table" then
term = term.term
end
if term then
local title = mw.title.new(term)
if title and not title:getContent() then
insert(headdata.categories, lang:getFullName() .. " " .. plpos ..
" with red links in their headword lines")
end
end
end
end
--[==[
Construct a link to [[Appendix:Glossary]] for `entry`. If `text` is specified, it is the display text; otherwise,
`entry` is used.
]==]
function export.glossary_link(entry, text)
text = text or entry
return "[[Apendise:Glosaryo#" .. entry .. "|" .. text .. "]]" --TLCHANGE Appendix:Glossary
end
function export.replace_glossary_links_in_label(label)
if label:find("<<", nil, true) then
label = label:gsub("<<(.-)|(.-)>>", export.glossary_link):gsub("<<(.-)>>", export.glossary_link)
end
return label
end
function export.insert_fixed_inflection(data)
local headdata, origterm, label = data.headdata, data.originating_term, data.label
local inflobj = data.inflobj or headdata
inflobj.inflections = inflobj.inflections or {}
if not origterm then
insert(inflobj.inflections, {
label = export.replace_glossary_links_in_label(label)
})
else
if origterm.id then
error(("It doesn't make sense to pass in an ID '%s' for label '%s' in conjunction with a term value '%s'"
):format(origterm.id, label, origterm.term))
end
-- Preserve qualifiers, labels, references
origterm.term = nil
origterm.label = export.replace_glossary_links_in_label(label)
insert(inflobj.inflections, origterm)
end
end
--[==[
Insert previously-parsed terms into an `inflections` field. The `inflections` field will be initialized if needed.
`data` is an object with the following fields:
* `headdata`: The headword structure passed to [[Module:headword]]. Required.
* `inflobj`: The object whose `inflections` field the terms are inserted into. Defaults to `headdata`. Only needs
to be set for nested inflections, which are specified for an inflection object rather than the headword data
structure as a whole.
* `terms`: The list of parsed terms. If {nil} or omitted, nothing happens unless `request` is set.
* `label`: The label that the inflections are given; any parts of the label surrounded in <<...>> are linked to the
glossary. (If the contents of <<...> contain a | in them, they are a two-part link.) Required.
* `no_label`: If the term is {"-"} and there are no other terms, insert a fixed label with this value. Defaults to
{"no "} plus the label.
* `usually_no_label`: If the term is {"-"} and there are other terms, insert a fixed label with this value. Defaults to
{"usually no "} plus the label.
* `accel`: If specified, a full accelerator object to add to the inflections.
* `request`: If specified and no terms are given, insert a label with a request for inflections to be given.
* `enable_auto_translit`: If specified and terms are given, display automatic transliteration of the terms.
* `check_missing`: If specified, check the parsed terms for red links, and if so, add a category such as
[[Category:Spanish nouns with red links in their headword lines]] to `headdata.categories`. If this is given, so must
`lang` and `plpos`.
* `lang`: The language object for the language of the terms. Required if `check_missing` is given.
* `plpos`: The plural part of speech, for the category name. Required if `check_missing` is given.
]==]
function export.insert_inflection(data)
local headdata, terms, label = data.headdata, data.terms, data.label
local inflobj = data.inflobj or headdata
if terms and terms[1] then
if terms[1].term == "-" then
if terms[2] then
export.insert_fixed_inflection {
headdata = headdata,
inflobj = inflobj,
originating_term = terms[1],
label = data.usually_no_label or "usually no " .. label,
}
remove(terms, 1)
else
export.insert_fixed_inflection {
headdata = headdata,
inflobj = inflobj,
originating_term = terms[1],
label = data.no_label or "no " .. label,
}
return
end
end
if data.check_missing then
export.check_term_list_missing {
headdata = headdata,
terms = terms,
lang = data.lang,
plpos = data.plpos,
}
end
terms.label = export.replace_glossary_links_in_label(label)
if data.accel then
terms.accel = data.accel
end
terms.enable_auto_translit = data.enable_auto_translit
inflobj.inflections = inflobj.inflections or {}
insert(inflobj.inflections, terms)
elseif data.request then
inflobj.inflections = inflobj.inflections or {}
insert(inflobj.inflections, {
label = export.replace_glossary_links_in_label(label),
request = true,
})
end
end
--[==[
Parse raw arguments from `forms` for inline modifiers, and insert the resulting terms (which should not require
significant additional processing) into `headdata.inflections`. `data` is an object with the following fields:
* `forms`: The list of raw values to parse. If {nil} or omitted, nothing happens.
* `headdata`: The headword structure passed to [[Module:headword]]. Required.
* `paramname`: As in `parse_term_list_with_modifiers()`. Required.
* `label`: As in `insert_inflection()`. Required.
* `qualifiers`, `frob`, `include_mods`, `exclude_mods`, `is_head`, `splitchar`, `preserve_splitchar`, `delimiter_key`,
`escape_fun`, `unescape_fun`, `pre_normalize_modifiers`: As in `parse_term_list_with_modifiers()`.
* `accel`, `check_missing`, `lang`, `plpos`: As in `insert_inflection()`.
]==]
function export.parse_and_insert_inflection(data)
local forms = data.forms
if forms and forms[1] then
data = shallow_copy(data)
data.forms = forms
data.terms = export.parse_term_list_with_modifiers(data)
export.insert_inflection(data)
end
end
--[==[
Combine two sets of qualifiers or labels. If either is {nil}, just return the other, and if both are {nil}, return
{nil}.
]==]
function export.combine_qualifiers_or_labels(quals1, quals2)
if not quals1 and not quals2 then
return nil
end
if not quals1 then
return quals2
end
if not quals2 then
return quals1
end
local combined = shallow_copy(quals1)
for _, note in ipairs(quals2) do
insert_if_not(combined, note)
end
return combined
end
--[==[
Combine the qualifiers, labels, references and ID's of two term objects. `destobj` is the "destination term object" into
which the combined properties are written, and `srcobj` is the "source object" into which the properties are merged.
`destobj` is side-effected (but the lists inside of `destobj` are not); if this is undesirable, make sure to
shallow-copy `destobj` first. If both objects have values for a given qualifier, label or reference, the values of
`destobj` come first. If both objects have a value for `id`, the values must match or an error is thrown; otherwise,
the resulting value of `id` comes from whichever one is defined.
'''NOTE:''' This may not be the correct behavior when deduplicating a list of term objects. See
`insert_termobj_combining_duplicates` for a different approach.
]==]
function export.combine_termobj_qualifiers_labels(destobj, srcobj)
destobj.q = export.combine_qualifiers_or_labels(destobj.q, srcobj.q)
destobj.qq = export.combine_qualifiers_or_labels(destobj.qq, srcobj.qq)
destobj.l = export.combine_qualifiers_or_labels(destobj.l, srcobj.l)
destobj.ll = export.combine_qualifiers_or_labels(destobj.ll, srcobj.ll)
destobj.refs = export.combine_qualifiers_or_labels(destobj.refs, srcobj.refs)
if destobj.id and srcobj.id and destobj.id ~= srcobj.id then
-- FIXME: We probably want to pass in an error function
error(("Can't specify two different ID's %s and %s when combining objects"):format(srcobj.id, destobj.id))
end
destobj.id = destobj.id or srcobj.id
return destobj
end
function export.termobj_has_qualifiers_or_labels(obj)
return obj.q and obj.q[1] or obj.qq and obj.qq[1] or obj.l and obj.l[1] or obj.ll and obj.ll[1] or
obj.refs and obj.refs[1]
end
local function one_ancillary_property_equal(prop1, prop2)
local prop1_is_nil = not prop1 or not prop1[1]
local prop2_is_nil = not prop2 or not prop2[1]
if prop1_is_nil and prop2_is_nil then
return true
end
if prop1_is_nil or prop2_is_nil then
return false
end
return deepEquals(prop1, prop2)
end
function export.termobj_ancillary_properties_equal(obj1, obj2)
return one_ancillary_property_equal(obj1.q, obj2.q) and
one_ancillary_property_equal(obj1.qq, obj2.qq) and
one_ancillary_property_equal(obj1.l, obj2.l) and
one_ancillary_property_equal(obj1.ll, obj2.ll) and
one_ancillary_property_equal(obj1.refs, obj2.refs) and
obj1.id == obj2.id
end
function export.convert_termobj_to_formobj(termobj)
local formobj = {
form = termobj.term,
translit = termobj.tr,
}
local footnotes
local function mods_to_footnote(mod_prefix, mod_vals)
if mod_vals and mod_vals[1] then
footnotes = footnotes or {}
for _, val in ipairs(mod_vals) do
insert(footnotes, "[" .. mod_prefix .. ":" .. val .. "]")
end
end
end
mods_to_footnote("q", termobj.q)
mods_to_footnote("qq", termobj.qq)
mods_to_footnote("l", termobj.l)
mods_to_footnote("ll", termobj.ll)
mods_to_footnote("ref", termobj.refs)
mods_to_footnote("id", termobj.id and {termobj.id} or nil)
formobj.footnotes = footnotes
return formobj
end
local recognized_multi_mods = {
q = "q",
qq = "qq",
l = "l",
ll = "ll",
ref = "refs",
}
local recognized_single_mods = {
id = "id",
}
function export.add_footnote_to_termobj(termobj, footnote)
local stripped_footnote = footnote:match("^%[(.*)%]$")
if not stripped_footnote then
error("Internal error: Footnote should be surrounded by brackets at this stage: " .. footnote)
end
local prefix, rest = stripped_footnote:match("^([a-z]+):(.+)$")
local field, is_multi
if prefix then
if recognized_multi_mods[prefix] then
field = recognized_multi_mods[prefix]
is_multi = true
elseif recognized_single_mods[prefix] then
field = recognized_single_mods[prefix]
is_multi = false
end
end
if not field then
rest = stripped_footnote
field = "l"
is_multi = true
end
if is_multi then
if not termobj[field] then
termobj[field] = {}
end
insert(termobj[field], rest)
else
if termobj[field] and termobj[field] ~= rest then
error(("Can't set two values for '%s': '%s' and '%s'"):format(field, termobj[field], rest))
end
termobj[field] = rest
end
end
function export.convert_formobj_to_termobj(formobj)
local termobj = {
term = formobj.form,
tr = formobj.translit,
}
if formobj.footnotes then
for _, footnote in ipairs(formobj.footnotes) do
export.add_footnote_to_termobj(termobj, footnote)
end
end
return termobj
end
local function extract_termobj_field_modifiers(fieldval)
return fieldval:match("^([*+]?)(.*)$")
end
function export.remove_termobj_field_modifiers(termobj)
local function remove_field_modifiers(field)
if termobj[field] and termobj[field][1] then
local any_field_modifiers = false
for _, val in ipairs(termobj[field]) do
local field_mods, _ = extract_termobj_field_modifiers(val)
if field_mods ~= "" then
any_field_modifiers = true
break
end
end
local new_field = {}
if any_field_modifiers then
for _, val in ipairs(termobj[field]) do
local _, field_without_mods = extract_termobj_field_modifiers(val)
insert_if_not(new_field, field_without_mods)
end
termobj[field] = new_field
end
end
end
remove_field_modifiers("q")
remove_field_modifiers("qq")
remove_field_modifiers("l")
remove_field_modifiers("ll")
remove_field_modifiers("refs")
end
function export.insert_termobj_combining_duplicates(destobjs, termobj)
for _, destobj in ipairs(destobjs) do
if destobj.term == termobj.term and destobj.tr == termobj.tr then
-- Form already present; maybe combine footnotes.
local function combine_field_values(field)
if termobj[field] and termobj[field][1] then
-- Check to see if there are existing values with *; if so, remove them.
if destobj[field] and destobj[field][1] then
local any_values_with_asterisk = false
for _, val in ipairs(destobj[field]) do
local field_mods, _ = extract_termobj_field_modifiers(val)
if field_mods:find("%*") then
any_values_with_asterisk = true
break
end
end
if any_values_with_asterisk then
local filtered_values = {}
for _, val in ipairs(destobj[field]) do
local field_mods, _ = extract_termobj_field_modifiers(val)
if not val:find("%*") then
insert(filtered_values, val)
end
end
if filtered_values[1] then
destobj[field] = filtered_values
else
destobj[field] = nil
end
end
end
local any_values_with_plus = false
for _, val in ipairs(termobj[field]) do
local field_mods, _ = extract_termobj_field_modifiers(val)
if val:find("%+") then
any_footnotes_with_plus = true
break
end
end
if any_footnotes_with_plus then
if not destobj[field] then
destobj[field] = {}
else
destobj[field] = shallow_copy(destobj[field])
end
for _, val in ipairs(termobj[field]) do
local already_seen = false
local field_mods, field_without_mods = extract_termobj_field_modifiers(val)
if val:find("%+") then
for _, existing_val in ipairs(destobj[field]) do
local existing_field_mods, existing_field_without_mods =
extract_termobj_field_modifiers(existing_val)
if existing_field_without_mods == field_without_mods then
already_seen = true
break
end
end
if not already_seen then
insert(destobj[field], val)
end
end
end
end
end
end
combine_field_values("q")
combine_field_values("qq")
combine_field_values("l")
combine_field_values("ll")
combine_field_values("refs")
if destobj.id and termobj.id and destobj.id ~= termobj.id then
-- FIXME: We probably want to pass in an error function
error(("Can't specify two different ID's %s and %s when combining objects"):format(termobj.id, destobj.id))
end
destobj.id = destobj.id or termobj.id
return
end
end
insert(destobjs, termobj)
end
export.allowed_special_indicators = {
["first"] = true,
["first-second"] = true,
["first-last"] = true,
["second"] = true,
["last"] = true,
["each"] = true,
["+"] = true, -- requests the default behavior with preposition handling
}
--[==[
Check for special indicators (values such as {"+first"} or {"+first-last"} that are used in a `pl`, `f`, etc. argument
and indicate how to inflect a multiword term). If `form` is such an indicator, the return value is `form` minus
the initial `+` sign; otherwise, if form begins with a `+` sign, an error is thrown; otherwise the return value is nil.
]==]
function export.get_special_indicator(form, noerror)
if form:find("^%+") then
form = form:gsub("^%+", "")
if not export.allowed_special_indicators[form] then
if noerror then
return nil
end
local indicators = {}
for indic, _ in pairs(export.allowed_special_indicators) do
insert(indicators, "+" .. indic)
end
sort(indicators)
error("Special inflection indicator beginning with '+' can only be " ..
mw.text.listToText(indicators) .. ": +" .. form)
end
return form
end
return nil
end
local function add_endings(bases, endings)
local retval = {}
if type(bases) ~= "table" then
bases = {bases}
end
if type(endings) ~= "table" then
endings = {endings}
end
for _, base in ipairs(bases) do
for _, ending in ipairs(endings) do
insert(retval, base .. ending)
end
end
return retval
end
--[==[
Inflect a possibly multiword or hyphenated term `form` using the function `inflect`, which is a function of one argument
that is called on a single word to inflect and should return either the inflected word or a list of inflected words.
`special` indicates how to inflect the multiword term and should be e.g. {"first"} to inflect only the first word,
{"first-last"} to inflect the first and last words, {"each"} to inflect each word, etc. See `allowed_special_indicators`
above for the possibilities. If `special` is `+`, or is omitted and the term is multiword (i.e. containing a space
character), and `prepositions` is supplied, the function checks for multiword or hyphenated terms containing the
prepositions in `prepositions`, e.g. Italian [[senso di marcia]] or [[medaglia d'oro]] or Portuguese
[[tartaruga-do-mar]]. If such a term is found, only the first word is inflected. Otherwise, the default is
{"first-last"}. `prepositions` is a list of Lua patterns matching prepositions. The patterns will automatically have the
separator character (space or hyphen) added to the left side but not the right side, so they should contain a space
character (which will automatically be converted to the appropriate separator) on the right side unless the preposition
is joined on the right side with an apostrophe. Examples of preposition patterns for Italian are {"di "}, {"sull'"} and
{"d?all[oae] "} (which matches {"dallo "}, {"dalle "}, {"alla "}, etc.).
The return value is always either a list of inflected multiword or hyphenated terms, or nil if `special` is omitted
and `form` is not multiword. (If `special` is specified and `form` is not multiword or hyphenated, an error results.)
]==]
function export.handle_multiword(form, special, inflect, prepositions, sep)
sep = sep or form:find(" ") and " " or "%-"
local raw_sep = sep == " " and " " or "-"
-- Used to add regex version of separator in the replacement portion of ugsub() or :gsub()
local sep_replacement = sep == " " and " " or "%%-"
-- Given a Lua pattern, replace space with the appropriate separator.
local function hack_re(re)
if sep == " " then
return re
end
return (re:gsub(" ", sep_replacement))
end
if special == "first" then
local first, rest = form:match(hack_re("^(.-)( .*)$"))
if not first then
error("Special indicator 'first' can only be used with a multiword term: " .. form)
end
return add_endings(inflect(first), rest)
elseif special == "second" then
local first, second, rest = form:match(hack_re("^([^ ]+ )([^ ]+)( .*)$"))
if not first then
error("Special indicator 'second' can only be used with a term with three or more words: " .. form)
end
return add_endings(add_endings({first}, inflect(second)), rest)
elseif special == "first-second" then
local first, space, second, rest = form:match(hack_re("^([^ ]+)( )([^ ]+)( .*)$"))
if not first then
error("Special indicator 'first-second' can only be used with a term with three or more words: " .. form)
end
return add_endings(add_endings(add_endings(inflect(first), space), inflect(second)), rest)
elseif special == "each" then
local terms = split(form, sep)
if #terms < 2 then
error("Special indicator 'each' can only be used with a multiword term: " .. form)
end
for i, term in ipairs(terms) do
terms[i] = inflect(term)
if i > 1 then
terms[i] = add_endings(raw_sep, terms[i])
end
end
local result = ""
for _, term in ipairs(terms) do
result = add_endings(result, term)
end
return result
elseif special == "first-last" then
local first, middle, last = form:match(hack_re("^(.-)( .* )(.-)$"))
if not first then
first, middle, last = form:match(hack_re("^(.-)( )(.*)$"))
end
if not first then
error("Special indicator 'first-last' can only be used with a multiword term: " .. form)
end
return add_endings(add_endings(inflect(first), middle), inflect(last))
elseif special == "last" then
local rest, last = form:match(hack_re("^(.* )(.-)$"))
if not rest then
error("Special indicator 'last' can only be used with a multiword term: " .. form)
end
return add_endings(rest, inflect(last))
elseif special and special ~= "+" then
error("Unrecognized special=" .. special)
end
-- Only do default behavior if special indicator '+' explicitly given or separator is space; otherwise we will
-- break existing behavior with hyphenated words.
if (special == "+" or sep == " ") and form:find(sep) then
if prepositions then
-- check for prepositions in the middle of the word; do it this way so we can handle
-- more than one word before the preposition (and usually inflect each word)
for _, prep in ipairs(prepositions) do
local first, space_prep_rest = umatch(form, hack_re("^(.-)( " .. prep .. ".*)$"))
if first then
return add_endings(inflect(first), space_prep_rest)
end
end
end
-- multiword or hyphenated expressions default to first-last; we need to pass in the separator to avoid
-- problems with multiword terms containing hyphens in the individual words
return export.handle_multiword(form, "first-last", inflect, prepositions, sep)
end
return nil
end
local function link_hyphen_split_component(word, data)
if data.link_hyphen_split_component then
return data.link_hyphen_split_component(word)
else
return "[[" .. word .. "]]"
end
end
-- Default function to split a word on apostrophes. Don't split apostrophes at the beginning or end of a word (e.g.
-- [['ndrangheta]] or [[po']]). Handle multiple apostrophes correctly, e.g. [[l'altr'ieri]] -> [[l']][altr']][[ieri]].
function export.default_split_apostrophe(word, data)
local apostrophe_parts = split(word, "'", true, true)
local linked_apostrophe_parts = {}
local apostrophes_at_beginning = ""
local i = 1
-- Apostrophes at beginning get attached to the first word after (which will always exist but may
-- be blank if the word consists only of apostrophes).
while i < #apostrophe_parts do -- <, not <=, in case the word consists only of apostrophes
local apostrophe_part = apostrophe_parts[i]
i = i + 1
if apostrophe_part == "" then
apostrophes_at_beginning = apostrophes_at_beginning .. "'"
else
break
end
end
apostrophe_parts[i] = apostrophes_at_beginning .. apostrophe_parts[i]
-- Now, do the remaining parts. A blank part indicates more than one apostrophe in a row; we join
-- all of them to the preceding word.
while i <= #apostrophe_parts do
local apostrophe_part = apostrophe_parts[i]
if apostrophe_part == "" then
linked_apostrophe_parts[#linked_apostrophe_parts] =
linked_apostrophe_parts[#linked_apostrophe_parts] .. "'"
elseif i == #apostrophe_parts then
insert(linked_apostrophe_parts, apostrophe_part)
else
insert(linked_apostrophe_parts, apostrophe_part .. "'")
end
i = i + 1
end
for j, tolink in ipairs(linked_apostrophe_parts) do
linked_apostrophe_parts[j] = link_hyphen_split_component(tolink, data)
end
return concat(linked_apostrophe_parts)
end
--[=[
Auto-add links to a word that should not have spaces but may have hyphens and/or apostrophes. We split off final
punctuation, then split on hyphens if `data.split_hyphen` is given, and also split on apostrophes if
`data.split_apostrophe` is given. We only split on hyphens if they are in the middle of the word, not at the beginning
or end (hyphens at the beginning or end indicate suffixes or prefixes, respectively). `include_hyphen_prefixes`, if
given, is a set of prefixes (not including the final hyphen) where we should include the final hyphen in the prefix.
Hence, e.g. if "anti" is in the set, a Portuguese word like [[anti-herói]] "anti-hero" will be split [[anti-]][[herói]]
(whereas a word like [[código-fonte]] "source code" will be split as [[código]]-[[fonte]]).
If `data.split_apostrophe` is specified, we split on apostrophes unless `data.no_split_apostrophe_words` is given and
the word is in the specified set, such as French [[c'est]] and [[quelqu'un]]. If `data.split_apostrophe` is true, the
default algorithm applies, which splits on all apostrophes except those at the beginning and end of a word (as in
Italian [['ndrangheta]] or [[po']]), and includes the apostrophe in the link to its left (so we auto-split French
[[l'eau]] as [[l']][[eau]] and [[l'altr'ieri]] as [[l']][altr']][[ieri]]). If `data.split_apostrophe` is specified
but not `true`, it should be a function of one argument that does custom apostrophe-splitting. The argument is the word
to split, and the return value should be the split and linked word.
]=]
local function add_single_word_links(space_word, data, term_has_spaces)
local space_word_no_punct, punct
local punct_pattern = data.punctuation
if punct_pattern and is_callable(punct_pattern) then
space_word_no_punct, punct = punct_pattern(space_word)
else
if punct_pattern == nil then
punct_pattern = "[,;:?!]"
end
space_word_no_punct, punct = umatch(space_word, "^(.*)(" .. punct_pattern .. ")$")
end
space_word_no_punct = space_word_no_punct or space_word
punct = punct or ""
local words
if space_word_no_punct:sub(1, 1) == "-" or space_word_no_punct:sub(-1) == "-" then
-- don't split prefixes and suffixes
words = {space_word_no_punct}
else
local splitter
if term_has_spaces then
splitter = data.split_hyphen_when_space
else
splitter = data.split_hyphen_when_no_space
end
if is_callable(splitter) then
words = splitter(space_word_no_punct)
if type(words) == "string" then
return words .. punct
end
end
end
if not words then
local split_hyphen
if term_has_spaces then
split_hyphen = data.split_hyphen_when_space
else
split_hyphen = data.split_hyphen_when_no_space
if split_hyphen == nil then -- default to true; use `false` to avoid this
split_hyphen = true
end
end
if split_hyphen then
words = split(space_word_no_punct, "-", true, true)
else
words = {space_word_no_punct}
end
end
local linked_words = {}
for j, word in ipairs(words) do
if j < #words and data.include_hyphen_prefixes and data.include_hyphen_prefixes[word] then
word = "[[" .. word .. "-]]"
elseif j > 1 and data.include_hyphen_suffixes and data.include_hyphen_suffixes[word] then
word = "[[-" .. word .. "]]"
else
-- Don't split on apostrophes if the word is in `no_split_apostrophe_words`.
if (not data.no_split_apostrophe_words or not data.no_split_apostrophe_words[word]) and
data.split_apostrophe and word:find("'", nil, true) then
if data.split_apostrophe == true then
word = export.default_split_apostrophe(word, data)
else -- custom apostrophe splitter/linker
word = data.split_apostrophe(word)
end
elseif word ~= "" then -- avoid -[[]]- (e.g. f--k)
word = link_hyphen_split_component(word, data)
end
if j < #words then
word = word .. "-"
end
end
insert(linked_words, word)
end
return concat(linked_words) .. punct
end
--[=[
Auto-add links to a multiword term. `data` contains fields customizing how to do this. By default we proceed as follows:
(1) If the term already has embedded links in it, they are left unchanged.
(2) Otherwise, if there are spaces present, we split on spaces and link each word separately.
(3) If a given space-separated component ends in punctuation (defaulting to [,;:?!]), it is separated off, the remainder
of the algorithm run, and the punctuation pasted back on.
(4) If there are hyphens in a given space-separated component, we may link each hyphenated term separately depending
on the settings in `data`. Normally the hyphens are not included in the linked terms, but this can be overridden
for specific prefixes and/or suffixes. By default, if there are spaces in the multiword term, we do not link
hyphenated components (because of cases like "boire du petit-lait" where "petit-lait" should be linked as a whole),
but do so otherwise (e.g. for "avant-avant-hier"); this can overridden for cases like "croyez-le ou non".
Cases where only some of the hyphens should be split can always be handled by explicitly specifying the head (e.g.
"Nord-Pas-de-Calais" given as head=[[Nord]]-[[Pas-de-Calais]]).
(5) If there are apostrophes in a given component, we may link each apostrophe-separated term separately depending
on the settings in `data`, including the apostrophe in the link to its left (so we split "de l'eau" as
"[[de]] [[l']][[eau]]").
The settings in `data` are as follows:
`split_hyphen_when_no_space`: Whether to split on hyphens when the term has no spaces. Defaults to true if set to `nil`.
This can be a function of one argument, to implement a custom splitting algorithm for hyphen-separated terms. If
this returns [FIXME: FINISH ME ...]
If `data.split_apostrophe` is specified, we split on apostrophes unless `data.no_split_apostrophe_words` is given and
the word is in the specified set, such as French [[c'est]] and [[quelqu'un]]. If `data.split_apostrophe` is true, the
default algorithm applies, which splits on all apostrophes except those at the beginning and end of a word (as in
Italian [['ndrangheta]] or [[po']]), and includes the apostrophe in the link to its left (so we auto-split French
[[l'eau]] as [[l']][[eau]] and [[l'altr'ieri]] as [[l']][altr']][[ieri]]). If `data.split_apostrophe` is specified
but not `true`, it should be a function of one argument that does custom apostrophe-splitting. The argument is the word
to split, and the return value should be the split and linked word.
We don't always split on hyphens because of cases like "boire du petit-lait" where "petit-lait" should be linked as a
whole, but provide the option to do it for cases like "croyez-le ou non". If there's no space, however, then it makes
sense to split on hyphens by `no_split_apostrophe_words` and `include_hyphen_prefixes` allow for special-case handling
of particular words and are as described in the comment above add_single_word_links().
]=]
function export.add_links_to_multiword_term(term, data)
if term:match("[%[%]]") then
return term
end
local words = split(term, " ", true, true)
local term_has_spaces = #words > 1
local linked_words = {}
for _, word in ipairs(words) do
insert(linked_words, add_single_word_links(word, data, term_has_spaces))
end
local retval = concat(linked_words, " ")
-- If we ended up with a single link consisting of the entire term,
-- remove the link.
return retval:match("^%[%[([^%[%]]*)%]%]$") or retval
end
local function canonicalize_begin_end_spec(spec)
local from, to = spec:match("^(.-):(.*)$")
if not from then
from = spec
to = ""
end
return from, to
end
--[==[
Given a `linked_term` that is the output of add_links_to_multiword_term(), apply modifications as given in
`modifier_spec` to change the link destination of subterms (normally single-word non-lemma forms; sometimes
collections of adjacent words). This is usually used to link non-lemma forms to their corresponding lemma, but can
also be used to replace a span of adjacent separately-linked words to a single multiword lemma. The format of
`modifier_spec` is one or more semicolon-separated subterm specs, where each such spec is of the form
SUBTERM:DEST, where SUBTERM is one or more words in the `linked_term` but without brackets in them, and DEST is the
corresponding link destination to link the subterm to. Any occurrence of ~ in DEST is replaced with SUBTERM.
Alternatively, a single modifier spec can be of the form BEGIN[FROM:TO], which is equivalent to writing
BEGINFROM:BEGINTO (see example below).
For example, given the source phrase [[il bue che dice cornuto all'asino]] "the pot calling the kettle black"
(literally "the ox that calls the donkey horned/cuckolded"), the result of calling add_links_to_multiword_term()
is [[il]] [[bue]] [[che]] [[dice]] [[cornuto]] [[all']][[asino]]. With a modifier_spec of 'dice:dire', the result
is [[il]] [[bue]] [[che]] [[dire|dice]] [[cornuto]] [[all']][[asino]]. Here, based on the modifier spec, the
non-lemma form [[dice]] is replaced with the two-part link [[dire|dice]].
Another example: given the source phrase [[chi semina vento raccoglie tempesta]] "sow the wind, reap the whirlwind"
(literally (he) who sows wind gathers [the] tempest"). The result of calling add_links_to_multiword_term() is
[[chi]] [[semina]] [[vento]] [[raccoglie]] [[tempesta]], and with a modifier_spec of 'semina:~re; raccoglie:~re',
the result is [[chi]] [[seminare|semina]] [[vento]] [[raccogliere|raccoglie]] [[tempesta]]. Here we use the ~
notation to stand for the non-lemma form in the destination link.
A more complex example is [[se non hai altri moccoli puoi andare a letto al buio]], which becomes
[[se]] [[non]] [[hai]] [[altri]] [[moccoli]] [[puoi]] [[andare]] [[a]] [[letto]] [[al]] [[buio]] after calling
add_links_to_multiword_term(). With the following modifier_spec:
'hai:avere; altr[i:o]; moccol[i:o]; puoi: potere; andare a letto:~; al buio:~', the result of applying the spec is
[[se]] [[non]] [[avere|hai]] [[altro|altri]] [[moccolo|moccoli]] [[potere|puoi]] [[andare a letto]] [[al buio]].
Here, we rely on the alternative notation mentioned above for e.g. 'altr[i:o]', which is equivalent to 'altri:altro',
and link multiword subterms using e.g. 'andare a letto:~'. (The code knows how to handle multiword subexpressions
properly, and if the link text and destination are the same, only a single-part link is formed.)
]==]
function export.apply_link_modifiers(linked_term, modifier_spec, lang)
local split_modspecs = split(modifier_spec, "%s*;%s*")
for j, modspec in ipairs(split_modspecs) do
local id
if modspec:find("<") then
local rest
rest, id = modspec:match("^(.*)<id:(.-)>$")
if rest then
modspec = rest
end
end
local subterm, dest, otherlang
local begin_spec, rest, end_spec = modspec:match("^%[(.-)%]([^:]*)%[(.-)%]$")
if begin_spec then
local begin_from, begin_to = canonicalize_begin_end_spec(begin_spec)
local end_from, end_to = canonicalize_begin_end_spec(end_spec)
subterm = begin_from .. rest .. end_from
dest = begin_to .. rest .. end_to
end
if not subterm then
rest, end_spec = modspec:match("^([^:]*)%[(.-)%]$")
if rest then
local end_from, end_to = canonicalize_begin_end_spec(end_spec)
subterm = rest .. end_from
dest = rest .. end_to
end
end
if not subterm then
begin_spec, rest = modspec:match("^%[(.-)%]([^:]*)$")
if begin_spec then
local begin_from, begin_to = canonicalize_begin_end_spec(begin_spec)
subterm = begin_from .. rest
dest = begin_to .. rest
end
end
if not subterm then
subterm, dest = modspec:match("^(.-)%s*:%s*(.*)$")
if subterm and subterm ~= "^" and subterm ~= "$" then
local langdest
-- Parse off an initial language code (e.g. 'en:Higgs', 'la:minūtia' or 'grc:σκατός'). Also handle
-- Wikipedia prefixes ('w:Abatemarco' or 'w:it:Colle Val d'Elsa').
otherlang, langdest = dest:match("^([A-Za-z0-9._-]+):([^ ].*)$")
if otherlang == "w" then
local foreign_wikipedia, foreign_term = langdest:match("^([A-Za-z0-9._-]+):([^ ].*)$")
if foreign_wikipedia then
otherlang = otherlang .. ":" .. foreign_wikipedia
langdest = foreign_term
end
dest = ("%s:%s"):format(otherlang, langdest)
otherlang = nil
elseif otherlang then
otherlang = get_lang(otherlang, true, "allow etym")
dest = langdest
end
end
end
if not subterm then
if modspec == "?" or modspec == "!" then
subterm = "$"
dest = modspec
elseif modspec == "..." or modspec == "...?" then
subterm = "$"
dest = " " .. modspec
elseif modspec:find("^[A-Z]$") then
-- X, Y, etc. by themselves are unlinked, to help with snowclones
subterm = modspec
dest = "_"
else
subterm = modspec
dest = "~"
end
end
if subterm == "^" then
linked_term = dest:gsub("_", " ") .. linked_term
elseif subterm == "$" then
linked_term = linked_term .. dest:gsub("_", " ")
else
if subterm:find("[", nil, true) then
error(("Subterm '%s' in modifier spec '%s' cannot have brackets in it"):format(
escape_wikicode(subterm), escape_wikicode(modspec)))
end
local escaped_subterm = pattern_escape(subterm)
local subterm_re = "%[%[" .. escaped_subterm:gsub("(%%?[ ',%-])", "%%]*%1%%[*") .. "%]%]"
local expanded_dest
if dest:find("~", nil, true) then
expanded_dest = dest:gsub("~", replacement_escape(subterm))
else
expanded_dest = dest
end
if otherlang then
expanded_dest = expanded_dest .. "#" .. otherlang:getCanonicalName()
end
local subterm_replacement
if expanded_dest == "_" then
subterm_replacement = subterm
if id then
error("Can't supply <id:...> with an unlinked subterm")
end
if otherlang then
error("Can't supply prefixed language with an unlinked subterm")
end
elseif id or otherlang then
if id and expanded_dest:find("[", nil, true) then
error("Can't supply <id:...> with destination with embedded brackets")
end
subterm_replacement = require(links_module).language_link {
lang = otherlang or lang,
term = expanded_dest,
alt = subterm,
id = id,
}
elseif expanded_dest:find("[", nil, true) then
-- Use the destination directly if it has brackets in it (e.g. to put brackets around parts of a word).
subterm_replacement = expanded_dest
elseif expanded_dest == subterm then
subterm_replacement = "[[" .. subterm .. "]]"
else
subterm_replacement = "[[" .. expanded_dest .. "|" .. subterm .. "]]"
end
local escaped_subterm_replacement = replacement_escape(subterm_replacement)
local replaced_linked_term = ugsub(linked_term, subterm_re, escaped_subterm_replacement)
if replaced_linked_term == linked_term then
mw.log(("Attempted to replace %s with %s in %s"):format(subterm_re, escaped_subterm_replacement, linked_term))
error(("Subterm '%s' could not be located in %slinked expression %s, or replacement same as subterm"):format(
subterm, j > 1 and "intermediate " or "", escape_wikicode(linked_term)))
else
linked_term = replaced_linked_term
end
end
end
return linked_term
end
return export