Difference between revisions of "Module:Language/data"

From Eat Every Plant
Jump to navigation Jump to search
m (1 revision)
 
m (1 revision: From PNW foraging book - part 3)
 
(One intermediate revision by the same user not shown)
Line 1: Line 1:
 
local U = mw.ustring.char
 
local U = mw.ustring.char
  
-- diacritics
+
-- Diacritics, from the [[Combining Diacritical Marks]] block.
local grave     = U(0x300)
+
local grave       = U(0x300)
local acute     = U(0x301)
+
local acute       = U(0x301)
 +
local circumflex  = U(0x302)
 +
local tilde        = U(0x303)
 +
local macron      = U(0x304)
 +
local breve        = U(0x306)
 +
local dot          = U(0x307)
 +
local diaeresis    = U(0x308)
 
local double_acute = U(0x30B)
 
local double_acute = U(0x30B)
local tilde    = U(0x303)
+
local double_grave = U(0x30F)
local macron    = U(0x304)
+
local invbreve    = U(0x311)
local dgrave    = U(0x30F)
+
local undertie    = U(0x35C)
local invbreve  = U(0x311)
 
 
 
--[[ Name is the "canonical name" used on Wiktionary. Article is the Wikipedia article. Script is the ISO 15924 code. ]]
 
  
 +
--[[
 +
 +
This is a table of Wiktionary language codes with data belonging to them.
 +
Name is the "canonical name" used on Wiktionary.
 +
Article is the Wikipedia article.
 +
Script is the ISO 15924 code.
 +
]]
 
local data = {
 
local data = {
["ang"] = {
+
["languages"] = {
["name"] = "Old English",
+
["ab"] = {
["article"] = {"Old English"},
+
["name"] = "Abkhaz",
["scripts"] = {"Latn"},
+
},
-- Remove macrons, acutes, and overdots
+
["ang"] = {
["replacements"] = {
+
["name"] = "Old English",
["[ĀÁ]"] = "A",
+
["article"] = {"Old English"},
["[āá]"] = "a",
+
-- ["scripts"] = {"Latn"},
["[ǢǼ]"] = "Æ",
+
-- Remove macrons, acutes, and overdots
["[ǣǽ]"] = "æ",
+
["replacements"] = {
["Ċ"]    = "C",
+
decompose = true,
["ċ"]   = "c",
+
from = { "[" .. macron .. acute .. dot .. "]" },
["[ĒÉ]"] = "E",
 
["[ēé]"] = "e",
 
["Ġ"]   = "G",
 
["ġ"]    = "g",
 
["[ĪÍ]"] = "I",
 
["[īí]"] = "i",
 
["[ŌÓ]"] = "O",
 
["[ōó]"] = "o",
 
["[ŪÚ]"] = "U",
 
["[ūú]"] = "u",
 
["[ȲÝ]"] = "Y",
 
["[ȳý]"] = "y",
 
 
},
 
},
 
},
 
},
["ar"] = {
+
["ar"] = {
["name"] = "Arabic",
+
["name"] = "Arabic",
["article"] = "Arabic language",
+
["article"] = "Arabic language",
["scripts"] = { "Arab" },
+
-- ["scripts"] = { "Arab" },
--[[ ālif with wasla is replaced by ālif;
+
["direction"] = "rtl", -- Should be in the script data module.
taṭwīl, fatḥatan, ḍammatan, kasratan,
+
["replacements"] = {
fatḥa, ḍamma, kasra,
+
-- ālif with wasla is replaced by ālif;
shadda, sukūn, and superscript (dagger) ālif are removed. ]]
+
[U(0x0671)] = U(0x0627),
["direction"] = "rtl", -- Should be in the script data module.
+
-- taṭwīl, fatḥatan, ḍammatan, kasratan,
["replacements"] = {
+
-- fatḥa, ḍamma, kasra,
[U(0x0671)] = U(0x0627),
+
-- shadda, sukūn, and superscript (dagger) ālif are removed.
["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)
+
["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)
..U(0x064E)..U(0x064F)..U(0x0650)
+
..U(0x064E)..U(0x064F)..U(0x0650)
..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",
+
..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",
 
},
 
},
 
},
 
},
["be"] = {
+
["av"] = {
["article"] = "Belarusian language",
+
["name"] = "Avar"
["scripts"] = { "Cyrl" },
+
},
-- Combining acute accent is removed.
+
["be"] = {
["replacements"] = { [U(0x0301)] = "", }
+
["article"] = "Belarusian language",
 +
-- ["scripts"] = { "Cyrl" },
 +
-- Combining acute accent is removed.
 +
["replacements"] = { [U(0x0301)] = "", },
 +
},
 +
["bn"] = {
 +
["name"] = "Bengali",
 +
["article"] = "Bengali language",
 +
-- ["scripts"] = { "Beng" },
 
},
 
},
["bn"] = {
+
["bua"] = {
["name"] = "Bengali",
+
["name"] = "Buryat",
["article"] = "Bengali language",
 
["scripts"] = { "Beng" },
 
 
},
 
},
["cu"] = {
+
["cel-pro"] = {
["name"] = "Old Church Slavonic",
+
["name"] = "Proto-Celtic",
["article"] = "Old Church Slavonic",
+
["Wikipedia_code"] = "cel-x-proto",
["scripts"] = { "Cyrs" },
 
 
},
 
},
["de"] = {
+
["cu"] = {
["name"] = "German",
+
["name"] = "Old Church Slavonic",
["article"] = "German language",
+
["article"] = "Old Church Slavonic",
["scripts"] = { "Latn" },
+
-- ["scripts"] = { "Cyrs" },
--[[
 
["replacements"] = {
 
["ae"]    = "ä",
 
["oe"]    = "ö",
 
["ue"]    = "ü",
 
["A[Ee]"] = "Ä",
 
["O[Ee]"] = "Ö",
 
["U[Ee]"] = "Ü",
 
 
},
 
},
]]
+
["de"] = {
 +
["name"] = "German",
 +
["article"] = "German language",
 +
-- ["scripts"] = { "Latn" },
 +
--[[
 +
["replacements"] = {
 +
["ae"]    = "ä",
 +
["oe"]    = "ö",
 +
["ue"]    = "ü",
 +
["A[Ee]"] = "Ä",
 +
["O[Ee]"] = "Ö",
 +
["U[Ee]"] = "Ü",
 +
},
 +
]]
 
},
 
},
["en"] = {
+
["en"] = {
["name"] = "English",
+
["name"] = "English",
["article"] = "English language",
+
["article"] = "English language",
["scripts"] = { "Latn" },
+
-- ["scripts"] = { "Latn" },
 
},
 
},
["es"] = {
+
["es"] = {
["name"] = "Spanish",
+
["name"] = "Spanish",
["article"] = "Spanish language",
+
["article"] = "Spanish language",
["scripts"] = { "Latn" },
+
-- ["scripts"] = { "Latn" },
 
},
 
},
["fr"] = {
+
["egy"] = {
["name"] = "French",
+
["name"] = "Egyptian",
["article"] = "French language",
 
["scripts"] = { "Latn" },
 
 
},
 
},
["frm"] = {
+
["fr"] = {
["name"] = "Middle French",
+
["name"] = "French",
["article"] = "Middle French",
+
["article"] = "French language",
["scripts"] = { "Latn" },
+
-- ["scripts"] = { "Latn" },
 
},
 
},
["gem-pro"] = {
+
["frm"] = {
["name"] = "Proto-Germanic",
+
["name"] = "Middle French",
["article"] = "Proto-Germanic language",
+
["article"] = "Middle French",
["script"] = { "Latn" },
+
-- ["scripts"] = { "Latn" },
["type"] = "reconstructed",
 
["replacements"] = {},
 
 
},
 
},
["grc"] = {
+
["frp"] = {
["name"] = "Ancient Greek",
+
["name"] = "Franco-Provençal",
["article"] = "Ancient Greek",
+
},
["scripts"] = { "Grek" },
+
["ff"] = {
["replacements"] = {
+
["name"] = "Fula",
-- Vowels with macrons or breves are replaced with plain letters.
+
},
["[ᾱᾰ]"] = "α",
+
["gem-pro"] = {
["[ᾹᾸ]"] = "Α",
+
["name"] = "Proto-Germanic",
["[ῑῐ]"] = "ι",
+
["article"] = "Proto-Germanic language",
["[ῙῘ]"] = "Ι",
+
-- ["scripts"] = { "Latn" },
["[ῡῠ]"] = "υ",
+
["type"] = "reconstructed",
["[ῩῨ]"] = "Υ",
+
["replacements"] = {},
["ϐ"]   = "β",
+
["Wikipedia_code"] = "gem-x-proto",
["ϵ"]   = "ε",
+
},
["ϑ"]   = "θ",
+
["gmw-ecg"] = {
["ϰ"]   = "κ",
+
["name"] = "East Central German",
["ϱ"]   = "ρ",
+
},
["ϲ"]    = "σ",
+
["got"] = {
["ϕ"]   = "φ",
+
["name"] = "Gothic",
 +
["article"] = "Gothic language",
 +
-- ["scripts"] = { "Goth" },
 +
["replacements"] = {
 +
-- Latin to Gothic since people will not want to have to copy
 +
-- and paste Gothic letters in
 +
["[AÁaáĀā]"] = "𐌰",
 +
["[Bb]"]    = "𐌱",
 +
["[Gg]"]    = "𐌲",
 +
["[Dd]"]    = "𐌳",
 +
["[EeĒē]"]  = "𐌴",
 +
["[Qq]"]    = "𐌵",
 +
["[Zz]"]    = "𐌶",
 +
["[Hh]"]    = "𐌷",
 +
["[Þþ]"]    = "𐌸",
 +
["[IiÍí]"]  = "𐌹",
 +
["[Kk]"]    = "𐌺",
 +
["[Ll]"]    = "𐌻",
 +
["[Mm]"]    = "𐌼",
 +
["[Nn]"]    = "𐌽",
 +
["[Jj]"]     = "𐌾",
 +
["[UuÚúŪū]"] = "𐌿",
 +
["[Pp]"]     = "𐍀",
 +
["[Rr]"]     = "𐍂",
 +
["[Ss]"]     = "𐍃",
 +
["[Tt]"]     = "𐍄",
 +
["[WwYy]"]   = "𐍅",
 +
["[Ff]"]     = "𐍆",
 +
["[Xx]"]     = "𐍇",
 +
["[Ƕƕ]"]    = "𐍈", -- Not sure if "hw" and "hv" can safely be converted
 +
["[OoŌō]"]   = "𐍉",
 
},
 
},
 
},
 
},
["grk-pro"] = {
+
["gsw"] = {
["name"] = "Proto-Hellenic",
+
["name"] = "Alemannic German",
["Wikipedia_name"] = "Proto-Greek",
+
},
["article"] = "Proto-Greek language",
+
["grc"] = {
["script"] = { "Latn" },
+
["name"] = "Ancient Greek",
["type"] = "reconstructed",
+
["article"] = "Ancient Greek",
["replacements"] = {},
+
-- ["scripts"] = { "Grek" },
 +
["replacements"] = {
 +
decompose = true,
 +
from = {
 +
-- Replace variant letterforms with standard ones.
 +
"ϐ", "ϵ", "ϑ", "ϰ", "ϱ", "ϲ", "ϕ",
 +
-- Remove macrons and breves.
 +
"[" .. macron .. breve .. undertie .. "]"
 +
},
 +
to  = {
 +
"β", "ε", "θ", "κ", "ρ", "σ", "φ",
 +
}
 +
},
 +
},
 +
["grk-pro"] = {
 +
["name"] = "Proto-Hellenic",
 +
["Wikipedia_name"] = "Proto-Greek",
 +
["article"] = "Proto-Greek language",
 +
-- ["scripts"] = { "Latn" },
 +
["type"] = "reconstructed",
 +
["replacements"] = {},
 +
},
 +
["ha"] = {
 +
["name"] = "Hausa",
 +
-- remove tilde, grave, acute, macron, circumflex
 +
["replacements"] = {
 +
decompose = true,
 +
from = { "[" .. grave .. circumflex .. macron .. acute .. tilde .. "]" },
 +
},
 +
},
 +
["hi"] = {
 +
["name"] = "Hindi",
 +
["article"] = "Hindi",
 +
-- ["scripts"] = { "Deva" },
 +
},
 +
["ine-pro"] = {
 +
["name"] = "Proto-Indo-European",
 +
["article"] = "Proto-Indo-European language",
 +
-- ["scripts"] = { "Latn" },
 +
["type"] = "reconstructed",
 +
["replacements"] = {},
 +
["Wikipedia_code"] = "ine-x-proto",
 +
},
 +
["ja"] = {
 +
["name"] = "Japanese",
 +
["article"] = "Japanese language",
 +
-- ["scripts"] = { "Jpan" },
 
},
 
},
["hi"] = {
+
["jbo"] = { -- Lojban
["name"] = "Hindi",
+
["type"] = "appendix",
["article"] = "Hindi",
 
["scripts"] = { "Deva" },
 
 
},
 
},
["ine-pro"] = {
+
["la"] = {
["name"] = "Proto-Indo-European",
+
["name"] = "Latin",
["article"] = "Proto-Indo-European language",
+
["article"] = "Latin",
["script"] = { "Latn" },
+
-- ["scripts"] = { "Latn" },
["type"] = "reconstructed",
+
["replacements"] = {
["replacements"] = {},
+
-- Remove macrons, breves, and diaereses.
 +
decompose = true,
 +
from = { "[" .. macron .. breve .. diaeresis .. "]" },
 +
},
 
},
 
},
["ja"] = {
+
["lt"] = {
["name"] = "Japanese",
+
["name"] = "Lithuanian",
["article"] = "Japanese language",
+
-- remove acute, tilde, grave
["scripts"] = { "Jpan" },
+
["replacements"] = {
 +
decompose = true,
 +
from = { "[" .. acute .. tilde .. grave .. "]" },
 +
},
 
},
 
},
["la"] = {
+
["moe"] = {
["name"] = "Latin",
+
["name"] = "Cree",
["article"] = "Latin",
+
},
["scripts"] = { "Latn" },
+
["mul"] = {
["replacements"] = {
+
["name"] = "Translingual",
-- Vowels with macrons, breves, or diaereses are replaced with plain letters.
+
["article"] = "",
["[ĀĂ]"] = "A",
+
-- ["scripts"] = { "" },
["[āă]"] = "a",
+
},
["[ĒĔ]"] = "E",
+
["nci"] = {
["[ēĕë]"] = "e",
+
["name"] = "Classical Nahuatl",
["[ĪĬÏ]"] = "I",
+
["article"] = "Classical Nahuatl",
["[īĭï]"] = "i",
+
-- ["scripts"] = {"Latn"},
["[ŌŎ]"] = "O",
+
-- Remove macrons, acutes, circumflexes and graves
["[ōŏ]"]  = "o",
+
["replacements"] = {
["[ŪŬÜ]"] = "U",
+
decompose = true,
["[ūŭü]"] = "u",
+
-- Remove macrons, acutes, circumflexes, graves, and saltillo;
["Ȳ"]     = "Y",
+
-- see [[Saltillo (linguistics)]].
["ȳ"]    = "y"
+
from = { "[" .. grave .. acute .. macron .. circumflex .. "Ꞌꞌʻʼ'ʔ]" },
 
},
 
},
 
},
 
},
["mul"] = {
+
["nds-de"] = {
["name"] = "Translingual",
+
["name"] = "German Low German",
["article"] = "",
+
},
["script"] = { "" },
+
["oj"] = {
 +
["name"] = "Ojibwe",
 
},
 
},
["orv"] = {
+
["orv"] = {
["name"] = "Old East Slavic",
+
["name"] = "Old East Slavic",
["article"] = "Old East Slavic",
+
["article"] = "Old East Slavic",
["script"] = { "Cyrs" },
+
-- ["scripts"] = { "Cyrs" },
["replacements"] = {
+
["replacements"] = {
[U(0x484)] = "",
+
[U(0x484)] = "",
 
},
 
},
 
},
 
},
["pt"] = {
+
["pt"] = {
["name"] = "Portuguese",
+
["name"] = "Portuguese",
["article"] = "Portuguese language",
+
["article"] = "Portuguese language",
["scripts"] = { "Latn" },
+
-- ["scripts"] = { "Latn" },
 +
},
 +
["pa"] = {
 +
["name"] = "Punjabi",
 +
["article"] = "Punjabi language",
 +
-- ["scripts"] = { "Guru", "Arab", },
 
},
 
},
["pa"] = {
+
["ru"] = {
["name"] = "Punjabi",
+
["name"] = "Russian",
["article"] = "Punjabi language",
+
["article"] = "Russian language",
["scripts"] = { "Guru", "Arab", }
+
-- ["scripts"] = { "Cyrl" },
 +
-- Combining acute accent is removed.
 +
["replacements"] = { [U(0x0301)] = "", },
 
},
 
},
["ru"] = {
+
["rw"] = {
["name"] = "Russian",
+
["name"] = "Rwanda-Rundi",
["article"] = "Russian language",
 
["scripts"] = { "Cyrl" },
 
-- Combining acute accent is removed.
 
["replacements"] = { [U(0x0301)] = "", }
 
 
},
 
},
["sh"] = {
+
["se"] = {
["article"] = "Serbo-Croatian language",
+
["replacements"] = {
["scripts"] = { "Latn", "Cyrl" },
+
["([đflmnŋrsšŧv])'%1"] = "%1%1",
["replacements"] = {
+
},
["[ȀÀȂÁĀÃ]"] = "A",
 
["[ȁàȃáāã]"] = "a",
 
["[ȄÈȆÉĒẼ]"] = "E",
 
["[ȅèȇéēẽ]"] = "e",
 
["[ȈÌȊÍĪĨ]"] = "I",
 
["[ȉìȋíīĩ]"] = "i",
 
["[ȌÒȎÓŌÕ]"] = "O",
 
["[ȍòȏóōõ]"] = "o",
 
["[ȐȒŔ]"] = "R",
 
["[ȑȓŕ]"] = "r",
 
["[ȔÙȖÚŪŨ]"] = "U",
 
["[ȕùȗúūũ]"] = "u",
 
["Ѐ"] = "Е",
 
["ѐ"] = "е",
 
["[ӢЍ]"] = "И",
 
["[ӣѝ]"] = "и",
 
["[Ӯ]"] = "У",
 
["[ӯ]"] = "у"
 
 
},
 
},
},
+
["sh"] = {
["sla-pro"] = {
+
["article"] = "Serbo-Croatian language",
["name"] = "Proto-Slavic", -- also Common Slavic
+
-- ["scripts"] = { "Latn", "Cyrl" },
["type"] = "reconstructed",
+
["replacements"] = {
["scripts"] = { "Latn" },
+
decompose = true,
["replacements"] = {
+
from =  { "([AaEeIiOoUuRrАаЕеИиОоУуРр])[" .. double_grave
["[ÀÁÃĀȀȂ]"] = "A",
+
.. grave .. invbreve .. acute .. macron .. tilde .. "]" },
["[àáãāȁȃ]"] = "a",
+
to  = { "%1" },
["[ÈÉẼĒȄȆ]"] = "E",
+
},
["[èéẽēȅȇ]"] = "e",
+
},
["[ÌÍĨĪȈȊ]"] = "I",
+
["sl"] = {
["[ìíĩīȉȋ]"] = "i",
+
["name"] = "Slovene",
["[ÒÓÕŌȌȎŐ]"] = "O",  
+
["replacements"] = {
["[òóõōȍȏő]"] = "o",
+
-- remove tonal orthography
["[ÙÚŨŪȔȖŰ]"] = "U",
+
["[ÁÀÂȂȀ]"] = "A",
["[ùúũūȕȗű]"] = "u",
+
["[áàâȃȁ]"] = "a",
["[ỲÝỸȲ]"] = "Y",
+
["[ÉÈÊȆȄỆẸ]"] = "e",
["[ỳýỹȳ]"] = "y",
+
["[éèêȇȅệẹə]"] = "e",
["Ǭ"] = "Ǫ",
+
["[ÍÌÎȊȈ]"] = "I",
["ǭ"] = "ǫ",
+
["[íìîȋȉ]"] = "i",
["[" .. grave .. acute .. double_acute .. tilde .. macron .. dgrave .. invbreve .. "]"] = "",
+
["[ÓÒÔȎȌỘỌ]"] = "O",
 +
["[óòôȏȍộọ]"] = "o",
 +
["[ŔȒȐ]"] = "R",
 +
["[ŕȓȑ]"] = "r",
 +
["[ÚÙÛȖȔ]"] = "U",
 +
["[úùûȗȕ]"] = "u",
 +
["ł"] = "l",
 +
},
 +
},
 +
["sla-pro"] = {
 +
["name"] = "Proto-Slavic", -- also Common Slavic
 +
["type"] = "reconstructed",
 +
-- ["scripts"] = { "Latn" },
 +
["replacements"] = {
 +
["[ÀÁÃĀȀȂ]"] = "A",
 +
["[àáãāȁȃ]"] = "a",
 +
["[ÈÉẼĒȄȆ]"] = "E",
 +
["[èéẽēȅȇ]"] = "e",
 +
["[ÌÍĨĪȈȊ]"] = "I",
 +
["[ìíĩīȉȋ]"] = "i",
 +
["[ÒÓÕŌȌȎŐ]"] = "O",  
 +
["[òóõōȍȏő]"] = "o",
 +
["[ÙÚŨŪȔȖŰ]"] = "U",
 +
["[ùúũūȕȗű]"] = "u",
 +
["[ỲÝỸȲ]"] = "Y",
 +
["[ỳýỹȳ]"] = "y",
 +
["Ǭ"] = "Ǫ",
 +
["ǭ"] = "ǫ",
 +
["[" .. grave .. acute .. double_acute .. tilde .. macron .. double_grave .. invbreve .. "]"] = "",
 +
["ĭ"] = "ь",
 +
["ŭ"] = "ъ",
 
},
 
},
 
},
 
},
["uk"] = {
+
["uk"] = {
["article"] = "Ukrainian language",
+
["article"] = "Ukrainian language",
["scripts"] = { "Cyrl" },
+
-- ["scripts"] = { "Cyrl" },
-- Combining acute accent is removed.
+
-- Combining acute accent is removed.
["replacements"] = { [U(0x0301)] = "", }
+
["replacements"] = { [U(0x0301)] = "", }
 
},
 
},
["ur"] = {
+
["ur"] = {
["name"] = "Urdu",
+
["name"] = "Urdu",
["article"] = "Urdu",
+
["article"] = "Urdu",
["scripts"] = { "Arab" },
+
-- ["scripts"] = { "Arab" },
 
},
 
},
["zh"] = {
+
["zh"] = {
["name"] = "Chinese",
+
["name"] = "Chinese",
["article"] = "Chinese language",
+
["article"] = "Chinese language",
["scripts"] = { "Hani" },
+
-- ["scripts"] = { "Hani" },
 
},
 
},
["xcl"] = {
+
["xcl"] = {
["name"] = "Old Armenian",
+
["name"] = "Old Armenian",
["article"] = "Classical Armenian",
+
["article"] = "Classical Armenian",
["script"] = { "Armn" },
+
-- ["scripts"] = { "Armn" },
["replacements"] = {
+
["replacements"] = {
["[՞՜՛՟]"] = "",
+
["[՞՜՛՟]"] = "",
["և"] = "եւ",
+
["և"] = "եւ",
 
},
 
},
 
},
 
},
}
+
["xvn"] = {
 
+
["name"] = "Vandalic",
 +
["article"] = "Vandalic language",
 +
-- ["scripts"] = { "Latn" },
 +
},
 
--[[
 
--[[
 
+
[""] = {
[""] = {
+
["name"] = "",
["name"] = "",
+
["article"] = "",
["article"] = "",
+
-- ["scripts"] = { "" },
["script"] = { "" },
 
 
},
 
},
 
+
[""] = {
+
[""] = {
["name"] = "",
+
["name"] = "",
["article"] = "",
+
["article"] = "",
["script"] = { "" },
+
-- ["scripts"] = { "" },
["replacements"] = {
+
["replacements"] = {
 
},
 
},
 
},
 
},
  
 
]]
 
]]
 +
},
 +
 +
-- Here, keys (for example, "gem") are Wikipedia language codes used in
 +
-- {{lang}}, and values (for example, "gem-pro") are the equivalent Wiktionary
 +
-- code.
 +
-- Subtags are not currently supported.
 +
["redirects"] = {
 +
["aae"] = "sq",
 +
["aiq"] = "fa",
 +
["aln"] = "sq",
 +
["als"] = "sq",
 +
["azb"] = "az",
 +
["azj"] = "az",
 +
["bgn"] = "bal",
 +
["bs"] = "sh",
 +
["bxr"] = "bua",
 +
["cel-x-proto"] = "cel-pro",
 +
["ciw"] = "oj",
 +
["cnr"] = "sh",
 +
["fil"] = "tl",
 +
["fuf"] = "ff",
 +
["gem"] = "gem-pro", -- Not correct, but is commonly used.
 +
["gem-x-proto"] = "gem-pro",
 +
["hak"] = "zh",
 +
["hbo"] = "he",
 +
["hr"] = "sh",
 +
["ine"] = "ine-pro", -- Not correct, but might be commonly used.
 +
["ine-x-proto"] = "ine-pro",
 +
["nan"] = "zh",
 +
["prs"] = "fa",
 +
["rn"] = "rw",
 +
["sli"] = "gmw-ecg",
 +
["sr"] = "sh",
 +
["src"] = "sc",
 +
["sro"] = "sc",
 +
["tw"] = "ak",
 +
["wae"] = "gsw",
 +
["wep"] = "nds-de",
 +
["yue"] = "zh",
 +
["xno"] = "fro",
 +
},
 +
}
  
 
return data
 
return data

Latest revision as of 23:10, 13 August 2018

local U = mw.ustring.char

-- Diacritics, from the Combining Diacritical Marks block. local grave = U(0x300) local acute = U(0x301) local circumflex = U(0x302) local tilde = U(0x303) local macron = U(0x304) local breve = U(0x306) local dot = U(0x307) local diaeresis = U(0x308) local double_acute = U(0x30B) local double_grave = U(0x30F) local invbreve = U(0x311) local undertie = U(0x35C)

--[[

This is a table of Wiktionary language codes with data belonging to them. Name is the "canonical name" used on Wiktionary. Article is the Wikipedia article. Script is the ISO 15924 code. ]] local data = { ["languages"] = { ["ab"] = { ["name"] = "Abkhaz", }, ["ang"] = { ["name"] = "Old English", ["article"] = {"Old English"}, -- ["scripts"] = {"Latn"}, -- Remove macrons, acutes, and overdots ["replacements"] = { decompose = true, from = { "[" .. macron .. acute .. dot .. "]" }, }, }, ["ar"] = { ["name"] = "Arabic", ["article"] = "Arabic language", -- ["scripts"] = { "Arab" }, ["direction"] = "rtl", -- Should be in the script data module. ["replacements"] = { -- ālif with wasla is replaced by ālif; [U(0x0671)] = U(0x0627), -- taṭwīl, fatḥatan, ḍammatan, kasratan, -- fatḥa, ḍamma, kasra, -- shadda, sukūn, and superscript (dagger) ālif are removed. ["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D) ..U(0x064E)..U(0x064F)..U(0x0650) ..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "", }, }, ["av"] = { ["name"] = "Avar" }, ["be"] = { ["article"] = "Belarusian language", -- ["scripts"] = { "Cyrl" }, -- Combining acute accent is removed. ["replacements"] = { [U(0x0301)] = "", }, }, ["bn"] = { ["name"] = "Bengali", ["article"] = "Bengali language", -- ["scripts"] = { "Beng" }, }, ["bua"] = { ["name"] = "Buryat", }, ["cel-pro"] = { ["name"] = "Proto-Celtic", ["Wikipedia_code"] = "cel-x-proto", }, ["cu"] = { ["name"] = "Old Church Slavonic", ["article"] = "Old Church Slavonic", -- ["scripts"] = { "Cyrs" }, }, ["de"] = { ["name"] = "German", ["article"] = "German language", -- ["scripts"] = { "Latn" }, --[[ ["replacements"] = { ["ae"] = "ä", ["oe"] = "ö", ["ue"] = "ü", ["A[Ee]"] = "Ä", ["O[Ee]"] = "Ö", ["U[Ee]"] = "Ü", }, ]] }, ["en"] = { ["name"] = "English", ["article"] = "English language", -- ["scripts"] = { "Latn" }, }, ["es"] = { ["name"] = "Spanish", ["article"] = "Spanish language", -- ["scripts"] = { "Latn" }, }, ["egy"] = { ["name"] = "Egyptian", }, ["fr"] = { ["name"] = "French", ["article"] = "French language", -- ["scripts"] = { "Latn" }, }, ["frm"] = { ["name"] = "Middle French", ["article"] = "Middle French", -- ["scripts"] = { "Latn" }, }, ["frp"] = { ["name"] = "Franco-Provençal", }, ["ff"] = { ["name"] = "Fula", }, ["gem-pro"] = { ["name"] = "Proto-Germanic", ["article"] = "Proto-Germanic language", -- ["scripts"] = { "Latn" }, ["type"] = "reconstructed", ["replacements"] = {}, ["Wikipedia_code"] = "gem-x-proto", }, ["gmw-ecg"] = { ["name"] = "East Central German", }, ["got"] = { ["name"] = "Gothic", ["article"] = "Gothic language", -- ["scripts"] = { "Goth" }, ["replacements"] = { -- Latin to Gothic since people will not want to have to copy -- and paste Gothic letters in ["[AÁaáĀā]"] = "𐌰", ["[Bb]"] = "𐌱", ["[Gg]"] = "𐌲", ["[Dd]"] = "𐌳", ["[EeĒē]"] = "𐌴", ["[Qq]"] = "𐌵", ["[Zz]"] = "𐌶", ["[Hh]"] = "𐌷", ["[Þþ]"] = "𐌸", ["[IiÍí]"] = "𐌹", ["[Kk]"] = "𐌺", ["[Ll]"] = "𐌻", ["[Mm]"] = "𐌼", ["[Nn]"] = "𐌽", ["[Jj]"] = "𐌾", ["[UuÚúŪū]"] = "𐌿", ["[Pp]"] = "𐍀", ["[Rr]"] = "𐍂", ["[Ss]"] = "𐍃", ["[Tt]"] = "𐍄", ["[WwYy]"] = "𐍅", ["[Ff]"] = "𐍆", ["[Xx]"] = "𐍇", ["[Ƕƕ]"] = "𐍈", -- Not sure if "hw" and "hv" can safely be converted ["[OoŌō]"] = "𐍉", }, }, ["gsw"] = { ["name"] = "Alemannic German", }, ["grc"] = { ["name"] = "Ancient Greek", ["article"] = "Ancient Greek", -- ["scripts"] = { "Grek" }, ["replacements"] = { decompose = true, from = { -- Replace variant letterforms with standard ones. "ϐ", "ϵ", "ϑ", "ϰ", "ϱ", "ϲ", "ϕ", -- Remove macrons and breves. "[" .. macron .. breve .. undertie .. "]" }, to = { "β", "ε", "θ", "κ", "ρ", "σ", "φ", } }, }, ["grk-pro"] = { ["name"] = "Proto-Hellenic", ["Wikipedia_name"] = "Proto-Greek", ["article"] = "Proto-Greek language", -- ["scripts"] = { "Latn" }, ["type"] = "reconstructed", ["replacements"] = {}, }, ["ha"] = { ["name"] = "Hausa", -- remove tilde, grave, acute, macron, circumflex ["replacements"] = { decompose = true, from = { "[" .. grave .. circumflex .. macron .. acute .. tilde .. "]" }, }, }, ["hi"] = { ["name"] = "Hindi", ["article"] = "Hindi", -- ["scripts"] = { "Deva" }, }, ["ine-pro"] = { ["name"] = "Proto-Indo-European", ["article"] = "Proto-Indo-European language", -- ["scripts"] = { "Latn" }, ["type"] = "reconstructed", ["replacements"] = {}, ["Wikipedia_code"] = "ine-x-proto", }, ["ja"] = { ["name"] = "Japanese", ["article"] = "Japanese language", -- ["scripts"] = { "Jpan" }, }, ["jbo"] = { -- Lojban ["type"] = "appendix", }, ["la"] = { ["name"] = "Latin", ["article"] = "Latin", -- ["scripts"] = { "Latn" }, ["replacements"] = { -- Remove macrons, breves, and diaereses. decompose = true, from = { "[" .. macron .. breve .. diaeresis .. "]" }, }, }, ["lt"] = { ["name"] = "Lithuanian", -- remove acute, tilde, grave ["replacements"] = { decompose = true, from = { "[" .. acute .. tilde .. grave .. "]" }, }, }, ["moe"] = { ["name"] = "Cree", }, ["mul"] = { ["name"] = "Translingual", ["article"] = "", -- ["scripts"] = { "" }, }, ["nci"] = { ["name"] = "Classical Nahuatl", ["article"] = "Classical Nahuatl", -- ["scripts"] = {"Latn"}, -- Remove macrons, acutes, circumflexes and graves ["replacements"] = { decompose = true, -- Remove macrons, acutes, circumflexes, graves, and saltillo; -- see Saltillo (linguistics). from = { "[" .. grave .. acute .. macron .. circumflex .. "Ꞌꞌʻʼ'ʔ]" }, }, }, ["nds-de"] = { ["name"] = "German Low German", }, ["oj"] = { ["name"] = "Ojibwe", }, ["orv"] = { ["name"] = "Old East Slavic", ["article"] = "Old East Slavic", -- ["scripts"] = { "Cyrs" }, ["replacements"] = { [U(0x484)] = "", }, }, ["pt"] = { ["name"] = "Portuguese", ["article"] = "Portuguese language", -- ["scripts"] = { "Latn" }, }, ["pa"] = { ["name"] = "Punjabi", ["article"] = "Punjabi language", -- ["scripts"] = { "Guru", "Arab", }, }, ["ru"] = { ["name"] = "Russian", ["article"] = "Russian language", -- ["scripts"] = { "Cyrl" }, -- Combining acute accent is removed. ["replacements"] = { [U(0x0301)] = "", }, }, ["rw"] = { ["name"] = "Rwanda-Rundi", }, ["se"] = { ["replacements"] = { ["([đflmnŋrsšŧv])'%1"] = "%1%1", }, }, ["sh"] = { ["article"] = "Serbo-Croatian language", -- ["scripts"] = { "Latn", "Cyrl" }, ["replacements"] = { decompose = true, from = { "([AaEeIiOoUuRrАаЕеИиОоУуРр])[" .. double_grave .. grave .. invbreve .. acute .. macron .. tilde .. "]" }, to = { "%1" }, }, }, ["sl"] = { ["name"] = "Slovene", ["replacements"] = { -- remove tonal orthography ["[ÁÀÂȂȀ]"] = "A", ["[áàâȃȁ]"] = "a", ["[ÉÈÊȆȄỆẸ]"] = "e", ["[éèêȇȅệẹə]"] = "e", ["[ÍÌÎȊȈ]"] = "I", ["[íìîȋȉ]"] = "i", ["[ÓÒÔȎȌỘỌ]"] = "O", ["[óòôȏȍộọ]"] = "o", ["[ŔȒȐ]"] = "R", ["[ŕȓȑ]"] = "r", ["[ÚÙÛȖȔ]"] = "U", ["[úùûȗȕ]"] = "u", ["ł"] = "l", }, }, ["sla-pro"] = { ["name"] = "Proto-Slavic", -- also Common Slavic ["type"] = "reconstructed", -- ["scripts"] = { "Latn" }, ["replacements"] = { ["[ÀÁÃĀȀȂ]"] = "A", ["[àáãāȁȃ]"] = "a", ["[ÈÉẼĒȄȆ]"] = "E", ["[èéẽēȅȇ]"] = "e", ["[ÌÍĨĪȈȊ]"] = "I", ["[ìíĩīȉȋ]"] = "i", ["[ÒÓÕŌȌȎŐ]"] = "O", ["[òóõōȍȏő]"] = "o", ["[ÙÚŨŪȔȖŰ]"] = "U", ["[ùúũūȕȗű]"] = "u", ["[ỲÝỸȲ]"] = "Y", ["[ỳýỹȳ]"] = "y", ["Ǭ"] = "Ǫ", ["ǭ"] = "ǫ", ["[" .. grave .. acute .. double_acute .. tilde .. macron .. double_grave .. invbreve .. "]"] = "", ["ĭ"] = "ь", ["ŭ"] = "ъ", }, }, ["uk"] = { ["article"] = "Ukrainian language", -- ["scripts"] = { "Cyrl" }, -- Combining acute accent is removed. ["replacements"] = { [U(0x0301)] = "", } }, ["ur"] = { ["name"] = "Urdu", ["article"] = "Urdu", -- ["scripts"] = { "Arab" }, }, ["zh"] = { ["name"] = "Chinese", ["article"] = "Chinese language", -- ["scripts"] = { "Hani" }, }, ["xcl"] = { ["name"] = "Old Armenian", ["article"] = "Classical Armenian", -- ["scripts"] = { "Armn" }, ["replacements"] = { ["[՞՜՛՟]"] = "", ["և"] = "եւ", }, }, ["xvn"] = { ["name"] = "Vandalic", ["article"] = "Vandalic language", -- ["scripts"] = { "Latn" }, }, --[[ [""] = { ["name"] = "", ["article"] = "", -- ["scripts"] = { "" }, },

[""] = { ["name"] = "", ["article"] = "", -- ["scripts"] = { "" }, ["replacements"] = { }, },

]] },

-- Here, keys (for example, "gem") are Wikipedia language codes used in -- [undefined] error: {{lang}}: no text (help), and values (for example, "gem-pro") are the equivalent Wiktionary -- code. -- Subtags are not currently supported. ["redirects"] = { ["aae"] = "sq", ["aiq"] = "fa", ["aln"] = "sq", ["als"] = "sq", ["azb"] = "az", ["azj"] = "az", ["bgn"] = "bal", ["bs"] = "sh", ["bxr"] = "bua", ["cel-x-proto"] = "cel-pro", ["ciw"] = "oj", ["cnr"] = "sh", ["fil"] = "tl", ["fuf"] = "ff", ["gem"] = "gem-pro", -- Not correct, but is commonly used. ["gem-x-proto"] = "gem-pro", ["hak"] = "zh", ["hbo"] = "he", ["hr"] = "sh", ["ine"] = "ine-pro", -- Not correct, but might be commonly used. ["ine-x-proto"] = "ine-pro", ["nan"] = "zh", ["prs"] = "fa", ["rn"] = "rw", ["sli"] = "gmw-ecg", ["sr"] = "sh", ["src"] = "sc", ["sro"] = "sc", ["tw"] = "ak", ["wae"] = "gsw", ["wep"] = "nds-de", ["yue"] = "zh", ["xno"] = "fro", }, }

return data