Modul:Language/scripts/data generation
local p = {}
-- options
local verbatim = true -- Don't merge any ranges.
local exactlyAdjacent = false -- Only merge exactly adjacent ranges of the same script.
-- If both are false, merge neighboring ranges of the same script even if
-- they have codepoints between them.
-- If either verbatim or exactlyAdjacent are true, then there will be more than
-- 1000 keys in the "ranges" table.
local numberKeyFormat = (verbatim or exactlyAdjacent) and "%4d" or "%3d"
local sortedPairs = require("Module:table").sortedPairs
local isArray = require("Module:table").isArray
local function nextVal(t)
local _, val = next(t)
return val
end
-- For printing codepoints.
local function toHex(number)
return ("0x%05X"):format(number)
end
local function fromHex(str)
return tonumber(str, 16)
end
local function highlight(content)
return mw.getCurrentFrame():extensionTag{
name = "syntaxhighlight",
content = content,
args = { lang = "lua" }
}
end
local maxSizeForPrintedArray = 3
local function dump(val, indent)
local ty = type(val)
indent = indent and indent .. "\t" or "\t"
if ty == "table" then
if not val[maxSizeForPrintedArray + 1] and isArray(val) then -- don't have to check for nested tables
local str_array = {}
local i = 1
for _, v in ipairs(val) do
str_array[i] = dump(v)
i = i + 1
end
return "{ " .. table.concat(str_array, ", ") .. "}"
else
local str_table = { "{" }
local i = 2
local dumpKey = dump
if type(nextVal(val)) == "table" then -- we are in "ranges" array
dumpKey = function (val)
return type(val) == "number" and (numberKeyFormat):format(val) or '"' .. val .. '"' -- number or string
end
end
for k, v in sortedPairs(val) do
str_table[i] = indent .. "[" .. dumpKey(k) .. "] = " .. dump(v, indent) .. ","
i = i + 1
end
str_table[i] = indent:sub(2) .. "}"
return table.concat(str_table, "\n")
end
elseif ty == "string" then
return '"' .. val .. '"'
elseif ty == "number" then
return toHex(val)
else
return tostring(val)
end
end
function p.parseUnicodeScripts(frame)
local content = mw.title.new("Module:language/scripts/data generation/doc"):getContent()
local _, j, scriptData = content:find("<!%-%-(.-)%-%->")
local scriptCodeData = content:match("<!%-%-(.-)%-%->", j + 1)
local scriptNameToCode = {}
for code, name in scriptCodeData:gmatch("\nsc +; +(%a+) +; +(%w+)") do
scriptNameToCode[name:gsub(" ", "_")] = code
end
local output = {}
local ranges, individual = {}, {}
local script_data = { ranges = ranges, individual = individual }
local i = 1
for line in scriptData:gmatch("\n%x[^\n]+") do
local lower, higher, scriptName = line:match("(%x+)%.%.(%x+) +; +(%w+)")
if lower then
ranges[i] = { fromHex(lower), fromHex(higher), scriptNameToCode[scriptName] }
i = i + 1
else
local codepoint, scriptName = line:match("(%x+) +; +(%w+)")
if codepoint then
individual[fromHex(codepoint)] = scriptNameToCode[scriptName]
end
end
end
table.sort(
ranges,
function(range_table1, range_table2)
return range_table1[1] < range_table2[1]
end)
--[[
-- For debugging.
local function showRange(range, name)
return mw.log(name .. " = " .. toHex(range[1]) .. "–" .. toHex(range[2]) .. ": " .. range[3])
end
--]]
if not verbatim then
-- mw.log(#ranges)
local i = 0
while i < #ranges do
i = i + 1
local range = ranges[i]
local nextRange = ranges[i + 1]
-- Merge adjacent ranges if they belong to the same script.
-- Do this even if there are codepoints between them.
-- Those codepoints probably belong to Zzzz (uncoded script), which
-- probably does not matter for Wikipedia's purposes.
if nextRange and range[3] == nextRange[3] and (not exactlyAdjacent or range[2] == nextRange[1] - 1) then
--[[
showRange(range, "range1")
showRange(nextRange, "range2")
--]]
ranges[i] = { range[1], nextRange[2], range[3] }
table.remove(ranges, i + 1)
i = i - 1
end
end
end
return highlight(dump(script_data))
end
return p