Modul:Language/scripts/data generation

local p = {}


-- options
local verbatim			= true -- Don't merge any ranges.
local exactlyAdjacent	= false -- Only merge exactly adjacent ranges of the same script.
-- If both are false, merge neighboring ranges of the same script even if
-- they have codepoints between them.

-- If either verbatim or exactlyAdjacent are true, then there will be more than
-- 1000 keys in the "ranges" table.
local numberKeyFormat = (verbatim or exactlyAdjacent) and "%4d" or "%3d"


local sortedPairs = require("Module:table").sortedPairs
local isArray = require("Module:table").isArray

local function nextVal(t)
	local _, val = next(t)
	return val
end

-- For printing codepoints.
local function toHex(number)
	return ("0x%05X"):format(number)
end

local function fromHex(str)
	return tonumber(str, 16)
end

local function highlight(content)
	return mw.getCurrentFrame():extensionTag{
		name = "syntaxhighlight",
		content = content,
		args = { lang = "lua" }
	}
end

local maxSizeForPrintedArray = 3
local function dump(val, indent)
	local ty = type(val)
	indent = indent and indent .. "\t" or "\t"
	if ty == "table" then
		if not val[maxSizeForPrintedArray + 1] and isArray(val) then -- don't have to check for nested tables
			local str_array = {}
			local i = 1
			for _, v in ipairs(val) do
				str_array[i] = dump(v)
				i = i + 1
			end
			return "{ " .. table.concat(str_array, ", ") .. "}"
		else
			local str_table = { "{" }
			local i = 2
			local dumpKey = dump
			if type(nextVal(val)) == "table" then -- we are in "ranges" array
				dumpKey = function (val)
					return type(val) == "number" and (numberKeyFormat):format(val) or '"' .. val .. '"' -- number or string
				end
			end
			for k, v in sortedPairs(val) do
				str_table[i] = indent .. "[" .. dumpKey(k) .. "] = " .. dump(v, indent) .. ","
				i = i + 1
			end
			str_table[i] = indent:sub(2) .. "}"
			return table.concat(str_table, "\n")
		end
	elseif ty == "string" then
		return '"' .. val .. '"'
	elseif ty == "number" then
		return toHex(val)
	else
		return tostring(val)
	end
end

function p.parseUnicodeScripts(frame)
	local content = mw.title.new("Module:language/scripts/data generation/doc"):getContent()
	local _, j, scriptData = content:find("<!%-%-(.-)%-%->")
	local scriptCodeData = content:match("<!%-%-(.-)%-%->", j + 1)
	
	local scriptNameToCode = {}
	for code, name in scriptCodeData:gmatch("\nsc +; +(%a+) +; +(%w+)") do
		scriptNameToCode[name:gsub(" ", "_")] = code
	end
	
	local output = {}
	local ranges, individual = {}, {}
	local script_data = { ranges = ranges, individual = individual }
	local i = 1
	for line in scriptData:gmatch("\n%x[^\n]+") do
		local lower, higher, scriptName = line:match("(%x+)%.%.(%x+) +; +(%w+)")
		if lower then
			ranges[i] = { fromHex(lower), fromHex(higher), scriptNameToCode[scriptName] }
			i = i + 1
		else
			local codepoint, scriptName = line:match("(%x+) +; +(%w+)")
			if codepoint then
				individual[fromHex(codepoint)] = scriptNameToCode[scriptName]
			end
		end
	end
	
	table.sort(
		ranges,
		function(range_table1, range_table2)
			return range_table1[1] < range_table2[1]
		end)
	
	--[[
	-- For debugging.
	local function showRange(range, name)
		return mw.log(name .. " = " .. toHex(range[1]) .. "–" .. toHex(range[2]) .. ": " .. range[3])
	end
	--]]
	
	if not verbatim then
		-- mw.log(#ranges)
		local i = 0
		while i < #ranges do
			i = i + 1
			local range = ranges[i]
			local nextRange = ranges[i + 1]
			-- Merge adjacent ranges if they belong to the same script.
			-- Do this even if there are codepoints between them.
			-- Those codepoints probably belong to Zzzz (uncoded script), which
			-- probably does not matter for Wikipedia's purposes.
			if nextRange and range[3] == nextRange[3] and (not exactlyAdjacent or range[2] == nextRange[1] - 1) then
				--[[
				showRange(range, "range1")
				showRange(nextRange, "range2")
				--]]
				ranges[i] = { range[1], nextRange[2], range[3] }
				table.remove(ranges, i + 1)
				i = i - 1
			end
		end
	end
	
	return highlight(dump(script_data))
end

return p