Module:Unicode data - Diff

local p = {}

local p = {}

local floor = math.floor

local floor = math.floor

local function errorf(level, ...)

local function errorf(level, ...)

	if type(level) == "number" then

	if type(level) == "number" then

		return error(string.format(...), level + 1)

		return error(string.format(...), level + 1)

	else -- level is actually the format string.

	else -- level is actually the format string.

		return error(string.format(level, ...), 2)

		return error(string.format(level, ...), 2)

end

end

end

end

local function binary_range_search(codepoint, ranges)

local function binary_range_search(codepoint, ranges)

	local low, mid, high

	local low, mid, high

	low, high = 1, ranges.length or require "Module:TableTools".length(ranges)

	low, high = 1, ranges.length or require "Module:TableTools".length(ranges)

	while low <= high do

	while low <= high do

		mid = floor((low + high) / 2)

		mid = floor((low + high) / 2)

		local range = ranges[mid]

		local range = ranges[mid]

		if codepoint < range[1] then

		if codepoint < range[1] then

			high = mid - 1

			high = mid - 1

		elseif codepoint <= range[2] then

		elseif codepoint <= range[2] then

			return range, mid

			return range, mid

		else

		else

			low = mid + 1

			low = mid + 1

end

end

end

end

	return nil, mid

	return nil, mid

end

end

p.binary_range_search = binary_range_search

p.binary_range_search = binary_range_search

--[[

--[[

local function linear_range_search(codepoint, ranges)

local function linear_range_search(codepoint, ranges)

	for i, range in ipairs(ranges) do

	for i, range in ipairs(ranges) do

		if range[1] <= codepoint and codepoint <= range[2] then

		if range[1] <= codepoint and codepoint <= range[2] then

			return range

			return range

end

end

end

end

end

end

--]]

--]]

-- Load a module by indexing "loader" with the name of the module minus the

-- Load a module by indexing "loader" with the name of the module minus the

-- "Module:Unicode data/" part. For instance, loader.blocks returns

-- "Module:Unicode data/" part. For instance, loader.blocks returns

-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be

-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be

-- returned.

-- returned.

local loader = setmetatable({}, {

local loader = setmetatable({}, {

	__index = function (self, key)

	__index = function (self, key)

		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)

		local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)

		if not success then

		if not success then

			data = false

			data = false

end

end

		self[key] = data

		self[key] = data

		return data

		return data

end

end

})

})

-- For the algorithm used to generate Hangul Syllable names,

-- For the algorithm used to generate Hangul Syllable names,

-- see "Hangul Syllable Name Generation" in section 3.12 of the

-- see "Hangul Syllable Name Generation" in section 3.12 of the

-- Unicode Specification:

-- Unicode Specification:

-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf

-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf

local name_hooks = {

local name_hooks = {

	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters

	{     0x00,     0x1F, "<control-%04X>" }, -- C0 control characters

	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters

	{     0x7F,     0x9F, "<control-%04X>" }, -- DEL and C1 control characters

	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A

	{   0x3400,   0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A

	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph

	{   0x4E00,   0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph

	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables

	{   0xAC00,   0xD7A3, function (codepoint) -- Hangul Syllables

		local Hangul_data = loader.Hangul

		local Hangul_data = loader.Hangul

		local syllable_index = codepoint - 0xAC00

		local syllable_index = codepoint - 0xAC00

		return ("HANGUL SYLLABLE %s%s%s"):format(

		return ("HANGUL SYLLABLE %s%s%s"):format(

			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],

			Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],

			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)

			Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)

				/ Hangul_data.trail_count)],

				/ Hangul_data.trail_count)],

			Hangul_data.trails[syllable_index % Hangul_data.trail_count]

			Hangul_data.trails[syllable_index % Hangul_data.trail_count]

	end },

	end },

	-- High Surrogates, High Private Use Surrogates, Low Surrogates

	-- High Surrogates, High Private Use Surrogates, Low Surrogates

	{   0xD800,   0xDFFF, "<surrogate-%04X>" },

	{   0xD800,   0xDFFF, "<surrogate-%04X>" },

	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use

	{   0xE000,   0xF8FF, "<private-use-%04X>" }, -- Private Use

	-- CJK Compatibility Ideographs

	-- CJK Compatibility Ideographs

	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },

	{   0xF900,   0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },

	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },

	{   0xFA70,   0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },

	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph

	{  0x17000,  0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph

	{  0x18800,  0x18AFF, function (codepoint)

	{  0x18800,  0x18AFF, function (codepoint)

		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)

		return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)

	end },

	end },

	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement

	{  0x18D00,  0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement

	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu

	{  0x1B170,  0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu

	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B

	{  0x20000,  0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B

	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C

	{  0x2A700,  0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C

	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D

	{  0x2B740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D

	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E

	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E

	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F

	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F

	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)

	-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)

	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },

	{  0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },

	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement

	{  0xE0100,  0xE01EF, function (codepoint) -- Variation Selectors Supplement

		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)

		return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)

	end},

	end},

	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G

	{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G

	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H

	{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H

	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use

	{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use

	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use

	{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use

name_hooks.length = #name_hooks

name_hooks.length = #name_hooks

local name_range_cache

local name_range_cache

local function generate_name(data, codepoint)

local function generate_name(data, codepoint)

	if type(data) == "string" then

	if type(data) == "string" then

		return data:format(codepoint)

		return data:format(codepoint)

	else

	else

		return data(codepoint)

		return data(codepoint)

end

end

end

end

--[[

--[[

-- Checks that the code point is a number and in range.

-- Checks that the code point is a number and in range.

-- Does not check whether code point is an integer.

-- Does not check whether code point is an integer.

-- Not used

-- Not used

local function check_codepoint(funcName, argIdx, val)

local function check_codepoint(funcName, argIdx, val)

	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')

	require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')

	if codepoint < 0 or 0x10FFFF < codepoint then

	if codepoint < 0 or 0x10FFFF < codepoint then

		errorf("Codepoint %04X out of range", codepoint)

		errorf("Codepoint %04X out of range", codepoint)

end

end

end

end

--]]

--]]

-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8

-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8

function p.lookup_name(codepoint)

function p.lookup_name(codepoint)

	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned

	-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned

	-- (Cn) and specifically noncharacters:

	-- (Cn) and specifically noncharacters:

	-- https://www.unicode.org/faq/private_use.html#nonchar4

	-- https://www.unicode.org/faq/private_use.html#nonchar4

	if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF

	if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF

			or floor(codepoint % 0x10000) >= 0xFFFE) then

			or floor(codepoint % 0x10000) >= 0xFFFE) then

		return ("<noncharacter-%04X>"):format(codepoint)

		return ("<noncharacter-%04X>"):format(codepoint)

end

end

	if name_range_cache -- Check if previously used "name hook" applies to this code point.

	if name_range_cache -- Check if previously used "name hook" applies to this code point.

			and codepoint >= name_range_cache[1]

			and codepoint >= name_range_cache[1]

			and codepoint <= name_range_cache[2] then

			and codepoint <= name_range_cache[2] then

		return generate_name(name_range_cache[3], codepoint)

		return generate_name(name_range_cache[3], codepoint)

end

end

	local range = binary_range_search(codepoint, name_hooks)

	local range = binary_range_search(codepoint, name_hooks)

	if range then

	if range then

		name_range_cache = range

		name_range_cache = range

		return generate_name(range[3], codepoint)

		return generate_name(range[3], codepoint)

end

end

	local data = loader[('names/%03X'):format(codepoint / 0x1000)]

	local data = loader[('names/%03X'):format(codepoint / 0x1000)]

	if data and data[codepoint] then

	if data and data[codepoint] then

		return data[codepoint]

		return data[codepoint]

	-- Unassigned (Cn) consists of noncharacters and reserved characters.

	-- Unassigned (Cn) consists of noncharacters and reserved characters.

	-- The character has been established not to be a noncharacter,

	-- The character has been established not to be a noncharacter,

	-- and if it were assigned, its name would already been retrieved,

	-- and if it were assigned, its name would already been retrieved,

	-- so it must be reserved.

	-- so it must be reserved.

	else

	else

		return ("<reserved-%04X>"):format(codepoint)

		return ("<reserved-%04X>"):format(codepoint)

end

end

end

end

--[[

--[[

-- No image data modules on iWiki yet.

-- No image data modules on iWiki yet.

function p.lookup_image(codepoint)

function p.lookup_image(codepoint)

	local data = loader[('images/%03X'):format(codepoint / 0x1000)]

	local data = loader[('images/%03X'):format(codepoint / 0x1000)]

	if data then

	if data then

		return data[codepoint]

		return data[codepoint]

end

end

end

end

--]]

--]]

local planes = {

local planes = {

	[ 0] = "Basic Multilingual Plane";

	[ 0] = "Basic Multilingual Plane";

	[ 1] = "Supplementary Multilingual Plane";

	[ 1] = "Supplementary Multilingual Plane";

	[ 2] = "Supplementary Ideographic Plane";

	[ 2] = "Supplementary Ideographic Plane";

	[ 3] = "Tertiary Ideographic Plane";

	[ 3] = "Tertiary Ideographic Plane";

	[14] = "Supplementary Special-purpose Plane";

	[14] = "Supplementary Special-purpose Plane";

	[15] = "Supplementary Private Use Area-A";

	[15] = "Supplementary Private Use Area-A";

	[16] = "Supplementary Private Use Area-B";

	[16] = "Supplementary Private Use Area-B";

-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.

-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.

local blocks

local blocks

local function block_iter(blocks, i)

local function block_iter(blocks, i)

	i = i + 1

	i = i + 1

	local data = blocks[i]

	local data = blocks[i]

	if data then

	if data then

		 -- Unpack doesn't work on tables loaded with mw.loadData.

		 -- Unpack doesn't work on tables loaded with mw.loadData.

		return i, data[1], data[2], data[3]

		return i, data[1], data[2], data[3]

end

end

end

end

-- An ipairs-type iterator generator for the list of blocks.

-- An ipairs-type iterator generator for the list of blocks.

function p.enum_blocks()

function p.enum_blocks()

	local blocks = loader.blocks

	local blocks = loader.blocks

	return block_iter, blocks, 0

	return block_iter, blocks, 0

end

end

function p.lookup_plane(codepoint)

function p.lookup_plane(codepoint)

	local i = floor(codepoint / 0x10000)

	local i = floor(codepoint / 0x10000)

	return planes[i] or ("Plane %u"):format(i)

	return planes[i] or ("Plane %u"):format(i)

end

end

function p.lookup_block(codepoint)

function p.lookup_block(codepoint)

	local blocks = loader.blocks

	local blocks = loader.blocks

	local range = binary_range_search(codepoint, blocks)

	local range = binary_range_search(codepoint, blocks)

	if range then

	if range then

		return range[3]

		return range[3]

	else

	else

		return "No Block"

		return "No Block"

end

end

end

end

function p.get_block_info(name)

function p.get_block_info(name)

	for i, block in ipairs(loader.blocks) do

	for i, block in ipairs(loader.blocks) do

		if block[3] == name then

		if block[3] == name then

			return block

			return block

end

end

end

end

end

end

function p.is_valid_pagename(pagename)

function p.is_valid_pagename(pagename)

	local has_nonws = false

	local has_nonws = false

	for cp in mw.ustring.gcodepoint(pagename) do

	for cp in mw.ustring.gcodepoint(pagename) do

		if (cp == 0x0023) -- #

		if (cp == 0x0023) -- #

		or (cp == 0x005B) -- [

		or (cp == 0x005B) -- [

		or (cp == 0x005D) -- ]

		or (cp == 0x005D) -- ]

		or (cp == 0x007B) -- {

		or (cp == 0x007B) -- {

		or (cp == 0x007C) -- |

		or (cp == 0x007C) -- |

		or (cp == 0x007D) -- }

		or (cp == 0x007D) -- }

		or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR

		or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR

		or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block

		or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block

		or (cp == 0xFFFD) -- REPLACEMENT CHARACTER

		or (cp == 0xFFFD) -- REPLACEMENT CHARACTER

		then

		then

			return false

			return false

end

end

		local printable, result = p.is_printable(cp)

		local printable, result = p.is_printable(cp)

		if not printable then

		if not printable then

			return false

			return false

end

end

		if result ~= "space-separator" then

		if result ~= "space-separator" then

			has_nonws = true

			has_nonws = true

end

end

end

end

	return has_nonws

	return has_nonws

end

end

local function manual_unpack(what, from)

local function manual_unpack(what, from)

	if what[from + 1] == nil then

	if what[from + 1] == nil then

		return what[from]

		return what[from]

end

end

	local result = {}

	local result = {}

	from = from or 1

	from = from or 1

	for i, item in ipairs(what) do

	for i, item in ipairs(what) do

		if i >= from then

		if i >= from then

			table.insert(result, item)

			table.insert(result, item)

end

end

end

end

	return unpack(result)

	return unpack(result)

end

end

local function compare_ranges(range1, range2)

local function compare_ranges(range1, range2)

	return range1[1] < range2[1]

	return range1[1] < range2[1]

end

end

-- Creates a function to look up data in a module that contains "singles" (a

-- Creates a function to look up data in a module that contains "singles" (a

-- code point-to-data map) and "ranges" (an array containing arrays that contain

-- code point-to-data map) and "ranges" (an array containing arrays that contain

-- the low and high code points of a range and the data associated with that

-- the low and high code points of a range and the data associated with that

-- range).

-- range).

-- "loader" loads and returns the "singles" and "ranges" tables.

-- "loader" loads and returns the "singles" and "ranges" tables.

-- "match_func" is passed the code point and either the data or the "dots", and

-- "match_func" is passed the code point and either the data or the "dots", and

-- generates the final result of the function.

-- generates the final result of the function.

-- The varargs ("dots") describes the default data to be returned if there wasn't

-- The varargs ("dots") describes the default data to be returned if there wasn't

-- a match.

-- a match.

-- In case the function is used more than once, "cache" saves ranges that have

-- In case the function is used more than once, "cache" saves ranges that have

-- already been found to match, or a range whose data is the default if there

-- already been found to match, or a range whose data is the default if there

-- was no match.

-- was no match.

local function memo_lookup(data_module_subpage, match_func, ...)

local function memo_lookup(data_module_subpage, match_func, ...)

	local dots = { ... }

	local dots = { ... }

	local cache = {}

	local cache = {}

	local singles, ranges

	local singles, ranges

	return function (codepoint)

	return function (codepoint)

		if not singles then

		if not singles then

			local data_module = loader[data_module_subpage]

			local data_module = loader[data_module_subpage]

			singles, ranges = data_module.singles, data_module.ranges

			singles, ranges = data_module.singles, data_module.ranges

end

end

		if singles[codepoint] then

		if singles[codepoint] then

			return match_func(codepoint, singles[codepoint])

			return match_func(codepoint, singles[codepoint])

end

end

		local range = binary_range_search(codepoint, cache)

		local range = binary_range_search(codepoint, cache)

		if range then

		if range then

			return match_func(codepoint, manual_unpack(range, 3))

			return match_func(codepoint, manual_unpack(range, 3))

end

end

		local range, index = binary_range_search(codepoint, ranges)

		local range, index = binary_range_search(codepoint, ranges)

		if range then

		if range then

			table.insert(cache, range)

			table.insert(cache, range)

			table.sort(cache, compare_ranges)

			table.sort(cache, compare_ranges)

			return match_func(codepoint, manual_unpack(range, 3))

			return match_func(codepoint, manual_unpack(range, 3))

end

end

		if ranges[index] then

		if ranges[index] then

			local dots_range

			local dots_range

			if codepoint > ranges[index][2] then

			if codepoint > ranges[index][2] then

				dots_range = {

				dots_range = {

					ranges[index][2] + 1,

					ranges[index][2] + 1,

					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,

					ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,

					unpack(dots)

					unpack(dots)

			else -- codepoint < range[index][1]

			else -- codepoint < range[index][1]

				dots_range = {

				dots_range = {

					ranges[index - 1] and ranges[index - 1][2] + 1 or 0,

					ranges[index - 1] and ranges[index - 1][2] + 1 or 0,

					ranges[index][1] - 1,

					ranges[index][1] - 1,

					unpack(dots)

					unpack(dots)

end

end

			table.sort(cache, compare_ranges)

			table.sort(cache, compare_ranges)

end

end

		return match_func(codepoint)

		return match_func(codepoint)

end

end

end

end

-- Get a code point's combining class value in [[Module:Unicode data/combining]],

-- Get a code point's combining class value in [[Module:Unicode data/combining]],

-- and return whether this value is not zero. Zero is assigned as the default

-- and return whether this value is not zero. Zero is assigned as the default

-- if the combining class value is not found in this data module.

-- if the combining class value is not found in this data module.

-- That is, return true if character is combining, or false if it is not.

-- That is, return true if character is combining, or false if it is not.

-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for

-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for

-- more information.

-- more information.

p.is_combining = memo_lookup(

p.is_combining = memo_lookup(

	"combining",

	"combining",

	function (codepoint, combining_class)

	function (codepoint, combining_class)

		return combining_class and combining_class ~= 0 or false

		return combining_class and combining_class ~= 0 or false

	end,

	end,

0)

0)

function p.add_dotted_circle(str)

function p.add_dotted_circle(str)

	return (mw.ustring.gsub(str, ".",

	return (mw.ustring.gsub(str, ".",

		function(char)

		function(char)

			if p.is_combining(mw.ustring.codepoint(char)) then

			if p.is_combining(mw.ustring.codepoint(char)) then

				return '◌' .. char

				return '◌' .. char

end

end

		end))

		end))

end

end

local lookup_control = memo_lookup(

local lookup_control = memo_lookup(

	"control",

	"control",

	function (codepoint, ccc)

	function (codepoint, ccc)

		return ccc or "assigned"

		return ccc or "assigned"

	end,

	end,

	"assigned")

	"assigned")

p.lookup_control = lookup_control

p.lookup_control = lookup_control

function p.is_assigned(codepoint)

function p.is_assigned(codepoint)

	return lookup_control(codepoint) ~= "unassigned"

	return lookup_control(codepoint) ~= "unassigned"

end

end

function p.is_printable(codepoint)

function p.is_printable(codepoint)

	local result = lookup_control(codepoint)

	local result = lookup_control(codepoint)

	return (result == "assigned") or (result == "space-separator"), result

	return (result == "assigned") or (result == "space-separator"), result

end

end

function p.is_whitespace(codepoint)

function p.is_whitespace(codepoint)

	local result = lookup_control(codepoint)

	local result = lookup_control(codepoint)

	return (result == "space-separator"), result

	return (result == "space-separator"), result

end

end

p.lookup_category = memo_lookup(

p.lookup_category = memo_lookup(

	"category",

	"category",

	function (codepoint, category)

	function (codepoint, category)

		return category

		return category

	end,

	end,

	"Cn")

	"Cn")

local lookup_script = memo_lookup(

local lookup_script = memo_lookup(

	"scripts",

	"scripts",

	function (codepoint, script_code)

	function (codepoint, script_code)

		return script_code or 'Zzzz'

		return script_code or 'Zzzz'

	end,

	end,

	"Zzzz")

	"Zzzz")

p.lookup_script = lookup_script

p.lookup_script = lookup_script

function p.get_best_script(str)

function p.get_best_script(str)

	-- Check type of argument, because mw.text.decode coerces numbers to strings!

	-- Check type of argument, because mw.text.decode coerces numbers to strings!

	require "libraryUtil".checkType("get_best_script", 1, str, "string")

	require "libraryUtil".checkType("get_best_script", 1, str, "string")

	-- Convert HTML character references (including named character references,

	-- Convert HTML character references (including named character references,

	-- or character entities) to characters.

	-- or character entities) to characters.

	str = mw.text.decode(str, true)

	str = mw.text.decode(str, true)

	local scripts = {}

	local scripts = {}

	for codepoint in mw.ustring.gcodepoint(str) do

	for codepoint in mw.ustring.gcodepoint(str) do

		local script = lookup_script(codepoint)

		local script = lookup_script(codepoint)

		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.

		-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.

		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then

		if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then

			scripts[script] = true

			scripts[script] = true

end

end

end

end

	-- If scripts does not contain two or more keys,

	-- If scripts does not contain two or more keys,

	-- return first and only key (script code) in table.

	-- return first and only key (script code) in table.

	if not next(scripts, next(scripts)) then

	if not next(scripts, next(scripts)) then

		return next(scripts)

		return next(scripts)

	end -- else return majority script, or else "Zzzz"?

	end -- else return majority script, or else "Zzzz"?

end

end

function p.is_Latin(str)

function p.is_Latin(str)

	require "libraryUtil".checkType("get_best_script", 1, str, "string")

	require "libraryUtil".checkType("get_best_script", 1, str, "string")

	str = mw.text.decode(str, true)

	str = mw.text.decode(str, true)

	-- Search for the leading bytes that introduce the UTF-8 encoding of the

	-- Search for the leading bytes that introduce the UTF-8 encoding of the

	-- code points U+0340-U+10FFFF. If they are not found and there is at least

	-- code points U+0340-U+10FFFF. If they are not found and there is at least

	-- one Latin-script character, the string counts as Latin, because the rest

	-- one Latin-script character, the string counts as Latin, because the rest

	-- of the characters can only be Zyyy, Zinh, and Zzzz.

	-- of the characters can only be Zyyy, Zinh, and Zzzz.

	-- The only scripts found below U+0370 (the first code point of the Greek

	-- The only scripts found below U+0370 (the first code point of the Greek

	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.

	-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.

	-- See the codepage in the [[UTF-8]] article.

	-- See the codepage in the [[UTF-8]] article.

	if not str:find "[\205-\244]" then

	if not str:find "[\205-\244]" then

		for codepoint in mw.ustring.gcodepoint(str) do

		for codepoint in mw.ustring.gcodepoint(str) do

			if lookup_script(codepoint) == "Latn" then

			if lookup_script(codepoint) == "Latn" then

				return true

				return true

end

end

end

end

end

end

	local Latn = false

	local Latn = false

	for codepoint in mw.ustring.gcodepoint(str) do

	for codepoint in mw.ustring.gcodepoint(str) do

		local script = lookup_script(codepoint)

		local script = lookup_script(codepoint)

		if script == "Latn" then

		if script == "Latn" then

			Latn = true

			Latn = true

		elseif not (script == "Zyyy" or script == "Zinh"

		elseif not (script == "Zyyy" or script == "Zinh"

				or script == "Zzzz") then

				or script == "Zzzz") then

			return false

			return false

end

end

end

end

	return Latn

	return Latn

end

end

-- Checks that a string contains only characters belonging to right-to-left

-- Checks that a string contains only characters belonging to right-to-left

-- scripts, or characters of ignorable scripts.

-- scripts, or characters of ignorable scripts.

function p.is_rtl(str)

function p.is_rtl(str)

	require "libraryUtil".checkType("get_best_script", 1, str, "string")

	require "libraryUtil".checkType("get_best_script", 1, str, "string")

	str = mw.text.decode(str, true)

	str = mw.text.decode(str, true)

	-- Search for the leading bytes that introduce the UTF-8 encoding of the

	-- Search for the leading bytes that introduce the UTF-8 encoding of the

	-- code points U+0580-U+10FFFF. If they are not found, the string can only

	-- code points U+0580-U+10FFFF. If they are not found, the string can only

	-- have characters from a left-to-right script, because the first code point

	-- have characters from a left-to-right script, because the first code point

	-- in a right-to-left script is U+0591, in the Hebrew block.

	-- in a right-to-left script is U+0591, in the Hebrew block.

	if not str:find "[\214-\244]" then

	if not str:find "[\214-\244]" then

		return false

		return false

end

end

	local result = false

	local result = false

	local rtl = loader.scripts.rtl

	local rtl = loader.scripts.rtl

	for codepoint in mw.ustring.gcodepoint(str) do

	for codepoint in mw.ustring.gcodepoint(str) do

		local script = lookup_script(codepoint)

		local script = lookup_script(codepoint)

		if rtl[script] then

		if rtl[script] then

			result = true

			result = true

		elseif not (script == "Zyyy" or script == "Zinh"

		elseif not (script == "Zyyy" or script == "Zinh"

				or script == "Zzzz") then

				or script == "Zzzz") then

			return false

			return false

end

end

end

end

	return result

	return result

end

end

local function get_codepoint(args, arg)

local function get_codepoint(args, arg)

	local codepoint_string = args[arg]

	local codepoint_string = args[arg]

		or errorf(2, "Parameter %s is required", tostring(arg))

		or errorf(2, "Parameter %s is required", tostring(arg))

	local codepoint = tonumber(codepoint_string, 16)

	local codepoint = tonumber(codepoint_string, 16)

		or errorf(2, "Parameter %s is not a code point in hexadecimal base",

		or errorf(2, "Parameter %s is not a code point in hexadecimal base",

			tostring(arg))

			tostring(arg))

	if not (0 <= codepoint and codepoint <= 0x10FFFF) then

	if not (0 <= codepoint and codepoint <= 0x10FFFF) then

		errorf(2, "code point in parameter %s out of range", tostring(arg))

		errorf(2, "code point in parameter %s out of range", tostring(arg))

end

end

	return codepoint

	return codepoint

end

end

local function get_func(args, arg, prefix)

local function get_func(args, arg, prefix)

	local suffix = args[arg]

	local suffix = args[arg]

		or errorf(2, "Parameter %s is required", tostring(arg))

		or errorf(2, "Parameter %s is required", tostring(arg))

	suffix = mw.text.trim(suffix)

	suffix = mw.text.trim(suffix)

	local func_name = prefix .. suffix

	local func_name = prefix .. suffix

	local func = p[func_name]

	local func = p[func_name]

		or errorf(2, "There is no function '%s'", func_name)

		or errorf(2, "There is no function '%s'", func_name)

	return func

	return func

end

end

-- This function allows any of the "lookup" functions to be invoked. The first

-- This function allows any of the "lookup" functions to be invoked. The first

-- parameter is the word after "lookup_"; the second parameter is the code point

-- parameter is the word after "lookup_"; the second parameter is the code point

-- in hexadecimal base.

-- in hexadecimal base.

function p.lookup(frame)

function p.lookup(frame)

	local func = get_func(frame.args, 1, "lookup_")

	local func = get_func(frame.args, 1, "lookup_")

	local codepoint = get_codepoint(frame.args, 2)

	local codepoint = get_codepoint(frame.args, 2)

	local result = func(codepoint)

	local result = func(codepoint)

	if func == p.lookup_name then

	if func == p.lookup_name then

		-- Prevent code point labels such as <control-0000> from being

		-- Prevent code point labels such as <control-0000> from being

		-- interpreted as HTML tags.

		-- interpreted as HTML tags.

		result = result:gsub("<", "&lt;")

		result = result:gsub("<", "&lt;")

end

end

	return result

	return result

end

end

function p.is(frame)

function p.is(frame)

	local func = get_func(frame.args, 1, "is_")

	local func = get_func(frame.args, 1, "is_")

	-- is_Latin and is_valid_pagename take strings.

	-- is_Latin and is_valid_pagename take strings.

	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then

	if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then

		return (func(frame.args[2]))

		return (func(frame.args[2]))

	else -- The rest take code points.

	else -- The rest take code points.

		local codepoint = get_codepoint(frame.args, 2)

		local codepoint = get_codepoint(frame.args, 2)

		return (func(codepoint)) -- Adjust to one result.

		return (func(codepoint)) -- Adjust to one result.

end

end

end

end

return p

return p

Diff: Module:Unicode data