Diff: Module:Unicode data
Comparing revision #1 (2022-09-17 03:28:33) with revision #2 (2023-02-02 02:41:39).
| Old | New |
|---|---|
local p = {} |
local p = {} |
local floor = math.floor |
local floor = math.floor |
local function errorf(level, ...) |
local function errorf(level, ...) |
if type(level) == "number" then |
if type(level) == "number" then |
return error(string.format(...), level + 1) |
return error(string.format(...), level + 1) |
else -- level is actually the format string. |
else -- level is actually the format string. |
return error(string.format(level, ...), 2) |
return error(string.format(level, ...), 2) |
end |
end |
end |
end |
local function binary_range_search(codepoint, ranges) |
local function binary_range_search(codepoint, ranges) |
local low, mid, high |
local low, mid, high |
low, high = 1, ranges.length or require "Module:TableTools".length(ranges) |
low, high = 1, ranges.length or require "Module:TableTools".length(ranges) |
while low <= high do |
while low <= high do |
mid = floor((low + high) / 2) |
mid = floor((low + high) / 2) |
local range = ranges[mid] |
local range = ranges[mid] |
if codepoint < range[1] then |
if codepoint < range[1] then |
high = mid - 1 |
high = mid - 1 |
elseif codepoint <= range[2] then |
elseif codepoint <= range[2] then |
return range, mid |
return range, mid |
else |
else |
low = mid + 1 |
low = mid + 1 |
end |
end |
end |
end |
return nil, mid |
return nil, mid |
end |
end |
p.binary_range_search = binary_range_search |
p.binary_range_search = binary_range_search |
--[[ |
--[[ |
local function linear_range_search(codepoint, ranges) |
local function linear_range_search(codepoint, ranges) |
for i, range in ipairs(ranges) do |
for i, range in ipairs(ranges) do |
if range[1] <= codepoint and codepoint <= range[2] then |
if range[1] <= codepoint and codepoint <= range[2] then |
return range |
return range |
end |
end |
end |
end |
end |
end |
--]] |
--]] |
-- Load a module by indexing "loader" with the name of the module minus the |
-- Load a module by indexing "loader" with the name of the module minus the |
-- "Module:Unicode data/" part. For instance, loader.blocks returns |
-- "Module:Unicode data/" part. For instance, loader.blocks returns |
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be |
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be |
-- returned. |
-- returned. |
local loader = setmetatable({}, { |
local loader = setmetatable({}, { |
__index = function (self, key) |
__index = function (self, key) |
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) |
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) |
if not success then |
if not success then |
data = false |
data = false |
end |
end |
self[key] = data |
self[key] = data |
return data |
return data |
end |
end |
}) |
}) |
-- For the algorithm used to generate Hangul Syllable names, |
-- For the algorithm used to generate Hangul Syllable names, |
-- see "Hangul Syllable Name Generation" in section 3.12 of the |
-- see "Hangul Syllable Name Generation" in section 3.12 of the |
-- Unicode Specification: |
-- Unicode Specification: |
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf |
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf |
local name_hooks = { |
local name_hooks = { |
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters |
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters |
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters |
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters |
{ 0x3400, 0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A |
{ 0x3400, 0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A |
{ 0x4E00, 0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph |
{ 0x4E00, 0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph |
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables |
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables |
local Hangul_data = loader.Hangul |
local Hangul_data = loader.Hangul |
local syllable_index = codepoint - 0xAC00 |
local syllable_index = codepoint - 0xAC00 |
return ("HANGUL SYLLABLE %s%s%s"):format( |
return ("HANGUL SYLLABLE %s%s%s"):format( |
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)], |
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)], |
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count) |
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count) |
/ Hangul_data.trail_count)], |
/ Hangul_data.trail_count)], |
Hangul_data.trails[syllable_index % Hangul_data.trail_count] |
Hangul_data.trails[syllable_index % Hangul_data.trail_count] |
) |
) |
end }, |
end }, |
-- High Surrogates, High Private Use Surrogates, Low Surrogates |
-- High Surrogates, High Private Use Surrogates, Low Surrogates |
{ 0xD800, 0xDFFF, "<surrogate-%04X>" }, |
{ 0xD800, 0xDFFF, "<surrogate-%04X>" }, |
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use |
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use |
-- CJK Compatibility Ideographs |
-- CJK Compatibility Ideographs |
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
{ 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph |
{ 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph |
{ 0x18800, 0x18AFF, function (codepoint) |
{ 0x18800, 0x18AFF, function (codepoint) |
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) |
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) |
end }, |
end }, |
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement |
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement |
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu |
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu |
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B |
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B |
{ 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C |
{ 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C |
{ 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D |
{ 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D |
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E |
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E |
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F |
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F |
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) |
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) |
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, |
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement |
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement |
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) |
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) |
end}, |
end}, |
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G |
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G |
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H |
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H |
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use |
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use |
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use |
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use |
} |
} |
name_hooks.length = #name_hooks |
name_hooks.length = #name_hooks |
local name_range_cache |
local name_range_cache |
local function generate_name(data, codepoint) |
local function generate_name(data, codepoint) |
if type(data) == "string" then |
if type(data) == "string" then |
return data:format(codepoint) |
return data:format(codepoint) |
else |
else |
return data(codepoint) |
return data(codepoint) |
end |
end |
end |
end |
--[[ |
--[[ |
-- Checks that the code point is a number and in range. |
-- Checks that the code point is a number and in range. |
-- Does not check whether code point is an integer. |
-- Does not check whether code point is an integer. |
-- Not used |
-- Not used |
local function check_codepoint(funcName, argIdx, val) |
local function check_codepoint(funcName, argIdx, val) |
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') |
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') |
if codepoint < 0 or 0x10FFFF < codepoint then |
if codepoint < 0 or 0x10FFFF < codepoint then |
errorf("Codepoint %04X out of range", codepoint) |
errorf("Codepoint %04X out of range", codepoint) |
end |
end |
end |
end |
--]] |
--]] |
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 |
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 |
function p.lookup_name(codepoint) |
function p.lookup_name(codepoint) |
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned |
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned |
-- (Cn) and specifically noncharacters: |
-- (Cn) and specifically noncharacters: |
-- https://www.unicode.org/faq/private_use.html#nonchar4 |
-- https://www.unicode.org/faq/private_use.html#nonchar4 |
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF |
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF |
or floor(codepoint % 0x10000) >= 0xFFFE) then |
or floor(codepoint % 0x10000) >= 0xFFFE) then |
return ("<noncharacter-%04X>"):format(codepoint) |
return ("<noncharacter-%04X>"):format(codepoint) |
end |
end |
if name_range_cache -- Check if previously used "name hook" applies to this code point. |
if name_range_cache -- Check if previously used "name hook" applies to this code point. |
and codepoint >= name_range_cache[1] |
and codepoint >= name_range_cache[1] |
and codepoint <= name_range_cache[2] then |
and codepoint <= name_range_cache[2] then |
return generate_name(name_range_cache[3], codepoint) |
return generate_name(name_range_cache[3], codepoint) |
end |
end |
local range = binary_range_search(codepoint, name_hooks) |
local range = binary_range_search(codepoint, name_hooks) |
if range then |
if range then |
name_range_cache = range |
name_range_cache = range |
return generate_name(range[3], codepoint) |
return generate_name(range[3], codepoint) |
end |
end |
local data = loader[('names/%03X'):format(codepoint / 0x1000)] |
local data = loader[('names/%03X'):format(codepoint / 0x1000)] |
if data and data[codepoint] then |
if data and data[codepoint] then |
return data[codepoint] |
return data[codepoint] |
-- Unassigned (Cn) consists of noncharacters and reserved characters. |
-- Unassigned (Cn) consists of noncharacters and reserved characters. |
-- The character has been established not to be a noncharacter, |
-- The character has been established not to be a noncharacter, |
-- and if it were assigned, its name would already been retrieved, |
-- and if it were assigned, its name would already been retrieved, |
-- so it must be reserved. |
-- so it must be reserved. |
else |
else |
return ("<reserved-%04X>"):format(codepoint) |
return ("<reserved-%04X>"):format(codepoint) |
end |
end |
end |
end |
--[[ |
--[[ |
-- No image data modules on iWiki yet. |
-- No image data modules on iWiki yet. |
function p.lookup_image(codepoint) |
function p.lookup_image(codepoint) |
local data = loader[('images/%03X'):format(codepoint / 0x1000)] |
local data = loader[('images/%03X'):format(codepoint / 0x1000)] |
if data then |
if data then |
return data[codepoint] |
return data[codepoint] |
end |
end |
end |
end |
--]] |
--]] |
local planes = { |
local planes = { |
[ 0] = "Basic Multilingual Plane"; |
[ 0] = "Basic Multilingual Plane"; |
[ 1] = "Supplementary Multilingual Plane"; |
[ 1] = "Supplementary Multilingual Plane"; |
[ 2] = "Supplementary Ideographic Plane"; |
[ 2] = "Supplementary Ideographic Plane"; |
[ 3] = "Tertiary Ideographic Plane"; |
[ 3] = "Tertiary Ideographic Plane"; |
[14] = "Supplementary Special-purpose Plane"; |
[14] = "Supplementary Special-purpose Plane"; |
[15] = "Supplementary Private Use Area-A"; |
[15] = "Supplementary Private Use Area-A"; |
[16] = "Supplementary Private Use Area-B"; |
[16] = "Supplementary Private Use Area-B"; |
} |
} |
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. |
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. |
local blocks |
local blocks |
local function block_iter(blocks, i) |
local function block_iter(blocks, i) |
i = i + 1 |
i = i + 1 |
local data = blocks[i] |
local data = blocks[i] |
if data then |
if data then |
-- Unpack doesn't work on tables loaded with mw.loadData. |
-- Unpack doesn't work on tables loaded with mw.loadData. |
return i, data[1], data[2], data[3] |
return i, data[1], data[2], data[3] |
end |
end |
end |
end |
-- An ipairs-type iterator generator for the list of blocks. |
-- An ipairs-type iterator generator for the list of blocks. |
function p.enum_blocks() |
function p.enum_blocks() |
local blocks = loader.blocks |
local blocks = loader.blocks |
return block_iter, blocks, 0 |
return block_iter, blocks, 0 |
end |
end |
function p.lookup_plane(codepoint) |
function p.lookup_plane(codepoint) |
local i = floor(codepoint / 0x10000) |
local i = floor(codepoint / 0x10000) |
return planes[i] or ("Plane %u"):format(i) |
return planes[i] or ("Plane %u"):format(i) |
end |
end |
function p.lookup_block(codepoint) |
function p.lookup_block(codepoint) |
local blocks = loader.blocks |
local blocks = loader.blocks |
local range = binary_range_search(codepoint, blocks) |
local range = binary_range_search(codepoint, blocks) |
if range then |
if range then |
return range[3] |
return range[3] |
else |
else |
return "No Block" |
return "No Block" |
end |
end |
end |
end |
function p.get_block_info(name) |
function p.get_block_info(name) |
for i, block in ipairs(loader.blocks) do |
for i, block in ipairs(loader.blocks) do |
if block[3] == name then |
if block[3] == name then |
return block |
return block |
end |
end |
end |
end |
end |
end |
function p.is_valid_pagename(pagename) |
function p.is_valid_pagename(pagename) |
local has_nonws = false |
local has_nonws = false |
for cp in mw.ustring.gcodepoint(pagename) do |
for cp in mw.ustring.gcodepoint(pagename) do |
if (cp == 0x0023) -- # |
if (cp == 0x0023) -- # |
or (cp == 0x005B) -- [ |
or (cp == 0x005B) -- [ |
or (cp == 0x005D) -- ] |
or (cp == 0x005D) -- ] |
or (cp == 0x007B) -- { |
or (cp == 0x007B) -- { |
or (cp == 0x007C) -- | |
or (cp == 0x007C) -- | |
or (cp == 0x007D) -- } |
or (cp == 0x007D) -- } |
or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR |
or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR |
or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block |
or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block |
or (cp == 0xFFFD) -- REPLACEMENT CHARACTER |
or (cp == 0xFFFD) -- REPLACEMENT CHARACTER |
then |
then |
return false |
return false |
end |
end |
local printable, result = p.is_printable(cp) |
local printable, result = p.is_printable(cp) |
if not printable then |
if not printable then |
return false |
return false |
end |
end |
if result ~= "space-separator" then |
if result ~= "space-separator" then |
has_nonws = true |
has_nonws = true |
end |
end |
end |
end |
return has_nonws |
return has_nonws |
end |
end |
local function manual_unpack(what, from) |
local function manual_unpack(what, from) |
if what[from + 1] == nil then |
if what[from + 1] == nil then |
return what[from] |
return what[from] |
end |
end |
local result = {} |
local result = {} |
from = from or 1 |
from = from or 1 |
for i, item in ipairs(what) do |
for i, item in ipairs(what) do |
if i >= from then |
if i >= from then |
table.insert(result, item) |
table.insert(result, item) |
end |
end |
end |
end |
return unpack(result) |
return unpack(result) |
end |
end |
local function compare_ranges(range1, range2) |
local function compare_ranges(range1, range2) |
return range1[1] < range2[1] |
return range1[1] < range2[1] |
end |
end |
-- Creates a function to look up data in a module that contains "singles" (a |
-- Creates a function to look up data in a module that contains "singles" (a |
-- code point-to-data map) and "ranges" (an array containing arrays that contain |
-- code point-to-data map) and "ranges" (an array containing arrays that contain |
-- the low and high code points of a range and the data associated with that |
-- the low and high code points of a range and the data associated with that |
-- range). |
-- range). |
-- "loader" loads and returns the "singles" and "ranges" tables. |
-- "loader" loads and returns the "singles" and "ranges" tables. |
-- "match_func" is passed the code point and either the data or the "dots", and |
-- "match_func" is passed the code point and either the data or the "dots", and |
-- generates the final result of the function. |
-- generates the final result of the function. |
-- The varargs ("dots") describes the default data to be returned if there wasn't |
-- The varargs ("dots") describes the default data to be returned if there wasn't |
-- a match. |
-- a match. |
-- In case the function is used more than once, "cache" saves ranges that have |
-- In case the function is used more than once, "cache" saves ranges that have |
-- already been found to match, or a range whose data is the default if there |
-- already been found to match, or a range whose data is the default if there |
-- was no match. |
-- was no match. |
local function memo_lookup(data_module_subpage, match_func, ...) |
local function memo_lookup(data_module_subpage, match_func, ...) |
local dots = { ... } |
local dots = { ... } |
local cache = {} |
local cache = {} |
local singles, ranges |
local singles, ranges |
return function (codepoint) |
return function (codepoint) |
if not singles then |
if not singles then |
local data_module = loader[data_module_subpage] |
local data_module = loader[data_module_subpage] |
singles, ranges = data_module.singles, data_module.ranges |
singles, ranges = data_module.singles, data_module.ranges |
end |
end |
if singles[codepoint] then |
if singles[codepoint] then |
return match_func(codepoint, singles[codepoint]) |
return match_func(codepoint, singles[codepoint]) |
end |
end |
local range = binary_range_search(codepoint, cache) |
local range = binary_range_search(codepoint, cache) |
if range then |
if range then |
return match_func(codepoint, manual_unpack(range, 3)) |
return match_func(codepoint, manual_unpack(range, 3)) |
end |
end |
local range, index = binary_range_search(codepoint, ranges) |
local range, index = binary_range_search(codepoint, ranges) |
if range then |
if range then |
table.insert(cache, range) |
table.insert(cache, range) |
table.sort(cache, compare_ranges) |
table.sort(cache, compare_ranges) |
return match_func(codepoint, manual_unpack(range, 3)) |
return match_func(codepoint, manual_unpack(range, 3)) |
end |
end |
if ranges[index] then |
if ranges[index] then |
local dots_range |
local dots_range |
if codepoint > ranges[index][2] then |
if codepoint > ranges[index][2] then |
dots_range = { |
dots_range = { |
ranges[index][2] + 1, |
ranges[index][2] + 1, |
ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF, |
ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF, |
unpack(dots) |
unpack(dots) |
} |
} |
else -- codepoint < range[index][1] |
else -- codepoint < range[index][1] |
dots_range = { |
dots_range = { |
ranges[index - 1] and ranges[index - 1][2] + 1 or 0, |
ranges[index - 1] and ranges[index - 1][2] + 1 or 0, |
ranges[index][1] - 1, |
ranges[index][1] - 1, |
unpack(dots) |
unpack(dots) |
} |
} |
end |
end |
table.sort(cache, compare_ranges) |
table.sort(cache, compare_ranges) |
end |
end |
return match_func(codepoint) |
return match_func(codepoint) |
end |
end |
end |
end |
-- Get a code point's combining class value in [[Module:Unicode data/combining]], |
-- Get a code point's combining class value in [[Module:Unicode data/combining]], |
-- and return whether this value is not zero. Zero is assigned as the default |
-- and return whether this value is not zero. Zero is assigned as the default |
-- if the combining class value is not found in this data module. |
-- if the combining class value is not found in this data module. |
-- That is, return true if character is combining, or false if it is not. |
-- That is, return true if character is combining, or false if it is not. |
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for |
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for |
-- more information. |
-- more information. |
p.is_combining = memo_lookup( |
p.is_combining = memo_lookup( |
"combining", |
"combining", |
function (codepoint, combining_class) |
function (codepoint, combining_class) |
return combining_class and combining_class ~= 0 or false |
return combining_class and combining_class ~= 0 or false |
end, |
end, |
0) |
0) |
function p.add_dotted_circle(str) |
function p.add_dotted_circle(str) |
return (mw.ustring.gsub(str, ".", |
return (mw.ustring.gsub(str, ".", |
function(char) |
function(char) |
if p.is_combining(mw.ustring.codepoint(char)) then |
if p.is_combining(mw.ustring.codepoint(char)) then |
return '◌' .. char |
return '◌' .. char |
end |
end |
end)) |
end)) |
end |
end |
local lookup_control = memo_lookup( |
local lookup_control = memo_lookup( |
"control", |
"control", |
function (codepoint, ccc) |
function (codepoint, ccc) |
return ccc or "assigned" |
return ccc or "assigned" |
end, |
end, |
"assigned") |
"assigned") |
p.lookup_control = lookup_control |
p.lookup_control = lookup_control |
function p.is_assigned(codepoint) |
function p.is_assigned(codepoint) |
return lookup_control(codepoint) ~= "unassigned" |
return lookup_control(codepoint) ~= "unassigned" |
end |
end |
function p.is_printable(codepoint) |
function p.is_printable(codepoint) |
local result = lookup_control(codepoint) |
local result = lookup_control(codepoint) |
return (result == "assigned") or (result == "space-separator"), result |
return (result == "assigned") or (result == "space-separator"), result |
end |
end |
function p.is_whitespace(codepoint) |
function p.is_whitespace(codepoint) |
local result = lookup_control(codepoint) |
local result = lookup_control(codepoint) |
return (result == "space-separator"), result |
return (result == "space-separator"), result |
end |
end |
p.lookup_category = memo_lookup( |
p.lookup_category = memo_lookup( |
"category", |
"category", |
function (codepoint, category) |
function (codepoint, category) |
return category |
return category |
end, |
end, |
"Cn") |
"Cn") |
local lookup_script = memo_lookup( |
local lookup_script = memo_lookup( |
"scripts", |
"scripts", |
function (codepoint, script_code) |
function (codepoint, script_code) |
return script_code or 'Zzzz' |
return script_code or 'Zzzz' |
end, |
end, |
"Zzzz") |
"Zzzz") |
p.lookup_script = lookup_script |
p.lookup_script = lookup_script |
function p.get_best_script(str) |
function p.get_best_script(str) |
-- Check type of argument, because mw.text.decode coerces numbers to strings! |
-- Check type of argument, because mw.text.decode coerces numbers to strings! |
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
-- Convert HTML character references (including named character references, |
-- Convert HTML character references (including named character references, |
-- or character entities) to characters. |
-- or character entities) to characters. |
str = mw.text.decode(str, true) |
str = mw.text.decode(str, true) |
local scripts = {} |
local scripts = {} |
for codepoint in mw.ustring.gcodepoint(str) do |
for codepoint in mw.ustring.gcodepoint(str) do |
local script = lookup_script(codepoint) |
local script = lookup_script(codepoint) |
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. |
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. |
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then |
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then |
scripts[script] = true |
scripts[script] = true |
end |
end |
end |
end |
-- If scripts does not contain two or more keys, |
-- If scripts does not contain two or more keys, |
-- return first and only key (script code) in table. |
-- return first and only key (script code) in table. |
if not next(scripts, next(scripts)) then |
if not next(scripts, next(scripts)) then |
return next(scripts) |
return next(scripts) |
end -- else return majority script, or else "Zzzz"? |
end -- else return majority script, or else "Zzzz"? |
end |
end |
function p.is_Latin(str) |
function p.is_Latin(str) |
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
str = mw.text.decode(str, true) |
str = mw.text.decode(str, true) |
-- Search for the leading bytes that introduce the UTF-8 encoding of the |
-- Search for the leading bytes that introduce the UTF-8 encoding of the |
-- code points U+0340-U+10FFFF. If they are not found and there is at least |
-- code points U+0340-U+10FFFF. If they are not found and there is at least |
-- one Latin-script character, the string counts as Latin, because the rest |
-- one Latin-script character, the string counts as Latin, because the rest |
-- of the characters can only be Zyyy, Zinh, and Zzzz. |
-- of the characters can only be Zyyy, Zinh, and Zzzz. |
-- The only scripts found below U+0370 (the first code point of the Greek |
-- The only scripts found below U+0370 (the first code point of the Greek |
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. |
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. |
-- See the codepage in the [[UTF-8]] article. |
-- See the codepage in the [[UTF-8]] article. |
if not str:find "[\205-\244]" then |
if not str:find "[\205-\244]" then |
for codepoint in mw.ustring.gcodepoint(str) do |
for codepoint in mw.ustring.gcodepoint(str) do |
if lookup_script(codepoint) == "Latn" then |
if lookup_script(codepoint) == "Latn" then |
return true |
return true |
end |
end |
end |
end |
end |
end |
local Latn = false |
local Latn = false |
for codepoint in mw.ustring.gcodepoint(str) do |
for codepoint in mw.ustring.gcodepoint(str) do |
local script = lookup_script(codepoint) |
local script = lookup_script(codepoint) |
if script == "Latn" then |
if script == "Latn" then |
Latn = true |
Latn = true |
elseif not (script == "Zyyy" or script == "Zinh" |
elseif not (script == "Zyyy" or script == "Zinh" |
or script == "Zzzz") then |
or script == "Zzzz") then |
return false |
return false |
end |
end |
end |
end |
return Latn |
return Latn |
end |
end |
-- Checks that a string contains only characters belonging to right-to-left |
-- Checks that a string contains only characters belonging to right-to-left |
-- scripts, or characters of ignorable scripts. |
-- scripts, or characters of ignorable scripts. |
function p.is_rtl(str) |
function p.is_rtl(str) |
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
require "libraryUtil".checkType("get_best_script", 1, str, "string") |
str = mw.text.decode(str, true) |
str = mw.text.decode(str, true) |
-- Search for the leading bytes that introduce the UTF-8 encoding of the |
-- Search for the leading bytes that introduce the UTF-8 encoding of the |
-- code points U+0580-U+10FFFF. If they are not found, the string can only |
-- code points U+0580-U+10FFFF. If they are not found, the string can only |
-- have characters from a left-to-right script, because the first code point |
-- have characters from a left-to-right script, because the first code point |
-- in a right-to-left script is U+0591, in the Hebrew block. |
-- in a right-to-left script is U+0591, in the Hebrew block. |
if not str:find "[\214-\244]" then |
if not str:find "[\214-\244]" then |
return false |
return false |
end |
end |
local result = false |
local result = false |
local rtl = loader.scripts.rtl |
local rtl = loader.scripts.rtl |
for codepoint in mw.ustring.gcodepoint(str) do |
for codepoint in mw.ustring.gcodepoint(str) do |
local script = lookup_script(codepoint) |
local script = lookup_script(codepoint) |
if rtl[script] then |
if rtl[script] then |
result = true |
result = true |
elseif not (script == "Zyyy" or script == "Zinh" |
elseif not (script == "Zyyy" or script == "Zinh" |
or script == "Zzzz") then |
or script == "Zzzz") then |
return false |
return false |
end |
end |
end |
end |
return result |
return result |
end |
end |
local function get_codepoint(args, arg) |
local function get_codepoint(args, arg) |
local codepoint_string = args[arg] |
local codepoint_string = args[arg] |
or errorf(2, "Parameter %s is required", tostring(arg)) |
or errorf(2, "Parameter %s is required", tostring(arg)) |
local codepoint = tonumber(codepoint_string, 16) |
local codepoint = tonumber(codepoint_string, 16) |
or errorf(2, "Parameter %s is not a code point in hexadecimal base", |
or errorf(2, "Parameter %s is not a code point in hexadecimal base", |
tostring(arg)) |
tostring(arg)) |
if not (0 <= codepoint and codepoint <= 0x10FFFF) then |
if not (0 <= codepoint and codepoint <= 0x10FFFF) then |
errorf(2, "code point in parameter %s out of range", tostring(arg)) |
errorf(2, "code point in parameter %s out of range", tostring(arg)) |
end |
end |
return codepoint |
return codepoint |
end |
end |
local function get_func(args, arg, prefix) |
local function get_func(args, arg, prefix) |
local suffix = args[arg] |
local suffix = args[arg] |
or errorf(2, "Parameter %s is required", tostring(arg)) |
or errorf(2, "Parameter %s is required", tostring(arg)) |
suffix = mw.text.trim(suffix) |
suffix = mw.text.trim(suffix) |
local func_name = prefix .. suffix |
local func_name = prefix .. suffix |
local func = p[func_name] |
local func = p[func_name] |
or errorf(2, "There is no function '%s'", func_name) |
or errorf(2, "There is no function '%s'", func_name) |
return func |
return func |
end |
end |
-- This function allows any of the "lookup" functions to be invoked. The first |
-- This function allows any of the "lookup" functions to be invoked. The first |
-- parameter is the word after "lookup_"; the second parameter is the code point |
-- parameter is the word after "lookup_"; the second parameter is the code point |
-- in hexadecimal base. |
-- in hexadecimal base. |
function p.lookup(frame) |
function p.lookup(frame) |
local func = get_func(frame.args, 1, "lookup_") |
local func = get_func(frame.args, 1, "lookup_") |
local codepoint = get_codepoint(frame.args, 2) |
local codepoint = get_codepoint(frame.args, 2) |
local result = func(codepoint) |
local result = func(codepoint) |
if func == p.lookup_name then |
if func == p.lookup_name then |
-- Prevent code point labels such as <control-0000> from being |
-- Prevent code point labels such as <control-0000> from being |
-- interpreted as HTML tags. |
-- interpreted as HTML tags. |
result = result:gsub("<", "<") |
result = result:gsub("<", "<") |
end |
end |
return result |
return result |
end |
end |
function p.is(frame) |
function p.is(frame) |
local func = get_func(frame.args, 1, "is_") |
local func = get_func(frame.args, 1, "is_") |
-- is_Latin and is_valid_pagename take strings. |
-- is_Latin and is_valid_pagename take strings. |
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then |
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then |
return (func(frame.args[2])) |
return (func(frame.args[2])) |
else -- The rest take code points. |
else -- The rest take code points. |
local codepoint = get_codepoint(frame.args, 2) |
local codepoint = get_codepoint(frame.args, 2) |
return (func(codepoint)) -- Adjust to one result. |
return (func(codepoint)) -- Adjust to one result. |
end |
end |
end |
end |
return p |
return p |