Module:Unicode data

local export = {}

local floor = math.floor

-- http://www.unicode.org/Public/UNIDATA/Jamo.txt -- For the algorithm used here, see Hangul Syllable Name Generation -- in section 3.12 of the Unicode Specification. -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf local hangul_leads = { [0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" }

local hangul_vowels = { [0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" }

local hangul_trails = { [0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" }

local name_hooks = { {    0x00,     0x1F, "" }, -- C0 control characters {    0x7F,     0x9F, "" }, -- DEL and C1 control characters {  0x3400,   0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A	{   0x4E00,   0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph --change v10 {  0xAC00,   0xD7A3, function (codepoint) local m_hangul = require('Module:ko-hangul') -- lead index, vowel index, trail index local li, vi, ti = m_hangul.syllableIndex2JamoIndices(			codepoint - 0xAC00		)

return ("HANGUL SYLLABLE %s%s%s"):format(			hangul_leads[li], -- I hate one-based indexing			hangul_vowels[vi],			hangul_trails[ti] -- never mind, I can live with it		) end }, {  0xD800,   0xDB7F, "" }, -- Non Private Use High Surrogate {  0xDB80,   0xDBFF, "" }, -- Private Use High Surrogate {  0xDC00,   0xDFFF, "" }, -- Low Surrogate {  0xE000,   0xF8FF, "" }, -- Private Use { 0x17000,  0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut { 0x1B170,  0x1B2FB, "NUSHU CHARACTER-%05X" }, -- Nushu --add v10 { 0x20000,  0x2A6D6, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension B	{  0x2A700,  0x2B734, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension C	{  0x2A740,  0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D	{  0x2B820,  0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E	{  0x2CEB0,  0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F --add v10 { 0x2F800,  0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) { 0xF0000,  0xFFFFD, "" }, -- Plane 15 Private Use { 0x100000, 0x10FFFD, "" } -- Plane 16 Private Use }

local name_range_cache

local function generate_name(data, codepoint) if type(data) == "string" then return data:format(codepoint) else return data(codepoint) end end

-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 function export.lookup_name(codepoint) -- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are noncharacters: -- https://www.unicode.org/faq/private_use.html#nonchar4 if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF			or math.floor(codepoint % 0x10000) >= 0xFFFE) then return (""):format(codepoint) end

if name_range_cache then if (codepoint >= name_range_cache[1]) and (codepoint <= name_range_cache[2]) then return generate_name(name_range_cache[3], codepoint) end end

for _, item in ipairs(name_hooks) do		if codepoint < item[1] then break elseif codepoint <= item[2] then name_range_cache = item return generate_name(item[3], codepoint) end end

local success, data = pcall(mw.loadData,		('Module:Unicode data/names/%03X'):format(codepoint / 0x1000)) if success and data[codepoint] then return data[codepoint] -- Unassigned (Cn) includes noncharacters and reserved characters. -- Codepoint has already been determined not to be a noncharacter, -- so if it is unassigned (Cn), it is reserved. elseif not export.is_assigned(codepoint) then return (""):format(codepoint) else require("Module:debug").track("Unicode data/no name or label") return (""):format(codepoint) -- This point should not be reached. end end

function export.lookup_image(codepoint) local success, data = pcall(mw.loadData,		('Module:Unicode data/images/%03X'):format(codepoint / 0x1000)	) if success then return data[codepoint] end end

function export.template_lookup_name(frame) local codepoint = tonumber(frame.args[1] or frame:getParent.args[1]) local name = export.lookup_name(codepoint) return name:gsub("<", "&lt;") end

local planes = { [ 0] = "Basic Multilingual Plane"; [ 1] = "Supplementary Multilingual Plane"; [ 2] = "Supplementary Ideographic Plane"; [13] = "Supplementary Special-purpose Plane"; [14] = "Supplementary Private Use Area-A"; [15] = "Supplementary Private Use Area-B"; }

-- http://www.unicode.org/Public/UNIDATA/Blocks.txt -- This should be kept synchronized with Module:category tree/scriptcatboiler/blocks. local blocks = { { "Basic Latin",                                    0x000000, 0x00007F }, { "Latin-1 Supplement",                             0x000080, 0x0000FF }, { "Latin Extended-A",                               0x000100, 0x00017F }, { "Latin Extended-B",                               0x000180, 0x00024F }, { "IPA Extensions",                                 0x000250, 0x0002AF }, { "Spacing Modifier Letters",                       0x0002B0, 0x0002FF }, { "Combining Diacritical Marks",                    0x000300, 0x00036F }, { "Greek and Coptic",                               0x000370, 0x0003FF }, { "Cyrillic",                                       0x000400, 0x0004FF }, { "Cyrillic Supplement",                            0x000500, 0x00052F }, { "Armenian",                                       0x000530, 0x00058F }, { "Hebrew",                                         0x000590, 0x0005FF }, { "Arabic",                                         0x000600, 0x0006FF }, { "Syriac",                                         0x000700, 0x00074F }, { "Arabic Supplement",                              0x000750, 0x00077F }, { "Thaana",                                         0x000780, 0x0007BF }, { "NKo",                                            0x0007C0, 0x0007FF }, { "Samaritan",                                      0x000800, 0x00083F }, { "Mandaic",                                        0x000840, 0x00085F }, { "Syriac Supplement",                              0x000860, 0x00086F }, { "Arabic Extended-A",                              0x0008A0, 0x0008FF }, { "Devanagari",                                     0x000900, 0x00097F }, { "Bengali",                                        0x000980, 0x0009FF }, { "Gurmukhi",                                       0x000A00, 0x000A7F }, { "Gujarati",                                       0x000A80, 0x000AFF }, { "Oriya",                                          0x000B00, 0x000B7F }, { "Tamil",                                          0x000B80, 0x000BFF }, { "Telugu",                                         0x000C00, 0x000C7F }, { "Kannada",                                        0x000C80, 0x000CFF }, { "Malayalam",                                      0x000D00, 0x000D7F }, { "Sinhala",                                        0x000D80, 0x000DFF }, { "Thai",                                           0x000E00, 0x000E7F }, { "Lao",                                            0x000E80, 0x000EFF }, { "Tibetan",                                        0x000F00, 0x000FFF }, { "Myanmar",                                        0x001000, 0x00109F }, { "Georgian",                                       0x0010A0, 0x0010FF }, { "Hangul Jamo",                                    0x001100, 0x0011FF }, { "Ethiopic",                                       0x001200, 0x00137F }, { "Ethiopic Supplement",                            0x001380, 0x00139F }, { "Cherokee",                                       0x0013A0, 0x0013FF }, { "Unified Canadian Aboriginal Syllabics",          0x001400, 0x00167F }, { "Ogham",                                          0x001680, 0x00169F }, { "Runic",                                          0x0016A0, 0x0016FF }, { "Tagalog",                                        0x001700, 0x00171F }, { "Hanunoo",                                        0x001720, 0x00173F }, { "Buhid",                                          0x001740, 0x00175F }, { "Tagbanwa",                                       0x001760, 0x00177F }, { "Khmer",                                          0x001780, 0x0017FF }, { "Mongolian",                                      0x001800, 0x0018AF }, { "Unified Canadian Aboriginal Syllabics Extended", 0x0018B0, 0x0018FF }, { "Limbu",                                          0x001900, 0x00194F }, { "Tai Le",                                         0x001950, 0x00197F }, { "New Tai Lue",                                    0x001980, 0x0019DF }, { "Khmer Symbols",                                  0x0019E0, 0x0019FF }, { "Buginese",                                       0x001A00, 0x001A1F }, { "Tai Tham",                                       0x001A20, 0x001AAF }, { "Combining Diacritical Marks Extended",           0x001AB0, 0x001AFF }, { "Balinese",                                       0x001B00, 0x001B7F }, { "Sundanese",                                      0x001B80, 0x001BBF }, { "Batak",                                          0x001BC0, 0x001BFF }, { "Lepcha",                                         0x001C00, 0x001C4F }, { "Ol Chiki",                                       0x001C50, 0x001C7F }, { "Cyrillic Extended-C",                            0x001C80, 0x001C8F }, { "Georgian Extended",                              0x001C90, 0x001CBF }, { "Sundanese Supplement",                           0x001CC0, 0x001CCF }, { "Vedic Extensions",                               0x001CD0, 0x001CFF }, { "Phonetic Extensions",                            0x001D00, 0x001D7F }, { "Phonetic Extensions Supplement",                 0x001D80, 0x001DBF }, { "Combining Diacritical Marks Supplement",         0x001DC0, 0x001DFF }, { "Latin Extended Additional",                      0x001E00, 0x001EFF }, { "Greek Extended",                                 0x001F00, 0x001FFF }, { "General Punctuation",                            0x002000, 0x00206F }, { "Superscripts and Subscripts",                    0x002070, 0x00209F }, { "Currency Symbols",                               0x0020A0, 0x0020CF }, { "Combining Diacritical Marks for Symbols",        0x0020D0, 0x0020FF }, { "Letterlike Symbols",                             0x002100, 0x00214F }, { "Number Forms",                                   0x002150, 0x00218F }, { "Arrows",                                         0x002190, 0x0021FF }, { "Mathematical Operators",                         0x002200, 0x0022FF }, { "Miscellaneous Technical",                        0x002300, 0x0023FF }, { "Control Pictures",                               0x002400, 0x00243F }, { "Optical Character Recognition",                  0x002440, 0x00245F }, { "Enclosed Alphanumerics",                         0x002460, 0x0024FF }, { "Box Drawing",                                    0x002500, 0x00257F }, { "Block Elements",                                 0x002580, 0x00259F }, { "Geometric Shapes",                               0x0025A0, 0x0025FF }, { "Miscellaneous Symbols",                          0x002600, 0x0026FF }, { "Dingbats",                                       0x002700, 0x0027BF }, { "Miscellaneous Mathematical Symbols-A",           0x0027C0, 0x0027EF }, { "Supplemental Arrows-A",                          0x0027F0, 0x0027FF }, { "Braille Patterns",                               0x002800, 0x0028FF }, { "Supplemental Arrows-B",                          0x002900, 0x00297F }, { "Miscellaneous Mathematical Symbols-B",           0x002980, 0x0029FF }, { "Supplemental Mathematical Operators",            0x002A00, 0x002AFF }, { "Miscellaneous Symbols and Arrows",               0x002B00, 0x002BFF }, { "Glagolitic",                                     0x002C00, 0x002C5F }, { "Latin Extended-C",                               0x002C60, 0x002C7F }, { "Coptic",                                         0x002C80, 0x002CFF }, { "Georgian Supplement",                            0x002D00, 0x002D2F }, { "Tifinagh",                                       0x002D30, 0x002D7F }, { "Ethiopic Extended",                              0x002D80, 0x002DDF }, { "Cyrillic Extended-A",                            0x002DE0, 0x002DFF }, { "Supplemental Punctuation",                       0x002E00, 0x002E7F }, { "CJK Radicals Supplement",                        0x002E80, 0x002EFF }, { "Kangxi Radicals",                                0x002F00, 0x002FDF }, { "Ideographic Description Characters",             0x002FF0, 0x002FFF }, { "CJK Symbols and Punctuation",                    0x003000, 0x00303F }, { "Hiragana",                                       0x003040, 0x00309F }, { "Katakana",                                       0x0030A0, 0x0030FF }, { "Bopomofo",                                       0x003100, 0x00312F }, { "Hangul Compatibility Jamo",                      0x003130, 0x00318F }, { "Kanbun",                                         0x003190, 0x00319F }, { "Bopomofo Extended",                              0x0031A0, 0x0031BF }, { "CJK Strokes",                                    0x0031C0, 0x0031EF }, { "Katakana Phonetic Extensions",                   0x0031F0, 0x0031FF }, { "Enclosed CJK Letters and Months",                0x003200, 0x0032FF }, { "CJK Compatibility",                              0x003300, 0x0033FF }, { "CJK Unified Ideographs Extension A",             0x003400, 0x004DBF }, { "Yijing Hexagram Symbols",                        0x004DC0, 0x004DFF }, { "CJK Unified Ideographs",                         0x004E00, 0x009FFF }, { "Yi Syllables",                                   0x00A000, 0x00A48F }, { "Yi Radicals",                                    0x00A490, 0x00A4CF }, { "Lisu",                                           0x00A4D0, 0x00A4FF }, { "Vai",                                            0x00A500, 0x00A63F }, { "Cyrillic Extended-B",                            0x00A640, 0x00A69F }, { "Bamum",                                          0x00A6A0, 0x00A6FF }, { "Modifier Tone Letters",                          0x00A700, 0x00A71F }, { "Latin Extended-D",                               0x00A720, 0x00A7FF }, { "Syloti Nagri",                                   0x00A800, 0x00A82F }, { "Common Indic Number Forms",                      0x00A830, 0x00A83F }, { "Phags-pa",                                       0x00A840, 0x00A87F }, { "Saurashtra",                                     0x00A880, 0x00A8DF }, { "Devanagari Extended",                            0x00A8E0, 0x00A8FF }, { "Kayah Li",                                       0x00A900, 0x00A92F }, { "Rejang",                                         0x00A930, 0x00A95F }, { "Hangul Jamo Extended-A",                         0x00A960, 0x00A97F }, { "Javanese",                                       0x00A980, 0x00A9DF }, { "Myanmar Extended-B",                             0x00A9E0, 0x00A9FF }, { "Cham",                                           0x00AA00, 0x00AA5F }, { "Myanmar Extended-A",                             0x00AA60, 0x00AA7F }, { "Tai Viet",                                       0x00AA80, 0x00AADF }, { "Meetei Mayek Extensions",                        0x00AAE0, 0x00AAFF }, { "Ethiopic Extended-A",                            0x00AB00, 0x00AB2F }, { "Latin Extended-E",                               0x00AB30, 0x00AB6F }, { "Cherokee Supplement",                            0x00AB70, 0x00ABBF }, { "Meetei Mayek",                                   0x00ABC0, 0x00ABFF }, { "Hangul Syllables",                               0x00AC00, 0x00D7AF }, { "Hangul Jamo Extended-B",                         0x00D7B0, 0x00D7FF }, { "High Surrogates",                                0x00D800, 0x00DB7F }, { "High Private Use Surrogates",                    0x00DB80, 0x00DBFF }, { "Low Surrogates",                                 0x00DC00, 0x00DFFF }, { "Private Use Area",                               0x00E000, 0x00F8FF }, { "CJK Compatibility Ideographs",                   0x00F900, 0x00FAFF }, { "Alphabetic Presentation Forms",                  0x00FB00, 0x00FB4F }, { "Arabic Presentation Forms-A",                    0x00FB50, 0x00FDFF }, { "Variation Selectors",                            0x00FE00, 0x00FE0F }, { "Vertical Forms",                                 0x00FE10, 0x00FE1F }, { "Combining Half Marks",                           0x00FE20, 0x00FE2F }, { "CJK Compatibility Forms",                        0x00FE30, 0x00FE4F }, { "Small Form Variants",                            0x00FE50, 0x00FE6F }, { "Arabic Presentation Forms-B",                    0x00FE70, 0x00FEFF }, { "Halfwidth and Fullwidth Forms",                  0x00FF00, 0x00FFEF }, { "Specials",                                       0x00FFF0, 0x00FFFF }, { "Linear B Syllabary",                             0x010000, 0x01007F }, { "Linear B Ideograms",                             0x010080, 0x0100FF }, { "Aegean Numbers",                                 0x010100, 0x01013F }, { "Ancient Greek Numbers",                          0x010140, 0x01018F }, { "Ancient Symbols",                                0x010190, 0x0101CF }, { "Phaistos Disc",                                  0x0101D0, 0x0101FF }, { "Lycian",                                         0x010280, 0x01029F }, { "Carian",                                         0x0102A0, 0x0102DF }, { "Coptic Epact Numbers",                           0x0102E0, 0x0102FF }, { "Old Italic",                                     0x010300, 0x01032F }, { "Gothic",                                         0x010330, 0x01034F }, { "Old Permic",                                     0x010350, 0x01037F }, { "Ugaritic",                                       0x010380, 0x01039F }, { "Old Persian",                                    0x0103A0, 0x0103DF }, { "Deseret",                                        0x010400, 0x01044F }, { "Shavian",                                        0x010450, 0x01047F }, { "Osmanya",                                        0x010480, 0x0104AF }, { "Osage",                                          0x0104B0, 0x0104FF }, { "Elbasan",                                        0x010500, 0x01052F }, { "Caucasian Albanian",                             0x010530, 0x01056F }, { "Linear A",                                       0x010600, 0x01077F }, { "Cypriot Syllabary",                              0x010800, 0x01083F }, { "Imperial Aramaic",                               0x010840, 0x01085F }, { "Palmyrene",                                      0x010860, 0x01087F }, { "Nabataean",                                      0x010880, 0x0108AF }, { "Hatran",                                         0x0108E0, 0x0108FF }, { "Phoenician",                                     0x010900, 0x01091F }, { "Lydian",                                         0x010920, 0x01093F }, { "Meroitic Hieroglyphs",                           0x010980, 0x01099F }, { "Meroitic Cursive",                               0x0109A0, 0x0109FF }, { "Kharoshthi",                                     0x010A00, 0x010A5F }, { "Old South Arabian",                              0x010A60, 0x010A7F }, { "Old North Arabian",                              0x010A80, 0x010A9F }, { "Manichaean",                                     0x010AC0, 0x010AFF }, { "Avestan",                                        0x010B00, 0x010B3F }, { "Inscriptional Parthian",                         0x010B40, 0x010B5F }, { "Inscriptional Pahlavi",                          0x010B60, 0x010B7F }, { "Psalter Pahlavi",                                0x010B80, 0x010BAF }, { "Old Turkic",                                     0x010C00, 0x010C4F }, { "Old Hungarian",                                  0x010C80, 0x010CFF }, { "Hanifi Rohingya",                                0x010D00, 0x010D3F }, { "Rumi Numeral Symbols",                           0x010E60, 0x010E7F }, { "Old Sogdian",                                    0x010F00, 0x010F2F }, { "Sogdian",                                        0x010F30, 0x010F6F }, { "Brahmi",                                         0x011000, 0x01107F }, { "Kaithi",                                         0x011080, 0x0110CF }, { "Sora Sompeng",                                   0x0110D0, 0x0110FF }, { "Chakma",                                         0x011100, 0x01114F }, { "Mahajani",                                       0x011150, 0x01117F }, { "Sharada",                                        0x011180, 0x0111DF }, { "Sinhala Archaic Numbers",                        0x0111E0, 0x0111FF }, { "Khojki",                                         0x011200, 0x01124F }, { "Multani",                                        0x011280, 0x0112AF }, { "Khudawadi",                                      0x0112B0, 0x0112FF }, { "Grantha",                                        0x011300, 0x01137F }, { "Newa",                                           0x011400, 0x01147F }, { "Tirhuta",                                        0x011480, 0x0114DF }, { "Siddham",                                        0x011580, 0x0115FF }, { "Modi",                                           0x011600, 0x01165F }, { "Mongolian Supplement",                           0x011660, 0x01167F }, { "Takri",                                          0x011680, 0x0116CF }, { "Ahom",                                           0x011700, 0x01173F }, { "Dogra",                                          0x011800, 0x01184F }, { "Warang Citi",                                    0x0118A0, 0x0118FF }, { "Zanabazar Square",                               0x011A00, 0x011A4F }, { "Soyombo",                                        0x011A50, 0x011AAF }, { "Pau Cin Hau",                                    0x011AC0, 0x011AFF }, { "Bhaiksuki",                                      0x011C00, 0x011C6F }, { "Marchen",                                        0x011C70, 0x011CBF }, { "Masaram Gondi",                                  0x011D00, 0x011D5F }, { "Gunjala Gondi",                                  0x011D60, 0x011DAF }, { "Makasar",                                        0x011EE0, 0x011EFF }, { "Cuneiform",                                      0x012000, 0x0123FF }, { "Cuneiform Numbers and Punctuation",              0x012400, 0x01247F }, { "Early Dynastic Cuneiform",                       0x012480, 0x01254F }, { "Egyptian Hieroglyphs",                           0x013000, 0x01342F }, { "Anatolian Hieroglyphs",                          0x014400, 0x01467F }, { "Bamum Supplement",                               0x016800, 0x016A3F }, { "Mro",                                            0x016A40, 0x016A6F }, { "Bassa Vah",                                      0x016AD0, 0x016AFF }, { "Pahawh Hmong",                                   0x016B00, 0x016B8F }, { "Medefaidrin",                                    0x016E40, 0x016E9F }, { "Miao",                                           0x016F00, 0x016F9F }, { "Ideographic Symbols and Punctuation",            0x016FE0, 0x016FFF }, { "Tangut",                                         0x017000, 0x0187FF }, { "Tangut Components",                              0x018800, 0x018AFF }, { "Kana Supplement",                                0x01B000, 0x01B0FF }, { "Kana Extended-A",                                0x01B100, 0x01B12F }, { "Nushu",                                          0x01B170, 0x01B2FF }, { "Duployan",                                       0x01BC00, 0x01BC9F }, { "Shorthand Format Controls",                      0x01BCA0, 0x01BCAF }, { "Byzantine Musical Symbols",                      0x01D000, 0x01D0FF }, { "Musical Symbols",                                0x01D100, 0x01D1FF }, { "Ancient Greek Musical Notation",                 0x01D200, 0x01D24F }, { "Mayan Numerals",                                 0x01D2E0, 0x01D2FF }, { "Tai Xuan Jing Symbols",                          0x01D300, 0x01D35F }, { "Counting Rod Numerals",                          0x01D360, 0x01D37F }, { "Mathematical Alphanumeric Symbols",              0x01D400, 0x01D7FF }, { "Sutton SignWriting",                             0x01D800, 0x01DAAF }, { "Glagolitic Supplement",                          0x01E000, 0x01E02F }, { "Mende Kikakui",                                  0x01E800, 0x01E8DF }, { "Adlam",                                          0x01E900, 0x01E95F }, { "Indic Siyaq Numbers",                            0x01EC70, 0x01ECBF }, { "Arabic Mathematical Alphabetic Symbols",         0x01EE00, 0x01EEFF }, { "Mahjong Tiles",                                  0x01F000, 0x01F02F }, { "Domino Tiles",                                   0x01F030, 0x01F09F }, { "Playing Cards",                                  0x01F0A0, 0x01F0FF }, { "Enclosed Alphanumeric Supplement",               0x01F100, 0x01F1FF }, { "Enclosed Ideographic Supplement",                0x01F200, 0x01F2FF }, { "Miscellaneous Symbols and Pictographs",          0x01F300, 0x01F5FF }, { "Emoticons",                                      0x01F600, 0x01F64F }, { "Ornamental Dingbats",                            0x01F650, 0x01F67F }, { "Transport and Map Symbols",                      0x01F680, 0x01F6FF }, { "Alchemical Symbols",                             0x01F700, 0x01F77F }, { "Geometric Shapes Extended",                      0x01F780, 0x01F7FF }, { "Supplemental Arrows-C",                          0x01F800, 0x01F8FF }, { "Supplemental Symbols and Pictographs",           0x01F900, 0x01F9FF }, { "Chess Symbols",                                  0x01FA00, 0x01FA6F }, { "CJK Unified Ideographs Extension B",             0x020000, 0x02A6DF }, { "CJK Unified Ideographs Extension C",             0x02A700, 0x02B73F }, { "CJK Unified Ideographs Extension D",             0x02B740, 0x02B81F }, { "CJK Unified Ideographs Extension E",             0x02B820, 0x02CEAF }, { "CJK Unified Ideographs Extension F",             0x02CEB0, 0x02EBEF }, { "CJK Compatibility Ideographs Supplement",        0x02F800, 0x02FA1F }, { "Tags",                                           0x0E0000, 0x0E007F }, { "Variation Selectors Supplement",                 0x0E0100, 0x0E01EF }, { "Supplementary Private Use Area-A",               0x0F0000, 0x0FFFFF }, { "Supplementary Private Use Area-B",               0x100000, 0x10FFFF }, } blocks.length = #blocks

function export.enum_blocks return function (blocks, i)		i = i + 1 local data = blocks[i] if not data then return nil end return i, unpack(data) end, blocks, 0 end

function export.lookup_plane(codepoint) local i = floor(codepoint / 0x10000) return planes[i] or ("Plane %u"):format(i) end

-- Binary search, to avoid iterating over entire table in order to look up the -- higher codepoints. function export.lookup_block(codepoint) local iStart, iEnd = 1, blocks.length or #blocks while iStart <= iEnd do		local iMid = floor((iStart + iEnd) / 2) local range = blocks[iMid] if codepoint < range[2] then iEnd = iMid - 1 elseif codepoint <= range[3] then return range[1] else iStart = iMid + 1 end end error(string.format("No block found for codepoint U+%04X.", codepoint)) end

function export.get_block_range(name) local range for i, block in ipairs(blocks) do		if block[1] == name then range = block end end if range then return range[2], range[3] end end

function export.is_valid_pagename(pagename) local has_nonws = false

for cp in mw.ustring.gcodepoint(pagename) do		if (cp == 0x0023) -- # or (cp == 0x005B) -- [ or (cp == 0x005D) -- ] or (cp == 0x007B) -- { or (cp == 0x007C) -- | or (cp == 0x007D) -- } or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block or (cp == 0xFFFD) -- REPLACEMENT CHARACTER then return false end

local printable, result = export.is_printable(cp) if not printable then return false end

if result ~= "space-separator" then has_nonws = true end end

return has_nonws end

local function manual_unpack(what, from) local result = {} from = from or 1 for i, item in ipairs(what) do		if i >= from then table.insert(result, item) end end return unpack(result) end

local function memo_lookup(loader, match_func, ...) local dots = { ... }	local cache = {} local singles, ranges

return function (codepoint) if not singles then singles, ranges = loader end

if singles[codepoint] then return match_func(codepoint, singles[codepoint]) end

local lastlast = -1 for _, range in pairs(cache) do			if (range[1] <= codepoint) and (codepoint <= range[2]) then return match_func(codepoint, unpack(range, 3)) end end

for _, range in pairs(ranges) do			if codepoint < range[1] then table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) }) return match_func(codepoint, unpack(dots)) elseif codepoint <= range[2] then table.insert(cache, { manual_unpack(range) }) return match_func(codepoint, manual_unpack(range, 3)) else lastlast = range[2] end end

return match_func(codepoint) end end

-- Get a codepoint's combining class value in Module:Unicode data/combining, -- and return whether this value is not zero. Zero is assigned as the default -- if the combining class value is not found in this data module. -- That is, return true if character is combining, or false if it is not. -- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for -- more information. export.is_combining = memo_lookup(function 	local m_comb = mw.loadData('Module:Unicode data/combining')	return m_comb.single, m_comb.ranges end, function (codepoint, combining_class)	return combining_class and combining_class ~= 0		or false end, 0)

function export.add_dotted_circle(str) return (mw.ustring.gsub(str, ".", function(char) if export.is_combining(mw.ustring.codepoint(char)) then return '◌' .. char end end)) end

local lookup_control = memo_lookup(function 	local m_cc = mw.loadData('Module:Unicode data/control')	return m_cc.single, m_cc.ranges end, function (codepoint, ccc)	return ccc or "assigned" end, "assigned")

function export.is_assigned(codepoint) return lookup_control(codepoint) ~= "unassigned" end

function export.is_printable(codepoint) local result = lookup_control(codepoint) return (result == "assigned") or (result == "space-separator"), result end

function export.is_whitespace(codepoint) local result = lookup_control(codepoint) return (result == "space-separator"), result end

-- to be used in language-neutral context only (e.g. character lists)

local script_pats

-- Scripts that consist entirely of characters from another script. local script_blacklist = { ["Latf"]		= true; ["Hans"]		= true; ["Hant"]		= true; ["Kore"]		= true; ["Jpan"]		= true; ["fa-Arab"] 	= true; ["kk-Arab"] 	= true; ["ks-Arab"] 	= true; ["ku-Arab"]		= true; ["mzn-Arab"]	= true; ["ota-Arab"]	= true; ["pa-Arab"]		= true; ["ps-Arab"]		= true; ["sd-Arab"]		= true; ["tt-Arab"]		= true; ["ug-Arab"]		= true; ["ur-Arab"]		= true; ["nv-Latn"]		= true; ["pjt-Latn"]	= true; ["Zyyy"]		= true; }

--	Problem scripts: Grek and polytonic, Cyrl and Cyrs, Latn and Latinx.	In each key-value pair, the value should take precedence over the key.

local overridden_by = { ["Cyrs"] = "Cyrl", ["polytonic"] = "Grek", ["Latinx"] = "Latn", }

local script_cache = {}

function export.get_script(codepoint) local text if type(codepoint) == "number" then text = mw.ustring.char(codepoint) elseif type(codepoint) == "string" then text = codepoint else error("Argument to get_script should be a number (codepoint) or string.") end

for pat, sc in pairs(script_cache) do		if mw.ustring.match(text, pat) and not overridden_by[sc] then return sc		end end

if not script_pats then local m_scripts = mw.loadData("Module:scripts/data") script_pats = {} for sc, info in pairs(m_scripts) do			if info.characters and not script_blacklist[sc] then script_pats[sc] = "[" .. info.characters .. "]"			end end end

for sc, pat in pairs(script_pats) do		if mw.ustring.match(text, pat) then local overriding = overridden_by[sc] if overriding and script_pats[overriding] and mw.ustring.match(text, script_pats[overriding]) then script_cache[script_pats[overriding]] = overriding return overriding else script_cache[pat] = sc				return sc			end end end

return "None" end

local function sortRange(range1, range2) return range1[1] < range2[1] end

--	Binary search: more efficient for the longer lists of codepoint ranges than	for the shorter ones. local function binary_search(ranges, value) if not ranges then return nil end --	Initialize numbers. local iStart, iMid = 1, 0 -- Can't use # because table is loaded by mw.loadData. local iEnd = ranges.length or require("Module:table").size(ranges)

if iEnd == 0 then return nil end

local iterations = 0

-- Do search. while iStart <= iEnd do		iterations = iterations + 1

-- Calculate middle. iMid = floor((iStart + iEnd) / 2)

-- Get compare value. local range = ranges[iMid]

if range[1] > value then iEnd = iMid - 1

-- Return matching index. Assumes there are no duplicates. elseif value <= range[2] then return range

-- Keep searching. else iStart = iMid + 1 end end return nil end

local function look_up_in_order(number, ranges) for i, range in ipairs(ranges) do		if number < range[1] then return nil elseif number <= range[2] then return range[3] end end end

-- Save previously used codepoint ranges in case another character is in the -- same range. local ranges_cache = {}

--[=[	Takes a codepoint or a character and finds the script code (if any) that is	appropriate for it based on the codepoint, using the data module Module:Unicode data/scripts. The data module was generated from the patterns in Module:scripts/data using Module:User:Erutuon/script recognition.

Converts the character to a codepoint. Returns a script code if the codepoint is in the list of individual characters, or if it is in one of the defined ranges in the 4096-character block that it belongs to, else returns "None". ]=] function export.char_to_script(char) local lookup = mw.loadData("Module:Unicode data/scripts") local t = type(char) local codepoint if t == "string" then local etc codepoint, etc = mw.ustring.codepoint(char) if etc then error("Argument to char_to_script should be a single character.") end elseif t == "number" then codepoint = char else error("Argument to char_to_script should be a string or a number, but its type is " .. t .. ".") end

local individual_match = lookup.individual[codepoint] if individual_match then return individual_match else local script = look_up_in_order(codepoint, ranges_cache) if script then return script end

local index = floor(codepoint / 0x1000)

script = look_up_in_order(index, lookup.blocks) if script then return script end

local range = binary_search(lookup[index], codepoint) if range then table.insert(ranges_cache, range) table.sort(ranges_cache, sortRange) return range[3] end end

return "None" end

function export.find_best_script(text) local scripts = {} for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do		local script = export.char_to_script(character) scripts[script] = (scripts[script] or 0) + 1 end local best_script local greatest_count = 0 for script, count in pairs(scripts) do		if count > greatest_count then best_script = script greatest_count = count end end return best_script end

local unsupported_title = { [0x0020] = "Unsupported titles/Space"; [0x0023] = "Unsupported titles/Number sign"; [0x002E] = "Unsupported titles/Full stop"; [0x003A] = "Unsupported titles/Colon"; [0x003C] = "Unsupported titles/Less than"; [0x003E] = "Unsupported titles/Greater than"; [0x005B] = "Unsupported titles/Left square bracket"; [0x005D] = "Unsupported titles/Right square bracket"; [0x005F] = "Unsupported titles/Low line"; [0x007B] = "Unsupported titles/Left curly bracket"; [0x007C] = "Unsupported titles/Vertical line"; [0x007D] = "Unsupported titles/Right curly bracket"; [0x1680] = "Unsupported titles/Ogham space"; [0xFFFD] = "Unsupported titles/Replacement character"; }

function export.get_entry_title(codepoint) if unsupported_title[codepoint] then return unsupported_title[codepoint] end if lookup_control(codepoint) ~= "assigned" then return nil end return mw.ustring.char(codepoint) end

return export