Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 2 additions & 95 deletions nototools/unicode_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from nototools import tool_utils # parse_int_ranges

# Update this when we update the base version data we use
UNICODE_VERSION = 14.0
UNICODE_VERSION = 17.0

_data_is_loaded = False
_property_value_aliases_data = {}
Expand Down Expand Up @@ -935,53 +935,6 @@ def _read_emoji_test_data(data_string):
fe82b ; fully-qualified # ? unknown flag PUA codepoint
"""

# These are skin tone sequences that Unicode decided not to define. Android
# shipped with them, so we're stuck with them forever regardless of what
# Unicode says.
#
# This data is in the format of emoji-sequences.txt and emoji-zwj-sequences.txt
_LEGACY_ANDROID_SEQUENCES = """
1F93C 1F3FB ; Emoji_Modifier_Sequence ; people wrestling: light skin tone # 9.0
1F93C 1F3FC ; Emoji_Modifier_Sequence ; people wrestling: medium-light skin tone # 9.0
1F93C 1F3FD ; Emoji_Modifier_Sequence ; people wrestling: medium skin tone # 9.0
1F93C 1F3FE ; Emoji_Modifier_Sequence ; people wrestling: medium-dark skin tone # 9.0
1F93C 1F3FF ; Emoji_Modifier_Sequence ; people wrestling: dark skin tone # 9.0
1F93C 1F3FB 200D 2642 FE0F ; Emoji_ZWJ_Sequence ; men wrestling: light skin tone # 9.0
1F93C 1F3FC 200D 2642 FE0F ; Emoji_ZWJ_Sequence ; men wrestling: medium-light skin tone # 9.0
1F93C 1F3FD 200D 2642 FE0F ; Emoji_ZWJ_Sequence ; men wrestling: medium skin tone # 9.0
1F93C 1F3FE 200D 2642 FE0F ; Emoji_ZWJ_Sequence ; men wrestling: medium-dark skin tone # 9.0
1F93C 1F3FF 200D 2642 FE0F ; Emoji_ZWJ_Sequence ; men wrestling: dark skin tone # 9.0
1F93C 1F3FB 200D 2640 FE0F ; Emoji_ZWJ_Sequence ; women wrestling: light skin tone # 9.0
1F93C 1F3FC 200D 2640 FE0F ; Emoji_ZWJ_Sequence ; women wrestling: medium-light skin tone # 9.0
1F93C 1F3FD 200D 2640 FE0F ; Emoji_ZWJ_Sequence ; women wrestling: medium skin tone # 9.0
1F93C 1F3FE 200D 2640 FE0F ; Emoji_ZWJ_Sequence ; women wrestling: medium-dark skin tone # 9.0
1F93C 1F3FF 200D 2640 FE0F ; Emoji_ZWJ_Sequence ; women wrestling: dark skin tone # 9.0
"""

# Defines how to insert the new sequences into the standard order data. Would
# have been nice to merge it into the above legacy data but that would have
# required a format change.
_LEGACY_ANDROID_ORDER = """
-1F93C # people wrestling
1F93C 1F3FB
1F93C 1F3FC
1F93C 1F3FD
1F93C 1F3FE
1F93C 1F3FF
-1F93C 200D 2642 FE0F # men wrestling
1F93C 1F3FB 200D 2642 FE0F
1F93C 1F3FC 200D 2642 FE0F
1F93C 1F3FD 200D 2642 FE0F
1F93C 1F3FE 200D 2642 FE0F
1F93C 1F3FF 200D 2642 FE0F
-1F93C 200D 2640 FE0F # women wrestling
1F93C 1F3FB 200D 2640 FE0F
1F93C 1F3FC 200D 2640 FE0F
1F93C 1F3FD 200D 2640 FE0F
1F93C 1F3FE 200D 2640 FE0F
1F93C 1F3FF 200D 2640 FE0F
"""


def _get_order_patch(order_text, seq_to_name):
"""Create a mapping from a key sequence to a list of sequence, name tuples.
Expand Down Expand Up @@ -1018,47 +971,6 @@ def get_sequence(seqtext):
return patch_map


def _get_android_order_patch():
"""Get an order patch using the legacy android data."""

# maps from sequence to (name, age, type), we only need the name
seq_data = _read_emoji_data(_LEGACY_ANDROID_SEQUENCES.splitlines())
seq_to_name = {k: v[0] for k, v in seq_data.items()}
return _get_order_patch(_LEGACY_ANDROID_ORDER, seq_to_name)


def _apply_order_patch(patch, group_list):
"""patch is a map from a key sequence to list of sequence, name pairs, and
group_list is an ordered list of sequence, group, subgroup, name tuples.
Iterate through the group list appending each item to a new list, and
after appending an item matching a key sequence, also append all of its
associated sequences in order using the same group and subgroup.
Return the new list. If there are any unused patches, raise an exception."""

result = []
patched = set()
for t in group_list:
result.append(t)
if t[0] in patch:
patched.add(t[0])
_, group, subgroup, _ = t
for seq, name in patch[t[0]]:
result.append((seq, group, subgroup, name))

unused = set(patch.keys()) - patched
if unused:
raise Exception(
"%d unused patch%s\n %s: "
% (
len(unused),
"" if len(unused) == 1 else "es",
"\n ".join(seq_to_string(seq) for seq in sorted(unused)),
)
)

return result


def _load_emoji_group_data():
global _emoji_group_data
if _emoji_group_data:
Expand All @@ -1070,10 +982,6 @@ def _load_emoji_group_data():
text = f.read()
group_list = _read_emoji_test_data(text)

# patch with android items
patch = _get_android_order_patch()
group_list = _apply_order_patch(patch, group_list)

group_list.extend(_read_emoji_test_data(_SUPPLEMENTAL_EMOJI_GROUP_DATA))
for i, (seq, group, subgroup, name) in enumerate(group_list):
if seq in _emoji_group_data:
Expand All @@ -1084,7 +992,7 @@ def _load_emoji_group_data():
print(" new value would be %s" % str((i, group, subgroup, name)))
_emoji_group_data[seq] = (i, group, subgroup, name)

assert len(group_list) == len(_emoji_group_data)
assert len(group_list) == len(_emoji_group_data), f"{len(group_list)} != {len(_emoji_group_data)}"


def get_emoji_group_data(seq):
Expand Down Expand Up @@ -1168,7 +1076,6 @@ def add_data(data):

for datafile in ["emoji-zwj-sequences.txt", "emoji-sequences.txt"]:
add_data(_read_emoji_data_file(datafile))
add_data(_read_emoji_data(_LEGACY_ANDROID_SEQUENCES.splitlines()))

_load_unicode_data_txt() # ensure character_names_data is populated
_load_emoji_data() # ensure presentation_default_text is populated
Expand Down
3 changes: 2 additions & 1 deletion tests/unicode_data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def test_defined(self):
"""Tests the is_defined() method."""
self.assertTrue(unicode_data.is_defined(0x20BD))
self.assertFalse(unicode_data.is_defined(0xFDD0))
self.assertFalse(unicode_data.is_defined(0x088F))
# This sometimes snaps when new codepoints are assigned
self.assertFalse(unicode_data.is_defined(0x0892))
# CJK ranges
self.assertTrue(unicode_data.is_defined(0x3400))
self.assertTrue(unicode_data.is_defined(0x4DB5))
Expand Down
8 changes: 4 additions & 4 deletions third_party/ucd/BidiMirroring.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# BidiMirroring-16.0.0.txt
# Date: 2024-01-30
# © 2024 Unicode®, Inc.
# BidiMirroring-17.0.0.txt
# Date: 2025-08-01
# © 2025 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
#
Expand All @@ -16,7 +16,7 @@
# value, for which there is another Unicode character that typically has a glyph
# that is the mirror image of the original character's glyph.
#
# The repertoire covered by the file is Unicode 16.0.0.
# The repertoire covered by the file is Unicode 17.0.0.
#
# The file contains a list of lines with mappings from one code point
# to another one for character-based mirroring.
Expand Down
14 changes: 11 additions & 3 deletions third_party/ucd/Blocks.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Blocks-16.0.0.txt
# Date: 2024-02-02
# © 2024 Unicode®, Inc.
# Blocks-17.0.0.txt
# Date: 2025-08-01
# © 2025 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
#
Expand Down Expand Up @@ -228,6 +228,7 @@ FFF0..FFFF; Specials
108E0..108FF; Hatran
10900..1091F; Phoenician
10920..1093F; Lydian
10940..1095F; Sidetic
10980..1099F; Meroitic Hieroglyphs
109A0..109FF; Meroitic Cursive
10A00..10A5F; Kharoshthi
Expand Down Expand Up @@ -279,11 +280,13 @@ FFF0..FFFF; Specials
11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A
11AC0..11AFF; Pau Cin Hau
11B00..11B5F; Devanagari Extended-A
11B60..11B7F; Sharada Supplement
11BC0..11BFF; Sunuwar
11C00..11C6F; Bhaiksuki
11C70..11CBF; Marchen
11D00..11D5F; Masaram Gondi
11D60..11DAF; Gunjala Gondi
11DB0..11DEF; Tolong Siki
11EE0..11EFF; Makasar
11F00..11F5F; Kawi
11FB0..11FBF; Lisu Supplement
Expand All @@ -304,12 +307,14 @@ FFF0..FFFF; Specials
16B00..16B8F; Pahawh Hmong
16D40..16D7F; Kirat Rai
16E40..16E9F; Medefaidrin
16EA0..16EDF; Beria Erfe
16F00..16F9F; Miao
16FE0..16FFF; Ideographic Symbols and Punctuation
17000..187FF; Tangut
18800..18AFF; Tangut Components
18B00..18CFF; Khitan Small Script
18D00..18D7F; Tangut Supplement
18D80..18DFF; Tangut Components Supplement
1AFF0..1AFFF; Kana Extended-B
1B000..1B0FF; Kana Supplement
1B100..1B12F; Kana Extended-A
Expand All @@ -318,6 +323,7 @@ FFF0..FFFF; Specials
1BC00..1BC9F; Duployan
1BCA0..1BCAF; Shorthand Format Controls
1CC00..1CEBF; Symbols for Legacy Computing Supplement
1CEC0..1CEFF; Miscellaneous Symbols Supplement
1CF00..1CFCF; Znamenny Musical Notation
1D000..1D0FF; Byzantine Musical Symbols
1D100..1D1FF; Musical Symbols
Expand All @@ -336,6 +342,7 @@ FFF0..FFFF; Specials
1E2C0..1E2FF; Wancho
1E4D0..1E4FF; Nag Mundari
1E5D0..1E5FF; Ol Onal
1E6C0..1E6FF; Tai Yo
1E7E0..1E7FF; Ethiopic Extended-B
1E800..1E8DF; Mende Kikakui
1E900..1E95F; Adlam
Expand Down Expand Up @@ -367,6 +374,7 @@ FFF0..FFFF; Specials
2F800..2FA1F; CJK Compatibility Ideographs Supplement
30000..3134F; CJK Unified Ideographs Extension G
31350..323AF; CJK Unified Ideographs Extension H
323B0..3347F; CJK Unified Ideographs Extension J
E0000..E007F; Tags
E0100..E01EF; Variation Selectors Supplement
F0000..FFFFF; Supplementary Private Use Area-A
Expand Down
65 changes: 61 additions & 4 deletions third_party/ucd/DerivedAge.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DerivedAge-16.0.0.txt
# Date: 2024-04-30, 21:48:12 GMT
# © 2024 Unicode®, Inc.
# DerivedAge-17.0.0.txt
# Date: 2025-07-30, 23:54:38 GMT
# © 2025 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
#
Expand All @@ -15,7 +15,8 @@
# - The term 'assigned' means that a previously reserved code point was assigned
# to be a character (graphic, format, control, or private-use);
# a noncharacter code point; or a surrogate code point.
# For more information, see The Unicode Standard Section 2.4
# For more information, see the
# "General Structure" / "Code Points and Characters" section of the core specification.
#
# - Versions are only tracked from 1.1 onwards, since version 1.0
# predated changes required by the ISO 10646 merger.
Expand Down Expand Up @@ -2059,4 +2060,60 @@ A7DA..A7DC ; 16.0 # [3] LATIN CAPITAL LETTER LAMBDA..LATIN CAPITAL LETTER L

# Total code points: 5185

# ================================================

# Age=V17_0

# Newly assigned in Unicode 17.0.0 (September, 2025)

088F ; 17.0 # ARABIC LETTER NOON WITH RING ABOVE
0C5C ; 17.0 # TELUGU ARCHAIC SHRII
0CDC ; 17.0 # KANNADA ARCHAIC SHRII
1ACF..1ADD ; 17.0 # [15] COMBINING DOUBLE CARON..COMBINING DOT-AND-RING BELOW
1AE0..1AEB ; 17.0 # [12] COMBINING LEFT TACK ABOVE..COMBINING DOUBLE RIGHTWARDS ARROW ABOVE
20C1 ; 17.0 # SAUDI RIYAL SIGN
2B96 ; 17.0 # EQUALS SIGN WITH INFINITY ABOVE
A7CE..A7CF ; 17.0 # [2] LATIN CAPITAL LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER PHARYNGEAL VOICED FRICATIVE
A7D2 ; 17.0 # LATIN CAPITAL LETTER DOUBLE THORN
A7D4 ; 17.0 # LATIN CAPITAL LETTER DOUBLE WYNN
A7F1 ; 17.0 # MODIFIER LETTER CAPITAL S
FBC3..FBD2 ; 17.0 # [16] ARABIC LIGATURE JALLA WA-ALAA..ARABIC LIGATURE ALAYHI AR-RAHMAH
FD90..FD91 ; 17.0 # [2] ARABIC LIGATURE RAHMATU ALLAAHI ALAYH..ARABIC LIGATURE RAHMATU ALLAAHI ALAYHAA
FDC8..FDCE ; 17.0 # [7] ARABIC LIGATURE RAHIMAHU ALLAAH TAAALAA..ARABIC LIGATURE KARRAMA ALLAAHU WAJHAH
10940..10959 ; 17.0 # [26] SIDETIC LETTER N01..SIDETIC LETTER N26
10EC5..10EC7 ; 17.0 # [3] ARABIC SMALL YEH BARREE WITH TWO DOTS BELOW..ARABIC LETTER YEH WITH FOUR DOTS BELOW
10ED0..10ED8 ; 17.0 # [9] ARABIC BIBLICAL END OF VERSE..ARABIC LIGATURE NAWWARA ALLAAHU MARQADAH
10EFA..10EFB ; 17.0 # [2] ARABIC DOUBLE VERTICAL BAR BELOW..ARABIC SMALL LOW NOON
11B60..11B67 ; 17.0 # [8] SHARADA VOWEL SIGN OE..SHARADA VOWEL SIGN CANDRA O
11DB0..11DDB ; 17.0 # [44] TOLONG SIKI LETTER I..TOLONG SIKI UNGGA
11DE0..11DE9 ; 17.0 # [10] TOLONG SIKI DIGIT ZERO..TOLONG SIKI DIGIT NINE
16EA0..16EB8 ; 17.0 # [25] BERIA ERFE CAPITAL LETTER ARKAB..BERIA ERFE CAPITAL LETTER AY
16EBB..16ED3 ; 17.0 # [25] BERIA ERFE SMALL LETTER ARKAB..BERIA ERFE SMALL LETTER AY
16FF2..16FF6 ; 17.0 # [5] CHINESE SMALL SIMPLIFIED ER..YANGQIN SIGN SLOW TWO BEATS
187F8..187FF ; 17.0 # [8] TANGUT IDEOGRAPH-187F8..TANGUT IDEOGRAPH-187FF
18D09..18D1E ; 17.0 # [22] TANGUT IDEOGRAPH-18D09..TANGUT IDEOGRAPH-18D1E
18D80..18DF2 ; 17.0 # [115] TANGUT COMPONENT-769..TANGUT COMPONENT-883
1CCFA..1CCFC ; 17.0 # [3] SNAKE SYMBOL..NOSE SYMBOL
1CEBA..1CED0 ; 17.0 # [23] FRAGILE SYMBOL..LEUKOTHEA
1CEE0..1CEF0 ; 17.0 # [17] GEOMANTIC FIGURE POPULUS..MEDIUM SMALL WHITE CIRCLE WITH HORIZONTAL BAR
1E6C0..1E6DE ; 17.0 # [31] TAI YO LETTER LOW KO..TAI YO LETTER HIGH KVO
1E6E0..1E6F5 ; 17.0 # [22] TAI YO LETTER AA..TAI YO SIGN OM
1E6FE..1E6FF ; 17.0 # [2] TAI YO SYMBOL MUEANG..TAI YO XAM LAI
1F6D8 ; 17.0 # LANDSLIDE
1F777..1F77A ; 17.0 # [4] VESTA FORM TWO..PARTHENOPE FORM TWO
1F8D0..1F8D8 ; 17.0 # [9] LONG RIGHTWARDS ARROW OVER LONG LEFTWARDS ARROW..LONG LEFT RIGHT ARROW WITH DEPENDENT LOBE
1FA54..1FA57 ; 17.0 # [4] WHITE CHESS FERZ..BLACK CHESS ALFIL
1FA8A ; 17.0 # TROMBONE
1FA8E ; 17.0 # TREASURE CHEST
1FAC8 ; 17.0 # HAIRY CREATURE
1FACD ; 17.0 # ORCA
1FAEA ; 17.0 # DISTORTED FACE
1FAEF ; 17.0 # FIGHT CLOUD
1FBFA ; 17.0 # ALARM BELL SYMBOL
2B73A..2B73F ; 17.0 # [6] CJK UNIFIED IDEOGRAPH-2B73A..CJK UNIFIED IDEOGRAPH-2B73F
2CEA2..2CEAD ; 17.0 # [12] CJK UNIFIED IDEOGRAPH-2CEA2..CJK UNIFIED IDEOGRAPH-2CEAD
323B0..33479 ; 17.0 # [4298] CJK UNIFIED IDEOGRAPH-323B0..CJK UNIFIED IDEOGRAPH-33479

# Total code points: 4803

# EOF
Loading