Skip to content

Commit d8654db

Browse files
authored
Split generated file into multiple files, separating static code from generated code (#86)
1 parent ab7a6dc commit d8654db

12 files changed

Lines changed: 23175 additions & 23638 deletions

File tree

scripts/unicode.py

Lines changed: 361 additions & 765 deletions
Large diffs are not rendered by default.

src/gen/lookup.rs

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
12+
use crate::tables::*;
13+
use crate::width_info::WidthInfo;
14+
15+
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
16+
/// consulting a multi-level lookup table.
17+
///
18+
/// # Maintenance
19+
/// The tables themselves are autogenerated but this function is hardcoded. You should have
20+
/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
21+
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
22+
/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes.
23+
#[inline]
24+
pub(crate) fn lookup_width(c: char) -> (u8, WidthInfo) {
25+
let cp = c as usize;
26+
27+
let t1_offset = WIDTH_ROOT.0[cp >> 13];
28+
29+
// Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte,
30+
// so each sub-table is 128 bytes in size.
31+
// (Sub-tables are selected using the computed offset from the previous table.)
32+
let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F];
33+
34+
// Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits.
35+
// This is accomplished by packing four stored entries into one byte.
36+
// So each sub-table is 2**(7-2) == 32 bytes in size.
37+
// Since this is the last table, each entry represents an encoded width.
38+
let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F];
39+
40+
// Extract the packed width
41+
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
42+
43+
if width < 3 {
44+
(width, WidthInfo::DEFAULT)
45+
} else {
46+
match c {
47+
'\u{A}' => (1, WidthInfo::LINE_FEED),
48+
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
49+
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
50+
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
51+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
52+
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
53+
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
54+
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
55+
'\u{FE01}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3),
56+
'\u{FE0E}' => (0, WidthInfo::VARIATION_SELECTOR_15),
57+
'\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16),
58+
'\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I),
59+
'\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E),
60+
'\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI),
61+
'\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR),
62+
'\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER),
63+
_ => (2, WidthInfo::EMOJI_PRESENTATION),
64+
}
65+
}
66+
}
67+
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
68+
/// consulting a multi-level lookup table.
69+
///
70+
/// # Maintenance
71+
/// The tables themselves are autogenerated but this function is hardcoded. You should have
72+
/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
73+
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
74+
/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes.
75+
#[cfg(feature = "cjk")]
76+
#[inline]
77+
pub(crate) fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
78+
let cp = c as usize;
79+
80+
let t1_offset = WIDTH_ROOT_CJK.0[cp >> 13];
81+
82+
// Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte,
83+
// so each sub-table is 128 bytes in size.
84+
// (Sub-tables are selected using the computed offset from the previous table.)
85+
let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> 7 & 0x3F];
86+
87+
// Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits.
88+
// This is accomplished by packing four stored entries into one byte.
89+
// So each sub-table is 2**(7-2) == 32 bytes in size.
90+
// Since this is the last table, each entry represents an encoded width.
91+
let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x1F];
92+
93+
// Extract the packed width
94+
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
95+
96+
if width < 3 {
97+
(width, WidthInfo::DEFAULT)
98+
} else {
99+
match c {
100+
'\u{A}' => (1, WidthInfo::LINE_FEED),
101+
'\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY),
102+
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
103+
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
104+
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
105+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
106+
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
107+
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
108+
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
109+
'\u{FE00}'..='\u{FE02}' => (0, WidthInfo::VARIATION_SELECTOR_1_2_OR_3),
110+
'\u{FE0F}' => (0, WidthInfo::VARIATION_SELECTOR_16),
111+
'\u{10C03}' => (1, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I),
112+
'\u{16D67}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E),
113+
'\u{16D68}' => (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI),
114+
'\u{1F1E6}'..='\u{1F1FF}' => (1, WidthInfo::REGIONAL_INDICATOR),
115+
'\u{1F3FB}'..='\u{1F3FF}' => (2, WidthInfo::EMOJI_MODIFIER),
116+
_ => (2, WidthInfo::EMOJI_PRESENTATION),
117+
}
118+
}
119+
}

src/gen/props.rs

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
12+
use crate::tables::*;
13+
use core::cmp::Ordering;
14+
15+
/// Whether this character has Joining_Group=Lam.
16+
pub fn is_joining_group_lam(c: char) -> bool {
17+
matches!(
18+
c,
19+
'\u{644}' | '\u{6B5}'..='\u{6B8}' | '\u{76A}' | '\u{8A6}' | '\u{8C7}'
20+
)
21+
}
22+
23+
/// Whether this character is a default-ignorable combining mark
24+
/// or ZWJ. These characters won't interrupt non-Arabic ligatures.
25+
pub fn is_ligature_transparent(c: char) -> bool {
26+
matches!(
27+
c,
28+
'\u{34F}' | '\u{17B4}'..='\u{17B5}' | '\u{180B}'..='\u{180D}' | '\u{180F}' | '\u{200D}' | '\u{FE00}'..='\u{FE0F}' | '\u{E0100}'..='\u{E01EF}'
29+
)
30+
}
31+
32+
/// Whether this character forms an [emoji presentation sequence]
33+
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
34+
/// when followed by `'\u{FEOF}'`.
35+
/// Emoji presentation sequences are considered to have width 2.
36+
#[inline]
37+
pub fn starts_emoji_presentation_seq(c: char) -> bool {
38+
let cp: u32 = c.into();
39+
// First level of lookup uses all but 10 LSB
40+
let top_bits = cp >> 10;
41+
let idx_of_leaf: usize = match top_bits {
42+
0x0 => 0,
43+
0x8 => 1,
44+
0x9 => 2,
45+
0xA => 3,
46+
0xC => 4,
47+
0x7C => 5,
48+
0x7D => 6,
49+
_ => return false,
50+
};
51+
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
52+
// and use them to index into `leaf_row`.
53+
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
54+
let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
55+
// Use the 3 LSB of `cp` to index into `leaf_byte`.
56+
((leaf_byte >> (cp & 7)) & 1) == 1
57+
}
58+
59+
/// Returns `true` if `c` has default emoji presentation, but forms a [text presentation sequence]
60+
/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
61+
/// when followed by `'\u{FEOE}'`, and is not ideographic.
62+
/// Such sequences are considered to have width 1.
63+
#[inline]
64+
pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
65+
let cp: u32 = c.into();
66+
// First level of lookup uses all but 8 LSB
67+
let top_bits = cp >> 8;
68+
let leaf: &[(u8, u8)] = match top_bits {
69+
0x23 => &TEXT_PRESENTATION_LEAF_0,
70+
0x25 => &TEXT_PRESENTATION_LEAF_1,
71+
0x26 => &TEXT_PRESENTATION_LEAF_2,
72+
0x27 => &TEXT_PRESENTATION_LEAF_3,
73+
0x2B => &TEXT_PRESENTATION_LEAF_4,
74+
0x1F0 => &TEXT_PRESENTATION_LEAF_5,
75+
0x1F3 => &TEXT_PRESENTATION_LEAF_6,
76+
0x1F4 => &TEXT_PRESENTATION_LEAF_7,
77+
0x1F5 => &TEXT_PRESENTATION_LEAF_8,
78+
0x1F6 => &TEXT_PRESENTATION_LEAF_9,
79+
_ => return false,
80+
};
81+
82+
let bottom_bits = (cp & 0xFF) as u8;
83+
leaf.binary_search_by(|&(lo, hi)| {
84+
if bottom_bits < lo {
85+
Ordering::Greater
86+
} else if bottom_bits > hi {
87+
Ordering::Less
88+
} else {
89+
Ordering::Equal
90+
}
91+
})
92+
.is_ok()
93+
}
94+
95+
/// Returns `true` if `c` is an `Emoji_Modifier_Base`.
96+
#[inline]
97+
pub fn is_emoji_modifier_base(c: char) -> bool {
98+
let cp: u32 = c.into();
99+
// First level of lookup uses all but 8 LSB
100+
let top_bits = cp >> 8;
101+
let leaf: &[(u8, u8)] = match top_bits {
102+
0x26 => &EMOJI_MODIFIER_LEAF_0,
103+
0x27 => &EMOJI_MODIFIER_LEAF_1,
104+
0x1F3 => &EMOJI_MODIFIER_LEAF_2,
105+
0x1F4 => &EMOJI_MODIFIER_LEAF_3,
106+
0x1F5 => &EMOJI_MODIFIER_LEAF_4,
107+
0x1F6 => &EMOJI_MODIFIER_LEAF_5,
108+
0x1F9 => &EMOJI_MODIFIER_LEAF_6,
109+
0x1FA => &EMOJI_MODIFIER_LEAF_7,
110+
_ => return false,
111+
};
112+
113+
let bottom_bits = (cp & 0xFF) as u8;
114+
leaf.binary_search_by(|&(lo, hi)| {
115+
if bottom_bits < lo {
116+
Ordering::Greater
117+
} else if bottom_bits > hi {
118+
Ordering::Less
119+
} else {
120+
Ordering::Equal
121+
}
122+
})
123+
.is_ok()
124+
}

0 commit comments

Comments
 (0)