Skip to content

Commit

Permalink
More tests
Browse files Browse the repository at this point in the history
  • Loading branch information
overlookmotel committed Jan 24, 2025
1 parent c9495c4 commit 669607b
Showing 1 changed file with 106 additions and 2 deletions.
108 changes: 106 additions & 2 deletions crates/oxc_ast/src/utf8_to_utf16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,15 @@ mod test {
use oxc_allocator::Allocator;
use oxc_span::{GetSpan, SourceType, Span};

use super::Utf8ToUtf16;
use crate::{
ast::{Expression, Statement},
AstBuilder, Comment, CommentKind,
};

use super::Utf8ToUtf16;

#[test]
fn test() {
fn translate_ast() {
let allocator = Allocator::new();
let ast = AstBuilder::new(&allocator);

Expand Down Expand Up @@ -141,4 +142,107 @@ mod test {
assert_eq!(s.span, Span::new(1, 5));
assert_eq!(program.comments[0].span, Span::new(6, 11));
}

#[test]
fn translate_offsets() {
assert_eq!('_'.len_utf8(), 1);
assert_eq!('_'.len_utf16(), 1);
assert_eq!('£'.len_utf8(), 2);
assert_eq!('£'.len_utf16(), 1);
assert_eq!('ऊ'.len_utf8(), 3);
assert_eq!('ऊ'.len_utf16(), 1);
assert_eq!('🤨'.len_utf8(), 4);
assert_eq!('🤨'.len_utf16(), 2);

let cases: &[(&str, &[(u32, u32)])] = &[
// 1-byte
("_", &[(0, 0), (1, 1)]),
// 2-byte
("£", &[(0, 0), (2, 1)]),
("£_", &[(0, 0), (2, 1), (3, 2)]),
("_£", &[(0, 0), (1, 1), (3, 2)]),
("_£_", &[(0, 0), (1, 1), (3, 2), (4, 3)]),
("_££_", &[(0, 0), (1, 1), (3, 2), (5, 3), (6, 4)]),
("_£_£_", &[(0, 0), (1, 1), (3, 2), (4, 3), (6, 4), (7, 5)]),
// 3-byte
("ऊ", &[(0, 0), (3, 1)]),
("ऊ_", &[(0, 0), (3, 1), (4, 2)]),
("_ऊ", &[(0, 0), (1, 1), (4, 2)]),
("_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3)]),
("_ऊऊ_", &[(0, 0), (1, 1), (4, 2), (7, 3), (8, 4)]),
("_ऊ_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3), (8, 4), (9, 5)]),
// 4-byte
("🤨", &[(0, 0), (4, 2)]),
("🤨_", &[(0, 0), (4, 2), (5, 3)]),
("_🤨", &[(0, 0), (1, 1), (5, 3)]),
("_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4)]),
("_🤨🤨_", &[(0, 0), (1, 1), (5, 3), (9, 5), (10, 6)]),
("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]),
];

for (text, translations) in cases {
let mut converter = Utf8ToUtf16::new();
converter.build_table(text);
for &(utf8_offset, expected_utf16_offset) in *translations {
assert_eq!(converter.convert_offset(utf8_offset), expected_utf16_offset);
}
}
}

// Check assumptions about how many UTF-16 chars result from different UTF-8 character sequences,
// which are relied on by `build_table`
#[test]
fn char_lengths() {
macro_rules! assert_utf8_bytes_eq {
($c:expr, $bytes:expr) => {{
let mut buffer = [0; 4];
let bytes = $c.encode_utf8(&mut buffer).as_bytes();
assert!($bytes == bytes);
}};
}

// All 1-byte UTF-8 character sequences = 1 x UTF-16 character.
// First byte is 0x00 - 0x7F.
let min_1_byte_char = char::from_u32(0).unwrap();
assert_eq!(min_1_byte_char.len_utf8(), 1);
assert_eq!(min_1_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_1_byte_char, [0x00]);
let max_1_byte_char = char::from_u32(0x7F).unwrap();
assert_eq!(max_1_byte_char.len_utf8(), 1);
assert_eq!(max_1_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_1_byte_char, [0x7F]);

// All 2-byte UTF-8 character sequences = 1 x UTF-16 character
// First byte is 0xC2 - 0xDF.
let min_2_byte_char = char::from_u32(0x80).unwrap();
assert_eq!(min_2_byte_char.len_utf8(), 2);
assert_eq!(min_2_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_2_byte_char, [0xC2, 0x80]);
let max_2_byte_char = char::from_u32(0x7FF).unwrap();
assert_eq!(max_2_byte_char.len_utf8(), 2);
assert_eq!(max_2_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_2_byte_char, [0xDF, 0xBF]);

// All 3-byte UTF-8 character sequences = 1 x UTF-16 character
// First byte is 0xE0 - 0xEF.
let min_3_byte_char = char::from_u32(0x800).unwrap();
assert_eq!(min_3_byte_char.len_utf8(), 3);
assert_eq!(min_3_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_3_byte_char, [0xE0, 0xA0, 0x80]);
let max_3_byte_char = char::from_u32(0xFFFF).unwrap();
assert_eq!(max_3_byte_char.len_utf8(), 3);
assert_eq!(max_3_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_3_byte_char, [0xEF, 0xBF, 0xBF]);

// All 4-byte UTF-8 character sequences = 2 x UTF-16 characters
// First byte is 0xF0 - 0xF4.
let min_4_byte_char = char::from_u32(0x10000).unwrap();
assert_eq!(min_4_byte_char.len_utf8(), 4);
assert_eq!(min_4_byte_char.len_utf16(), 2);
assert_utf8_bytes_eq!(min_4_byte_char, [0xF0, 0x90, 0x80, 0x80]);
let max_4_byte_char = char::MAX;
assert_eq!(max_4_byte_char.len_utf8(), 4);
assert_eq!(max_4_byte_char.len_utf16(), 2);
assert_utf8_bytes_eq!(max_4_byte_char, [0xF4, 0x8F, 0xBF, 0xBF]);
}
}

0 comments on commit 669607b

Please sign in to comment.