diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 485cc670ebe2a..aa7fe09f875af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,7 @@ jobs: save-cache: ${{ github.ref_name == 'main' }} cache-key: warm - run: cargo ck - - run: cargo test + - run: cargo test --all-features - run: git diff --exit-code # Must commit everything test-windows: @@ -96,7 +96,7 @@ jobs: shell: bash run: | # cargo ck # no need to check because it's already checked in linux - cargo test --workspace + cargo test --all-features test-wasm32-wasip1-threads: name: Test wasm32-wasip1-threads diff --git a/crates/oxc_ast/src/lib.rs b/crates/oxc_ast/src/lib.rs index 5ee8a4ff466cf..e7bdbb3fdabc5 100644 --- a/crates/oxc_ast/src/lib.rs +++ b/crates/oxc_ast/src/lib.rs @@ -53,6 +53,8 @@ mod ast_impl; mod ast_kind_impl; pub mod precedence; mod trivia; +#[cfg(feature = "serialize")] +pub mod utf8_to_utf16; mod generated { #![allow(missing_docs)] diff --git a/crates/oxc_ast/src/utf8_to_utf16.rs b/crates/oxc_ast/src/utf8_to_utf16.rs new file mode 100644 index 0000000000000..368d9ece4968b --- /dev/null +++ b/crates/oxc_ast/src/utf8_to_utf16.rs @@ -0,0 +1,248 @@ +//! Convert UTF-8 span offsets to UTF-16. + +use oxc_span::Span; + +use crate::{ast::Program, visit::VisitMut}; + +/// Convert UTF-8 span offsets to UTF-16. +pub struct Utf8ToUtf16 { + translations: Vec, +} + +#[derive(Clone, Copy)] +#[repr(align(8))] +struct Translation { + // UTF-8 byte offset + utf8_offset: u32, + // Number to subtract from UTF-8 byte offset to get UTF-16 char offset + // for offsets *after* `utf8_offset` + utf16_difference: u32, +} + +impl Utf8ToUtf16 { + /// Create new `Utf8ToUtf16` converter. + #[expect(clippy::new_without_default)] + pub fn new() -> Self { + let mut translations = Vec::with_capacity(16); + translations.push(Translation { utf8_offset: 0, utf16_difference: 0 }); + Self { translations } + } + + /// Convert all spans in the AST to UTF-16. + pub fn convert(mut self, program: &mut Program<'_>) { + self.build_table(program.source_text); + // Skip if source is entirely ASCII + if self.translations.len() == 1 { + return; + } + self.visit_program(program); + for comment in &mut program.comments { + self.convert_span(&mut comment.span); + } + } + + #[allow(clippy::cast_possible_truncation)] + fn build_table(&mut self, source_text: &str) { + // Translation from UTF-8 byte offset to UTF-16 char offset: + // + // * 1-byte UTF-8 sequence + // = 1st byte 0xxxxxxx (0 - 0x7F) + // -> 1 x UTF-16 char + // UTF-16 len = UTF-8 len + // * 2-byte UTF-8 sequence + // = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF) + // -> 1 x UTF-16 + // UTF-16 len = UTF-8 len - 1 + // * 3-byte UTF-8 sequence + // = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF) + // -> 1 x UTF-16 + // UTF-16 len = UTF-8 len - 2 + // * 4-byte UTF-8 sequence + // = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF) + // -> 2 x UTF-16 + // UTF-16 len = UTF-8 len - 2 + // + // So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0` + let mut utf16_difference = 0; + for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() { + if byte >= 0xC0 { + let difference_for_this_byte = u32::from(byte >= 0xE0) + 1; + utf16_difference += difference_for_this_byte; + // Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this + // Unicode character that need to be shifted + self.translations + .push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference }); + } + } + } + + fn convert_span(&self, span: &mut Span) { + span.start = self.convert_offset(span.start); + span.end = self.convert_offset(span.end); + } + + fn convert_offset(&self, utf8_offset: u32) -> u32 { + // Find the first entry in table *after* the UTF-8 offset. + // The difference we need to subtract is recorded in the entry prior to it. + let index = + self.translations.partition_point(|translation| translation.utf8_offset <= utf8_offset); + // First entry in table is `0, 0`. `partition_point` finds the first entry where + // `utf8_offset < translation.utf8_offset` (or `translations.len()` if none exists). + // So guaranteed `index > 0`, and `index <= translations.len()`. + // Therefore `index - 1` cannot wrap around, and cannot be out of bounds. + let translation = self.translations[index - 1]; + utf8_offset - translation.utf16_difference + } +} + +impl VisitMut<'_> for Utf8ToUtf16 { + fn visit_span(&mut self, span: &mut Span) { + self.convert_span(span); + } +} + +#[cfg(test)] +mod test { + use oxc_allocator::Allocator; + use oxc_span::{GetSpan, SourceType, Span}; + + use crate::{ + ast::{Expression, Statement}, + AstBuilder, Comment, CommentKind, + }; + + use super::Utf8ToUtf16; + + #[test] + fn translate_ast() { + let allocator = Allocator::new(); + let ast = AstBuilder::new(&allocator); + + let mut program = ast.program( + Span::new(0, 15), + SourceType::default(), + ";'🤨' // 🤨", + ast.vec1(Comment::new(8, 15, CommentKind::Line)), + None, + ast.vec(), + ast.vec_from_array([ + ast.statement_empty(Span::new(0, 1)), + ast.statement_expression( + Span::new(1, 7), + ast.expression_string_literal(Span::new(1, 7), "🤨", None), + ), + ]), + ); + + Utf8ToUtf16::new().convert(&mut program); + assert_eq!(program.span, Span::new(0, 11)); + assert_eq!(program.body[1].span(), Span::new(1, 5)); + let Statement::ExpressionStatement(expr_stmt) = &program.body[1] else { unreachable!() }; + let Expression::StringLiteral(s) = &expr_stmt.expression else { unreachable!() }; + assert_eq!(s.span, Span::new(1, 5)); + assert_eq!(program.comments[0].span, Span::new(6, 11)); + } + + #[test] + fn translate_offsets() { + assert_eq!('_'.len_utf8(), 1); + assert_eq!('_'.len_utf16(), 1); + assert_eq!('£'.len_utf8(), 2); + assert_eq!('£'.len_utf16(), 1); + assert_eq!('ऊ'.len_utf8(), 3); + assert_eq!('ऊ'.len_utf16(), 1); + assert_eq!('🤨'.len_utf8(), 4); + assert_eq!('🤨'.len_utf16(), 2); + + let cases: &[(&str, &[(u32, u32)])] = &[ + // 1-byte + ("_", &[(0, 0), (1, 1)]), + // 2-byte + ("£", &[(0, 0), (2, 1)]), + ("£_", &[(0, 0), (2, 1), (3, 2)]), + ("_£", &[(0, 0), (1, 1), (3, 2)]), + ("_£_", &[(0, 0), (1, 1), (3, 2), (4, 3)]), + ("_££_", &[(0, 0), (1, 1), (3, 2), (5, 3), (6, 4)]), + ("_£_£_", &[(0, 0), (1, 1), (3, 2), (4, 3), (6, 4), (7, 5)]), + // 3-byte + ("ऊ", &[(0, 0), (3, 1)]), + ("ऊ_", &[(0, 0), (3, 1), (4, 2)]), + ("_ऊ", &[(0, 0), (1, 1), (4, 2)]), + ("_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3)]), + ("_ऊऊ_", &[(0, 0), (1, 1), (4, 2), (7, 3), (8, 4)]), + ("_ऊ_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3), (8, 4), (9, 5)]), + // 4-byte + ("🤨", &[(0, 0), (4, 2)]), + ("🤨_", &[(0, 0), (4, 2), (5, 3)]), + ("_🤨", &[(0, 0), (1, 1), (5, 3)]), + ("_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4)]), + ("_🤨🤨_", &[(0, 0), (1, 1), (5, 3), (9, 5), (10, 6)]), + ("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]), + ]; + + for (text, translations) in cases { + let mut converter = Utf8ToUtf16::new(); + converter.build_table(text); + for &(utf8_offset, expected_utf16_offset) in *translations { + assert_eq!(converter.convert_offset(utf8_offset), expected_utf16_offset); + } + } + } + + // Check assumptions about how many UTF-16 chars result from different UTF-8 character sequences, + // which are relied on by `build_table` + #[test] + fn char_lengths() { + macro_rules! assert_utf8_bytes_eq { + ($c:expr, $bytes:expr) => {{ + let mut buffer = [0; 4]; + let bytes = $c.encode_utf8(&mut buffer).as_bytes(); + assert!($bytes == bytes); + }}; + } + + // All 1-byte UTF-8 character sequences = 1 x UTF-16 character. + // First byte is 0x00 - 0x7F. + let min_1_byte_char = char::from_u32(0).unwrap(); + assert_eq!(min_1_byte_char.len_utf8(), 1); + assert_eq!(min_1_byte_char.len_utf16(), 1); + assert_utf8_bytes_eq!(min_1_byte_char, [0x00]); + let max_1_byte_char = char::from_u32(0x7F).unwrap(); + assert_eq!(max_1_byte_char.len_utf8(), 1); + assert_eq!(max_1_byte_char.len_utf16(), 1); + assert_utf8_bytes_eq!(max_1_byte_char, [0x7F]); + + // All 2-byte UTF-8 character sequences = 1 x UTF-16 character + // First byte is 0xC2 - 0xDF. + let min_2_byte_char = char::from_u32(0x80).unwrap(); + assert_eq!(min_2_byte_char.len_utf8(), 2); + assert_eq!(min_2_byte_char.len_utf16(), 1); + assert_utf8_bytes_eq!(min_2_byte_char, [0xC2, 0x80]); + let max_2_byte_char = char::from_u32(0x7FF).unwrap(); + assert_eq!(max_2_byte_char.len_utf8(), 2); + assert_eq!(max_2_byte_char.len_utf16(), 1); + assert_utf8_bytes_eq!(max_2_byte_char, [0xDF, 0xBF]); + + // All 3-byte UTF-8 character sequences = 1 x UTF-16 character + // First byte is 0xE0 - 0xEF. + let min_3_byte_char = char::from_u32(0x800).unwrap(); + assert_eq!(min_3_byte_char.len_utf8(), 3); + assert_eq!(min_3_byte_char.len_utf16(), 1); + assert_utf8_bytes_eq!(min_3_byte_char, [0xE0, 0xA0, 0x80]); + let max_3_byte_char = char::from_u32(0xFFFF).unwrap(); + assert_eq!(max_3_byte_char.len_utf8(), 3); + assert_eq!(max_3_byte_char.len_utf16(), 1); + assert_utf8_bytes_eq!(max_3_byte_char, [0xEF, 0xBF, 0xBF]); + + // All 4-byte UTF-8 character sequences = 2 x UTF-16 characters + // First byte is 0xF0 - 0xF4. + let min_4_byte_char = char::from_u32(0x10000).unwrap(); + assert_eq!(min_4_byte_char.len_utf8(), 4); + assert_eq!(min_4_byte_char.len_utf16(), 2); + assert_utf8_bytes_eq!(min_4_byte_char, [0xF0, 0x90, 0x80, 0x80]); + let max_4_byte_char = char::MAX; + assert_eq!(max_4_byte_char.len_utf8(), 4); + assert_eq!(max_4_byte_char.len_utf16(), 2); + assert_utf8_bytes_eq!(max_4_byte_char, [0xF4, 0x8F, 0xBF, 0xBF]); + } +} diff --git a/crates/oxc_parser/examples/parser.rs b/crates/oxc_parser/examples/parser.rs index 70a01f6a5a402..c0ab2dd8e1bba 100644 --- a/crates/oxc_parser/examples/parser.rs +++ b/crates/oxc_parser/examples/parser.rs @@ -2,6 +2,7 @@ use std::{fs, path::Path}; use oxc_allocator::Allocator; +use oxc_ast::utf8_to_utf16::Utf8ToUtf16; use oxc_parser::{ParseOptions, Parser}; use oxc_span::SourceType; use pico_args::Arguments; @@ -9,12 +10,13 @@ use pico_args::Arguments; // Instruction: // create a `test.js`, // run `cargo run -p oxc_parser --example parser` -// or `cargo watch -x "run -p oxc_parser --example parser"` +// or `just watch "cargo run -p oxc_parser --example parser"` fn main() -> Result<(), String> { let mut args = Arguments::from_env(); let show_ast = args.contains("--ast"); + let show_estree = args.contains("--estree"); let show_comments = args.contains("--comments"); let name = args.free_from_str().unwrap_or_else(|_| "test.js".to_string()); @@ -26,20 +28,24 @@ fn main() -> Result<(), String> { let ret = Parser::new(&allocator, &source_text, source_type) .with_options(ParseOptions { parse_regular_expression: true, ..ParseOptions::default() }) .parse(); - - if show_ast { - println!("AST:"); - println!("{}", serde_json::to_string_pretty(&ret.program).unwrap()); - } + let mut program = ret.program; if show_comments { println!("Comments:"); - for comment in ret.program.comments { + for comment in &program.comments { let s = comment.content_span().source_text(&source_text); println!("{s}"); } } + if show_ast || show_estree { + println!("AST:"); + if show_estree { + Utf8ToUtf16::new().convert(&mut program); + } + println!("{}", serde_json::to_string_pretty(&program).unwrap()); + } + if ret.errors.is_empty() { println!("Parsed Successfully."); } else { diff --git a/tasks/benchmark/Cargo.toml b/tasks/benchmark/Cargo.toml index d9e330f0538e5..57adfaf57fcf6 100644 --- a/tasks/benchmark/Cargo.toml +++ b/tasks/benchmark/Cargo.toml @@ -65,7 +65,7 @@ bench = false # with only the crates it needs, to speed up the builds [dependencies] oxc_allocator = { workspace = true, optional = true } -oxc_ast = { workspace = true, optional = true } +oxc_ast = { workspace = true, optional = true, features = ["serialize"] } oxc_codegen = { workspace = true, optional = true } oxc_isolated_declarations = { workspace = true, optional = true } oxc_linter = { workspace = true, optional = true } @@ -106,7 +106,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"] # Features for running each benchmark separately with minimum dependencies that benchmark needs. # e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser` lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"] -parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"] +parser = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"] transformer = [ "dep:oxc_allocator", "dep:oxc_parser", diff --git a/tasks/benchmark/benches/parser.rs b/tasks/benchmark/benches/parser.rs index 5ede362633e83..023c9658f2b2a 100644 --- a/tasks/benchmark/benches/parser.rs +++ b/tasks/benchmark/benches/parser.rs @@ -1,4 +1,5 @@ use oxc_allocator::Allocator; +use oxc_ast::utf8_to_utf16::Utf8ToUtf16; use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion}; use oxc_parser::{ParseOptions, Parser}; use oxc_span::SourceType; @@ -29,5 +30,33 @@ fn bench_parser(criterion: &mut Criterion) { group.finish(); } -criterion_group!(parser, bench_parser); +fn bench_estree(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("estree"); + for file in TestFiles::complicated().files().iter().take(1) { + let id = BenchmarkId::from_parameter(&file.file_name); + let source_text = file.source_text.as_str(); + let source_type = SourceType::from_path(&file.file_name).unwrap(); + let mut allocator = Allocator::default(); + group.bench_function(id, |b| { + b.iter_with_setup_wrapper(|runner| { + allocator.reset(); + let mut program = Parser::new(&allocator, source_text, source_type) + .with_options(ParseOptions { + parse_regular_expression: true, + ..ParseOptions::default() + }) + .parse() + .program; + runner.run(|| { + Utf8ToUtf16::new().convert(&mut program); + program.to_json(); + program + }); + }); + }); + } + group.finish(); +} + +criterion_group!(parser, bench_parser, bench_estree); criterion_main!(parser);