Skip to content

Commit

Permalink
feat(ast): implement utf8 to utf16 span converter (#8687)
Browse files Browse the repository at this point in the history
closes #8629
  • Loading branch information
Boshen committed Jan 24, 2025
1 parent 10e5920 commit b7f13e6
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 12 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
save-cache: ${{ github.ref_name == 'main' }}
cache-key: warm
- run: cargo ck
- run: cargo test
- run: cargo test --all-features
- run: git diff --exit-code # Must commit everything

test-windows:
Expand Down Expand Up @@ -96,7 +96,7 @@ jobs:
shell: bash
run: |
# cargo ck # no need to check because it's already checked in linux
cargo test --workspace
cargo test --all-features
test-wasm32-wasip1-threads:
name: Test wasm32-wasip1-threads
Expand Down
2 changes: 2 additions & 0 deletions crates/oxc_ast/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ mod ast_impl;
mod ast_kind_impl;
pub mod precedence;
mod trivia;
#[cfg(feature = "serialize")]
pub mod utf8_to_utf16;

mod generated {
#![allow(missing_docs)]
Expand Down
248 changes: 248 additions & 0 deletions crates/oxc_ast/src/utf8_to_utf16.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
//! Convert UTF-8 span offsets to UTF-16.
use oxc_span::Span;

use crate::{ast::Program, visit::VisitMut};

/// Convert UTF-8 span offsets to UTF-16.
pub struct Utf8ToUtf16 {
translations: Vec<Translation>,
}

#[derive(Clone, Copy)]
#[repr(align(8))]
struct Translation {
// UTF-8 byte offset
utf8_offset: u32,
// Number to subtract from UTF-8 byte offset to get UTF-16 char offset
// for offsets *after* `utf8_offset`
utf16_difference: u32,
}

impl Utf8ToUtf16 {
/// Create new `Utf8ToUtf16` converter.
#[expect(clippy::new_without_default)]
pub fn new() -> Self {
let mut translations = Vec::with_capacity(16);
translations.push(Translation { utf8_offset: 0, utf16_difference: 0 });
Self { translations }
}

/// Convert all spans in the AST to UTF-16.
pub fn convert(mut self, program: &mut Program<'_>) {
self.build_table(program.source_text);
// Skip if source is entirely ASCII
if self.translations.len() == 1 {
return;
}
self.visit_program(program);
for comment in &mut program.comments {
self.convert_span(&mut comment.span);
}
}

#[allow(clippy::cast_possible_truncation)]
fn build_table(&mut self, source_text: &str) {
// Translation from UTF-8 byte offset to UTF-16 char offset:
//
// * 1-byte UTF-8 sequence
// = 1st byte 0xxxxxxx (0 - 0x7F)
// -> 1 x UTF-16 char
// UTF-16 len = UTF-8 len
// * 2-byte UTF-8 sequence
// = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 1 x UTF-16
// UTF-16 len = UTF-8 len - 1
// * 3-byte UTF-8 sequence
// = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 1 x UTF-16
// UTF-16 len = UTF-8 len - 2
// * 4-byte UTF-8 sequence
// = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
// -> 2 x UTF-16
// UTF-16 len = UTF-8 len - 2
//
// So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
let mut utf16_difference = 0;
for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() {
if byte >= 0xC0 {
let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
utf16_difference += difference_for_this_byte;
// Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this
// Unicode character that need to be shifted
self.translations
.push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference });
}
}
}

fn convert_span(&self, span: &mut Span) {
span.start = self.convert_offset(span.start);
span.end = self.convert_offset(span.end);
}

fn convert_offset(&self, utf8_offset: u32) -> u32 {
// Find the first entry in table *after* the UTF-8 offset.
// The difference we need to subtract is recorded in the entry prior to it.
let index =
self.translations.partition_point(|translation| translation.utf8_offset <= utf8_offset);
// First entry in table is `0, 0`. `partition_point` finds the first entry where
// `utf8_offset < translation.utf8_offset` (or `translations.len()` if none exists).
// So guaranteed `index > 0`, and `index <= translations.len()`.
// Therefore `index - 1` cannot wrap around, and cannot be out of bounds.
let translation = self.translations[index - 1];
utf8_offset - translation.utf16_difference
}
}

impl VisitMut<'_> for Utf8ToUtf16 {
fn visit_span(&mut self, span: &mut Span) {
self.convert_span(span);
}
}

#[cfg(test)]
mod test {
use oxc_allocator::Allocator;
use oxc_span::{GetSpan, SourceType, Span};

use crate::{
ast::{Expression, Statement},
AstBuilder, Comment, CommentKind,
};

use super::Utf8ToUtf16;

#[test]
fn translate_ast() {
let allocator = Allocator::new();
let ast = AstBuilder::new(&allocator);

let mut program = ast.program(
Span::new(0, 15),
SourceType::default(),
";'🤨' // 🤨",
ast.vec1(Comment::new(8, 15, CommentKind::Line)),
None,
ast.vec(),
ast.vec_from_array([
ast.statement_empty(Span::new(0, 1)),
ast.statement_expression(
Span::new(1, 7),
ast.expression_string_literal(Span::new(1, 7), "🤨", None),
),
]),
);

Utf8ToUtf16::new().convert(&mut program);
assert_eq!(program.span, Span::new(0, 11));
assert_eq!(program.body[1].span(), Span::new(1, 5));
let Statement::ExpressionStatement(expr_stmt) = &program.body[1] else { unreachable!() };
let Expression::StringLiteral(s) = &expr_stmt.expression else { unreachable!() };
assert_eq!(s.span, Span::new(1, 5));
assert_eq!(program.comments[0].span, Span::new(6, 11));
}

#[test]
fn translate_offsets() {
assert_eq!('_'.len_utf8(), 1);
assert_eq!('_'.len_utf16(), 1);
assert_eq!('£'.len_utf8(), 2);
assert_eq!('£'.len_utf16(), 1);
assert_eq!('ऊ'.len_utf8(), 3);
assert_eq!('ऊ'.len_utf16(), 1);
assert_eq!('🤨'.len_utf8(), 4);
assert_eq!('🤨'.len_utf16(), 2);

let cases: &[(&str, &[(u32, u32)])] = &[
// 1-byte
("_", &[(0, 0), (1, 1)]),
// 2-byte
("£", &[(0, 0), (2, 1)]),
("£_", &[(0, 0), (2, 1), (3, 2)]),
("_£", &[(0, 0), (1, 1), (3, 2)]),
("_£_", &[(0, 0), (1, 1), (3, 2), (4, 3)]),
("_££_", &[(0, 0), (1, 1), (3, 2), (5, 3), (6, 4)]),
("_£_£_", &[(0, 0), (1, 1), (3, 2), (4, 3), (6, 4), (7, 5)]),
// 3-byte
("ऊ", &[(0, 0), (3, 1)]),
("ऊ_", &[(0, 0), (3, 1), (4, 2)]),
("_ऊ", &[(0, 0), (1, 1), (4, 2)]),
("_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3)]),
("_ऊऊ_", &[(0, 0), (1, 1), (4, 2), (7, 3), (8, 4)]),
("_ऊ_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3), (8, 4), (9, 5)]),
// 4-byte
("🤨", &[(0, 0), (4, 2)]),
("🤨_", &[(0, 0), (4, 2), (5, 3)]),
("_🤨", &[(0, 0), (1, 1), (5, 3)]),
("_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4)]),
("_🤨🤨_", &[(0, 0), (1, 1), (5, 3), (9, 5), (10, 6)]),
("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]),
];

for (text, translations) in cases {
let mut converter = Utf8ToUtf16::new();
converter.build_table(text);
for &(utf8_offset, expected_utf16_offset) in *translations {
assert_eq!(converter.convert_offset(utf8_offset), expected_utf16_offset);
}
}
}

// Check assumptions about how many UTF-16 chars result from different UTF-8 character sequences,
// which are relied on by `build_table`
#[test]
fn char_lengths() {
macro_rules! assert_utf8_bytes_eq {
($c:expr, $bytes:expr) => {{
let mut buffer = [0; 4];
let bytes = $c.encode_utf8(&mut buffer).as_bytes();
assert!($bytes == bytes);
}};
}

// All 1-byte UTF-8 character sequences = 1 x UTF-16 character.
// First byte is 0x00 - 0x7F.
let min_1_byte_char = char::from_u32(0).unwrap();
assert_eq!(min_1_byte_char.len_utf8(), 1);
assert_eq!(min_1_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_1_byte_char, [0x00]);
let max_1_byte_char = char::from_u32(0x7F).unwrap();
assert_eq!(max_1_byte_char.len_utf8(), 1);
assert_eq!(max_1_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_1_byte_char, [0x7F]);

// All 2-byte UTF-8 character sequences = 1 x UTF-16 character
// First byte is 0xC2 - 0xDF.
let min_2_byte_char = char::from_u32(0x80).unwrap();
assert_eq!(min_2_byte_char.len_utf8(), 2);
assert_eq!(min_2_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_2_byte_char, [0xC2, 0x80]);
let max_2_byte_char = char::from_u32(0x7FF).unwrap();
assert_eq!(max_2_byte_char.len_utf8(), 2);
assert_eq!(max_2_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_2_byte_char, [0xDF, 0xBF]);

// All 3-byte UTF-8 character sequences = 1 x UTF-16 character
// First byte is 0xE0 - 0xEF.
let min_3_byte_char = char::from_u32(0x800).unwrap();
assert_eq!(min_3_byte_char.len_utf8(), 3);
assert_eq!(min_3_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(min_3_byte_char, [0xE0, 0xA0, 0x80]);
let max_3_byte_char = char::from_u32(0xFFFF).unwrap();
assert_eq!(max_3_byte_char.len_utf8(), 3);
assert_eq!(max_3_byte_char.len_utf16(), 1);
assert_utf8_bytes_eq!(max_3_byte_char, [0xEF, 0xBF, 0xBF]);

// All 4-byte UTF-8 character sequences = 2 x UTF-16 characters
// First byte is 0xF0 - 0xF4.
let min_4_byte_char = char::from_u32(0x10000).unwrap();
assert_eq!(min_4_byte_char.len_utf8(), 4);
assert_eq!(min_4_byte_char.len_utf16(), 2);
assert_utf8_bytes_eq!(min_4_byte_char, [0xF0, 0x90, 0x80, 0x80]);
let max_4_byte_char = char::MAX;
assert_eq!(max_4_byte_char.len_utf8(), 4);
assert_eq!(max_4_byte_char.len_utf16(), 2);
assert_utf8_bytes_eq!(max_4_byte_char, [0xF4, 0x8F, 0xBF, 0xBF]);
}
}
20 changes: 13 additions & 7 deletions crates/oxc_parser/examples/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@
use std::{fs, path::Path};

use oxc_allocator::Allocator;
use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
use oxc_parser::{ParseOptions, Parser};
use oxc_span::SourceType;
use pico_args::Arguments;

// Instruction:
// create a `test.js`,
// run `cargo run -p oxc_parser --example parser`
// or `cargo watch -x "run -p oxc_parser --example parser"`
// or `just watch "cargo run -p oxc_parser --example parser"`

fn main() -> Result<(), String> {
let mut args = Arguments::from_env();

let show_ast = args.contains("--ast");
let show_estree = args.contains("--estree");
let show_comments = args.contains("--comments");
let name = args.free_from_str().unwrap_or_else(|_| "test.js".to_string());

Expand All @@ -26,20 +28,24 @@ fn main() -> Result<(), String> {
let ret = Parser::new(&allocator, &source_text, source_type)
.with_options(ParseOptions { parse_regular_expression: true, ..ParseOptions::default() })
.parse();

if show_ast {
println!("AST:");
println!("{}", serde_json::to_string_pretty(&ret.program).unwrap());
}
let mut program = ret.program;

if show_comments {
println!("Comments:");
for comment in ret.program.comments {
for comment in &program.comments {
let s = comment.content_span().source_text(&source_text);
println!("{s}");
}
}

if show_ast || show_estree {
println!("AST:");
if show_estree {
Utf8ToUtf16::new().convert(&mut program);
}
println!("{}", serde_json::to_string_pretty(&program).unwrap());
}

if ret.errors.is_empty() {
println!("Parsed Successfully.");
} else {
Expand Down
4 changes: 2 additions & 2 deletions tasks/benchmark/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ bench = false
# with only the crates it needs, to speed up the builds
[dependencies]
oxc_allocator = { workspace = true, optional = true }
oxc_ast = { workspace = true, optional = true }
oxc_ast = { workspace = true, optional = true, features = ["serialize"] }
oxc_codegen = { workspace = true, optional = true }
oxc_isolated_declarations = { workspace = true, optional = true }
oxc_linter = { workspace = true, optional = true }
Expand Down Expand Up @@ -106,7 +106,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"]
# Features for running each benchmark separately with minimum dependencies that benchmark needs.
# e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser`
lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
parser = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
transformer = [
"dep:oxc_allocator",
"dep:oxc_parser",
Expand Down
31 changes: 30 additions & 1 deletion tasks/benchmark/benches/parser.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use oxc_allocator::Allocator;
use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion};
use oxc_parser::{ParseOptions, Parser};
use oxc_span::SourceType;
Expand Down Expand Up @@ -29,5 +30,33 @@ fn bench_parser(criterion: &mut Criterion) {
group.finish();
}

criterion_group!(parser, bench_parser);
fn bench_estree(criterion: &mut Criterion) {
let mut group = criterion.benchmark_group("estree");
for file in TestFiles::complicated().files().iter().take(1) {
let id = BenchmarkId::from_parameter(&file.file_name);
let source_text = file.source_text.as_str();
let source_type = SourceType::from_path(&file.file_name).unwrap();
let mut allocator = Allocator::default();
group.bench_function(id, |b| {
b.iter_with_setup_wrapper(|runner| {
allocator.reset();
let mut program = Parser::new(&allocator, source_text, source_type)
.with_options(ParseOptions {
parse_regular_expression: true,
..ParseOptions::default()
})
.parse()
.program;
runner.run(|| {
Utf8ToUtf16::new().convert(&mut program);
program.to_json();
program
});
});
});
}
group.finish();
}

criterion_group!(parser, bench_parser, bench_estree);
criterion_main!(parser);

0 comments on commit b7f13e6

Please sign in to comment.