From f3470163d925b371df4d9e8ecf77b3bb17d16b7a Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Thu, 8 Feb 2024 06:56:26 +0000 Subject: [PATCH] refactor(parser): make `Source::set_position` safe (#2341) Make `Source::set_position` a safe function. This addresses a shortcoming of #2288. Instead of requiring caller of `Source::set_position` to guarantee that the `SourcePosition` is created from this `Source`, the preceding PRs enforce this guarantee at the type level. `Source::set_position` is going to be a central API for transitioning the lexer to processing the source as bytes, rather than `char`s (and the anticipated speed-ups that will produce). So making this method safe will remove the need for a *lot* of unsafe code blocks, and boilerplate comments promising "SAFETY: There's only one `Source`", when to the developer, this is blindingly obvious anyway. So, while splitting the parser into `Parser` and `ParserImpl` (#2339) is an annoying change to have to make, I believe the benefit of this PR justifies it. --- crates/oxc_parser/src/cursor.rs | 4 +--- crates/oxc_parser/src/lexer/mod.rs | 21 ++++----------------- crates/oxc_parser/src/lexer/source.rs | 20 ++++++++++++-------- 3 files changed, 17 insertions(+), 28 deletions(-) diff --git a/crates/oxc_parser/src/cursor.rs b/crates/oxc_parser/src/cursor.rs index 9ff6e55896033..44692dc9b9713 100644 --- a/crates/oxc_parser/src/cursor.rs +++ b/crates/oxc_parser/src/cursor.rs @@ -255,9 +255,7 @@ impl<'a> ParserImpl<'a> { let ParserCheckpoint { lexer, cur_token, prev_span_end, errors_pos: errors_lens } = checkpoint; - // SAFETY: Parser only ever creates a single `Lexer`, - // therefore all checkpoints must be created from it. - unsafe { self.lexer.rewind(lexer) }; + self.lexer.rewind(lexer); self.token = cur_token; self.prev_token_end = prev_span_end; self.errors.truncate(errors_lens); diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 120abb841c801..f7e3242b6a43a 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -153,14 +153,8 @@ impl<'a> Lexer<'a> { } /// Rewinds the lexer to the same state as when the passed in `checkpoint` was created. - /// - /// # SAFETY - /// `checkpoint` must have been created from this `Lexer`. - #[allow(clippy::missing_safety_doc)] // Clippy is wrong! - pub unsafe fn rewind(&mut self, checkpoint: LexerCheckpoint<'a>) { + pub fn rewind(&mut self, checkpoint: LexerCheckpoint<'a>) { self.errors.truncate(checkpoint.errors_pos); - // SAFETY: Caller guarantees `checkpoint` was created from this `Lexer`, - // and therefore `checkpoint.position` was created from `self.source`. self.source.set_position(checkpoint.position); self.token = checkpoint.token; self.lookahead.clear(); @@ -178,10 +172,7 @@ impl<'a> Lexer<'a> { let position = self.source.position(); if let Some(lookahead) = self.lookahead.back() { - // SAFETY: `self.lookahead` only contains lookaheads created by this `Lexer`. - // `self.source` never changes, so `lookahead.position` must have been created - // from `self.source`. - unsafe { self.source.set_position(lookahead.position) }; + self.source.set_position(lookahead.position); } for _i in self.lookahead.len()..n { @@ -197,8 +188,7 @@ impl<'a> Lexer<'a> { // read, so that's not possible. So no need to restore `self.token` here. // It's already in same state as it was at start of this function. - // SAFETY: `position` was created above from `self.source`. `self.source` never changes. - unsafe { self.source.set_position(position) }; + self.source.set_position(position); self.lookahead[n - 1].token } @@ -211,10 +201,7 @@ impl<'a> Lexer<'a> { /// Main entry point pub fn next_token(&mut self) -> Token { if let Some(lookahead) = self.lookahead.pop_front() { - // SAFETY: `self.lookahead` only contains lookaheads created by this `Lexer`. - // `self.source` never changes, so `lookahead.position` must have been created - // from `self.source`. - unsafe { self.source.set_position(lookahead.position) }; + self.source.set_position(lookahead.position); return lookahead.token; } let kind = self.read_next_token(); diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index 07f4397ca96e5..16df3cad4d190 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -140,14 +140,17 @@ impl<'a> Source<'a> { } /// Move current position. - /// - /// # SAFETY - /// `pos` must be created from this `Source`, not another `Source`. - /// If this is the case, the invariants of `Source` are guaranteed to be upheld. #[inline] - pub(super) unsafe fn set_position(&mut self, pos: SourcePosition) { - // `SourcePosition` always upholds the invariants of `Source`, - // as long as it's created from this `Source`. + pub(super) fn set_position(&mut self, pos: SourcePosition) { + // `SourcePosition` always upholds the invariants of `Source`, as long as it's created + // from this `Source`. `SourcePosition`s can only be created from a `Source`. + // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source` + // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another + // `Source` originated on another thread can "jump" onto this one. + // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be + // from this `Source`. + // This guarantee is what allows this function to be safe. + // SAFETY: `read_u8`'s contract is upheld by: // * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`. // * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory. @@ -157,7 +160,8 @@ impl<'a> Source<'a> { debug_assert!( pos.ptr >= self.start && pos.ptr <= self.end - && (pos.ptr == self.end || !is_utf8_cont_byte(read_u8(pos.ptr))) + // SAFETY: See above + && (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { read_u8(pos.ptr) })) ); self.ptr = pos.ptr; }