Skip to content

Commit

Permalink
refactor(parser): all pointer manipulation through SourcePosition (#…
Browse files Browse the repository at this point in the history
…2350)

A safer and faster interface for reading source text using pointers than `*ptr`.
  • Loading branch information
overlookmotel authored Feb 9, 2024
1 parent 651b0b1 commit 6f597b1
Showing 1 changed file with 97 additions and 31 deletions.
128 changes: 97 additions & 31 deletions crates/oxc_parser/src/lexer/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,24 @@ impl<'a> Source<'a> {
/// `SourcePosition` lives as long as the source text `&str` that `Source` was created from.
#[inline]
pub(super) fn position(&self) -> SourcePosition<'a> {
SourcePosition { ptr: self.ptr, _marker: PhantomData }
// SAFETY: Creating a `SourcePosition` from current position of `Source` is always valid,
// if caller has upheld safety conditions of other unsafe methods of this type.
let pos = unsafe { SourcePosition::new(self.ptr) };

// SAFETY: `pos.read()`'s contract is upheld by:
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
// * `SourcePosition` can only live as long as the `&str` underlying `Source`.
debug_assert!(
pos.ptr >= self.start
&& pos.ptr <= self.end
// SAFETY: See above
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() }))
);

pos
}

/// Move current position.
Expand All @@ -151,7 +168,7 @@ impl<'a> Source<'a> {
// from this `Source`.
// This guarantee is what allows this function to be safe.

// SAFETY: `read_u8`'s contract is upheld by:
// SAFETY: `SourcePosition::read`'s contract is upheld by:
// * The preceding checks that `pos.ptr` >= `self.start` and < `self.end`.
// * `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// * `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `pos.ptr`
Expand All @@ -161,7 +178,7 @@ impl<'a> Source<'a> {
pos.ptr >= self.start
&& pos.ptr <= self.end
// SAFETY: See above
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { read_u8(pos.ptr) }))
&& (pos.ptr == self.end || !is_utf8_cont_byte(unsafe { pos.read() }))
);
self.ptr = pos.ptr;
}
Expand All @@ -183,7 +200,7 @@ impl<'a> Source<'a> {
/// * Moving back `n` bytes would not place current position on a UTF-8 character boundary.
#[inline]
pub(super) fn back(&mut self, n: usize) {
// This assertion is essential to ensure safety of `read_u8()` call below.
// This assertion is essential to ensure safety of `pos.read()` call below.
// Without this check, calling `back(0)` on an empty `Source` would cause reading
// out of bounds.
// Compiler should remove this assertion when inlining this function,
Expand All @@ -196,19 +213,19 @@ impl<'a> Source<'a> {

// SAFETY: We have checked that `n` is less than distance between `start` and `ptr`,
// so `new_ptr` cannot be outside of allocation of original `&str`
let new_ptr = unsafe { self.ptr.sub(n) };
let new_pos = unsafe { self.position().sub(n) };

// Enforce invariant that `ptr` must be positioned on a UTF-8 character boundary.
// SAFETY: `new_ptr` is in bounds of original `&str`, and `n > 0` assertion ensures
// not at the end, so valid to read a byte.
// `Source`'s invariants guarantee that `self.start` - `self.end` contains allocated memory.
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `new_ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
let byte = unsafe { read_u8(new_ptr) };
let byte = unsafe { new_pos.read() };
assert!(!is_utf8_cont_byte(byte), "Offset is not on a UTF-8 character boundary");

// Move current position. The checks above satisfy `Source`'s invariants.
self.ptr = new_ptr;
self.ptr = new_pos.ptr;
}

/// Get next char of source, and advance position to after it.
Expand Down Expand Up @@ -376,43 +393,92 @@ impl<'a> Source<'a> {
// `Source::new` takes an immutable ref `&str`, guaranteeing that the memory `self.ptr`
// addresses cannot be aliased by a `&mut` ref as long as `Source` exists.
debug_assert!(self.ptr >= self.start && self.ptr < self.end);
read_u8(self.ptr)
self.position().read()
}
}

/// Wrapper around a pointer to a position in `Source`.
///
/// # SAFETY
/// `SourcePosition` must always be on a UTF-8 character boundary,
/// and within bounds of the `Source` that created it.
#[derive(Debug, Clone, Copy)]
pub struct SourcePosition<'a> {
ptr: *const u8,
_marker: PhantomData<&'a u8>,
}

impl<'a> SourcePosition<'a> {
/// Create a new `SourcePosition` from a pointer.
///
/// # SAFETY
/// * Pointer must obey all the same invariants as `Source::ptr`.
/// * It must be created from a `Source`.
/// * It must be in bounds of the source text `&str` the `Source` is created from,
/// or 1 byte after the end of the source text (i.e. positioned at EOF).
/// * It must be positioned on a UTF-8 character boundary (or EOF).
#[inline]
pub(super) unsafe fn new(ptr: *const u8) -> Self {
Self { ptr, _marker: PhantomData }
}

/// Create new `SourcePosition` which is `n` bytes after this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
/// # SAFETY
/// Caller must ensure that advancing `SourcePosition` by `n` bytes does not make it past the end
/// of `Source` this `SourcePosition` was created from.
/// NB: It is legal to use `add` to create a `SourcePosition` which is *on* the end of `Source`,
/// just not past it.
#[allow(dead_code)]
#[inline]
pub(super) unsafe fn add(self, n: usize) -> Self {
Self::new(self.ptr.add(n))
}

/// Create new `SourcePosition` which is `n` bytes before this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
/// # SAFETY
/// Caller must ensure that reversing `SourcePosition` by `n` bytes does not make it before the start
/// of `Source` this `SourcePosition` was created from.
#[inline]
pub(super) unsafe fn sub(self, n: usize) -> Self {
Self::new(self.ptr.sub(n))
}

/// Read byte from this `SourcePosition`.
///
/// # SAFETY
/// Caller must ensure `SourcePosition` is not at end of source text.
///
/// # Implementation details
///
/// Using `as_ref()` for reading is copied from `core::slice::iter::next`.
/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132
/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168
///
/// Using `ptr.as_ref().unwrap_unchecked()` instead of `*ptr` or `ptr.read()` produces
/// a 7% speed-up on Lexer benchmarks.
/// Presumably this is because it tells the compiler it can rely on the memory being immutable,
/// because if a `&mut` reference existed, that would violate Rust's aliasing rules.
#[inline]
pub(super) unsafe fn read(self) -> u8 {
// SAFETY:
// Caller guarantees `self` is not at end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees to mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
*self.ptr.as_ref().unwrap_unchecked()
}
}

/// Return if byte is a UTF-8 continuation byte.
#[inline]
const fn is_utf8_cont_byte(byte: u8) -> bool {
// 0x80 - 0xBF are continuation bytes i.e. not 1st byte of a UTF-8 character sequence
byte >= 0x80 && byte < 0xC0
}

/// Read `u8` from `*const u8` pointer.
///
/// Using `as_ref()` for reading is copied from `core::slice::iter::next`.
/// https://doc.rust-lang.org/src/core/slice/iter.rs.html#132
/// https://doc.rust-lang.org/src/core/slice/iter/macros.rs.html#156-168
///
/// This is about 7% faster than `*ptr` or `ptr.read()`, presumably because it tells the compiler
/// it can rely on the memory being immutable, because if a `&mut` reference existed, that would
/// violate Rust's aliasing rules.
///
/// # SAFETY
/// Caller must ensure pointer is non-null, and points to allocated, initialized memory.
/// Pointer must point to within an object for which no `&mut` references are currently held.
#[inline]
unsafe fn read_u8(ptr: *const u8) -> u8 {
// SAFETY: Caller guarantees pointer is non-null, and points to allocated, initialized memory.
// Caller guarantees no mutable references to same memory exist, thus upholding Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!ptr.is_null());
*ptr.as_ref().unwrap_unchecked()
}

0 comments on commit 6f597b1

Please sign in to comment.