Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 66 additions & 5 deletions compiler/rustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub struct Cursor<'a> {
pub(crate) const EOF_CHAR: char = '\0';

impl<'a> Cursor<'a> {
#[inline]
pub fn new(input: &'a str, frontmatter_allowed: FrontmatterAllowed) -> Cursor<'a> {
Cursor {
len_remaining: input.len(),
Expand All @@ -31,6 +32,7 @@ impl<'a> Cursor<'a> {
}
}

#[inline]
pub fn as_str(&self) -> &'a str {
self.chars.as_str()
}
Expand All @@ -53,12 +55,14 @@ impl<'a> Cursor<'a> {
/// If requested position doesn't exist, `EOF_CHAR` is returned.
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
/// it should be checked with `is_eof` method.
#[inline]
pub fn first(&self) -> char {
// `.next()` optimizes better than `.nth(0)`
self.chars.clone().next().unwrap_or(EOF_CHAR)
}

/// Peeks the second symbol from the input stream without consuming it.
#[inline]
pub(crate) fn second(&self) -> char {
// `.next()` optimizes better than `.nth(1)`
let mut iter = self.chars.clone();
Expand All @@ -67,6 +71,7 @@ impl<'a> Cursor<'a> {
}

/// Peeks the third symbol from the input stream without consuming it.
#[inline]
pub fn third(&self) -> char {
// `.next()` optimizes better than `.nth(2)`
let mut iter = self.chars.clone();
Expand All @@ -76,21 +81,25 @@ impl<'a> Cursor<'a> {
}

/// Checks if there is nothing more to consume.
#[inline]
pub(crate) fn is_eof(&self) -> bool {
self.chars.as_str().is_empty()
}

/// Returns amount of already consumed symbols.
#[inline]
pub(crate) fn pos_within_token(&self) -> u32 {
(self.len_remaining - self.chars.as_str().len()) as u32
}

/// Resets the number of bytes consumed to 0.
#[inline]
pub(crate) fn reset_pos_within_token(&mut self) {
self.len_remaining = self.chars.as_str().len();
}

/// Moves to the next character.
#[inline]
pub(crate) fn bump(&mut self) -> Option<char> {
let c = self.chars.next()?;

Expand All @@ -102,24 +111,76 @@ impl<'a> Cursor<'a> {
Some(c)
}

#[inline]
pub(crate) fn bump_if(&mut self, expected: char) -> bool {
let mut chars = self.chars.clone();
if chars.next() == Some(expected) {
self.chars = chars;
true
} else {
false
}
}

/// Bumps the cursor if the next character is either of the two expected characters.
#[inline]
pub(crate) fn bump_if2(&mut self, expected1: char, expected2: char) -> bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would call this bump_if_either. bump_if2 makes me think that expected1 must be followed by expected2.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure I can rename it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be logical to rename eat_past2 to eat_past_either too, and use byte1 and byte2 to match already existing eat_until style, what do you suggest?

let mut chars = self.chars.clone();
if let Some(c) = chars.next()
&& (c == expected1 || c == expected2)
{
self.chars = chars;
return true;
}
false
}

/// Moves to a substring by a number of bytes.
#[inline]
pub(crate) fn bump_bytes(&mut self, n: usize) {
self.chars = self.as_str()[n..].chars();
self.chars = self.as_str().get(n..).unwrap_or("").chars();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the thinking behind this change?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it removes the panic handling code generation and branching which in my experiments it is always faster even when panic doesn't happen. if it can be proven by llvm that it will never panic unwrap_or will be optimized away like the panic handling.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems that I am wrong and it does not apply everywhere as even in my benchmark suite it reduces performance in cursor_eat_until/eat_until_newline, I'm going to remove this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that only works if function is #[inline].

}

/// Eats symbols while predicate returns true or until the end of file is reached.
#[inline]
pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
// It was tried making optimized version of this for eg. line comments, but
// LLVM can inline all of this and compile it down to fast iteration over bytes.
while predicate(self.first()) && !self.is_eof() {
self.bump();
}
}
/// Eats characters until the given byte is found.
/// Returns true if the byte was found, false if end of file was reached.
#[inline]
pub(crate) fn eat_until(&mut self, byte: u8) -> bool {
match memchr::memchr(byte, self.as_str().as_bytes()) {
Some(index) => {
self.bump_bytes(index);
true
}
None => {
self.chars = "".chars();
false
}
}
}

pub(crate) fn eat_until(&mut self, byte: u8) {
self.chars = match memchr::memchr(byte, self.as_str().as_bytes()) {
Some(index) => self.as_str()[index..].chars(),
None => "".chars(),
/// Eats characters until any of the given bytes is found, then consumes past it.
/// Returns the found byte if any, or None if end of file was reached.
#[inline]
pub(crate) fn eat_past2(&mut self, byte1: u8, byte2: u8) -> Option<u8> {
let bytes = self.as_str().as_bytes();
match memchr::memchr2(byte1, byte2, bytes) {
Some(index) => {
let found = bytes[index];
self.bump_bytes(index + 1);
Some(found)
}
None => {
self.chars = "".chars();
None
}
}
}
}
90 changes: 52 additions & 38 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -563,11 +563,30 @@ impl Cursor<'_> {
self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch));
let invalid_infostring = self.first() != '\n';

let mut found = false;
let nl_fence_pattern = format!("\n{:-<1$}", "", length_opening as usize);
if let Some(closing) = self.as_str().find(&nl_fence_pattern) {
#[inline]
fn find_closing_fence(s: &str, dash_count: usize) -> Option<usize> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Micro-optimizing frontmatter lexing doesn't seem worthwhile. It's just a tiny fraction of general lexing.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that frontmatter lexing doesn’t need to be heavily optimized. That said, the memchr version eliminates a possible heap allocation (rare in practice, but still nice to avoid) and gives a ~4× speedup on that path. It improves the overall tone and consistency of the lexer code, and there’s no real harm in keeping it. Totally your call, though if you’d rather drop it for simplicity, I can remove it without issue.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed it in my last commit, but let me know if you are interested in having it back.

let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
if let Some(newline_pos) = memchr::memchr(b'\n', &bytes[i..]) {
i += newline_pos + 1;
let start = i;
if start + dash_count <= bytes.len() {
let slice = &bytes[start..start + dash_count];
if slice.iter().all(|&b| b == b'-') {
return Some(start + dash_count);
}
}
} else {
break;
}
}
None
}

if let Some(closing) = find_closing_fence(self.as_str(), length_opening as usize) {
// candidate found
self.bump_bytes(closing + nl_fence_pattern.len());
self.bump_bytes(closing);
// in case like
// ---cargo
// --- blahblah
Expand All @@ -576,10 +595,7 @@ impl Cursor<'_> {
// ----
// combine those stuff into this frontmatter token such that it gets detected later.
self.eat_until(b'\n');
found = true;
}

if !found {
} else {
// recovery strategy: a closing statement might have preceding whitespace/newline
// but not have enough dashes to properly close. In this case, we eat until there,
// and report a mismatch in the parser.
Expand Down Expand Up @@ -656,23 +672,25 @@ impl Cursor<'_> {
};

let mut depth = 1usize;
while let Some(c) = self.bump() {
while let Some(c) = self.eat_past2(b'/', b'*') {
match c {
'/' if self.first() == '*' => {
self.bump();
depth += 1;
b'/' => {
if self.bump_if('*') {
depth += 1;
}
}
'*' if self.first() == '/' => {
self.bump();
depth -= 1;
if depth == 0 {
// This block comment is closed, so for a construction like "/* */ */"
// there will be a successfully parsed block comment "/* */"
// and " */" will be processed separately.
break;
b'*' => {
if self.bump_if('/') {
depth -= 1;
if depth == 0 {
// This block comment is closed, so for a construction like "/* */ */"
// there will be a successfully parsed block comment "/* */"
// and " */" will be processed separately.
break;
}
}
}
_ => (),
_ => unreachable!(),
}
}

Expand Down Expand Up @@ -935,19 +953,21 @@ impl Cursor<'_> {
/// if string is terminated.
fn double_quoted_string(&mut self) -> bool {
debug_assert!(self.prev() == '"');
while let Some(c) = self.bump() {
while let Some(c) = self.eat_past2(b'"', b'\\') {
match c {
'"' => {
b'"' => {
return true;
}
'\\' if self.first() == '\\' || self.first() == '"' => {
// Bump again to skip escaped character.
self.bump();
b'\\' => {
let first = self.first();
if first == '\\' || first == '"' {
// Bump to skip escaped character.
self.bump();
}
}
_ => (),
_ => unreachable!(),
}
}
// End of file reached.
false
}

Expand All @@ -963,9 +983,8 @@ impl Cursor<'_> {
debug_assert!(self.prev() != '#');

let mut n_start_hashes: u32 = 0;
while self.first() == '#' {
while self.bump_if('#') {
n_start_hashes += 1;
self.bump();
}

if self.first() != '"' {
Expand Down Expand Up @@ -1025,9 +1044,8 @@ impl Cursor<'_> {

// Count opening '#' symbols.
let mut eaten = 0;
while self.first() == '#' {
while self.bump_if('#') {
eaten += 1;
self.bump();
}
let n_start_hashes = eaten;

Expand All @@ -1043,9 +1061,7 @@ impl Cursor<'_> {
// Skip the string contents and on each '#' character met, check if this is
// a raw string termination.
loop {
self.eat_until(b'"');

if self.is_eof() {
if !self.eat_until(b'"') {
return Err(RawStrError::NoTerminator {
expected: n_start_hashes,
found: max_hashes,
Expand Down Expand Up @@ -1117,9 +1133,7 @@ impl Cursor<'_> {
/// and returns false otherwise.
fn eat_float_exponent(&mut self) -> bool {
debug_assert!(self.prev() == 'e' || self.prev() == 'E');
if self.first() == '-' || self.first() == '+' {
self.bump();
}
self.bump_if2('-', '+');
self.eat_decimal_digits()
}

Expand Down
Loading