From 1ac102a15de8095a141fcdf672e0cad457b0caab Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Sun, 26 Jan 2025 15:34:19 -0800 Subject: [PATCH 1/2] ilex: Implement line end tokens --- ilex/src/rt/dfa.rs | 6 ++ ilex/src/rt/emit2.rs | 35 ++++++++++- ilex/src/rt/lexer.rs | 12 +++- ilex/src/rule.rs | 63 +++++++++++++++++++ ilex/src/spec.rs | 11 +++- ilex/src/token/stream.rs | 4 +- ilex/tests/greedy/greedy.tokens.yaml | 30 +++++++++ ilex/tests/greedy/main.rs | 6 ++ ilex/tests/greedy/newlines.tokens.yaml | 30 +++++++++ ilex/tests/greedy/newlines.txt | 5 ++ ilex/tests/ui/ambiguous/idents.stderr | 6 +- .../tests/ui/ambiguous/no_xid_after_cm.stderr | 2 +- .../tests/ui/ambiguous/no_xid_after_id.stderr | 7 ++- .../tests/ui/ambiguous/no_xid_after_nm.stderr | 6 +- .../tests/ui/ambiguous/no_xid_after_st.stderr | 2 +- ilex/tests/ui/digital/invalid.stderr | 10 +-- ilex/tests/ui/digital/missing.stderr | 2 +- ilex/tests/ui/digital/points.stderr | 8 +-- ilex/tests/ui/digital/separators.stderr | 16 ++--- ilex/tests/ui/eof/bracket.stderr | 2 +- ilex/tests/ui/eof/bracket_multiline.stderr | 2 +- ilex/tests/ui/eof/comment.stderr | 2 +- ilex/tests/ui/eof/comment_multiline.stderr | 2 +- ilex/tests/ui/eof/mixed_brackets.stderr | 6 +- .../ui/eof/mixed_brackets_multiline.stderr | 6 +- ilex/tests/ui/eof/quoted.stderr | 2 +- ilex/tests/ui/eof/quoted_multiline.stderr | 2 +- ilex/tests/ui/too_small/ident.stderr | 2 +- 28 files changed, 239 insertions(+), 48 deletions(-) create mode 100644 ilex/tests/greedy/newlines.tokens.yaml create mode 100644 ilex/tests/greedy/newlines.txt diff --git a/ilex/src/rt/dfa.rs b/ilex/src/rt/dfa.rs index f6dfd06..9a3fc5d 100644 --- a/ilex/src/rt/dfa.rs +++ b/ilex/src/rt/dfa.rs @@ -165,6 +165,12 @@ fn compile_rule(rule: &Any) -> Rule { let (pat, close) = match rule { Any::Keyword(rule) => (lit(&rule.value), None), + Any::LineEnd(rule) if rule.cancel.is_empty() => (lit(&"\n".into()), None), + + Any::LineEnd(rule) => { + (Hir::alternation(vec![lit(&rule.cancel), lit(&"\n".into())]), None) + } + Any::Comment(rule) => { // We can just throw the bracket in, regardless of whether it's a line // comment. Because of how the outer lexer loop works, we will run the DFA diff --git a/ilex/src/rt/emit2.rs b/ilex/src/rt/emit2.rs index 8c2c21a..074fa84 100644 --- a/ilex/src/rt/emit2.rs +++ b/ilex/src/rt/emit2.rs @@ -258,9 +258,34 @@ pub fn emit(lexer: &mut Lexer) { // Now we have repeat the process from the 'verify, but now we know what kind // of token we're going to create. - match lexer.spec().rule(best.lexeme) { + let rule = lexer.spec().rule(best.lexeme); + if !matches!(rule, Any::Comment(..)) { + // Diagnose a \ that is not followed by only spaces and comments. + if let Some(cancel) = lexer.line_end_cancel.take() { + let cancel = cancel.get(lexer.file()); + lexer + .report() + .builtins(lexer.spec()) + .unexpected(cancel.text(), best.lexeme, cancel) + .note(f!( + "expected `{}` to be followed by a new line", + cancel.text() + )); + } + } + + match rule { Any::Keyword(..) => lexer.add_token(best.lexeme, range.len(), None), + Any::LineEnd(..) if text == "\n" => { + lexer.add_token(best.lexeme, range.len(), None) + } + Any::LineEnd(..) => { + // The cancel is always inserted as whitespace. + lexer.add_token(rt::WHITESPACE, range.len(), None); + lexer.line_end_cancel = Some(range.span2()) + } + Any::Bracket(..) => { // Construct the closer. lexer.push_closer( @@ -306,6 +331,12 @@ pub fn emit(lexer: &mut Lexer) { .unclosed(span, &close, Lexeme::eof(), lexer.eof()); } + // Crop off an ending \n so that it can get turned into whitespace or + // a line end token, as appropriate. + if close == "\n" && depth == 0 { + cursor -= 1; + } + lexer.add_token(best.lexeme, cursor - lexer.cursor(), None); } @@ -755,7 +786,7 @@ pub fn emit(lexer: &mut Lexer) { } let rest = lexer.text(lexer.cursor()..); - let prev = rest.chars().next_back(); + let prev = lexer.text(..lexer.cursor()).chars().next_back(); if prev.is_some_and(is_xid) { let xids = rest.find(|c| !is_xid(c)).unwrap_or(rest.len()); if xids > 0 { diff --git a/ilex/src/rt/lexer.rs b/ilex/src/rt/lexer.rs index 165427c..e191fd0 100644 --- a/ilex/src/rt/lexer.rs +++ b/ilex/src/rt/lexer.rs @@ -9,6 +9,7 @@ use regex_automata::hybrid::dfa::Cache; use crate::f; use crate::file::File; use crate::file::Span; +use crate::file::Span2; use crate::report::Builtins; use crate::report::Report; use crate::rt; @@ -31,6 +32,7 @@ pub struct Lexer<'a, 'ctx> { cursor: usize, closers: Vec, comments: Vec, + pub line_end_cancel: Option, cache: Cache, } @@ -60,6 +62,7 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { cursor: 0, closers: Vec::new(), comments: Vec::new(), + line_end_cancel: None, cache: Cache::new(&spec.dfa().engine), } @@ -290,10 +293,17 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { } pub fn skip_whitespace(&mut self) -> bool { + let have_line_end = self.spec().builder.line_end.is_some(); let len = self .text(self.cursor()..) .chars() - .take_while(|c| c.is_whitespace()) + .take_while(|&c| { + if c == '\n' && have_line_end { + return self.line_end_cancel.take().is_some(); + } + + c.is_whitespace() + }) .map(char::len_utf8) .sum(); diff --git a/ilex/src/rule.rs b/ilex/src/rule.rs index cadd086..60f81df 100644 --- a/ilex/src/rule.rs +++ b/ilex/src/rule.rs @@ -32,6 +32,7 @@ pub use crate::token::Sign; #[allow(missing_docs)] pub enum Any { Keyword(Keyword), + LineEnd(LineEnd), Bracket(Bracket), Ident(Ident), Quoted(Quoted), @@ -44,6 +45,7 @@ impl Any { pub(crate) fn debug_name(&self) -> &'static str { match self { Any::Keyword(_) => "Keyword", + Any::LineEnd(_) => "LineEnd", Any::Bracket(_) => "Bracket", Any::Ident(_) => "Ident", Any::Digital(_) => "Digital", @@ -141,6 +143,67 @@ impl TryFrom for Keyword { } } +/// A line ending. +/// +/// Line ends are like [`Keyword`]s with the value `"\n"`, but which have two +/// extra features: +/// +/// 1. They can specify a "cancel" string for escaping a newline. This is +/// valuable for situations where a line end is syntactically meaningful, but +/// users need to break a line without it affecting lexing. For example, \ +/// takes this role in C, since C uses a line-end token for `#define`s. +/// +/// The cancel string, followed by whitespace and then a newline, will cause +/// that newline to become whitespace, rather than a token. +/// +/// 2. They play nice with line comments. A line comment's ending newline will +/// be turned into a `LineEnd`, unless the comment was prefixed with the +/// cancel string. +#[derive(Default, Debug)] +pub struct LineEnd { + pub(crate) cancel: Yarn, +} + +impl LineEnd { + /// Constructs a new line end rule with no cancel. + pub fn new() -> Self { + Self::default() + } + + /// COnstructs a new line end rule with the given cancel prefix. + pub fn cancellable(cancel: impl Into) -> Self { + Self { cancel: cancel.into() } + } +} + +impl Rule for LineEnd { + type Token<'lex> = token::Keyword<'lex>; + + fn try_from_ref(value: &Any) -> Result<&Self, WrongKind> { + match value { + Any::LineEnd(rule) => Ok(rule), + _ => Err(WrongKind { want: "LineEnd", got: value.debug_name() }), + } + } +} + +impl From for Any { + fn from(value: LineEnd) -> Self { + Any::LineEnd(value) + } +} + +impl TryFrom for LineEnd { + type Error = WrongKind; + + fn try_from(value: Any) -> Result { + match value { + Any::LineEnd(rule) => Ok(rule), + _ => Err(WrongKind { want: "LineEnd", got: value.debug_name() }), + } + } +} + /// A paired bracket, such as `(..)`. /// /// Brackets are pairs of delimiters with tokens between them. They are used as diff --git a/ilex/src/spec.rs b/ilex/src/spec.rs index ea49840..f4768f7 100644 --- a/ilex/src/spec.rs +++ b/ilex/src/spec.rs @@ -15,6 +15,7 @@ use crate::report::Expected; use crate::rt; use crate::rule; use crate::rule::Comment; +use crate::rule::LineEnd; use crate::rule::Rule; /// An ID for a lexeme that a [`Spec`][crate::Spec] can capture. @@ -85,7 +86,7 @@ impl fmt::Debug for Lexeme { /// This is a compiled, immutable object that describes how to lex a particular /// language. The [`Spec::builder()`] function returns a builder for pub struct Spec { - builder: SpecBuilder, + pub(crate) builder: SpecBuilder, dfa: rt::Dfa, } @@ -139,6 +140,7 @@ impl Spec { pub struct SpecBuilder { pub(crate) rules: Vec, pub(crate) names: Vec, + pub(crate) line_end: Option>, } impl SpecBuilder { @@ -198,7 +200,11 @@ impl SpecBuilder { self.names.push(name.into()); self.rules.push(rule.into()); - Lexeme::new(self.rules.len() as i32 - 1) + let lex = Lexeme::new(self.rules.len() as i32 - 1); + if let rule::Any::LineEnd(_) = self.rules.last().unwrap() { + self.line_end = Some(lex.cast()); + } + lex } #[doc(hidden)] @@ -258,6 +264,7 @@ impl Lexeme { match spec.rule(self) { rule::Any::Keyword(rule) => yarn!("`{}`", rule.value), + rule::Any::LineEnd(_) => "line ending".into(), rule::Any::Bracket(d) | rule::Any::Comment(Comment { bracket: d, .. }) => match &d.kind { rule::BracketKind::Paired(open, close) => { diff --git a/ilex/src/token/stream.rs b/ilex/src/token/stream.rs index e25c7ac..f1ce5da 100644 --- a/ilex/src/token/stream.rs +++ b/ilex/src/token/stream.rs @@ -103,7 +103,9 @@ impl<'ctx> Stream<'ctx> { Some(match self.spec().rule(tok.lexeme) { rule::Any::Comment(..) => return None, - rule::Any::Keyword(..) => token::Keyword { stream: self, id }.into(), + rule::Any::Keyword(..) | rule::Any::LineEnd(..) => { + token::Keyword { stream: self, id }.into() + } rule::Any::Ident(..) => token::Ident { stream: self, id }.into(), rule::Any::Bracket(..) => { diff --git a/ilex/tests/greedy/greedy.tokens.yaml b/ilex/tests/greedy/greedy.tokens.yaml index 3bfeb32..d01bc2d 100644 --- a/ilex/tests/greedy/greedy.tokens.yaml +++ b/ilex/tests/greedy/greedy.tokens.yaml @@ -1,10 +1,22 @@ - keyword: lexeme: 3 span: {span: [0, 6], text: "poison"} +- keyword: + lexeme: 5 + span: + span: [6, 7] + text: | + - ident: lexeme: 4 span: {span: [7, 16], text: "poisonous"} name: {span: [7, 16], text: "poisonous"} +- keyword: + lexeme: 5 + span: + span: [16, 17] + text: | + - quoted: lexeme: 0 span: @@ -14,6 +26,12 @@ - {span: [17, 27], text: "poisonous["} - {span: [30, 32], text: "]>"} contents: [{text: {span: [27, 30], text: "xyz"}}] +- keyword: + lexeme: 5 + span: + span: [32, 33] + text: | + - quoted: lexeme: 0 span: @@ -23,6 +41,12 @@ - {span: [33, 47], text: "poisonous#%#%["} - {span: [50, 56], text: "]#%#%>"} contents: [{text: {span: [47, 50], text: "xyz"}}] +- keyword: + lexeme: 5 + span: + span: [56, 57] + text: | + - ident: lexeme: 4 span: {span: [57, 66], text: "poisonous"} @@ -38,6 +62,12 @@ lexeme: 4 span: {span: [68, 71], text: "xyz"} name: {span: [68, 71], text: "xyz"} +- keyword: + lexeme: 5 + span: + span: [72, 73] + text: | + - quoted: lexeme: 1 span: diff --git a/ilex/tests/greedy/main.rs b/ilex/tests/greedy/main.rs index c80989c..c3989a2 100644 --- a/ilex/tests/greedy/main.rs +++ b/ilex/tests/greedy/main.rs @@ -29,6 +29,12 @@ fn greedy(test: &gilded::Test) { #[rule(Ident::new())] ident: Lexeme, + + #[rule(LineEnd::cancellable("\\"))] + nl: Lexeme, + + #[rule(Comment::line("//"))] + comment: Lexeme, } let ctx = Context::new(); diff --git a/ilex/tests/greedy/newlines.tokens.yaml b/ilex/tests/greedy/newlines.tokens.yaml new file mode 100644 index 0000000..1103d3e --- /dev/null +++ b/ilex/tests/greedy/newlines.tokens.yaml @@ -0,0 +1,30 @@ +- keyword: + lexeme: 3 + span: {span: [0, 6], text: "poison"} +- keyword: + lexeme: 5 + span: + span: [6, 7] + text: | + +- keyword: + lexeme: 3 + span: {span: [7, 13], text: "poison"} +- keyword: + lexeme: 3 + span: {span: [16, 22], text: "poison"} +- keyword: + lexeme: 3 + span: {span: [36, 42], text: "poison"} +- keyword: + lexeme: 5 + span: + span: [53, 54] + text: | + +- keyword: + lexeme: 3 + span: {span: [54, 60], text: "poison"} +- eof: + lexeme: 2147483647 + span: {span: [60, 60], text: ""} diff --git a/ilex/tests/greedy/newlines.txt b/ilex/tests/greedy/newlines.txt new file mode 100644 index 0000000..295f4cf --- /dev/null +++ b/ilex/tests/greedy/newlines.txt @@ -0,0 +1,5 @@ +poison +poison \ +poison \ // comment +poison // comment +poison \ No newline at end of file diff --git a/ilex/tests/ui/ambiguous/idents.stderr b/ilex/tests/ui/ambiguous/idents.stderr index bb169ff..bf9d1c1 100644 --- a/ilex/tests/ui/ambiguous/idents.stderr +++ b/ilex/tests/ui/ambiguous/idents.stderr @@ -5,7 +5,7 @@ error: unexpected `b` in `/`-suffixed number | ^ | --- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `a` in `/`-suffixed number --> ambiguous/idents.txt:1:7 @@ -14,7 +14,7 @@ error: unexpected `a` in `/`-suffixed number | ^ | --- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `r` in `/`-suffixed number --> ambiguous/idents.txt:1:8 @@ -23,6 +23,6 @@ error: unexpected `r` in `/`-suffixed number | ^ | --- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: aborting due to 3 errors diff --git a/ilex/tests/ui/ambiguous/no_xid_after_cm.stderr b/ilex/tests/ui/ambiguous/no_xid_after_cm.stderr index b9181e7..5392290 100644 --- a/ilex/tests/ui/ambiguous/no_xid_after_cm.stderr +++ b/ilex/tests/ui/ambiguous/no_xid_after_cm.stderr @@ -5,6 +5,6 @@ error: extraneous characters after `--null ... null` | ^^^^ | -- help: maybe you meant to include a space here | - = note: reported at: ilex/src/rt/emit2.rs:779:10 + = note: reported at: ilex/src/rt/emit2.rs:810:10 error: aborting due to previous error diff --git a/ilex/tests/ui/ambiguous/no_xid_after_id.stderr b/ilex/tests/ui/ambiguous/no_xid_after_id.stderr index 7a15292..a821733 100644 --- a/ilex/tests/ui/ambiguous/no_xid_after_id.stderr +++ b/ilex/tests/ui/ambiguous/no_xid_after_id.stderr @@ -1,9 +1,10 @@ -error: unexpected closing `ua` +error: extraneous characters after `/`-prefixed, `%q`-suffixed identifier --> ambiguous/no_xid_after_id.txt:1:22 | 1 | /foo%q /null%q /foo%qua - | ^^ expected to be opened by `--ua` + | ^^ + | -- help: maybe you meant to include a space here | - = note: reported at: ilex/src/rt/emit2.rs:254:22 + = note: reported at: ilex/src/rt/emit2.rs:810:10 error: aborting due to previous error diff --git a/ilex/tests/ui/ambiguous/no_xid_after_nm.stderr b/ilex/tests/ui/ambiguous/no_xid_after_nm.stderr index 8456863..d668917 100644 --- a/ilex/tests/ui/ambiguous/no_xid_after_nm.stderr +++ b/ilex/tests/ui/ambiguous/no_xid_after_nm.stderr @@ -5,7 +5,7 @@ error: unexpected `q` in `%`-prefixed number | ^ | ------- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `u` in `%`-prefixed number --> ambiguous/no_xid_after_nm.txt:1:11 @@ -14,7 +14,7 @@ error: unexpected `u` in `%`-prefixed number | ^ | ------- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `a` in `%`-prefixed number --> ambiguous/no_xid_after_nm.txt:1:12 @@ -23,6 +23,6 @@ error: unexpected `a` in `%`-prefixed number | ^ | ------- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: aborting due to 3 errors diff --git a/ilex/tests/ui/ambiguous/no_xid_after_st.stderr b/ilex/tests/ui/ambiguous/no_xid_after_st.stderr index d136573..c3deb13 100644 --- a/ilex/tests/ui/ambiguous/no_xid_after_st.stderr +++ b/ilex/tests/ui/ambiguous/no_xid_after_st.stderr @@ -5,6 +5,6 @@ error: extraneous characters after `%'...'q` | ^^ | -- help: maybe you meant to include a space here | - = note: reported at: ilex/src/rt/emit2.rs:779:10 + = note: reported at: ilex/src/rt/emit2.rs:810:10 error: aborting due to previous error diff --git a/ilex/tests/ui/digital/invalid.stderr b/ilex/tests/ui/digital/invalid.stderr index 50b3962..f1dcc5e 100644 --- a/ilex/tests/ui/digital/invalid.stderr +++ b/ilex/tests/ui/digital/invalid.stderr @@ -5,7 +5,7 @@ error: unexpected `8` in `0o`-prefixed number | ^ | --- help: because this value is octal (base 8), digits should be within '0'..='7' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `8` in `0o`-prefixed number --> digital/invalid.txt:3:4 @@ -14,7 +14,7 @@ error: unexpected `8` in `0o`-prefixed number | ^ | ---- help: because this value is octal (base 8), digits should be within '0'..='7' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `a` in number --> digital/invalid.txt:4:5 @@ -23,7 +23,7 @@ error: unexpected `a` in number | ^ | --------- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `a` in number --> digital/invalid.txt:4:6 @@ -32,7 +32,7 @@ error: unexpected `a` in number | ^ | --------- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: unexpected `g` in number --> digital/invalid.txt:4:9 @@ -41,6 +41,6 @@ error: unexpected `g` in number | ^ | --------- help: because this value is decimal (base 10), digits should be within '0'..='9' | - = note: reported at: ilex/src/rt/emit2.rs:562:34 + = note: reported at: ilex/src/rt/emit2.rs:593:34 error: aborting due to 5 errors diff --git a/ilex/tests/ui/digital/missing.stderr b/ilex/tests/ui/digital/missing.stderr index ae4fa47..d6c50fe 100644 --- a/ilex/tests/ui/digital/missing.stderr +++ b/ilex/tests/ui/digital/missing.stderr @@ -5,6 +5,6 @@ error: expected digits after `0x`, but found ` ` | ^ expected digits after `0x` | ^^ because of this prefix | - = note: reported at: ilex/src/rt/emit2.rs:540:18 + = note: reported at: ilex/src/rt/emit2.rs:571:18 error: aborting due to previous error diff --git a/ilex/tests/ui/digital/points.stderr b/ilex/tests/ui/digital/points.stderr index 02acd80..71ef565 100644 --- a/ilex/tests/ui/digital/points.stderr +++ b/ilex/tests/ui/digital/points.stderr @@ -4,7 +4,7 @@ error: expected at least 2 `/`s 2 | 1/2/3/4e4/5 | ^ | - = note: reported at: ilex/src/rt/emit2.rs:523:16 + = note: reported at: ilex/src/rt/emit2.rs:554:16 error: unrecognized character --> digital/points.txt:2:6 @@ -20,7 +20,7 @@ error: expected at least 2 `/`s 3 | 1/2e4/5 | ^^^ | - = note: reported at: ilex/src/rt/emit2.rs:523:16 + = note: reported at: ilex/src/rt/emit2.rs:554:16 error: expected at least 2 `/`s --> digital/points.txt:4:11 @@ -28,7 +28,7 @@ error: expected at least 2 `/`s 4 | 1/2/3e4/5/6 | ^ | - = note: reported at: ilex/src/rt/emit2.rs:523:16 + = note: reported at: ilex/src/rt/emit2.rs:554:16 error: unrecognized character --> digital/points.txt:4:10 @@ -44,6 +44,6 @@ error: expected at least 1 `/` 5 | 1/2/3e4 | ^^ | - = note: reported at: ilex/src/rt/emit2.rs:523:16 + = note: reported at: ilex/src/rt/emit2.rs:554:16 error: aborting due to 6 errors diff --git a/ilex/tests/ui/digital/separators.stderr b/ilex/tests/ui/digital/separators.stderr index ae70934..ed6f45e 100644 --- a/ilex/tests/ui/digital/separators.stderr +++ b/ilex/tests/ui/digital/separators.stderr @@ -4,7 +4,7 @@ error: unexpected digit separator in `no_prefix@`-prefixed number 2 | no_prefix@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:387:36 + = note: reported at: ilex/src/rt/emit2.rs:418:36 error: unexpected digit separator in `no_suffix@`-prefixed number --> digital/separators.txt:3:33 @@ -12,7 +12,7 @@ error: unexpected digit separator in `no_suffix@`-prefixed number 3 | no_suffix@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:474:28 + = note: reported at: ilex/src/rt/emit2.rs:505:28 error: unexpected digit separator in `no_point@`-prefixed number --> digital/separators.txt:4:15 @@ -20,7 +20,7 @@ error: unexpected digit separator in `no_point@`-prefixed number 4 | no_point@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:404:32 + = note: reported at: ilex/src/rt/emit2.rs:435:32 error: unexpected digit separator in `no_point@`-prefixed number --> digital/separators.txt:4:16 @@ -28,7 +28,7 @@ error: unexpected digit separator in `no_point@`-prefixed number 4 | no_point@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:387:36 + = note: reported at: ilex/src/rt/emit2.rs:418:36 error: unexpected digit separator in `no_point@`-prefixed number --> digital/separators.txt:4:27 @@ -36,7 +36,7 @@ error: unexpected digit separator in `no_point@`-prefixed number 4 | no_point@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:404:32 + = note: reported at: ilex/src/rt/emit2.rs:435:32 error: unexpected digit separator in `no_point@`-prefixed number --> digital/separators.txt:4:28 @@ -44,7 +44,7 @@ error: unexpected digit separator in `no_point@`-prefixed number 4 | no_point@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:387:36 + = note: reported at: ilex/src/rt/emit2.rs:418:36 error: unexpected digit separator in `no_exp@`-prefixed number --> digital/separators.txt:5:19 @@ -52,7 +52,7 @@ error: unexpected digit separator in `no_exp@`-prefixed number 5 | no_exp@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:424:34 + = note: reported at: ilex/src/rt/emit2.rs:455:34 error: unexpected digit separator in `no_exp@`-prefixed number --> digital/separators.txt:5:20 @@ -60,6 +60,6 @@ error: unexpected digit separator in `no_exp@`-prefixed number 5 | no_exp@_123_._456_e_789_._012_ | ^ | - = note: reported at: ilex/src/rt/emit2.rs:387:36 + = note: reported at: ilex/src/rt/emit2.rs:418:36 error: aborting due to 8 errors diff --git a/ilex/tests/ui/eof/bracket.stderr b/ilex/tests/ui/eof/bracket.stderr index ea31961..92f253f 100644 --- a/ilex/tests/ui/eof/bracket.stderr +++ b/ilex/tests/ui/eof/bracket.stderr @@ -5,6 +5,6 @@ error: expected closing `]`, but found | ^ expected `]` here | - help: previously opened here | - = note: reported at: ilex/src/rt/lexer.rs:311:10 + = note: reported at: ilex/src/rt/lexer.rs:323:10 error: aborting due to previous error diff --git a/ilex/tests/ui/eof/bracket_multiline.stderr b/ilex/tests/ui/eof/bracket_multiline.stderr index 9d0148b..e152675 100644 --- a/ilex/tests/ui/eof/bracket_multiline.stderr +++ b/ilex/tests/ui/eof/bracket_multiline.stderr @@ -5,6 +5,6 @@ error: expected closing `]`, but found | ^ expected `]` here | - help: previously opened here | - = note: reported at: ilex/src/rt/lexer.rs:311:10 + = note: reported at: ilex/src/rt/lexer.rs:323:10 error: aborting due to previous error diff --git a/ilex/tests/ui/eof/comment.stderr b/ilex/tests/ui/eof/comment.stderr index eebb64f..0249af3 100644 --- a/ilex/tests/ui/eof/comment.stderr +++ b/ilex/tests/ui/eof/comment.stderr @@ -5,6 +5,6 @@ error: expected closing `*/`, but found | ^ expected `*/` here | -- help: previously opened here | - = note: reported at: ilex/src/rt/emit2.rs:306:14 + = note: reported at: ilex/src/rt/emit2.rs:331:14 error: aborting due to previous error diff --git a/ilex/tests/ui/eof/comment_multiline.stderr b/ilex/tests/ui/eof/comment_multiline.stderr index 01a1c69..7abee12 100644 --- a/ilex/tests/ui/eof/comment_multiline.stderr +++ b/ilex/tests/ui/eof/comment_multiline.stderr @@ -6,6 +6,6 @@ error: expected closing `*/`, but found 4 | /* not ok */ | ^ expected `*/` here | - = note: reported at: ilex/src/rt/emit2.rs:306:14 + = note: reported at: ilex/src/rt/emit2.rs:331:14 error: aborting due to previous error diff --git a/ilex/tests/ui/eof/mixed_brackets.stderr b/ilex/tests/ui/eof/mixed_brackets.stderr index 243c71d..3617c5a 100644 --- a/ilex/tests/ui/eof/mixed_brackets.stderr +++ b/ilex/tests/ui/eof/mixed_brackets.stderr @@ -13,7 +13,7 @@ error: expected closing `)`, but found `]` | ^ expected `)` here | - help: previously opened here | - = note: reported at: ilex/src/rt/lexer.rs:202:23 + = note: reported at: ilex/src/rt/lexer.rs:207:23 error: expected closing `)`, but found `]` --> eof/mixed_brackets.txt:1:15 @@ -22,7 +22,7 @@ error: expected closing `)`, but found `]` | ^ expected `)` here | - help: previously opened here | - = note: reported at: ilex/src/rt/lexer.rs:202:23 + = note: reported at: ilex/src/rt/lexer.rs:207:23 error: expected closing `)`, but found --> eof/mixed_brackets.txt:1:17 @@ -31,6 +31,6 @@ error: expected closing `)`, but found | ^ expected `)` here | - help: previously opened here | - = note: reported at: ilex/src/rt/lexer.rs:311:10 + = note: reported at: ilex/src/rt/lexer.rs:323:10 error: aborting due to 4 errors diff --git a/ilex/tests/ui/eof/mixed_brackets_multiline.stderr b/ilex/tests/ui/eof/mixed_brackets_multiline.stderr index 7cc82d5..3403a10 100644 --- a/ilex/tests/ui/eof/mixed_brackets_multiline.stderr +++ b/ilex/tests/ui/eof/mixed_brackets_multiline.stderr @@ -6,7 +6,7 @@ error: expected closing `)`, but found `]` 6 | ] | ^ expected `)` here | - = note: reported at: ilex/src/rt/lexer.rs:202:23 + = note: reported at: ilex/src/rt/lexer.rs:207:23 error: unexpected closing `)` --> eof/mixed_brackets_multiline.txt:9:3 @@ -24,7 +24,7 @@ error: expected closing `)`, but found `]` 11 | ] | ^ expected `)` here | - = note: reported at: ilex/src/rt/lexer.rs:202:23 + = note: reported at: ilex/src/rt/lexer.rs:207:23 error: expected closing `)`, but found --> eof/mixed_brackets_multiline.txt:11:2 @@ -34,6 +34,6 @@ error: expected closing `)`, but found 11 | ] | ^ expected `)` here | - = note: reported at: ilex/src/rt/lexer.rs:311:10 + = note: reported at: ilex/src/rt/lexer.rs:323:10 error: aborting due to 4 errors diff --git a/ilex/tests/ui/eof/quoted.stderr b/ilex/tests/ui/eof/quoted.stderr index b095d02..5c654a3 100644 --- a/ilex/tests/ui/eof/quoted.stderr +++ b/ilex/tests/ui/eof/quoted.stderr @@ -5,6 +5,6 @@ error: expected closing `'`, but found | ^ expected `'` here | - help: previously opened here | - = note: reported at: ilex/src/rt/emit2.rs:691:14 + = note: reported at: ilex/src/rt/emit2.rs:722:14 error: aborting due to previous error diff --git a/ilex/tests/ui/eof/quoted_multiline.stderr b/ilex/tests/ui/eof/quoted_multiline.stderr index ff96acd..7f696a0 100644 --- a/ilex/tests/ui/eof/quoted_multiline.stderr +++ b/ilex/tests/ui/eof/quoted_multiline.stderr @@ -5,6 +5,6 @@ error: expected closing `'`, but found | ^ expected `'` here | - help: previously opened here | - = note: reported at: ilex/src/rt/emit2.rs:691:14 + = note: reported at: ilex/src/rt/emit2.rs:722:14 error: aborting due to previous error diff --git a/ilex/tests/ui/too_small/ident.stderr b/ilex/tests/ui/too_small/ident.stderr index 449d5de..c2962a9 100644 --- a/ilex/tests/ui/too_small/ident.stderr +++ b/ilex/tests/ui/too_small/ident.stderr @@ -4,6 +4,6 @@ error: expected at least 3 characters in identifier, but found only 2 1 | %foo $bar % $oo | ^^^ expected at least 3 here | - = note: reported at: ilex/src/rt/emit2.rs:315:28 + = note: reported at: ilex/src/rt/emit2.rs:346:28 error: aborting due to previous error From e684be642efe9380d0a734f18449acddda86401d Mon Sep 17 00:00:00 2001 From: Miguel Young de la Sota Date: Sun, 26 Jan 2025 15:54:31 -0800 Subject: [PATCH 2/2] ilex: Add token silencing to Stream --- ilex/Cargo.toml | 1 + ilex/src/rt/lexer.rs | 2 + ilex/src/token/stream.rs | 94 ++++++++++++++++++++++++++++------------ 3 files changed, 70 insertions(+), 27 deletions(-) diff --git a/ilex/Cargo.toml b/ilex/Cargo.toml index 78ac86d..3d06f77 100644 --- a/ilex/Cargo.toml +++ b/ilex/Cargo.toml @@ -26,3 +26,4 @@ regex-syntax = "0.8.2" regex-automata = "0.4.3" # Bless Andrew for his patience. rustc_apfloat = "0.2.0" # By eddyb's recommendation. unicode-xid = "0.2.4" +bitvec = "1.0.1" diff --git a/ilex/src/rt/lexer.rs b/ilex/src/rt/lexer.rs index e191fd0..69c40ba 100644 --- a/ilex/src/rt/lexer.rs +++ b/ilex/src/rt/lexer.rs @@ -3,6 +3,7 @@ use std::num::NonZeroU32; use std::ops::Index; use std::ops::RangeBounds; +use bitvec::vec::BitVec; use byteyarn::Yarn; use regex_automata::hybrid::dfa::Cache; @@ -57,6 +58,7 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { toks: Vec::new(), meta_idx: Vec::new(), meta: Vec::new(), + silent: BitVec::new(), }, cursor: 0, diff --git a/ilex/src/token/stream.rs b/ilex/src/token/stream.rs index f1ce5da..369bee3 100644 --- a/ilex/src/token/stream.rs +++ b/ilex/src/token/stream.rs @@ -4,6 +4,8 @@ use std::mem; use std::num::NonZeroU32; use std::slice; +use bitvec::vec::BitVec; + use crate::file::Context; use crate::file::File; use crate::file::Span; @@ -15,6 +17,8 @@ use crate::spec::Lexeme; use crate::spec::Spec; use crate::token; +use super::Token; + /// A tree-like stream of tokens. /// /// This is type returned by by [`File::lex()`] when lexing succeeds. @@ -26,6 +30,8 @@ pub struct Stream<'ctx> { pub(crate) toks: Vec, pub(crate) meta_idx: Vec, pub(crate) meta: Vec, + + pub(crate) silent: BitVec, // Set of lexemes that have been silenced. } impl<'ctx> Stream<'ctx> { @@ -65,6 +71,28 @@ impl<'ctx> Stream<'ctx> { self.token_at_hint(id, meta_hint).unwrap() } + /// Returns whether the given lexeme has been slienced. + pub fn is_silenced(&self, lexeme: Lexeme) -> bool { + self.silent.get(lexeme.index()).is_some_and(|p| *p) + } + + /// Silences the given lexeme in this stream. + /// + /// This means that all tokens with this lexeme will be skipped when yielded + /// from [`Cursor::next()`]. Use [`Cursor::noisy()`] to yield all tokens, + /// including silenced ones. + /// + /// This is useful for tokens that can appear anywhere in the stream, but + /// which should be ignored unless they are being explicitly searched for. + /// This is useful, for example, for [`rule::LineEnd`] tokens. + pub fn silence(&mut self, lexeme: Lexeme) { + let idx = lexeme.index(); + if self.silent.len() <= idx { + self.silent.resize(idx + 1, false); + } + self.silent.set(idx, true); + } + /// Returns the last token pushed to this stream. pub(crate) fn last_token(&self) -> token::Any { let mut cursor = self.cursor(); @@ -296,6 +324,22 @@ impl<'lex> Cursor<'lex> { self.cursor >= self.end } + /// Returns an iterator that yields all of the values in this cursor, + /// including silenced ones. + pub fn noisy(&mut self) -> impl Iterator> + '_ { + iter::from_fn(move || loop { + if self.is_empty() { + return None; + } + + let next = self.stream.token_at_hint(self.id(), self.meta_cursor); + self.step_forward(); + if next.is_some() { + return next; + } + }) + } + /// Returns the next token under the cursor without consuming it. pub fn peek_any(&self) -> Option> { let mut copy = *self; @@ -514,18 +558,8 @@ impl fmt::Debug for Cursor<'_> { impl<'lex> Iterator for Cursor<'lex> { type Item = token::Any<'lex>; fn next(&mut self) -> Option { - loop { - if self.is_empty() { - return None; - } - - let next = self.stream.token_at_hint(self.id(), self.meta_cursor); - self.step_forward(); - - if next.is_some() { - return next; - } - } + let stream = self.stream; + self.noisy().find(|next| !stream.is_silenced(next.lexeme())) } } @@ -623,24 +657,30 @@ pub mod switch { where X: Impl<'lex, T>, { - let Some(next) = cursor.next() else { - report.builtins(cursor.spec()).expected( - self.0.lexemes(0), - Lexeme::eof(), - cursor.end(), - ); + loop { + let Some(next) = cursor.noisy().next() else { + report.builtins(cursor.spec()).expected( + self.0.lexemes(0), + Lexeme::eof(), + cursor.end(), + ); - return None; - }; + return None; + }; - if let Some(found) = self.0.apply(next, cursor) { - return Some(found); - } + if let Some(found) = self.0.apply(next, cursor) { + return Some(found); + } - report - .builtins(cursor.spec()) - .expected(self.0.lexemes(0), next, next); - None + if cursor.stream.is_silenced(next.lexeme()) { + continue; + } + + report + .builtins(cursor.spec()) + .expected(self.0.lexemes(0), next, next); + return None; + } } /// Takes the next token from `cursor` and matches it against this switch.