diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 34574af..b25d1d5 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -7,7 +7,7 @@ on: env: CARGO_TERM_COLOR: always - NIGHTLY: 'nightly-2023-09-30' + NIGHTLY: 'nightly-2025-01-01' jobs: check_lints: diff --git a/byteyarn/src/boxed.rs b/byteyarn/src/boxed.rs index 82e3ae0..dcd2e1c 100644 --- a/byteyarn/src/boxed.rs +++ b/byteyarn/src/boxed.rs @@ -587,7 +587,7 @@ impl<'a> YarnBox<'a, [u8]> { } } -impl<'a, T> YarnBox<'a, [T]> +impl YarnBox<'_, [T]> where [T]: crate::Buf, { diff --git a/byteyarn/src/reffed.rs b/byteyarn/src/reffed.rs index 86b7b71..87cc84c 100644 --- a/byteyarn/src/reffed.rs +++ b/byteyarn/src/reffed.rs @@ -293,7 +293,7 @@ impl<'a> YarnRef<'a, [u8]> { } } -impl<'a> YarnRef<'a, str> { +impl YarnRef<'_, str> { /// Converts this yarn into a string slice. pub fn as_str(&self) -> &str { self.as_slice() diff --git a/ilex/src/file/context.rs b/ilex/src/file/context.rs index b17e416..5d774e7 100644 --- a/ilex/src/file/context.rs +++ b/ilex/src/file/context.rs @@ -1,27 +1,23 @@ -use std::collections::HashMap; use std::fs; use std::sync::Arc; use std::sync::RwLock; -use std::sync::RwLockReadGuard; use camino::Utf8Path; -use camino::Utf8PathBuf; use crate::f; use crate::file::File; -use crate::file::SpanId; use crate::file::CTX_FOR_SPAN_DEBUG; use crate::report; use crate::report::Fatal; use crate::report::Report; -use super::Span; +#[cfg(doc)] +use crate::Span; /// A source context, which owns source code files. /// /// A `Context` contains the full text of all the loaded source files, which -/// [`SpanId`]s ultimately refer to. Most [`SpanId`] operations need their -/// corresponding `Context` available. +/// [`Span`]s ultimately refer to. #[derive(Default)] pub struct Context { state: Arc>, @@ -29,12 +25,12 @@ pub struct Context { #[derive(Default)] pub struct State { + // Each file is laid out as the length of the text, followed by the text data, + // followed by the path. + // // TODO(mcyoung): Be smarter about this and use something something concurrent // vector? We don't need to have all this stuff behind a lock I think. - files: Vec<(Utf8PathBuf, String)>, - - ranges: Vec, - comments: HashMap<(u32, u32), Vec>, + files: Vec<(usize, String)>, } unsafe impl Send for Context {} @@ -84,19 +80,21 @@ impl Context { } /// Adds a new file to this source context. - pub fn new_file( + pub fn new_file<'a>( &self, - name: impl Into, + path: impl Into<&'a Utf8Path>, text: impl Into, ) -> File { let mut text = text.into(); text.push(' '); // This space only exists to be somewhere for an EOF span // to point to in diagnostics; user code will never see // it. + let len = text.len(); + text.push_str(path.into().as_str()); let idx = { let mut state = self.state.write().unwrap(); - state.files.push((name.into(), text)); + state.files.push((len, text)); state.files.len() - 1 }; @@ -105,85 +103,44 @@ impl Context { /// Adds a new file to this source context by opening `name` and reading it /// from the file system. - pub fn open_file( + pub fn open_file<'a>( &self, - name: impl Into, + path: impl Into<&'a Utf8Path>, report: &Report, ) -> Result { - let name = name.into(); + let path = path.into(); - let bytes = match fs::read(&name) { + let bytes = match fs::read(path) { Ok(bytes) => bytes, Err(e) => { - report.error(f!("could not open input file `{name}`: {e}")); + report.error(f!("could not open input file `{path}`: {e}")); return report.fatal(); } }; let Ok(utf8) = String::from_utf8(bytes) else { - report.error(f!("input file `{name}` was not valid UTF-8")); + report.error(f!("input file `{path}` was not valid UTF-8")); return report.fatal(); }; - Ok(self.new_file(name, utf8)) + Ok(self.new_file(path, utf8)) } /// Gets the `idx`th file in this source context. pub fn file(&self, idx: usize) -> Option { let state = self.state.read().unwrap(); - let (path, text) = state.files.get(idx)?; - let (path, text) = unsafe { + let (len, text) = state.files.get(idx)?; + let text = unsafe { // SAFETY: The pointer to the file's text is immutable and pointer-stable, // so we can safely extend its lifetime here. - (&*(path.as_path() as *const Utf8Path), &*(text.as_str() as *const str)) + &*(text.as_str() as *const str) }; - Some(File { path, text, ctx: self, idx }) + Some(File { len: *len, text, ctx: self, idx }) } /// Gets the number of files currently tracked by this source context. pub fn file_count(&self) -> usize { self.state.read().unwrap().files.len() } - - /// Gets the byte range for the given span, if it isn't the synthetic span. - pub(crate) fn lookup_range(&self, span: SpanId) -> Span { - let state = self.state.read().unwrap(); - state.ranges[span.0 as usize] - } - - pub(crate) fn lookup_comments( - &self, - file: File, - offset: usize, - ) -> (RwLockReadGuard, *const [SpanId]) { - let state = self.state.read().unwrap(); - let ptr = state - .comments - .get(&(file.idx as u32, offset as u32)) - .map(|x| x.as_slice()) - .unwrap_or_default() as *const [SpanId]; - (state, ptr) - } - - pub(crate) fn add_comment(&self, file: File, offset: usize, comment: SpanId) { - self - .state - .write() - .unwrap() - .comments - .entry((file.idx as u32, offset as u32)) - .or_default() - .push(comment) - } - - /// Creates a new synthetic span with the given contents. - pub(crate) fn new_span(&self, range: Span) -> SpanId { - let mut state = self.state.write().unwrap(); - assert!(state.ranges.len() <= (u32::MAX as usize), "ran out of spans"); - - let span = SpanId(state.ranges.len() as u32); - state.ranges.push(range); - span - } } diff --git a/ilex/src/file/mod.rs b/ilex/src/file/mod.rs index cbed544..10924e4 100644 --- a/ilex/src/file/mod.rs +++ b/ilex/src/file/mod.rs @@ -3,13 +3,10 @@ use std::cell::RefCell; use std::fmt; use std::fmt::Write; -use std::iter; use std::ops::Bound; use std::ops::Index; use std::ops::RangeBounds; use std::ptr; -use std::slice; -use std::sync::RwLockReadGuard; use camino::Utf8Path; @@ -26,7 +23,7 @@ pub use context::Context; /// An input source file. #[derive(Copy, Clone)] pub struct File<'ctx> { - path: &'ctx Utf8Path, + len: usize, text: &'ctx str, ctx: &'ctx Context, idx: usize, @@ -35,7 +32,7 @@ pub struct File<'ctx> { impl<'ctx> File<'ctx> { /// Returns the name of this file, as a path. pub fn path(self) -> &'ctx Utf8Path { - self.path + self.text[self.len..].into() } /// Returns the textual contents of this file. This function takes a range, @@ -51,7 +48,7 @@ impl<'ctx> File<'ctx> { // // XXX: Apparently rustc forgets about other impls if we use // text[..x] here?? - let text = &self.text.get(..self.text.len() - 1).unwrap(); + let text = &self.text.get(..self.len - 1).unwrap(); &text[range] } @@ -62,7 +59,7 @@ impl<'ctx> File<'ctx> { } pub(crate) fn text_with_extra_space(self) -> &'ctx str { - self.text + &self.text[..self.len] } /// Returns the [`Context`] that owns this file. @@ -76,7 +73,7 @@ impl<'ctx> File<'ctx> { /// /// Panics if `start > end`, or if `end` is greater than the length of the /// file. - pub fn span(self, range: impl RangeBounds) -> Span { + pub fn span(self, range: impl RangeBounds) -> Span<'ctx> { Span::new(self, range) } @@ -106,37 +103,36 @@ impl PartialEq for File<'_> { /// so anything that implements [`Spanned`] is suitable for placing spanned data /// in diagnostics. #[derive(Copy, Clone)] -pub struct Span { - file: u32, +pub struct Span<'ctx> { + file: File<'ctx>, start: u32, end: u32, } -/// An interned [`Span`]. -/// -/// Most tokens' spans will never be inspected after lexing, so it's better to -/// make them small for memory saving reasons. This abstraction allows the -/// library to optimize internal handling of spans over time. -/// -/// This type is just a numeric ID; in order to do anything with it, you'll -/// need to call one of the functions in [`Spanned`]. -#[derive(Copy, Clone)] -pub struct SpanId(u32); +// A compressed version of a span that only remembers the start/end. +#[derive(Clone, Copy, Default, PartialEq, Eq)] +pub struct Span2(u32, u32); -impl fmt::Debug for SpanId { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - CTX_FOR_SPAN_DEBUG.with(|ctx| { - let ctx = ctx.borrow(); - let Some(ctx) = &*ctx else { - return f.write_str(""); - }; +impl Span2 { + pub fn get(self, file: File) -> Span { + file.span(self.0 as usize..self.1 as usize) + } +} - fmt::Debug::fmt(&Spanned::span(&self, ctx), f) - }) +// A compressed version of a span that remembers the start, end, and file. +#[derive(Clone, Copy)] +pub struct Span3(u32, u32, u32); + +impl Span3 { + pub fn get(self, ctx: &Context) -> Span { + ctx + .file(self.0 as usize) + .unwrap() + .span(self.1 as usize..self.2 as usize) } } -impl Span { +impl<'ctx> Span<'ctx> { /// Constructs a span from a file and a byte range within it. /// /// # Panics @@ -145,9 +141,9 @@ impl Span { /// file. #[track_caller] pub(crate) fn new + fmt::Debug>( - file: File, + file: File<'ctx>, range: impl RangeBounds, - ) -> Span { + ) -> Self { let start = match range.start_bound() { Bound::Included(&x) => cast(x), Bound::Excluded(&x) => cast(x).saturating_add(1), @@ -167,7 +163,15 @@ impl Span { file.text.len(), ); - Span { file: file.idx() as u32, start, end } + Span { file, start, end } + } + + pub(crate) fn span2(self) -> Span2 { + Span2(self.start, self.end) + } + + pub(crate) fn span3(self) -> Span3 { + Span3(self.file.idx() as u32, self.start, self.end) } /// Gets the file for this span. @@ -176,8 +180,8 @@ impl Span { /// /// May panic if this span is not owned by `ctx` (or it may produce an /// unexpected result). - pub fn file(self, ctx: &Context) -> File { - ctx.file(self.file as usize).unwrap() + pub fn file(self) -> File<'ctx> { + self.file } /// Returns the start (inclusive) byte offset of this span. @@ -200,19 +204,6 @@ impl Span { (self.end - self.start) as usize } - /// Gets the comment associated with this span, if any. - /// - /// # Panics - /// - /// May panic if this span is not owned by `ctx` (or it may produce an - /// unexpected result). - pub fn comments(self, ctx: &Context) -> Comments { - Comments { - slice: ctx.lookup_comments(self.file(ctx), self.start()), - ctx, - } - } - /// Returns a subspan of this range. /// /// # Panics @@ -221,7 +212,7 @@ impl Span { pub fn subspan + fmt::Debug>( self, range: impl RangeBounds, - ) -> Span { + ) -> Self { let start = match range.start_bound() { Bound::Included(&x) => cast(x), Bound::Excluded(&x) => cast(x).saturating_add(1), @@ -251,9 +242,8 @@ impl Span { /// Splits this range in two at `at`. /// /// # Panics - /// - /// Panics if `at` is larger than the length of this range. - pub fn split_at(self, at: usize) -> (Span, Span) { + /// /// Panics if `at` is larger than the length of this range. + pub fn split_at(self, at: usize) -> (Self, Self) { (self.subspan(..at), self.subspan(at..)) } @@ -263,7 +253,7 @@ impl Span { /// # Panics /// /// Panics if `range` is smaller than `pre + suf`. - pub fn split_around(self, pre: usize, suf: usize) -> [Span; 3] { + pub fn split_around(self, pre: usize, suf: usize) -> [Self; 3] { let (pre, range) = self.split_at(pre); let (range, suf) = range.split_at(range.len() - suf); [pre, range, suf] @@ -275,8 +265,8 @@ impl Span { /// /// May panic if this range is not owned by `ctx` (or it may produce an /// unexpected result). - pub fn text(self, ctx: &Context) -> &str { - self.file(ctx).text(self.start as usize..self.end as usize) + pub fn text(self) -> &'ctx str { + self.file().text(self.start as usize..self.end as usize) } /// Joins together a collection of ranges. @@ -285,7 +275,7 @@ impl Span { /// /// May panic if not all spans are for the same file, or if the iterator /// is empty. - pub fn union(ranges: impl IntoIterator) -> Span { + pub fn union(ranges: impl IntoIterator) -> Self { let mut best = None; for range in ranges { @@ -302,146 +292,97 @@ impl Span { best.expect("attempted to join zero spans") } - - /// Bakes this range into a span. - pub(crate) fn intern(self, ctx: &Context) -> SpanId { - ctx.new_span(self) - } - - /// Bakes this range into a span. - pub(crate) fn intern_nonempty(self, ctx: &Context) -> Option { - if self.is_empty() { - return None; - } - Some(self.intern(ctx)) - } - - /// Sets the comment associated with a given span. The comment must itself - /// be specified as a span. - pub(crate) fn append_comment_span(self, ctx: &Context, comment: SpanId) { - ctx.add_comment(self.file(ctx), self.start(), comment) - } } /// A syntax element which contains a span. /// /// You should implement this type for any type which naturally has a single /// span that describes it. -pub trait Spanned { +pub trait Spanned<'ctx> { /// Returns the span in this syntax element. - fn span(&self, ctx: &Context) -> Span; + fn span(&self) -> Span<'ctx>; - /// Forwards to [`SpanId::file()`]. - fn file<'ctx>(&self, ctx: &'ctx Context) -> File<'ctx> { - self.span(ctx).file(ctx) + /// Forwards to [`Span::file()`]. + fn file(&self) -> File<'ctx> { + self.span().file() } /// Forwards to [`Span::start()`]. - fn start(&self, ctx: &Context) -> usize { - self.span(ctx).start() + fn start(&self) -> usize { + self.span().start() } /// Forwards to [`Span::end()`]. - fn end(&self, ctx: &Context) -> usize { - self.span(ctx).end() + fn end(&self) -> usize { + self.span().end() } /// Forwards to [`Span::is_empty()`]. - fn is_empty(&self, ctx: &Context) -> bool { - self.span(ctx).is_empty() + fn is_empty(&self) -> bool { + self.span().is_empty() } /// Forwards to [`Span::len()`]. - fn len(&self, ctx: &Context) -> usize { - self.span(ctx).len() + fn len(&self) -> usize { + self.span().len() } - /// Forwards to [`SpanId::text()`]. - fn text<'ctx>(&self, ctx: &'ctx Context) -> &'ctx str { - self.span(ctx).text(ctx) - } - - /// Forwards to [`SpanId::comments()`]. - fn comments<'ctx>(&self, ctx: &'ctx Context) -> Comments<'ctx> { - self.span(ctx).comments(ctx) - } -} - -impl Spanned for SpanId { - fn span(&self, ctx: &Context) -> Span { - ctx.lookup_range(*self) + /// Forwards to [`Span::text()`]. + fn text(&self) -> &'ctx str { + self.span().text() } } // Spans are spanned by their own spans. -impl Spanned for Span { - fn span(&self, _ctx: &Context) -> Span { +impl<'ctx> Spanned<'ctx> for Span<'ctx> { + fn span(&self) -> Span<'ctx> { *self } } -impl Spanned for &S { - fn span(&self, ctx: &Context) -> Span { - S::span(self, ctx) +impl<'ctx, S: Spanned<'ctx>> Spanned<'ctx> for &S { + fn span(&self) -> Span<'ctx> { + S::span(self) } } -impl Spanned for Never { - fn span(&self, _ctx: &Context) -> Span { +impl<'ctx> Spanned<'ctx> for Never { + fn span(&self) -> Span<'ctx> { self.from_nothing_anything() } } thread_local! { - static CTX_FOR_SPAN_DEBUG: RefCell> = RefCell::new(None); + static CTX_FOR_SPAN_DEBUG: RefCell> = const { RefCell::new(None) }; } -impl fmt::Debug for Span { +impl fmt::Debug for File<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - CTX_FOR_SPAN_DEBUG.with(|ctx| { - if let Some(ctx) = &*ctx.borrow() { - let text = self.text(ctx); - write!(f, "`")?; - for c in text.chars() { - if ('\x20'..'\x7e').contains(&c) { - f.write_char(c)?; - } else if c < '\x20' { - write!(f, "{}", c.escape_debug())? - } else { - write!(f, "", c as u32)?; - } - } - write!(f, "` @ {}", self.file(ctx).path())?; - } else { - write!(f, "<#{}>", self.file)?; - } - - write!(f, "[{}..{}]", Span::start(*self), Span::end(*self)) - }) + write!(f, "File({})", self.path()) } } -/// An iterator over the comment spans attached to a [`SpanId`]. -pub struct Comments<'ctx> { - slice: (RwLockReadGuard<'ctx, context::State>, *const [SpanId]), - ctx: &'ctx Context, -} +impl fmt::Debug for Span<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "`")?; + for c in self.text().chars() { + if ('\x20'..'\x7e').contains(&c) { + f.write_char(c)?; + } else if c < '\x20' { + write!(f, "{}", c.escape_debug())? + } else { + write!(f, "", c as u32)?; + } + } + write!(f, "` @ {}", self.file.path())?; -impl<'ctx> Comments<'ctx> { - /// Adapts this iterator to return just the text contents of each [`SpanId`]. - pub fn as_strings(&self) -> impl Iterator { - unsafe { &*self.slice.1 } - .iter() - .map(|span| span.text(self.ctx)) + write!(f, "[{}..{}]", Span::start(*self), Span::end(*self)) } } -impl<'a> IntoIterator for &'a Comments<'_> { - type Item = SpanId; - type IntoIter = iter::Copied>; - - fn into_iter(self) -> Self::IntoIter { - unsafe { &*self.slice.1 }.iter().copied() +impl fmt::Display for Span<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.text()) } } diff --git a/ilex/src/fp.rs b/ilex/src/fp.rs index e29125b..b138078 100644 --- a/ilex/src/fp.rs +++ b/ilex/src/fp.rs @@ -17,7 +17,6 @@ use rustc_apfloat::Round; use rustc_apfloat::Status; use rustc_apfloat::StatusAnd; -use crate::file::Context; use crate::file::Spanned; use crate::report::Report; use crate::token::Digital; @@ -464,7 +463,6 @@ impl Digital<'_> { #[track_caller] pub(crate) fn parse_fp( self, - ctx: &Context, report: &Report, exact: bool, ) -> Result { @@ -521,7 +519,7 @@ impl Digital<'_> { let mut int_digits = 0i64; let mut frac_digits = 0i64; for (span, digits) in [(int, &mut int_digits), (frac, &mut frac_digits)] { - let Some(mut text) = span.map(|s| s.text(ctx)) else { + let Some(mut text) = span.map(|s| s.text()) else { continue; }; while let Some(c) = text.chars().next() { @@ -554,7 +552,7 @@ impl Digital<'_> { } if let Some(exp) = exp { - let mut text = exp.digit_blocks().next().unwrap().text(ctx); + let mut text = exp.digit_blocks().next().unwrap().text(); while let Some(c) = text.chars().next() { if let Some(suf) = text.strip_prefix(rule.separator.as_str()) { text = suf; @@ -595,11 +593,11 @@ impl Digital<'_> { value } } else { - fn has_ordinary_sign(ctx: &Context, tok: &Digital) -> bool { + fn has_ordinary_sign(tok: &Digital) -> bool { tok.sign().is_none() || tok.sign().is_some_and(|s| { matches!( - (tok.sign_span().unwrap().text(ctx), s), + (tok.sign_span().unwrap().text(), s), ("+", Sign::Pos) | ("-", Sign::Neg) ) }) @@ -609,21 +607,20 @@ impl Digital<'_> { // underlying string. This is such a common format that we special case // it. if rule.point == "." - && has_ordinary_sign(ctx, &self) + && has_ordinary_sign(&self) && (exp.is_none() || exp.is_some_and(|exp| { exp.radix() == 10 && (exp.has_prefix("e") || exp.has_prefix("E")) - && has_ordinary_sign(ctx, &exp) + && has_ordinary_sign(&exp) })) && (rule.separator.is_empty() - || !self.text(ctx).contains(rule.separator.as_str())) + || !self.text().contains(rule.separator.as_str())) { - let text = self.text(ctx); + let text = self.text(); Fp::__parse( - &text[self.prefix().map(|s| s.text(ctx).len()).unwrap_or(0) - ..text.len() - - self.suffix().map(|s| s.text(ctx).len()).unwrap_or(0)], + &text[self.prefix().map(|s| s.text().len()).unwrap_or(0) + ..text.len() - self.suffix().map(|s| s.text().len()).unwrap_or(0)], ) } else { // Since the fast paths have failed us, we need to construct a suitable @@ -632,7 +629,7 @@ impl Digital<'_> { let buf = (|| { use std::fmt::Write; - let mut buf = String::with_capacity(self.text(ctx).len()); + let mut buf = String::with_capacity(self.text().len()); if self.is_negative() { buf.push('-'); } @@ -640,12 +637,12 @@ impl Digital<'_> { let _ = write!( buf, "{}", - u64::from_radix(int.unwrap().text(ctx), 10, &rule.separator)? + u64::from_radix(int.unwrap().text(), 10, &rule.separator)? ); if let Some(frac) = frac { let sep = rule.separator.as_str(); - let mut frac = frac.text(ctx); + let mut frac = frac.text(); let mut lz = 0; loop { let start_len = frac.len(); @@ -688,7 +685,7 @@ impl Digital<'_> { _ => '+', }, u64::from_radix( - exp.digit_blocks().next().unwrap().text(ctx), + exp.digit_blocks().next().unwrap().text(), exp.radix(), &rule.separator )?, diff --git a/ilex/src/ice.rs b/ilex/src/ice.rs index c3d0a18..6b83e3b 100644 --- a/ilex/src/ice.rs +++ b/ilex/src/ice.rs @@ -8,7 +8,7 @@ use std::backtrace::BacktraceStatus; use std::io; use std::panic; use std::panic::AssertUnwindSafe; -use std::panic::PanicInfo; +use std::panic::PanicHookInfo; use std::panic::UnwindSafe; use std::sync::Mutex; use std::thread; @@ -148,7 +148,7 @@ impl Ice { /// /// The results are "best effort". The Rust backtrace API is incomplete, so we /// make do with some... cleverness around parsing the backtrace itself. - pub fn generate(panic: &PanicInfo, options: Options) -> Self { + pub fn generate(panic: &PanicHookInfo, options: Options) -> Self { let msg = panic.payload(); let msg = Option::or( msg.downcast_ref::<&str>().copied().map(str::to_string), diff --git a/ilex/src/lib.rs b/ilex/src/lib.rs index e781288..06f2088 100644 --- a/ilex/src/lib.rs +++ b/ilex/src/lib.rs @@ -271,7 +271,7 @@ pub use { crate::{ file::Context, file::File, - file::{Span, SpanId, Spanned}, + file::{Span, Spanned}, report::{Fatal, Report}, rule::Rule, spec::{Lexeme, Spec, SpecBuilder}, diff --git a/ilex/src/report/builtin.rs b/ilex/src/report/builtin.rs index 5008b72..31c21d2 100644 --- a/ilex/src/report/builtin.rs +++ b/ilex/src/report/builtin.rs @@ -29,12 +29,11 @@ pub struct Builtins<'a> { impl Builtins<'_> { /// Generates an "unexpected" diagnostic. #[track_caller] - pub fn unexpected<'a, 'b>( + pub fn unexpected<'a, 'b, 's>( &self, - found: impl Into>, unexpected_in: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let found = found.into(); @@ -52,9 +51,12 @@ impl Builtins<'_> { } #[track_caller] - pub(crate) fn unexpected_token(&self, at: impl Spanned) -> Diagnostic { - let at = at.span(&self.report.ctx); - let found = at.text(&self.report.ctx); + pub(crate) fn unexpected_token<'s>( + &self, + at: impl Spanned<'s>, + ) -> Diagnostic { + let at = at.span(); + let found = at.text(); let diagnostic = self .report @@ -66,14 +68,13 @@ impl Builtins<'_> { } #[track_caller] - pub(crate) fn extra_chars<'a>( + pub(crate) fn extra_chars<'a, 's>( &self, - unexpected_in: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { - let at = at.span(&self.report.ctx); - let found = at.text(&self.report.ctx); + let at = at.span(); + let found = at.text(); let diagnostic = self .report @@ -84,7 +85,7 @@ impl Builtins<'_> { )) .at(at) .remark( - at.file(&self.report.ctx) + at.file() .span(at.start().saturating_sub(1)..at.start().saturating_add(1)), "maybe you meant to include a space here", ) @@ -96,12 +97,12 @@ impl Builtins<'_> { /// Generates an "expected one of these tokens but got something else" /// diagnostic. #[track_caller] - pub fn expected<'a, 'b, E: Into>>( + pub fn expected<'a, 'b, 's, E: Into>>( &self, expected: impl IntoIterator, found: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let expected = expected.into_iter().map(Into::into).collect::>(); let alts = disjunction_to_string(self.spec, &expected); @@ -122,12 +123,11 @@ impl Builtins<'_> { /// Generates an "unopened delimiter" diagnostic, for when a delimiter is /// not opened before expected. #[track_caller] - pub(crate) fn unopened<'a>( + pub(crate) fn unopened<'a, 's>( &self, - expected: &str, found: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let found = found.into(); @@ -143,13 +143,13 @@ impl Builtins<'_> { /// Generates an "unclosed delimiter" diagnostic, for when a delimiter is /// not closed before expected. #[track_caller] - pub(crate) fn unclosed<'a>( + pub(crate) fn unclosed<'a, 's1, 's2>( &self, - open: impl Spanned, + open: impl Spanned<'s1>, expected: &str, found: impl Into>, - at: impl Spanned, + at: impl Spanned<'s2>, ) -> Diagnostic { let found = found.into(); @@ -169,11 +169,10 @@ impl Builtins<'_> { /// Generates an "unclosed delimiter" diagnostic, for when a delimiter is /// not closed before expected. #[track_caller] - pub(crate) fn non_ascii_in_ident<'a>( + pub(crate) fn non_ascii_in_ident<'a, 's>( &self, - expected: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { self .report @@ -186,11 +185,11 @@ impl Builtins<'_> { } #[track_caller] - pub(crate) fn ident_too_small( + pub(crate) fn ident_too_small<'s>( &self, min_len: usize, actual: usize, - at: impl Spanned, + at: impl Spanned<'s>, ) -> Diagnostic { let diagnostic = self .report @@ -211,27 +210,25 @@ impl Builtins<'_> { /// Generates an "invalid escape sequence" diagnostic. #[track_caller] - pub fn invalid_escape( + pub fn invalid_escape<'s>( &self, - at: impl Spanned, + at: impl Spanned<'s>, why: impl fmt::Display, ) -> Diagnostic { - let at = at.span(&self.report.ctx); - let seq = at.text(&self.report.ctx); + let at = at.span(); self .report - .error(f!("found an invalid escape sequence: `{seq}`")) + .error(f!("found an invalid escape sequence: `{at}`")) .saying(at, why) .reported_at(Location::caller()) } /// Generates a "numeric literal overflowed" diagnostic. #[track_caller] - pub fn literal_out_of_range<'a, N: fmt::Display>( + pub fn literal_out_of_range<'a, 's, N: fmt::Display>( &self, - what: impl Into>, - at: impl Spanned, + at: impl Spanned<'s>, span: &impl RangeBounds, min: &dyn fmt::Display, max: &dyn fmt::Display, diff --git a/ilex/src/report/diagnostic.rs b/ilex/src/report/diagnostic.rs index 33b1010..41f9071 100644 --- a/ilex/src/report/diagnostic.rs +++ b/ilex/src/report/diagnostic.rs @@ -2,7 +2,7 @@ use std::fmt; use std::mem; use std::panic; -use crate::file::Span; +use crate::file; use crate::file::Spanned; use crate::report::Report; @@ -13,7 +13,7 @@ use crate::report::Report; /// almost always temporaries, e.g. /// /// ``` -/// # fn x(report: &ilex::Report, span: ilex::SpanId) { +/// # fn x(report: &ilex::Report, span: ilex::Span) { /// report.error("my error message") /// .saying(span, "this is bad code"); /// # } @@ -35,7 +35,7 @@ pub use annotate_snippets::AnnotationType as Kind; pub struct Info { pub kind: Kind, pub message: String, - pub snippets: Vec>, + pub snippets: Vec>, pub notes: Vec<(String, Kind)>, pub reported_at: Option<&'static panic::Location<'static>>, } @@ -70,24 +70,32 @@ impl Diagnostic { } /// Adds a new relevant snippet at the given location. - pub fn at(self, span: impl Spanned) -> Self { + pub fn at<'s>(self, span: impl Spanned<'s>) -> Self { self.saying(span, "") } /// Adds a new diagnostic location, with the given message attached to it. - pub fn saying(self, span: impl Spanned, message: impl fmt::Display) -> Self { + pub fn saying<'s>( + self, + span: impl Spanned<'s>, + message: impl fmt::Display, + ) -> Self { self.snippet(span, message, None) } /// Like `saying`, but the underline is as for a "note" rather than the /// overall diagnostic. - pub fn remark(self, span: impl Spanned, message: impl fmt::Display) -> Self { + pub fn remark<'s>( + self, + span: impl Spanned<'s>, + message: impl fmt::Display, + ) -> Self { self.snippet(span, message, Some(Kind::Help)) } - fn snippet( + fn snippet<'s>( mut self, - span: impl Spanned, + span: impl Spanned<'s>, message: impl fmt::Display, kind: Option, ) -> Self { @@ -96,7 +104,7 @@ impl Diagnostic { } self.info.snippets.last_mut().unwrap().push(( - span.span(&self.report.ctx), + span.span().span3(), message.to_string(), kind.unwrap_or(self.info.kind), )); diff --git a/ilex/src/report/mod.rs b/ilex/src/report/mod.rs index a840cad..2c19d0e 100644 --- a/ilex/src/report/mod.rs +++ b/ilex/src/report/mod.rs @@ -2,7 +2,7 @@ //! //! This module contains types for generating an *error report*: a collection of //! diagnostics that describe why an operation failed in detail. Diagnostics -//! are basically fancy compiler errors: they use [`SpanId`]s to present faulty +//! are basically fancy compiler errors: they use [`Span`]s to present faulty //! input in context. //! //! The [`Report`] type is a reference-counted list of diagnostics, which is @@ -17,8 +17,6 @@ use std::process; use std::sync::Arc; use crate::file::Context; -#[cfg(doc)] -use crate::file::SpanId; use crate::spec::Spec; mod builtin; @@ -30,10 +28,13 @@ pub use builtin::Expected; pub use diagnostic::Diagnostic; use diagnostic::Kind; +#[cfg(doc)] +use crate::Span; + /// A collection of errors can may built up over the course of an operation. /// /// To construct a report, see [`Context::new_report()`]. The context that -/// constructs a report is the only one whose [`SpanId`]s should be passed into +/// constructs a report is the only one whose [`Span`]s should be passed into /// it; doing otherwise will result in unspecified output (or probably a panic). pub struct Report { ctx: Context, @@ -70,7 +71,7 @@ impl Report { /// Returns a wrapper for accessing commonly-used, built-in message types. /// /// See [`Builtins`]. - pub fn builtins<'a>(&'a self, spec: &'a Spec) -> Builtins { + pub fn builtins<'a>(&'a self, spec: &'a Spec) -> Builtins<'a> { Builtins { report: self, spec } } diff --git a/ilex/src/report/render.rs b/ilex/src/report/render.rs index c942af9..e7e843a 100644 --- a/ilex/src/report/render.rs +++ b/ilex/src/report/render.rs @@ -134,8 +134,9 @@ pub fn render_fmt( let mut cur_file = None; let mut cur_slice = None::; let mut has_eof = false; - for (range, text, kind) in snips { - let file = range.file(&report.ctx); + for (span, text, kind) in snips { + let span = span.get(&report.ctx); + let file = span.file(); if cur_file != Some(file) { cur_file = Some(file); if let Some(mut slice) = cur_slice.take() { @@ -155,8 +156,8 @@ pub fn render_fmt( } let slice = cur_slice.as_mut().unwrap(); - let mut start = range.start(); - let mut end = range.end(); + let mut start = span.start(); + let mut end = span.end(); // Ensure that all ranges have length at least one, and try to get them // to point just after non-whitespace. diff --git a/ilex/src/rt/dfa.rs b/ilex/src/rt/dfa.rs index 8553c18..f6dfd06 100644 --- a/ilex/src/rt/dfa.rs +++ b/ilex/src/rt/dfa.rs @@ -69,7 +69,7 @@ impl Dfa { /// length, plus potential lexical interpretations of that range. pub fn search(&self, lexer: &mut Lexer) -> Option { let dfa = &self.engine; - let haystack = lexer.rest(); + let haystack = lexer.text(lexer.cursor()..); let mut state = dfa .start_state(lexer.cache(), &start::Config::new().anchored(Anchored::Yes)) @@ -95,15 +95,13 @@ impl Dfa { } } - let Some((last_match, state)) = last_match else { - return None; - }; + let (last_match, state) = last_match?; let candidates = (0..dfa.match_len(lexer.cache(), state)) .map(|i| { let id = dfa.match_pattern(lexer.cache(), state, i); if id.as_usize() < self.non_close_rules { Lexeme2 { - lexeme: Lexeme::new(id.as_u32()), + lexeme: Lexeme::new(id.as_i32()), is_close: false, } } else { @@ -127,7 +125,7 @@ pub fn compile(rules: &[Any]) -> Dfa { let mut closers = Vec::new(); for (lexeme, rule) in rules.iter().enumerate() { - let lexeme = Lexeme::new(lexeme as u32); + let lexeme = Lexeme::new(lexeme as i32); let rule = compile_rule(rule); patterns.push(rule.pat); if let Some(close) = rule.close { diff --git a/ilex/src/rt/emit2.rs b/ilex/src/rt/emit2.rs index 02b2b6a..8c2c21a 100644 --- a/ilex/src/rt/emit2.rs +++ b/ilex/src/rt/emit2.rs @@ -6,9 +6,8 @@ use byteyarn::Yarn; use byteyarn::YarnBox; use crate::f; -use crate::file::Context; use crate::file::Span; -use crate::file::Spanned; +use crate::file::Span2; use crate::plural; use crate::report::Expected; use crate::rt; @@ -22,16 +21,11 @@ use crate::rule::Comment; use crate::rule::Quoted; use crate::spec::Lexeme; use crate::spec::Spec; -use crate::token; -use crate::token::Content; -use crate::token::Cursor; use super::dfa::Lexeme2; use super::unicode::is_xid; pub fn emit(lexer: &mut Lexer) { - let ctx = lexer.file().context(); - // Start by searching for the longest matches using the DFA. let dfa = lexer.spec().dfa(); let Some(mut match_) = dfa.search(lexer) else { @@ -39,11 +33,10 @@ pub fn emit(lexer: &mut Lexer) { }; let start = lexer.cursor(); - lexer.advance(match_.len); - let range = lexer.span(start..lexer.cursor()); - let span = range.intern(ctx); - let text = range.text(ctx); - lexer.advance(match_.extra); + let end = start + match_.len; + let span = lexer.span(start..end); + let text = span.text(); + let end = end + match_.extra; // Now we have to decide which of `candidates` is the best one, i.e., // the one with no errors. The following things are explicitly *not* @@ -70,7 +63,7 @@ pub fn emit(lexer: &mut Lexer) { // choices; that is independent of which token we decide to create. let mut best = None; 'verify: for &c in &match_.candidates { - let [.., range, _] = find_affixes_partial(range, lexer.spec(), c, ctx); + let [.., range, _] = find_affixes_partial(span, lexer.spec(), c); // NOTE: We only need to find the first lexeme that is valid. If it's not // valid, we will diagnose that in the next stage. @@ -86,13 +79,13 @@ pub fn emit(lexer: &mut Lexer) { range.split_around(close.0.len(), close.1.len()) }; - let [_, name, _] = find_affixes(range, &ident_rule.affixes, ctx); - if name.text(ctx).chars().count() < ident_rule.min_len { + let [_, name, _] = find_affixes(range, &ident_rule.affixes); + if name.text().chars().count() < ident_rule.min_len { continue 'verify; } if ident_rule.ascii_only { - for c in name.text(ctx).chars() { + for c in name.text().chars() { if !c.is_ascii() && !ident_rule.extra_continues.contains(c) && !ident_rule.extra_starts.contains(c) @@ -198,12 +191,9 @@ pub fn emit(lexer: &mut Lexer) { } let best = best.unwrap_or(match_.candidates[0]); - let [sign, pre, range, suf] = - find_affixes_partial(range, lexer.spec(), best, ctx); - let text = range.text(ctx); - - let prefix = pre.intern_nonempty(ctx); - let suffix = suf.intern_nonempty(ctx); + let [sign_span, prefix, range, suffix] = + find_affixes_partial(span, lexer.spec(), best); + let text = range.text(); let mirrored = match lexer.spec().rule(best.lexeme) { Any::Bracket(bracket) @@ -216,16 +206,16 @@ pub fn emit(lexer: &mut Lexer) { if !best.is_close { (open, close) } else { (close, open) }; let [_, mid, _] = range.split_around(remove.0.len(), remove.1.len()); - Some(yarn!("{}{}{}", replace.0, mid.text(ctx), replace.1)) + Some(yarn!("{}{}{}", replace.0, mid.text(), replace.1)) } BracketKind::CxxLike { ident_rule, open, close, .. } => { let (remove, replace) = if !best.is_close { (open, close) } else { (close, open) }; let [_, mid, _] = range.split_around(remove.0.len(), remove.1.len()); - let [_, name, _] = find_affixes(mid, &ident_rule.affixes, ctx); + let [_, name, _] = find_affixes(mid, &ident_rule.affixes); - let text = name.text(ctx); + let text = name.text(); let count = text.chars().count(); if count < ident_rule.min_len { lexer @@ -243,13 +233,13 @@ pub fn emit(lexer: &mut Lexer) { } } - Some(yarn!("{}{}{}", replace.0, mid.text(ctx), replace.1)) + Some(yarn!("{}{}{}", replace.0, mid.text(), replace.1)) } }, _ => None, }; - let mut generated_token = true; + let mut emitted = true; if best.is_close { let Some(opener) = &mirrored else { bug!("found is_close Lexeme2 corresponding to rule without brackets") @@ -262,19 +252,14 @@ pub fn emit(lexer: &mut Lexer) { }; lexer.builtins().unopened(opener, found, span); - generated_token = false; + lexer.add_token(rt::UNEXPECTED, end - start, None); + emitted = false; } else { // Now we have repeat the process from the 'verify, but now we know what kind // of token we're going to create. match lexer.spec().rule(best.lexeme) { - Any::Keyword(..) => lexer.add_token(rt::Token { - kind: rt::Kind::Keyword, - span, - lexeme: best.lexeme, - prefix, - suffix, - }), + Any::Keyword(..) => lexer.add_token(best.lexeme, range.len(), None), Any::Bracket(..) => { // Construct the closer. @@ -282,35 +267,35 @@ pub fn emit(lexer: &mut Lexer) { best.lexeme.cast(), mirrored.clone().unwrap().immortalize(), ); - lexer.add_token(rt::Token { - kind: rt::Kind::Open { offset_to_close: !0 }, - span, - lexeme: best.lexeme, - prefix, - suffix, - }); + lexer.add_token( + best.lexeme, + range.len(), + Some(rt::Kind::Offset { cursor: 0, meta: 0 }), + ); } + #[allow(clippy::almost_swapped)] Any::Comment(rule) => { // Comments aren't real tokens. - generated_token = false; + emitted = false; + let mut cursor = end; // The span we created only contains the open bracket for the comment. // We still need to lex the comment to the end. let mut depth = 1; let close = mirrored.clone().unwrap().immortalize(); - while let Some(c) = lexer.rest().chars().next() { - if rule.can_nest && lexer.rest().starts_with(text) { + while let Some(c) = lexer.text(cursor..).chars().next() { + if rule.can_nest && lexer.text(cursor..).starts_with(text) { depth += 1; - lexer.advance(text.len()); - } else if lexer.rest().starts_with(close.as_str()) { + cursor += text.len(); + } else if lexer.text(cursor..).starts_with(close.as_str()) { depth -= 1; - lexer.advance(close.len()); + cursor += close.len(); if depth == 0 { break; } } else { - lexer.advance(c.len_utf8()); + cursor += c.len_utf8(); } } @@ -321,8 +306,7 @@ pub fn emit(lexer: &mut Lexer) { .unclosed(span, &close, Lexeme::eof(), lexer.eof()); } - let span = lexer.intern(start..lexer.cursor()); - lexer.add_comment(span); + lexer.add_token(best.lexeme, cursor - lexer.cursor(), None); } Any::Ident(rule) => { @@ -342,33 +326,42 @@ pub fn emit(lexer: &mut Lexer) { } } - lexer.add_token(rt::Token { - kind: rt::Kind::Ident(range.intern(ctx)), - span, - lexeme: best.lexeme, - prefix, - suffix, - }); + lexer.add_token(rt::PREFIX, prefix.len(), None); + lexer.add_token(best.lexeme, range.len(), None); + lexer.add_token(rt::SUFFIX, suffix.len(), None); } Any::Digital(rule) => { - let sign_text = sign.text(ctx); - let sign = sign.intern_nonempty(ctx).map(|span| { - for (text, value) in &rule.mant.signs { - if text == sign_text { - return (*value, span); - } - } - bug!("could not find appropriate sign for Digital rule") + lexer.add_token(rt::PREFIX, prefix.len(), None); + lexer.add_token( + best.lexeme, + sign_span.len() + range.len(), + Some(rt::Kind::Digital(rt::Digital::default())), + ); + lexer.add_token(rt::SUFFIX, suffix.len(), None); + + let sign_text = sign_span.text(); + let sign = (!sign_text.is_empty()).then(|| { + let Some((_, value)) = + rule.mant.signs.iter().find(|(text, _)| text == sign_text) + else { + bug!("could not find appropriate sign for Digital rule"); + }; + + (*value, sign_span.span2()) }); let mut chunks = vec![DigitBlocks { - prefix, + prefix: Span2::default(), sign, blocks: Vec::new(), which_exp: !0, }]; + if !prefix.is_empty() { + chunks[0].prefix = prefix.span2(); + } + let mut offset = 0; let mut text = text; @@ -417,7 +410,7 @@ pub fn emit(lexer: &mut Lexer) { chunk .blocks - .push(range.subspan(block_start..offset).intern(ctx)); + .push(range.subspan(block_start..offset).span2()); text = rest; offset += rule.point.len(); block_start = offset; @@ -437,12 +430,10 @@ pub fn emit(lexer: &mut Lexer) { chunk .blocks - .push(range.subspan(block_start..offset).intern(ctx)); + .push(range.subspan(block_start..offset).span2()); - let prefix = - range.subspan(offset..offset + pre.len()).intern(ctx); + let prefix = range.subspan(offset..offset + pre.len()); text = rest; - offset += pre.len(); let sign = exp @@ -451,20 +442,23 @@ pub fn emit(lexer: &mut Lexer) { .filter(|(y, _)| rest.starts_with(y.as_str())) .max_by_key(|(y, _)| y.len()) .map(|(y, s)| { - let sign = - range.subspan(offset..offset + y.len()).intern(ctx); + let sign = range.subspan(offset..offset + y.len()); text = &text[y.len()..]; offset += y.len(); - (*s, sign) + (*s, sign.span2()) }); chunks.push(DigitBlocks { - prefix: Some(prefix), + prefix: Span2::default(), sign, blocks: Vec::new(), which_exp: i, }); + if !prefix.is_empty() { + chunks.last_mut().unwrap().prefix = prefix.span2(); + } + digits = exp; block_start = offset; last_was_sep = false; @@ -488,24 +482,28 @@ pub fn emit(lexer: &mut Lexer) { .last_mut() .unwrap() .blocks - .push(range.subspan(block_start..).intern(ctx)); - + .push(range.subspan(block_start..).span2()); let mant = chunks.remove(0); - let tok = rt::Token { - kind: rt::Kind::Digital { digits: mant, exponents: chunks }, - span, - lexeme: best.lexeme, - prefix, - suffix, + + let Some(rt::Kind::Digital(meta)) = lexer + .stream_mut() + .last_meta_mut() + .and_then(|m| m.kind.as_mut()) + else { + bug!("missing rt::Digital in digital token"); + }; + meta.digits = mant; + meta.exponents = chunks; + + let Some(rt::Kind::Digital(meta)) = + lexer.stream().last_meta().and_then(|m| m.kind.as_ref()) + else { + bug!("missing rt::Digital in digital token"); }; - let token = Cursor::fake_token(lexer.file(), lexer.spec(), &tok); // This happens later so we have access to the full spans of // the digit blocks. - let rt::Kind::Digital { digits, exponents } = &tok.kind else { - unreachable!() - }; - for chunk in iter::once(digits).chain(exponents) { + for chunk in iter::once(&meta.digits).chain(&meta.exponents) { let digits = rule .exps .get(chunk.which_exp) @@ -514,10 +512,9 @@ pub fn emit(lexer: &mut Lexer) { let chunk_span = Span::union( chunk - .prefix + .prefix(lexer.file()) .into_iter() - .chain(chunk.blocks.iter().copied()) - .map(|s| s.span(ctx)), + .chain(chunk.blocks(lexer.file())), ); if (chunk.blocks.len() as u32) < digits.min_chunks { @@ -532,19 +529,16 @@ pub fn emit(lexer: &mut Lexer) { .at(chunk_span); } - for block in &chunk.blocks { - let range = block.span(ctx); - let mut text = block.text(ctx); + for block in chunk.blocks(lexer.file()) { + let mut text = block.text(); - if range.is_empty() { - let prefix = chunk.prefix.unwrap(); + // FIXME: The is_some() here should not be necessary. + if range.is_empty() && chunk.prefix(lexer.file()).is_some() { + let prefix = chunk.prefix(lexer.file()).unwrap(); lexer .builtins() .expected( - [Expected::Name(yarn!( - "digits after `{}`", - prefix.text(ctx), - ))], + [Expected::Name(yarn!("digits after `{}`", prefix.text(),))], match lexer.text(range.end()..).chars().next() { Some(c) => Expected::Literal(Yarn::from(c)), None => Expected::Lexeme(Lexeme::eof().any()), @@ -567,7 +561,7 @@ pub fn emit(lexer: &mut Lexer) { if !c.is_digit(digits.radix as u32) { lexer.builtins().unexpected( Expected::Literal(c.into()), - token, + lexer.stream().last_token(), lexer.span(cursor..cursor + c.len_utf8()), ) .remark( @@ -581,70 +575,69 @@ pub fn emit(lexer: &mut Lexer) { } } } - - lexer.add_token(tok); } Any::Quoted(rule) => { let close = mirrored.clone().unwrap().immortalize(); - let mut chunk_start = lexer.cursor(); - let mut content = Vec::new(); + let mut chunk_start = end; + let mut cursor = end; + let mut marks = vec![chunk_start as u32]; let uq_end = loop { - if lexer.rest().starts_with(close.as_str()) { - let end = lexer.cursor(); - lexer.advance(close.len()); + if lexer.text(cursor..).starts_with(close.as_str()) { + let end = cursor; + cursor += close.len(); if end > chunk_start { - content.push(Content::Lit(lexer.intern(chunk_start..end))); + marks.push(end as u32); } break Some(end); } - let (esc, rule) = match rule.escapes.longest_prefix(lexer.rest()) { + let rest = lexer.text(cursor..); + let (esc, rule) = match rule.escapes.longest_prefix(rest) { Some(e) => e, - None => match lexer.rest().chars().next() { + None => match rest.chars().next() { Some(c) => { - lexer.advance(c.len_utf8()); + cursor += c.len_utf8(); continue; } None => break None, }, }; - if lexer.cursor() > chunk_start { - content - .push(Content::Lit(lexer.intern(chunk_start..lexer.cursor()))); - } + // Push unconditionally: this ensures that chunks of text are always + // between escapes, even if the literal chunks are empty. + marks.push(cursor as u32); - let esc_start = lexer.cursor(); - lexer.advance(esc.len()); - let esc = lexer.intern(esc_start..lexer.cursor()); - let value = match rule { + let esc_start = cursor; + cursor += esc.len(); + let esc_end = cursor; + let mark = match rule { rule::Escape::Invalid => { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), "invalid escape sequence", ); - None + [cursor; 3] } - rule::Escape::Basic => None, + rule::Escape::Basic => [cursor; 3], rule::Escape::Fixed(chars) => { - let arg_start = lexer.cursor(); + let arg_start = cursor; let mut count = 0; for _ in 0..*chars { // TRICKY: We have just skipped over \x. If we were to take *any* // characters, we would lex `"\x" ` as being `\x` with arg `" `. // So, we want to check for a closer on *every* loop iteration, and // break out if we *see* it: we should not consume it. - if lexer.rest().starts_with(close.as_str()) { + if lexer.text(cursor..).starts_with(close.as_str()) { break; } - match lexer.rest().chars().next() { - Some(c) => lexer.advance(c.len_utf8()), + match lexer.text(cursor..).chars().next() { + Some(c) => cursor += c.len_utf8(), None => break, } count += 1; @@ -652,7 +645,7 @@ pub fn emit(lexer: &mut Lexer) { if count != *chars { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), f!( "expected exactly {chars} character{} here", plural(*chars) @@ -660,80 +653,78 @@ pub fn emit(lexer: &mut Lexer) { ); } - Some(lexer.intern(arg_start..lexer.cursor())) + [arg_start, cursor, cursor] } rule::Escape::Bracketed(open, close) => 'delim: { - if !lexer.rest().starts_with(open.as_str()) { + if !lexer.text(cursor..).starts_with(open.as_str()) { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), f!("expected a `{open}`"), ); - break 'delim None; + break 'delim [cursor; 3]; } else { - lexer.advance(open.len()); + cursor += open.len() } - let arg_start = lexer.cursor(); - let Some(len) = lexer.rest().find(close.as_str()) else { + let arg_start = cursor; + let Some(len) = lexer.text(..cursor).find(close.as_str()) else { lexer.builtins().invalid_escape( - lexer.span(esc_start..lexer.cursor()), + lexer.span(esc_start..cursor), f!("expected a `{close}`"), ); - break 'delim None; + break 'delim [arg_start, cursor, cursor]; }; - lexer.advance(len + close.len()); - Some(lexer.intern(arg_start..lexer.cursor() - close.len())) + cursor += len + close.len(); + [arg_start, arg_start + len, cursor] } }; - content.push(Content::Esc(esc, value)); - chunk_start = lexer.cursor(); + marks.push(esc_end as u32); + marks.extend(mark.iter().map(|&x| x as u32)); + chunk_start = cursor; }; - let uq_end = uq_end.unwrap_or_else(|| { + if uq_end.is_none() { lexer .builtins() .unclosed(span, &close, Lexeme::eof(), lexer.eof()); - lexer.cursor() - }); + } // We have to parse the suffix ourselves explicitly! let suf = rule .affixes .suffixes() .iter() - .filter(|y| lexer.rest().starts_with(y.as_str())) + .filter(|y| lexer.text(cursor..).starts_with(y.as_str())) .map(|y| y.len()) .max() .unwrap_or_else(|| { + let found = match lexer.text(cursor..).chars().next() { + Some(n) => Expected::Literal(n.into()), + None => Lexeme::eof().into(), + }; + lexer.builtins().expected( rule .affixes .suffixes() .iter() .map(|y| Expected::Literal(y.aliased())), - Expected::Literal("fixme".into()), - lexer.span(lexer.cursor()..lexer.cursor()), + found, + lexer.span(cursor..cursor), ); 0 }); - let suf_start = lexer.cursor(); - lexer.advance(suf); - let suffix = lexer.span(suf_start..lexer.cursor()).intern_nonempty(ctx); - - lexer.add_token(rt::Token { - kind: rt::Kind::Quoted { - content, - open: range.intern(ctx), - close: lexer.intern(uq_end..suf_start), - }, - span: lexer.intern(span.span(ctx).start()..lexer.cursor()), - lexeme: best.lexeme, - prefix, - suffix, - }); + + lexer.add_token(rt::PREFIX, prefix.len(), None); + lexer.add_token( + best.lexeme, + cursor - lexer.cursor(), + Some(rt::Kind::Quoted(rt::Quoted { marks })), + ); + lexer.add_token(rt::SUFFIX, suf, None); } } } @@ -745,12 +736,8 @@ pub fn emit(lexer: &mut Lexer) { // and diagnose that. if match_.extra > 0 { - let expected = if generated_token { - Expected::Token(token::Cursor::fake_token( - lexer.file(), - lexer.spec(), - lexer.last_token(), - )) + let expected = if emitted { + Expected::Token(lexer.stream().last_token()) } else if let Some(mirrored) = &mirrored { if best.is_close { Expected::Literal(yarn!("{mirrored} ... {text}")) @@ -767,22 +754,16 @@ pub fn emit(lexer: &mut Lexer) { .extra_chars(expected, lexer.span(start..start + match_.extra)); } - let prev = lexer.rest().chars().next_back(); + let rest = lexer.text(lexer.cursor()..); + let prev = rest.chars().next_back(); if prev.is_some_and(is_xid) { - let xids = lexer - .rest() - .find(|c| !is_xid(c)) - .unwrap_or(lexer.rest().len()); + let xids = rest.find(|c| !is_xid(c)).unwrap_or(rest.len()); if xids > 0 { let start = lexer.cursor(); - lexer.advance(xids); - - let expected = if generated_token { - Expected::Token(token::Cursor::fake_token( - lexer.file(), - lexer.spec(), - lexer.last_token(), - )) + lexer.add_token(rt::UNEXPECTED, xids, None); + + let expected = if emitted { + Expected::Token(lexer.stream().last_token()) } else if let Some(mirrored) = &mirrored { if best.is_close { Expected::Literal(yarn!("{mirrored} ... {text}")) @@ -795,23 +776,22 @@ pub fn emit(lexer: &mut Lexer) { lexer .builtins() - .extra_chars(expected, lexer.span(start..lexer.cursor())); + .extra_chars(expected, lexer.span(start..start + xids)); } } } /// Extracts the affixes from `text`. -fn find_affixes_partial( - range: Span, +fn find_affixes_partial<'a>( + range: Span<'a>, spec: &Spec, best: Lexeme2, - ctx: &Context, -) -> [Span; 4] { - let text = range.text(ctx); - let ep = range.file(ctx).span(0..0); +) -> [Span<'a>; 4] { + let text = range.text(); + let ep = range.file().span(0..0); match spec.rule(best.lexeme) { Any::Ident(rule) => { - let [pre, range, suf] = find_affixes(range, &rule.affixes, ctx); + let [pre, range, suf] = find_affixes(range, &rule.affixes); [ep, pre, range, suf] } Any::Digital(rule) => { @@ -825,15 +805,15 @@ fn find_affixes_partial( .unwrap_or(0); let (sign, range) = range.split_at(sign); - let [pre, range, suf] = find_affixes(range, &rule.affixes, ctx); + let [pre, range, suf] = find_affixes(range, &rule.affixes); [sign, pre, range, suf] } Any::Quoted(rule) if !best.is_close => { - let (pre, range) = find_prefix(range, &rule.affixes, ctx); + let (pre, range) = find_prefix(range, &rule.affixes); [ep, pre, range, ep] } Any::Quoted(rule) => { - let (range, suf) = find_suffix(range, &rule.affixes, ctx); + let (range, suf) = find_suffix(range, &rule.affixes); [ep, ep, range, suf] } _ => [ep, ep, range, ep], @@ -841,14 +821,14 @@ fn find_affixes_partial( } /// Extracts the affixes from `text`. -fn find_affixes(range: Span, affixes: &Affixes, ctx: &Context) -> [Span; 3] { - let (prefix, range) = find_prefix(range, affixes, ctx); - let (range, suffix) = find_suffix(range, affixes, ctx); +fn find_affixes<'a>(range: Span<'a>, affixes: &Affixes) -> [Span<'a>; 3] { + let (prefix, range) = find_prefix(range, affixes); + let (range, suffix) = find_suffix(range, affixes); [prefix, range, suffix] } -fn find_prefix(range: Span, affixes: &Affixes, ctx: &Context) -> (Span, Span) { - let text = range.text(ctx); +fn find_prefix<'a>(range: Span<'a>, affixes: &Affixes) -> (Span<'a>, Span<'a>) { + let text = range.text(); let prefix = affixes .prefixes() .iter() @@ -859,8 +839,8 @@ fn find_prefix(range: Span, affixes: &Affixes, ctx: &Context) -> (Span, Span) { range.split_at(prefix) } -fn find_suffix(range: Span, affixes: &Affixes, ctx: &Context) -> (Span, Span) { - let text = range.text(ctx); +fn find_suffix<'a>(range: Span<'a>, affixes: &Affixes) -> (Span<'a>, Span<'a>) { + let text = range.text(); let suffix = affixes .suffixes() .iter() diff --git a/ilex/src/rt/lexer.rs b/ilex/src/rt/lexer.rs index 2e1025a..165427c 100644 --- a/ilex/src/rt/lexer.rs +++ b/ilex/src/rt/lexer.rs @@ -1,4 +1,5 @@ use std::mem; +use std::num::NonZeroU32; use std::ops::Index; use std::ops::RangeBounds; @@ -6,19 +7,18 @@ use byteyarn::Yarn; use regex_automata::hybrid::dfa::Cache; use crate::f; -use crate::file::Context; use crate::file::File; use crate::file::Span; -use crate::file::SpanId; -use crate::file::Spanned; use crate::report::Builtins; use crate::report::Report; use crate::rt; use crate::rule; +use crate::rule::Any; use crate::rule::Bracket; use crate::spec::Lexeme; use crate::spec::Spec; use crate::token; +use crate::token::Stream; use super::unicode::is_xid; @@ -26,15 +26,12 @@ use super::unicode::is_xid; /// operation. pub struct Lexer<'a, 'ctx> { report: &'a Report, - spec: &'ctx Spec, - file: File<'ctx>, + stream: Stream<'ctx>, cursor: usize, - tokens: Vec, closers: Vec, - comments: Vec, + comments: Vec, - eof: SpanId, cache: Cache, } @@ -42,6 +39,7 @@ pub struct Lexer<'a, 'ctx> { pub struct Closer { lexeme: Lexeme, open_idx: usize, + meta_idx: usize, original_open_idx: usize, // For diagnostics. close: Yarn, } @@ -50,27 +48,21 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { /// Creates a new lexer. pub fn new(file: File<'ctx>, report: &'a Report, spec: &'ctx Spec) -> Self { Lexer { - eof: file.span(file.len()..file.len()).intern(file.context()), - cache: Cache::new(&spec.dfa().engine), - - file, report, - spec, + stream: Stream { + file, + spec, + toks: Vec::new(), + meta_idx: Vec::new(), + meta: Vec::new(), + }, cursor: 0, - tokens: Vec::new(), closers: Vec::new(), comments: Vec::new(), - } - } - - pub fn advance(&mut self, by: usize) { - assert!( - self.cursor.saturating_add(by) <= self.text(..).len(), - "ilex: advanced cursor beyond the end of text; this is a bug" - ); - self.cursor += by; + cache: Cache::new(&spec.dfa().engine), + } } /// Returns the report for diagnostics. @@ -78,9 +70,18 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { self.report } + /// Returns the stream this lexer is building. + pub fn stream(&self) -> &Stream<'ctx> { + &self.stream + } + + pub(crate) fn stream_mut(&mut self) -> &mut Stream<'ctx> { + &mut self.stream + } + /// Returns the spec we're lexing against. pub fn spec(&self) -> &'ctx Spec { - self.spec + self.stream.spec() } /// Returns the diagnostics builtins. @@ -90,7 +91,7 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { /// Returns the spec we're lexing against. pub fn file(&self) -> File<'ctx> { - self.file + self.stream.file() } /// Returns a slice of the current file being lexed. @@ -98,7 +99,7 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { where str: Index, { - self.file.text(range) + self.file().text(range) } /// Returns the current cursor position. @@ -106,55 +107,46 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { self.cursor } - /// Returns everything after the current cursor position. - pub fn rest(&self) -> &'ctx str { - self.text(self.cursor..) - } - /// Returns the EOF span. - pub fn eof(&self) -> SpanId { - self.eof + pub fn eof(&self) -> Span<'ctx> { + self.file().span(self.file().len()..self.file().len()) } /// Creates a new range in the current file. - pub fn span(&self, range: impl RangeBounds) -> Span { - self.file.span(range) - } - - /// Creates a new range in the current file and bakes it. - pub fn intern(&self, range: impl RangeBounds) -> SpanId { - self.file.span(range).intern(self.ctx()) + pub fn span(&self, range: impl RangeBounds) -> Span<'ctx> { + self.file().span(range) } - /// Creates a new span in the current file with the given range. - pub fn ctx(&self) -> &'ctx Context { - self.file().context() + // Returns the span of the token at the given index. + pub fn lookup_span(&self, idx: usize) -> Span<'ctx> { + let end = self.stream.toks[idx].end as usize; + let start = self.stream.toks[..idx] + .last() + .map(|p| p.end as usize) + .unwrap_or(0); + self.file().span(start..end) } pub fn cache(&mut self) -> &mut Cache { &mut self.cache } - pub fn last_token(&self) -> &rt::Token { - self.tokens.last().unwrap() - } - /// Pushes a closer. pub fn push_closer(&mut self, lexeme: Lexeme, close: Yarn) { self.closers.push(Closer { lexeme, close, - open_idx: self.tokens.len(), - original_open_idx: self.tokens.len(), + open_idx: self.stream.toks.len(), + meta_idx: self.stream.meta_idx.len(), + original_open_idx: self.stream.toks.len(), }); } /// Pops a closer, if it is time for it. pub fn pop_closer(&mut self) { - let idx = self - .closers - .iter() - .rposition(|close| self.rest().starts_with(close.close.as_str())); + let idx = self.closers.iter().rposition(|close| { + self.text(self.cursor()..).starts_with(close.close.as_str()) + }); let Some(idx) = idx else { return }; let len = self.closers.len(); @@ -167,43 +159,43 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { } let start = self.cursor(); - self.advance(close.close.len()); + let mut end = start + close.close.len(); - let close_idx = self.tokens.len(); - let offset_to_open = (close_idx - close.open_idx) as u32; + let close_idx = self.stream.toks.len(); + let meta_idx = self.stream.meta.len(); + let offset = (close_idx - close.open_idx) as i32; + let meta_offset = (meta_idx - close.meta_idx) as i32; - match &mut self.tokens[close.open_idx].kind { - rt::Kind::Open { offset_to_close, .. } => { - *offset_to_close = offset_to_open - } - _ => { - panic!("ilex: lexer.closers.last().open_idx did not point to an rt::Kind::Open; this is a bug") - } - } - let open_sp = self.tokens[close.open_idx].span; + let Some(rt::Kind::Offset { cursor, meta }) = + &mut self.stream.meta[close.meta_idx].kind + else { + bug!("ilex: lexer.closers.last().open_idx did not point to an rt::Kind::Open") + }; + *cursor += offset; + *meta += meta_offset; + + let open_sp = self.lookup_span(close.open_idx); - let prev = self.rest().chars().next_back(); + let rest = self.text(end..); + let prev = rest.chars().next_back(); if prev.is_some_and(is_xid) { - let xids = self - .rest() - .find(|c| !is_xid(c)) - .unwrap_or(self.rest().len()); + let xids = rest.find(|c| !is_xid(c)).unwrap_or(rest.len()); if xids > 0 { - let start = self.cursor(); - self.advance(xids); + let start = end; + end += xids; - let span = self.span(start..self.cursor()); + let span = self.span(start..end); self.builtins().extra_chars( self.spec().rule_name_or( close.lexeme.any(), - f!("{} ... {}", open_sp.text(self.file.context()), close.close), + f!("{} ... {}", open_sp, close.close), ), span, ); } } - let span = self.span(start..self.cursor).intern(self.ctx()); + let span = self.span(start..end); if idx != self.closers.len() { // This is a so-called "mixed delimiter", and an error we need to // diagnose. @@ -215,76 +207,110 @@ impl<'a, 'ctx> Lexer<'a, 'ctx> { ); } - let full_span = - self.intern(open_sp.span(self.ctx()).start()..self.cursor()); - self.add_token(rt::Token { - kind: rt::Kind::Close { full_span, offset_to_open }, - span, - lexeme: close.lexeme.any(), - prefix: None, - suffix: None, - }); + self.add_token( + close.lexeme.any(), + end - start, + Some(rt::Kind::Offset { cursor: -offset, meta: -meta_offset }), + ); } - /// Adds a new token, draining all of the saved-up comments. - pub fn add_token(&mut self, tok: rt::Token) { - let span = tok.span.span(self.ctx()); - for comment in self.comments.drain(..) { - span.append_comment_span(self.file.context(), comment); + /// Adds a new token. + pub fn add_token( + &mut self, + lexeme: Lexeme, + len: usize, + kind: Option, + ) { + if lexeme.is_aux() { + if len == 0 { + return; + } + + if let Some(prev) = self.stream.toks.last_mut() { + if prev.lexeme == lexeme { + prev.end += len as u32; + self.cursor += len; + return; + } + } } - self.tokens.push(tok); - } + let new_len = self.cursor.saturating_add(len); + let total_len = self.text(..).len(); - /// Adds a new token, draining all of the saved-up comments. - pub fn add_comment(&mut self, span: SpanId) { - self.comments.push(span); - } + debug_assert!( + new_len <= total_len, + "ilex: advanced cursor beyond the end of text ({new_len} > {total_len}); this is a bug" + ); - /// Adds new unexpected tokens, starting from `start`. This may generate - /// multiple tokens, since it does not include whitespace in them. - pub fn add_unexpected(&mut self, mut start: usize, end: usize) { - let mut idx = start; - // Can't use a for loop, since that takes ownership of the iterator - // and that makes the self. calls below a problem. - while let Some(c) = self.text(idx..end).chars().next() { - if c.is_whitespace() { - if idx > start { - let span = self.span(start..idx); - self.builtins().unexpected_token(span); + if cfg!(debug_assertions) && !lexeme.is_eof() && !lexeme.is_aux() { + match self.spec().rule(lexeme) { + Any::Bracket(_) if !matches!(kind, Some(rt::Kind::Offset { .. })) => { + bug!("missing rt::Metadata::Offset on bracket rule") } - start = idx + c.len_utf8(); + Any::Digital(_) if !matches!(kind, Some(rt::Kind::Digital(_))) => { + bug!("missing rt::Metadata::Digital on digital rule") + } + Any::Quoted(_) if !matches!(kind, Some(rt::Kind::Quoted(_))) => { + bug!("missing rt::Metadata::Quoted on quoted rule") + } + _ => {} } + } - idx += c.len_utf8(); + let start = self.cursor(); + self + .stream + .toks + .push(rt::Token { lexeme, end: (start + len) as u32 }); + + let mut meta = rt::Metadata { kind, comments: Vec::new() }; + + if lexeme.can_have_comments(self.spec()) { + meta.comments = mem::take(&mut self.comments); } - if idx > start { - let span = self.span(start..idx); - self.builtins().unexpected_token(span); + if meta.kind.is_some() || !meta.comments.is_empty() { + self.stream.meta_idx.push(token::Id( + NonZeroU32::new(self.stream.toks.len() as u32).unwrap(), + )); + self.stream.meta.push(meta); } + + if !lexeme.is_eof() + && !lexeme.is_aux() + && matches!(self.spec().rule(lexeme), rule::Any::Comment(_)) + { + self.comments.push(token::Id( + NonZeroU32::new(self.stream.toks.len() as u32).unwrap(), + )); + } + + self.cursor += len; + } + + pub fn skip_whitespace(&mut self) -> bool { + let len = self + .text(self.cursor()..) + .chars() + .take_while(|c| c.is_whitespace()) + .map(char::len_utf8) + .sum(); + + self.add_token(rt::WHITESPACE, len, None); + len > 0 } pub fn finish(mut self) -> token::Stream<'ctx> { - self.add_token(rt::Token { - kind: rt::Kind::Eof, - span: self.eof, - lexeme: Lexeme::eof().cast(), - prefix: None, - suffix: None, - }); + self.add_token(Lexeme::eof().any(), 0, None); for close in mem::take(&mut self.closers) { - let open = self.tokens[close.original_open_idx].span; + let open = self.lookup_span(close.original_open_idx); self .builtins() .unclosed(open, &close.close, Lexeme::eof(), self.eof()); } - token::Stream { - file: self.file, - spec: self.spec, - toks: self.tokens, - } + self.stream } } diff --git a/ilex/src/rt/mod.rs b/ilex/src/rt/mod.rs index 631bd05..80afcb6 100644 --- a/ilex/src/rt/mod.rs +++ b/ilex/src/rt/mod.rs @@ -1,7 +1,10 @@ //! The lexer runtime. +use std::cell::Cell; + use crate::file::File; -use crate::file::SpanId; +use crate::file::Span; +use crate::file::Span2; use crate::report::Fatal; use crate::report::Report; use crate::rule; @@ -9,7 +12,6 @@ use crate::rule::Sign; use crate::spec::Lexeme; use crate::spec::Spec; use crate::token; -use crate::token::Content; mod emit2; pub mod lexer; @@ -26,41 +28,39 @@ pub fn lex<'ctx>( ) -> Result, Fatal> { let mut lexer = lexer::Lexer::new(file, report, spec); - let mut unexpected_start = None; - while let Some(next) = lexer.rest().chars().next() { - if !next.is_whitespace() { - let start = lexer.cursor(); - - lexer.pop_closer(); - if lexer.cursor() != start { - if let Some(ustart) = unexpected_start.take() { - lexer.add_unexpected(ustart, start); - } - - continue; - } - - emit2::emit(&mut lexer); - if lexer.cursor() != start { - if let Some(ustart) = unexpected_start.take() { - lexer.add_unexpected(ustart, start); - } - - continue; - } - - // We failed to make progress. Skip this character and start an - // "unexpected" token. - if unexpected_start.is_none() { - unexpected_start = Some(lexer.cursor()); - } + let unexpected = Cell::new(None); + let diagnose_unexpected = |end: usize| { + let Some(start) = unexpected.take() else { return }; + report + .builtins(spec) + .unexpected_token(file.span(start..end)); + }; + + loop { + let start = lexer.cursor(); + if lexer.skip_whitespace() { + diagnose_unexpected(start); } - lexer.advance(next.len_utf8()); - } + let start = lexer.cursor(); + let Some(next) = lexer.text(lexer.cursor()..).chars().next() else { break }; + + lexer.pop_closer(); + if lexer.cursor() > start { + diagnose_unexpected(start); + continue; + } + + emit2::emit(&mut lexer); + if lexer.cursor() > start { + diagnose_unexpected(start); + continue; + } - if let Some(start) = unexpected_start { - lexer.add_unexpected(start, lexer.cursor()); + lexer.add_token(UNEXPECTED, next.len_utf8(), None); + if unexpected.get().is_none() { + unexpected.set(Some(start)) + } } report.fatal_or(lexer.finish()) @@ -69,41 +69,71 @@ pub fn lex<'ctx>( /// The internal representation of a token inside of a token stream. #[derive(Clone)] pub struct Token { - pub kind: Kind, - pub span: SpanId, pub lexeme: Lexeme, - pub prefix: Option, - pub suffix: Option, + pub end: u32, +} +#[derive(Clone, Default)] +pub struct Metadata { + pub kind: Option, + pub comments: Vec, } -/// A pared-down token kind. #[derive(Clone)] pub enum Kind { - Eof, - Keyword, - Ident(SpanId), - Quoted { - content: Vec, - open: SpanId, - close: SpanId, - }, - Digital { - digits: DigitBlocks, - exponents: Vec, - }, - Open { - offset_to_close: u32, - }, - Close { - offset_to_open: u32, - full_span: SpanId, - }, + Quoted(Quoted), + Digital(Digital), + Offset { cursor: i32, meta: i32 }, } #[derive(Clone)] +pub struct Quoted { + // Offsets for the components of the string. First mark is the end of the + // open quote; following are alternating marks for textual and escape content. + // Adjacent escapes are separated by empty text content. + // + // Each text component consists of one mark, its end. Each escape consists of + // four marks, which refer to the end of the escape sequence prefix, the start of extra data, its end, and the + // end of the whole escape. This means that when we encounter \xNN, the + // positions of the marks are \x||NN||. When we encounter \u{NN}, the positions + // are \u|{|NN|}|. For \n, the positions are \n||||. + pub marks: Vec, +} + +#[derive(Clone, Default)] +pub struct Digital { + pub digits: DigitBlocks, + pub exponents: Vec, +} + +#[derive(Clone, Default)] pub struct DigitBlocks { - pub prefix: Option, - pub sign: Option<(Sign, SpanId)>, - pub blocks: Vec, + pub prefix: Span2, + pub sign: Option<(Sign, Span2)>, + pub blocks: Vec, pub which_exp: usize, } + +impl DigitBlocks { + pub fn prefix<'ctx>(&self, file: File<'ctx>) -> Option> { + if self.prefix == Span2::default() { + return None; + } + Some(self.prefix.get(file)) + } + + pub fn sign<'ctx>(&self, file: File<'ctx>) -> Option> { + self.sign.map(|(_, s)| s.get(file)) + } + + pub fn blocks<'a, 'ctx: 'a>( + &'a self, + file: File<'ctx>, + ) -> impl Iterator> + 'a { + self.blocks.iter().map(move |s| s.get(file)) + } +} + +pub const WHITESPACE: Lexeme = Lexeme::new(-1); +pub const UNEXPECTED: Lexeme = Lexeme::new(-2); +pub const PREFIX: Lexeme = Lexeme::new(-3); +pub const SUFFIX: Lexeme = Lexeme::new(-4); diff --git a/ilex/src/rule.rs b/ilex/src/rule.rs index 1c12304..cadd086 100644 --- a/ilex/src/rule.rs +++ b/ilex/src/rule.rs @@ -1060,7 +1060,7 @@ impl TryFrom for Digital { /// /// Comments do not generate tokens, unlike most rules. Instead, they are /// attached to the span of a token, and can be inspected through -/// [`Span::comments()`][crate::Span::comments]. +/// [`Token::comments()`][crate::Token::comments]. #[derive(Debug)] pub struct Comment { pub(crate) bracket: Bracket, diff --git a/ilex/src/spec.rs b/ilex/src/spec.rs index 78ffdc8..9139843 100644 --- a/ilex/src/spec.rs +++ b/ilex/src/spec.rs @@ -23,14 +23,14 @@ use crate::rule::Rule; /// be used to distinguish what rule a [`Token`][crate::token::Token] came from. #[repr(transparent)] pub struct Lexeme { - id: u32, + id: i32, _ph: PhantomData, } impl Lexeme { /// Returns the unique lexeme for the end-of-file marker. pub fn eof() -> Self { - Self::new(!0) + Self::new(i32::MAX) } } @@ -40,6 +40,24 @@ impl Lexeme { Lexeme::new(self.id) } + /// Returns whether this is the EOF lexeme. + pub fn is_eof(self) -> bool { + self == Lexeme::eof() + } + + /// Returns whether this is an auxiliary token that users should never + /// actually see. + pub(crate) fn is_aux(self) -> bool { + self.id < 0 + } + + /// Returns whether this lexeme can have comments attached to it. + pub(crate) fn can_have_comments(self, spec: &Spec) -> bool { + !self.is_aux() + && (self.is_eof() + || !matches!(spec.rule(self.any()), rule::Any::Comment(_))) + } + /// Converts this lexeme into an index. pub(crate) fn index(self) -> usize { self.id as usize @@ -51,14 +69,9 @@ impl Lexeme { } /// Creates a new lexeme. - pub(crate) fn new(id: u32) -> Self { + pub(crate) const fn new(id: i32) -> Self { Self { id, _ph: PhantomData } } - - /// Creates a new lexeme. - pub fn z() -> Self { - Self { id: 0, _ph: PhantomData } - } } impl fmt::Debug for Lexeme { @@ -176,13 +189,16 @@ impl SpecBuilder { name: impl Into, rule: R, ) -> Lexeme { - if self.rules.len() == (u32::MAX as usize) - 2 { - panic!("ilex: ran out of lexeme ids") + if self.rules.len() == (i32::MAX as usize) { + panic!( + "ilex: grammars with more than {} lexemes are unsupported", + i32::MAX + ) } self.names.push(name.into()); self.rules.push(rule.into()); - Lexeme::new(self.rules.len() as u32 - 1) + Lexeme::new(self.rules.len() as i32 - 1) } #[doc(hidden)] @@ -206,8 +222,8 @@ impl Clone for Lexeme { impl Copy for Lexeme {} -impl PartialEq> for Lexeme { - fn eq(&self, other: &Lexeme) -> bool { +impl PartialEq> for Lexeme { + fn eq(&self, other: &Lexeme) -> bool { self.id == other.id } } @@ -225,7 +241,7 @@ impl Ord for Lexeme { impl Hash for Lexeme { fn hash(&self, state: &mut H) { - u32::hash(&self.id, state) + i32::hash(&self.id, state) } } diff --git a/ilex/src/testing/mod.rs b/ilex/src/testing/mod.rs index 1fda03e..8528d9b 100644 --- a/ilex/src/testing/mod.rs +++ b/ilex/src/testing/mod.rs @@ -13,7 +13,6 @@ use std::fs; use std::ops::Range; use std::path::Path; -use crate::file::Context; use crate::file::Span; use crate::file::Spanned; use crate::report::Report; @@ -185,10 +184,9 @@ impl Matcher { #[track_caller] pub fn assert_matches<'lex>( &self, - ctx: &Context, that: impl IntoIterator>, ) { - self.matches(ctx, that).unwrap() + self.matches(that).unwrap() } /// Sets an expectation for the overall span of the most recently added @@ -226,7 +224,6 @@ impl Matcher { /// If matching fails, returns an error describing why. pub fn matches<'lex>( &self, - ctx: &Context, that: impl IntoIterator>, ) -> Result<(), impl fmt::Debug> { struct DebugBy(String); @@ -236,13 +233,13 @@ impl Matcher { } } - let mut state = recognize::MatchState::new(ctx); + let mut state = recognize::MatchState::new(); recognize::zip_eq( "token streams", &mut state, &self.stream, that, - |state, ours, theirs| ours.recognizes(state, theirs, ctx), + |state, ours, theirs| ours.recognizes(state, theirs), ); state.finish().map_err(DebugBy) } @@ -332,13 +329,10 @@ impl Text { } /// Returns whether this span recognizes a particular span. - fn recognizes(&self, span: Span, ctx: &Context) -> bool { - !self - .text - .as_ref() - .is_some_and(|text| text != span.text(ctx)) + fn recognizes(&self, span: Span) -> bool { + self.text.as_ref().is_none_or(|text| text == span.text()) && !self.range.as_ref().is_some_and(|range| { - let r = span.span(ctx); + let r = span.span(); range != &(r.start()..r.end()) }) } diff --git a/ilex/src/testing/recognize.rs b/ilex/src/testing/recognize.rs index b1a2626..dd7d495 100644 --- a/ilex/src/testing/recognize.rs +++ b/ilex/src/testing/recognize.rs @@ -8,7 +8,6 @@ use std::fmt::DebugStruct; use std::fmt::Display; use crate::f; -use crate::file::Context; use crate::file::Spanned; use crate::rule; use crate::spec::Lexeme; @@ -16,6 +15,7 @@ use crate::testing::Text; use crate::token; use crate::token::Any; use crate::token::Sign; +use crate::token::Token; pub struct Matcher { pub which: Option>, @@ -57,23 +57,12 @@ pub struct DigitalMatcher { } impl Matcher { - pub fn recognizes( - &self, - state: &mut MatchState, - tok: token::Any, - ctx: &Context, - ) { - state.match_spans("token span", &self.span, Spanned::span(&tok, ctx)); - - zip_eq( - "comments", - state, - &self.comments, - &tok.comments(ctx), - |state, t, s| { - state.match_spans("comment", t, s); - }, - ); + pub fn recognizes(&self, state: &mut MatchState, tok: token::Any) { + state.match_spans("token span", &self.span, Spanned::span(&tok)); + + zip_eq("comments", state, &self.comments, tok.comments(), |state, t, s| { + state.match_spans("comment", t, s); + }); match (&self.kind, tok) { (Kind::Eof, Any::Eof(..)) | (Kind::Keyword, Any::Keyword(..)) => {} @@ -83,7 +72,7 @@ impl Matcher { state.match_options("suffix", suffix.as_ref(), tok.suffix()); } (Kind::Quoted { delims, content, prefix, suffix }, Any::Quoted(tok)) => { - let (open, close) = tok.delimiters(); + let [open, close] = tok.delimiters(); state.match_spans("open quote", &delims.0, open); state.match_spans("close quote", &delims.1, close); state.match_options("prefix", prefix.as_ref(), tok.prefix()); @@ -162,7 +151,7 @@ impl Matcher { state, tokens, tok.contents(), - |state, ours, theirs| ours.recognizes(state, theirs, ctx), + |state, ours, theirs| ours.recognizes(state, theirs), ); } _ => state.error("mismatched token types"), @@ -240,17 +229,15 @@ impl fmt::Debug for Matcher { } } -pub struct MatchState<'a> { - ctx: &'a Context, +pub struct MatchState { errors: String, stack: Vec, error_count: usize, } -impl<'a> MatchState<'a> { - pub fn new(ctx: &'a Context) -> Self { +impl MatchState { + pub fn new() -> Self { Self { - ctx, errors: String::new(), stack: Vec::new(), error_count: 0, @@ -272,28 +259,30 @@ impl<'a> MatchState<'a> { let _ = writeln!(self.errors, ": {msg}"); } - fn match_spans(&mut self, what: &str, text: &Text, span: impl Spanned) { - let span = span.span(self.ctx); - if !text.recognizes(span, self.ctx) { + fn match_spans<'s>( + &mut self, + what: &str, + text: &Text, + span: impl Spanned<'s>, + ) { + let span = span.span(); + if !text.recognizes(span) { self.error(f!("wrong {what}; want {:?}, got {:?}", text, span)); } } - fn match_options( + fn match_options<'s>( &mut self, what: &str, text: Option<&Text>, - span: Option, + span: Option>, ) { - let span = span.map(|s| s.span(self.ctx)); + let span = span.map(|s| s.span()); if text.is_none() && span.is_none() { return; } - if !text - .zip(span) - .is_some_and(|(t, s)| t.recognizes(s, self.ctx)) - { + if !text.zip(span).is_some_and(|(t, s)| t.recognizes(s)) { self.error(f!("wrong {what}; want {:?}, got {:?}", text, span)); } } diff --git a/ilex/src/token/mod.rs b/ilex/src/token/mod.rs index 2682a84..92c2742 100644 --- a/ilex/src/token/mod.rs +++ b/ilex/src/token/mod.rs @@ -11,7 +11,8 @@ //! value. They all implement [`Token`]. use std::fmt; -use std::marker::PhantomData; +use std::iter; +use std::num::NonZeroU32; use std::ops::RangeBounds; use std::panic::Location; @@ -22,13 +23,11 @@ use num_traits::Bounded; use crate::f; use crate::file::Context; use crate::file::Span; -use crate::file::SpanId; use crate::file::Spanned; use crate::fp; use crate::report::Report; use crate::rt; use crate::rt::DigitBlocks; -use crate::rt::Kind; use crate::rule; use crate::spec::Lexeme; use crate::spec::Spec; @@ -39,25 +38,72 @@ mod stream; pub use stream::switch::switch; pub use stream::switch::Switch; +pub use stream::Comments; pub use stream::Cursor; pub use stream::Stream; +/// A token ID. +/// +/// An [`Id`] is a lightweight handle to some token, which can be converted +/// back into that token using the corresponding [`Stream`]. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct Id(pub(crate) NonZeroU32); + +impl Id { + fn idx(self) -> usize { + self.0.get() as usize - 1 + } + + fn prev(self) -> Option { + NonZeroU32::new(self.0.get() - 1).map(Self) + } + + fn next(self) -> Option { + self.0.checked_add(1).map(Self) + } +} + /// A token type. All types in [`ilex::token`][crate::token] implement this /// trait. pub trait Token<'lex>: - Copy + Spanned + fmt::Debug + TryFrom> + Into> + Copy + Spanned<'lex> + fmt::Debug + TryFrom> + Into> { /// The token this rule was parsed from. type Rule: rule::Rule; + /// The ID of this token. + fn id(self) -> Id; + + /// The token stream that owns this token. + fn stream(self) -> &'lex Stream<'lex>; + /// The context that owns this token. - fn context(self) -> &'lex Context; + fn context(self) -> &'lex Context { + self.stream().context() + } /// The spec that lexed this token. - fn spec(self) -> &'lex Spec; + fn spec(self) -> &'lex Spec { + self.stream().spec() + } /// Returns this token's [`Lexeme`]. - fn lexeme(self) -> Lexeme; + fn lexeme(self) -> Lexeme { + self.stream().lookup_token(self.id()).lexeme.cast() + } + + /// Returns an iterator over the attacked to this token. + fn comments(self) -> Comments<'lex> { + let stream = self.stream(); + Comments { + stream, + comments: stream + .lookup_meta(self.id()) + .map(|m| m.comments.as_slice()) + .unwrap_or(&[]) + .iter(), + } + } /// The rule inside of [`Token::spec()`] that this token refers to. /// @@ -97,36 +143,25 @@ pub enum Any<'lex> { impl<'lex> Token<'lex> for Any<'lex> { type Rule = rule::Any; - fn lexeme(self) -> Lexeme { - match self { - Self::Eof(tok) => tok.lexeme().any(), - Self::Bracket(tok) => tok.lexeme().any(), - Self::Keyword(tok) => tok.lexeme().any(), - Self::Ident(tok) => tok.lexeme().any(), - Self::Digital(tok) => tok.lexeme().any(), - Self::Quoted(tok) => tok.lexeme().any(), - } - } - - fn context(self) -> &'lex Context { + fn id(self) -> Id { match self { - Self::Eof(tok) => tok.context(), - Self::Bracket(tok) => tok.context(), - Self::Keyword(tok) => tok.context(), - Self::Ident(tok) => tok.context(), - Self::Digital(tok) => tok.context(), - Self::Quoted(tok) => tok.context(), + Self::Eof(tok) => tok.id(), + Self::Bracket(tok) => tok.id(), + Self::Keyword(tok) => tok.id(), + Self::Ident(tok) => tok.id(), + Self::Digital(tok) => tok.id(), + Self::Quoted(tok) => tok.id(), } } - fn spec(self) -> &'lex Spec { + fn stream(self) -> &'lex Stream<'lex> { match self { - Self::Eof(tok) => tok.spec, - Self::Bracket(tok) => tok.spec, - Self::Keyword(tok) => tok.spec, - Self::Ident(tok) => tok.spec, - Self::Digital(tok) => tok.spec, - Self::Quoted(tok) => tok.spec, + Self::Eof(tok) => tok.stream(), + Self::Bracket(tok) => tok.stream(), + Self::Keyword(tok) => tok.stream(), + Self::Ident(tok) => tok.stream(), + Self::Digital(tok) => tok.stream(), + Self::Quoted(tok) => tok.stream(), } } @@ -215,15 +250,15 @@ impl fmt::Debug for Any<'_> { } } -impl Spanned for Any<'_> { - fn span(&self, ctx: &Context) -> Span { +impl<'lex> Spanned<'lex> for Any<'lex> { + fn span(&self) -> Span<'lex> { match self { - Self::Eof(tok) => tok.span(ctx), - Self::Keyword(tok) => tok.span(ctx), - Self::Bracket(tok) => tok.span(ctx), - Self::Ident(tok) => tok.span(ctx), - Self::Quoted(tok) => tok.span(ctx), - Self::Digital(tok) => tok.span(ctx), + Self::Eof(tok) => tok.span(), + Self::Keyword(tok) => tok.span(), + Self::Bracket(tok) => tok.span(), + Self::Ident(tok) => tok.span(), + Self::Quoted(tok) => tok.span(), + Self::Digital(tok) => tok.span(), } } } @@ -236,20 +271,27 @@ impl Spanned for Any<'_> { /// comments within. #[derive(Copy, Clone)] pub struct Eof<'lex> { - span: SpanId, - ctx: &'lex Context, - spec: &'lex Spec, + stream: &'lex Stream<'lex>, + id: Id, } impl<'lex> Token<'lex> for Eof<'lex> { type Rule = rule::Eof; + fn id(self) -> Id { + self.id + } + + fn stream(self) -> &'lex Stream<'lex> { + self.stream + } + fn context(self) -> &'lex Context { - self.ctx + self.stream.context() } fn spec(self) -> &'lex Spec { - self.spec + self.stream.spec() } fn lexeme(self) -> Lexeme { @@ -277,13 +319,13 @@ impl<'lex> TryFrom> for Eof<'lex> { impl fmt::Debug for Eof<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Eof({:?})", self.span) + write!(f, "Eof({:?})", self.span()) } } -impl Spanned for Eof<'_> { - fn span(&self, ctx: &Context) -> Span { - self.span.span(ctx) +impl<'lex> Spanned<'lex> for Eof<'lex> { + fn span(&self) -> Span<'lex> { + self.stream.lookup_span_no_affix(self.id) } } @@ -294,26 +336,19 @@ impl Spanned for Eof<'_> { /// fixed string. #[derive(Copy, Clone)] pub struct Keyword<'lex> { - lexeme: Lexeme, - ctx: &'lex Context, - spec: &'lex Spec, - span: SpanId, - _ph: PhantomData<&'lex rt::Token>, + stream: &'lex Stream<'lex>, + id: Id, } impl<'lex> Token<'lex> for Keyword<'lex> { type Rule = rule::Keyword; - fn context(self) -> &'lex Context { - self.ctx - } - - fn spec(self) -> &'lex Spec { - self.spec + fn id(self) -> Id { + self.id } - fn lexeme(self) -> Lexeme { - self.lexeme + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -337,13 +372,13 @@ impl<'lex> TryFrom> for Keyword<'lex> { impl fmt::Debug for Keyword<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Keyword({:?})", self.span) + write!(f, "Keyword({:?})", self.span()) } } -impl Spanned for Keyword<'_> { - fn span(&self, ctx: &Context) -> Span { - self.span.span(ctx) +impl<'lex> Spanned<'lex> for Keyword<'lex> { + fn span(&self) -> Span<'lex> { + self.stream.lookup_span_no_affix(self.id) } } @@ -354,29 +389,25 @@ impl Spanned for Keyword<'_> { /// *trees*, like Rust does. #[derive(Copy, Clone)] pub struct Bracket<'lex> { - span: SpanId, - open: SpanId, - close: SpanId, - lexeme: Lexeme, - ctx: &'lex Context, - spec: &'lex Spec, + open: Id, + close: Id, contents: Cursor<'lex>, } impl<'lex> Bracket<'lex> { /// Returns this token's open delimiter. - pub fn open(self) -> SpanId { - self.open + pub fn open(self) -> Span<'lex> { + self.contents.stream().lookup_span_no_affix(self.open) } /// Returns this token's close delimiter. - pub fn close(self) -> SpanId { - self.close + pub fn close(self) -> Span<'lex> { + self.contents.stream().lookup_span_no_affix(self.close) } /// Returns this token's quote delimiters. - pub fn delimiters(self) -> [SpanId; 2] { - [self.open, self.close] + pub fn delimiters(self) -> [Span<'lex>; 2] { + [self.open(), self.close()] } /// Returns a cursor over this bracket's internal tokens (not including the @@ -391,16 +422,12 @@ impl<'lex> Bracket<'lex> { impl<'lex> Token<'lex> for Bracket<'lex> { type Rule = rule::Bracket; - fn context(self) -> &'lex Context { - self.ctx - } - - fn spec(self) -> &'lex Spec { - self.spec + fn id(self) -> Id { + self.open } - fn lexeme(self) -> Lexeme { - self.lexeme + fn stream(self) -> &'lex Stream<'lex> { + self.contents().stream() } #[doc(hidden)] @@ -433,69 +460,62 @@ impl<'lex> IntoIterator for Bracket<'lex> { impl fmt::Debug for Bracket<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Bracket") - .field("delimiters", &f!("({:?}, {:?})", self.open, self.close)) + .field("delimiters", &f!("({:?}, {:?})", self.open(), self.close())) .field("contents", &self.contents) .finish() } } -impl Spanned for Bracket<'_> { - fn span(&self, ctx: &Context) -> Span { - self.span.span(ctx) +impl<'lex> Spanned<'lex> for Bracket<'lex> { + fn span(&self) -> Span<'lex> { + let [a, b] = self.delimiters(); + self.contents.stream().file().span(a.start()..b.end()) } } /// A identifier, i.e., a self-delimiting word like `foo` or `黒猫`. #[derive(Copy, Clone)] pub struct Ident<'lex> { - tok: &'lex rt::Token, - ctx: &'lex Context, - spec: &'lex Spec, + stream: &'lex Stream<'lex>, + id: Id, } impl<'lex> Ident<'lex> { /// Returns this token's name span. - pub fn name(self) -> SpanId { - match &self.tok.kind { - &Kind::Ident(name) => name, - _ => panic!("non-lexer::Kind::Ident inside of Ident"), - } + pub fn name(self) -> Span<'lex> { + self.stream.lookup_span_no_affix(self.id) } /// Returns this token's prefix. - pub fn prefix(self) -> Option { - self.tok.prefix + pub fn prefix(self) -> Option> { + self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_prefix(&self, expected: &str) -> bool { - self.prefix().is_some_and(|s| s.text(self.ctx) == expected) + self.prefix().is_some_and(|s| s.text() == expected) } /// Returns this token's suffix. - pub fn suffix(&self) -> Option { - self.tok.suffix + pub fn suffix(&self) -> Option> { + self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_suffix(&self, expected: &str) -> bool { - self.suffix().is_some_and(|s| s.text(self.ctx) == expected) + self.suffix().is_some_and(|s| s.text() == expected) } } impl<'lex> Token<'lex> for Ident<'lex> { type Rule = rule::Ident; - fn context(self) -> &'lex Context { - self.ctx + fn id(self) -> Id { + self.id } - fn spec(self) -> &'lex Spec { - self.spec - } - - fn lexeme(self) -> Lexeme { - self.tok.lexeme.cast() + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -520,11 +540,11 @@ impl<'lex> TryFrom> for Ident<'lex> { impl fmt::Debug for Ident<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { if self.prefix().is_none() && self.suffix().is_none() { - return write!(f, "Ident({:?})", self.tok.span); + return write!(f, "Ident({:?})", self.name()); } let mut f = f.debug_struct("Ident"); - f.field("span", &self.tok.span).field("name", &self.name()); + f.field("span", &self.span()).field("name", &self.name()); if let Some(prefix) = self.prefix() { f.field("prefix", &prefix); @@ -538,9 +558,9 @@ impl fmt::Debug for Ident<'_> { } } -impl Spanned for Ident<'_> { - fn span(&self, ctx: &Context) -> Span { - self.tok.span.span(ctx) +impl<'lex> Spanned<'lex> for Ident<'lex> { + fn span(&self) -> Span<'lex> { + self.stream.lookup_span_with_affixes(self.id) } } @@ -561,10 +581,10 @@ impl Spanned for Ident<'_> { /// others). #[derive(Copy, Clone)] pub struct Digital<'lex> { - tok: &'lex rt::Token, + stream: &'lex Stream<'lex>, + id: Id, + meta: &'lex rt::Digital, idx: usize, - ctx: &'lex Context, - spec: &'lex Spec, } impl<'lex> Digital<'lex> { @@ -589,13 +609,13 @@ impl<'lex> Digital<'lex> { } /// Returns the span corresponding to [`Digital::sign()`]. - pub fn sign_span(self) -> Option { - self.rt_blocks().sign.map(|(_, sp)| sp) + pub fn sign_span(self) -> Option> { + self.rt_blocks().sign(self.file()) } /// Returns the point-separated digit chunks of this digital literal. - pub fn digit_blocks(self) -> impl Iterator + 'lex { - self.digit_slice().iter().copied() + pub fn digit_blocks(self) -> impl Iterator> + 'lex { + self.rt_blocks().blocks(self.file()) } /// Returns the exponents of this digital literal, if it any. @@ -603,41 +623,41 @@ impl<'lex> Digital<'lex> { /// Calling `exponents()` on any of the returned tokens will yield all /// exponents that follow. pub fn exponents(self) -> impl Iterator> { - (self.idx..self.exponent_slice().len()).map(move |idx| Self { - tok: self.tok, - ctx: self.ctx, + (self.idx..self.meta.exponents.len()).map(move |idx| Self { + stream: self.stream, + id: self.id, + meta: self.meta, idx: idx + 1, - spec: self.spec, }) } /// Returns this token's prefix. - pub fn prefix(self) -> Option { + pub fn prefix(self) -> Option> { if self.idx > 0 { - return self.rt_blocks().prefix; + return self.rt_blocks().prefix(self.file()); } - self.tok.prefix + self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_prefix(&self, expected: &str) -> bool { - self.prefix().is_some_and(|s| s.text(self.ctx) == expected) + self.prefix().is_some_and(|s| s.text() == expected) } /// Returns this token's suffix. - pub fn suffix(&self) -> Option { + pub fn suffix(&self) -> Option> { if self.idx > 0 { // Exponent tokens never have a suffix. return None; } - self.tok.suffix + self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. pub fn has_suffix(&self, expected: &str) -> bool { - self.suffix().is_some_and(|s| s.text(self.ctx) == expected) + self.suffix().is_some_and(|s| s.text() == expected) } /// Parses this token as an integer. @@ -653,7 +673,7 @@ impl<'lex> Digital<'lex> { N: Bounded + PartialOrd + FromRadix + fmt::Display, { for extra in self.digit_blocks().skip(1) { - report.builtins(self.spec).unexpected( + report.builtins(self.spec()).unexpected( "extra digits", self.lexeme(), extra, @@ -662,7 +682,7 @@ impl<'lex> Digital<'lex> { for extra in self.exponents() { report - .builtins(self.spec) + .builtins(self.spec()) .unexpected("exponent", self.lexeme(), extra); } @@ -686,7 +706,7 @@ impl<'lex> Digital<'lex> { self .digit_blocks() .map(|span| { - let text = span.text(self.ctx); + let text = span.text(); let buf; let text = if !rule.separator.is_empty() && text.contains(&*rule.separator) { @@ -738,7 +758,7 @@ impl<'lex> Digital<'lex> { range: impl RangeBounds, report: &Report, ) -> Result { - let fp: Fp = self.parse_fp(self.ctx, report, false)?; + let fp: Fp = self.parse_fp(report, false)?; if !fp.__is_finite() || !range.contains(&fp) { report.builtins(self.spec()).literal_out_of_range( @@ -764,7 +784,7 @@ impl<'lex> Digital<'lex> { range: impl RangeBounds, report: &Report, ) -> Result { - let fp: Fp = self.parse_fp(self.ctx, report, true)?; + let fp: Fp = self.parse_fp(report, true)?; if !fp.__is_finite() || !range.contains(&fp) { report.builtins(self.spec()).literal_out_of_range( @@ -788,23 +808,11 @@ impl<'lex> Digital<'lex> { } } - fn digit_slice(self) -> &'lex [SpanId] { - &self.rt_blocks().blocks - } - - fn exponent_slice(self) -> &'lex [DigitBlocks] { - match &self.tok.kind { - Kind::Digital { exponents, .. } => exponents, - _ => panic!("non-lexer::Kind::Digital inside of Digital"), - } - } - fn rt_blocks(&self) -> &'lex DigitBlocks { - match &self.tok.kind { - Kind::Digital { digits, .. } if self.idx == 0 => digits, - Kind::Digital { exponents, .. } => &exponents[self.idx - 1], - _ => panic!("non-lexer::Kind::Digital inside of Digital"), + if self.idx == 0 { + return &self.meta.digits; } + &self.meta.exponents[self.idx - 1] } } @@ -894,16 +902,12 @@ impl_radix! { impl<'lex> Token<'lex> for Digital<'lex> { type Rule = rule::Digital; - fn context(self) -> &'lex Context { - self.ctx - } - - fn spec(self) -> &'lex Spec { - self.spec + fn id(self) -> Id { + self.id } - fn lexeme(self) -> Lexeme { - self.tok.lexeme.cast() + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -928,9 +932,10 @@ impl<'lex> TryFrom> for Digital<'lex> { impl fmt::Debug for Digital<'_> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let mut f = f.debug_struct("Digital"); - f.field("span", &self.tok.span) + f.field("span", &self.span()) .field("radix", &self.radix()) - .field("digits", &self.digit_slice()); + // TODO: Get rid of this collect. + .field("digits", &self.digit_blocks().collect::>()); if let Some(sign) = self.sign() { f.field("sign", &sign); @@ -952,70 +957,120 @@ impl fmt::Debug for Digital<'_> { } } -impl Spanned for Digital<'_> { - fn span(&self, ctx: &Context) -> Span { - self.tok.span.span(ctx) +impl<'lex> Spanned<'lex> for Digital<'lex> { + fn span(&self) -> Span<'lex> { + self.stream.lookup_span_with_affixes(self.id) } } /// A quoted literal. #[derive(Copy, Clone)] pub struct Quoted<'lex> { - tok: &'lex rt::Token, - ctx: &'lex Context, - spec: &'lex Spec, + stream: &'lex Stream<'lex>, + id: Id, + meta: &'lex rt::Quoted, } impl<'lex> Quoted<'lex> { /// Returns this token's open delimiter. - pub fn open(self) -> SpanId { - self.delimiters().0 + pub fn open(self) -> Span<'lex> { + self.delimiters()[0] } /// Returns this token's close delimiter. - pub fn close(self) -> SpanId { - self.delimiters().0 + pub fn close(self) -> Span<'lex> { + self.delimiters()[1] } /// Returns this token's quote delimiters. - pub fn delimiters(self) -> (SpanId, SpanId) { - match &self.tok.kind { - &Kind::Quoted { open, close, .. } => (open, close), - _ => panic!("non-lexer::Kind::Quoted inside of Quoted"), - } + pub fn delimiters(self) -> [Span<'lex>; 2] { + let span = self.stream.lookup_span_no_affix(self.id); + [ + self + .stream + .file() + .span(span.start()..*self.meta.marks.first().unwrap() as usize), + self + .stream + .file() + .span(*self.meta.marks.last().unwrap() as usize..span.end()), + ] } /// Returns the raw content of this token. /// /// There are two kinds of content: either a literal span of Unicode scalars - /// (represented as a [`SpanId`] pointing to those characters) or a single + /// (represented as a [`Span`] pointing to those characters) or a single /// escape, potentially with some side data. /// /// It is up to the user of the library to decode these two content types into /// strings. [`Quoted::to_utf8()`] helps with the common case of doing this for /// UTF-8 strings. - pub fn raw_content(self) -> impl Iterator + 'lex { - self.content_slice().iter().copied() + pub fn raw_content(self) -> impl Iterator>> + 'lex { + let file = self.stream.file(); + let mut next = self.meta.marks[0]; + let mut is_escape = false; + let mut marks = &self.meta.marks[1..]; + + iter::from_fn(move || loop { + return match is_escape { + false => { + let start = next; + let &[end, ref rest @ ..] = marks else { + return None; + }; + + next = end; + marks = rest; + is_escape = true; + + if start == end { + continue; + } + + let span = file.span(start as usize..end as usize); + Some(Content::Lit(span)) + } + true => { + let start = next; + let &[esc_end, data_start, data_end, end, ref rest @ ..] = marks + else { + return None; + }; + + next = end; + marks = rest; + is_escape = false; + + let span = file.span(start as usize..esc_end as usize); + let data = (data_start != data_end) + .then(|| file.span(data_start as usize..data_end as usize)); + Some(Content::Esc(span, data)) + } + }; + }) } - /// Returns the unique single [`Content`] of this token, if it is unique. - pub fn unique_content(self) -> Option { - match self.content_slice() { - [unique] => Some(*unique), - _ => None, + /// Returns the unique single literal content of this token, if it is unique. + pub fn literal(self) -> Option> { + if self.meta.marks.len() > 2 { + return None; } + let start = *self.meta.marks.first().unwrap(); + let end = *self.meta.marks.last().unwrap(); + Some(self.stream.file().span(start as usize..end as usize)) } /// Constructs a UTF-8 string in the "obvious way", using this token and a /// mapping function for escapes. pub fn to_utf8( self, - mut decode_esc: impl FnMut(SpanId, Option, &mut String), + mut decode_esc: impl FnMut(Span, Option>, &mut String), ) -> String { let total = self .raw_content() .map(|c| match c { - Content::Lit(sp) => sp.text(self.ctx).len(), + Content::Lit(sp) => sp.text().len(), Content::Esc(..) => 1, }) .sum(); @@ -1023,38 +1078,31 @@ impl<'lex> Quoted<'lex> { let mut buf = String::with_capacity(total); for chunk in self.raw_content() { match chunk { - Content::Lit(sp) => buf.push_str(sp.text(self.ctx)), + Content::Lit(sp) => buf.push_str(sp.text()), Content::Esc(sp, data) => decode_esc(sp, data, &mut buf), } } buf } - fn content_slice(self) -> &'lex [Content] { - match &self.tok.kind { - Kind::Quoted { content, .. } => content, - _ => panic!("non-lexer::Kind::Quoted inside of Quoted"), - } - } - /// Returns this token's prefix. - pub fn prefix(self) -> Option { - self.tok.prefix + pub fn prefix(self) -> Option> { + self.stream.lookup_prefix(self.id) } /// Checks whether this identifier has a particular prefix. - pub fn has_prefix(self, expected: &str) -> bool { - self.prefix().is_some_and(|s| s.text(self.ctx) == expected) + pub fn has_prefix(&self, expected: &str) -> bool { + self.prefix().is_some_and(|s| s.text() == expected) } /// Returns this token's suffix. - pub fn suffix(self) -> Option { - self.tok.suffix + pub fn suffix(&self) -> Option> { + self.stream.lookup_suffix(self.id) } /// Checks whether this identifier has a particular prefix. - pub fn has_suffix(self, expected: &str) -> bool { - self.suffix().is_some_and(|s| s.text(self.ctx) == expected) + pub fn has_suffix(&self, expected: &str) -> bool { + self.suffix().is_some_and(|s| s.text() == expected) } } @@ -1063,31 +1111,28 @@ impl<'lex> Quoted<'lex> { /// The "span type" is configurable; this type is used by multiple parts of /// the library. #[derive(Copy, Clone, Debug)] -pub enum Content { +pub enum Content { /// A literal chunk, i.e. UTF-8 text directly from the source file. - Lit(SpanId), + Lit(Span), /// An escape sequence, which may have associated data (e.g. the `NN` from a /// `\xNN`). - Esc(SpanId, Option), + Esc(Span, Option), } -impl Content { +impl Content { /// Literal contents. - pub fn lit(chunk: impl Into) -> Self { + pub fn lit(chunk: impl Into) -> Self { Self::Lit(chunk.into()) } /// Escaped contents. - pub fn esc(chunk: impl Into) -> Self { + pub fn esc(chunk: impl Into) -> Self { Self::Esc(chunk.into(), None) } /// Escaped contents. - pub fn esc_with_data( - chunk: impl Into, - data: impl Into, - ) -> Self { + pub fn esc_with_data(chunk: impl Into, data: impl Into) -> Self { Self::Esc(chunk.into(), Some(data.into())) } } @@ -1095,16 +1140,12 @@ impl Content { impl<'lex> Token<'lex> for Quoted<'lex> { type Rule = rule::Quoted; - fn context(self) -> &'lex Context { - self.ctx + fn id(self) -> Id { + self.id } - fn spec(self) -> &'lex Spec { - self.spec - } - - fn lexeme(self) -> Lexeme { - self.tok.lexeme.cast() + fn stream(self) -> &'lex Stream<'lex> { + self.stream } #[doc(hidden)] @@ -1129,9 +1170,10 @@ impl<'lex> TryFrom> for Quoted<'lex> { impl fmt::Debug for Quoted<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut f = f.debug_struct("Quoted"); - f.field("span", &self.tok.span) + f.field("span", &self.span()) .field("delimiters", &self.delimiters()) - .field("content", &self.content_slice()); + // TODO: get rid of this collect(). + .field("content", &self.raw_content().collect::>()); if let Some(prefix) = self.prefix() { f.field("prefix", &prefix); @@ -1145,24 +1187,20 @@ impl fmt::Debug for Quoted<'_> { } } -impl Spanned for Quoted<'_> { - fn span(&self, ctx: &Context) -> Span { - self.tok.span.span(ctx) +impl<'lex> Spanned<'lex> for Quoted<'lex> { + fn span(&self) -> Span<'lex> { + self.stream.lookup_span_with_affixes(self.id) } } impl<'lex> Token<'lex> for Never { type Rule = Never; - fn context(self) -> &'lex Context { - self.from_nothing_anything() - } - - fn spec(self) -> &'lex Spec { + fn id(self) -> Id { self.from_nothing_anything() } - fn lexeme(self) -> Lexeme { + fn stream(self) -> &'lex Stream<'lex> { self.from_nothing_anything() } @@ -1193,19 +1231,18 @@ impl<'lex> Any<'lex> { return name.to_box(); } - let ctx = self.context(); let (pre, suf, kind) = match self { Any::Eof(_) => return yarn!(""), - Any::Keyword(tok) => return yarn!("`{}`", tok.text(ctx)), + Any::Keyword(tok) => return yarn!("`{}`", tok.text()), Any::Bracket(d) => { - return yarn!("`{} ... {}`", d.open().text(ctx), d.close().text(ctx)); + return yarn!("`{} ... {}`", d.open().text(), d.close().text()); } Any::Quoted(tok) => { - let pre = tok.prefix().map(|s| s.text(ctx)).unwrap_or(""); - let suf = tok.suffix().map(|s| s.text(ctx)).unwrap_or(""); - let open = tok.open().text(ctx); - let close = tok.close().text(ctx); + let pre = tok.prefix().map(|s| s.text()).unwrap_or(""); + let suf = tok.suffix().map(|s| s.text()).unwrap_or(""); + let open = tok.open().text(); + let close = tok.close().text(); return yarn!("`{pre}{open}...{close}{suf}`"); } @@ -1213,7 +1250,7 @@ impl<'lex> Any<'lex> { Any::Digital(tok) => (tok.prefix(), tok.suffix(), "number"), }; - match (pre.map(|s| s.text(ctx)), suf.map(|s| s.text(ctx))) { + match (pre.map(|s| s.text()), suf.map(|s| s.text())) { (Some(pre), Some(suf)) => { yarn!("`{pre}`-prefixed, `{suf}`-suffixed {kind}") } diff --git a/ilex/src/token/stream.rs b/ilex/src/token/stream.rs index 9794280..e25c7ac 100644 --- a/ilex/src/token/stream.rs +++ b/ilex/src/token/stream.rs @@ -1,15 +1,15 @@ -use std::array; use std::fmt; use std::iter; -use std::marker::PhantomData; use std::mem; +use std::num::NonZeroU32; +use std::slice; use crate::file::Context; use crate::file::File; -use crate::file::SpanId; +use crate::file::Span; use crate::report::Report; use crate::rt; -use crate::rt::Kind; +use crate::rule; use crate::rule::Rule; use crate::spec::Lexeme; use crate::spec::Spec; @@ -22,17 +22,21 @@ use crate::token; pub struct Stream<'ctx> { pub(crate) file: File<'ctx>, pub(crate) spec: &'ctx Spec, + pub(crate) toks: Vec, + pub(crate) meta_idx: Vec, + pub(crate) meta: Vec, } impl<'ctx> Stream<'ctx> { /// Returns a cursor over this stream. pub fn cursor(&self) -> Cursor { Cursor { - file: self.file, - spec: self.spec, - toks: &self.toks, + stream: self, + start: 0, + end: self.toks.len(), cursor: 0, + meta_cursor: 0, } } @@ -50,6 +54,171 @@ impl<'ctx> Stream<'ctx> { pub fn spec(&self) -> &'ctx Spec { self.spec } + + /// Returns the token with the given ID. + /// + /// # Panics + /// + /// Panics if this stream does not have a token with the given ID. + pub fn token_at(&self, id: token::Id) -> token::Any { + let meta_hint = self.meta_idx.binary_search(&id).unwrap_or(0); + self.token_at_hint(id, meta_hint).unwrap() + } + + /// Returns the last token pushed to this stream. + pub(crate) fn last_token(&self) -> token::Any { + let mut cursor = self.cursor(); + cursor.cursor = cursor.end; + cursor.meta_cursor = self.meta_idx.len(); + loop { + cursor.step_backward(); + let tok = self.lookup_token(cursor.id()); + if tok.lexeme.is_aux() { + continue; + } + + return self.token_at_hint(cursor.id(), cursor.meta_cursor).unwrap(); + } + } + + pub(crate) fn token_at_hint( + &self, + id: token::Id, + meta_hint: usize, + ) -> Option { + let tok = &self.toks[id.idx()]; + let meta = self + .lookup_meta_hint(id, meta_hint) + .and_then(|m| m.kind.as_ref()); + + if [rt::PREFIX, rt::SUFFIX, rt::WHITESPACE, rt::UNEXPECTED] + .contains(&tok.lexeme) + { + return None; + } + + if tok.lexeme == Lexeme::eof().any() { + return Some(token::Eof { stream: self, id }.into()); + } + + Some(match self.spec().rule(tok.lexeme) { + rule::Any::Comment(..) => return None, + rule::Any::Keyword(..) => token::Keyword { stream: self, id }.into(), + rule::Any::Ident(..) => token::Ident { stream: self, id }.into(), + + rule::Any::Bracket(..) => { + let Some(&rt::Kind::Offset { cursor, .. }) = meta else { + bug!("missing rt::Metadata::Offset on bracket token") + }; + let open = id; + let close = token::Id( + NonZeroU32::new(id.0.get().wrapping_add_signed(cursor)).unwrap(), + ); + + token::Bracket { + open, + close, + contents: Cursor { + stream: self, + start: open.idx() + 1, + end: close.idx(), + cursor: open.idx() + 1, + meta_cursor: meta_hint + 1, + }, + } + .into() + } + + crate::rule::Any::Quoted(..) => { + let Some(rt::Kind::Quoted(meta)) = meta else { + bug!("missing rt::Metadata::Quoted on quoted token") + }; + + token::Quoted { stream: self, id, meta }.into() + } + + crate::rule::Any::Digital(..) => { + let Some(rt::Kind::Digital(meta)) = meta else { + bug!("missing rt::Metadata::Digital on digital token") + }; + + token::Digital { stream: self, id, meta, idx: 0 }.into() + } + }) + } + + pub(crate) fn lookup_meta(&self, id: token::Id) -> Option<&rt::Metadata> { + let idx = self.meta_idx.binary_search(&id).ok()?; + Some(&self.meta[idx]) + } + + pub(crate) fn lookup_meta_hint( + &self, + id: token::Id, + hint: usize, + ) -> Option<&rt::Metadata> { + if self.meta_idx.get(hint) != Some(&id) { + return None; + } + + Some(&self.meta[hint]) + } + + pub(crate) fn lookup_token(&self, id: token::Id) -> &rt::Token { + &self.toks[id.idx()] + } + + pub(crate) fn lookup_span_no_affix(&self, id: token::Id) -> Span { + let start = self + .toks + .get(id.idx().wrapping_sub(1)) + .map(|t| t.end as usize) + .unwrap_or(0); + let end = self.lookup_token(id).end as usize; + self.file().span(start..end) + } + + pub(crate) fn lookup_prefix(&self, id: token::Id) -> Option { + let prev = id.prev()?; + if self.lookup_token(prev).lexeme != rt::PREFIX { + return None; + } + Some(self.lookup_span_no_affix(prev)) + } + + pub(crate) fn lookup_suffix(&self, id: token::Id) -> Option { + let next = id.next()?; + if next.idx() == self.toks.len() + || self.lookup_token(next).lexeme != rt::SUFFIX + { + return None; + } + Some(self.lookup_span_no_affix(next)) + } + + pub(crate) fn lookup_span_with_affixes(&self, id: token::Id) -> Span { + let span = self.lookup_span_no_affix(id); + + let mut start = span.start(); + if let Some(prefix) = self.lookup_prefix(id) { + start = prefix.start() + } + + let mut end = span.end(); + if let Some(suffix) = self.lookup_suffix(id) { + end = suffix.end(); + } + + self.file.span(start..end) + } + + pub(crate) fn last_meta(&self) -> Option<&rt::Metadata> { + self.meta.last() + } + + pub(crate) fn last_meta_mut(&mut self) -> Option<&mut rt::Metadata> { + self.meta.last_mut() + } } impl<'lex> IntoIterator for &'lex Stream<'_> { @@ -73,35 +242,56 @@ impl fmt::Debug for Stream<'_> { /// also be queried for more specific token kinds. #[derive(Copy, Clone)] pub struct Cursor<'lex> { - file: File<'lex>, - spec: &'lex Spec, - toks: &'lex [rt::Token], + stream: &'lex Stream<'lex>, + + // These are the range within `stream.toks` that we're allowed to yield. + start: usize, + end: usize, + + // This is the position of the cursor in `stream.toks`. cursor: usize, + + // This points to a value in `stream.meta_idx` whose `idx()` is greater than + // or equal to that of cursor; when `stream.toks[cursor]` is a token with + // metadata, this points to its metadata. When advancing, if + // + // ``` + // stream.meta_idx[meta_cursor].idx() == cursor + // ``` + // + // then we advance meta_cursor too. When backing up, we back up meta_cursor + // if + // + // ``` + // stream.meta_idx[meta_cursor - 1].idx() == cursor - 1 + // ``` + meta_cursor: usize, } impl<'lex> Cursor<'lex> { - fn end(&self) -> SpanId { - self.toks.last().unwrap().span + /// Returns the stream this cursor runs over. + pub fn stream(&self) -> &'lex Stream<'lex> { + self.stream } /// Returns the source code context this stream is associated with. pub fn context(&self) -> &'lex Context { - self.file.context() + self.stream.context() } /// Returns the file this stream was lexed from. pub fn file(&self) -> File<'lex> { - self.file + self.stream.file() } /// Returns the lexer spec this stream was lexed with. pub fn spec(&self) -> &'lex Spec { - self.spec + self.stream.spec() } /// Returns whether this cursor has yielded all of its tokens. pub fn is_empty(&self) -> bool { - self.cursor >= self.toks.len() + self.cursor >= self.end } /// Returns the next token under the cursor without consuming it. @@ -117,12 +307,7 @@ impl<'lex> Cursor<'lex> { /// Panics if this causes the internal cursor to underflow. pub fn back_up(&mut self, count: usize) { for _ in 0..count { - assert!(self.cursor > 0, "cursor underflowed"); - self.cursor -= 1; - - if let Kind::Close { offset_to_open, .. } = &self.toks[self.cursor].kind { - self.cursor -= *offset_to_open as usize; - } + assert!(self.step_backward(), "underflow attempting to back up cursor") } } @@ -131,7 +316,7 @@ impl<'lex> Cursor<'lex> { pub fn expect_finished(&self, report: &Report) { if let Some(next) = self.peek_any() { report - .builtins(self.spec) + .builtins(self.spec()) .expected([Lexeme::eof()], next, self.end()); } } @@ -189,7 +374,7 @@ impl<'lex> Cursor<'lex> { &'a mut self, delim: Lexeme, mut cb: impl FnMut(&mut Self) -> Option + 'a, - ) -> impl Iterator>)> + '_ { + ) -> impl Iterator>)> + 'a { let mut sep = switch::switch().case(delim, |x, _| x); let mut done = false; let mut prev = self.cursor; @@ -218,26 +403,91 @@ impl<'lex> Cursor<'lex> { }) } - pub(crate) fn fake_token( - file: File<'lex>, - spec: &'lex Spec, - tok: &'lex rt::Token, - ) -> token::Any<'lex> { - Self { - file, - spec, - toks: array::from_ref(tok), - cursor: 0, + // pub(crate) fn fake_token( + // file: File<'lex>, + // spec: &'lex Spec, + // tok: &'lex rt::Token, + // ) -> token::Any<'lex> { + // Self { + // file, + // spec, + // toks: array::from_ref(tok), + // cursor: 0, + // } + // .next() + // .unwrap() + // } + + fn id(&self) -> token::Id { + token::Id(NonZeroU32::new(self.cursor as u32 + 1).unwrap()) + } + + fn step_forward(&mut self) -> bool { + if self.cursor >= self.end { + return false; + } + + // Step past an open token. This will result in the cursor pointing to + // one-past the end token. + if let Some(&rt::Kind::Offset { cursor, meta }) = self.kind() { + self.cursor = self.cursor.wrapping_add_signed(cursor as isize); + self.meta_cursor = self.meta_cursor.wrapping_add_signed(meta as isize); + } + + if let Some(id) = self.stream.meta_idx.get(self.meta_cursor) { + if id.idx() == self.cursor { + self.meta_cursor += 1; + } + } + + self.cursor += 1; + true + } + + fn step_backward(&mut self) -> bool { + if self.cursor <= self.start { + return false; + } + + if let Some(id) = self.stream.meta_idx.get(self.meta_cursor.wrapping_sub(1)) + { + if id.idx() == self.cursor.wrapping_sub(1) { + self.meta_cursor -= 1; + } } - .next() - .unwrap() + + self.cursor -= 1; + + // Step back from a close token. This will result in the cursor pointing to + // the open token. + if let Some(&rt::Kind::Offset { cursor, meta }) = self.kind() { + self.cursor = self.cursor.wrapping_add_signed(cursor as isize); + self.meta_cursor = self.meta_cursor.wrapping_add_signed(meta as isize); + } + + true + } + + fn kind(&self) -> Option<&'lex rt::Kind> { + self + .stream + .lookup_meta_hint(self.id(), self.meta_cursor) + .and_then(|m| m.kind.as_ref()) + } + + fn end(&self) -> Span { + let end = self + .stream() + .lookup_token(token::Id(NonZeroU32::new(self.end as u32 + 1).unwrap())) + .end as usize; + self.file().span(end..end) } } impl fmt::Debug for Cursor<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut copy = *self; - copy.cursor = 0; + copy.cursor = copy.start; let mut list = f.debug_list(); for (i, tok) in copy.enumerate() { @@ -262,104 +512,40 @@ impl fmt::Debug for Cursor<'_> { impl<'lex> Iterator for Cursor<'lex> { type Item = token::Any<'lex>; fn next(&mut self) -> Option { - let tok = self.toks.get(self.cursor)?; - let next = match &tok.kind { - Kind::Eof => { - self.cursor += 1; - token::Any::Eof(token::Eof { - span: tok.span, - ctx: self.context(), - spec: self.spec, - }) - } - - Kind::Keyword => { - self.cursor += 1; - token::Any::Keyword(token::Keyword { - lexeme: tok.lexeme.cast(), - ctx: self.context(), - spec: self.spec, - span: tok.span, - _ph: PhantomData, - }) + loop { + if self.is_empty() { + return None; } - Kind::Open { offset_to_close } => { - if *offset_to_close == !0 { - // This was called from deep inside the lexer to generate a token - // name for a diagnostic, so we're just gonna give it a... - // stringifyable token. - - return Some(token::Any::Bracket(token::Bracket { - span: tok.span, - open: tok.span, - close: tok.span, - lexeme: tok.lexeme.cast(), - ctx: self.context(), - spec: self.spec, - contents: *self, - })); - } - - let open_idx = self.cursor; - let close_idx = open_idx + (*offset_to_close as usize); - self.cursor = close_idx + 1; - - let close = &self.toks[close_idx]; - let &Kind::Close { full_span, .. } = &close.kind else { - bug!("Kind::Open did not point to an Kind::Close"); - }; + let next = self.stream.token_at_hint(self.id(), self.meta_cursor); + self.step_forward(); - token::Any::Bracket(token::Bracket { - span: full_span, - open: tok.span, - close: close.span, - lexeme: tok.lexeme.cast(), - ctx: self.context(), - spec: self.spec, - contents: Cursor { - file: self.file, - spec: self.spec, - toks: &self.toks[open_idx + 1..close_idx], - cursor: 0, - }, - }) - } - - Kind::Close { .. } => { - bug!("stray closing delimiter {:?} in token stream", tok.span) + if next.is_some() { + return next; } + } + } +} - Kind::Ident { .. } => { - self.cursor += 1; - token::Any::Ident(token::Ident { - tok, - ctx: self.context(), - spec: self.spec, - }) - } +/// An iterator over the comment spans attached to a token. +pub struct Comments<'lex> { + pub(super) stream: &'lex Stream<'lex>, + pub(super) comments: slice::Iter<'lex, token::Id>, +} - Kind::Quoted { .. } => { - self.cursor += 1; - token::Any::Quoted(token::Quoted { - tok, - ctx: self.context(), - spec: self.spec, - }) - } +impl<'lex> Comments<'lex> { + /// Adapts this iterator to return just the text contents of each [`Span`]. + pub fn as_strings(self) -> impl Iterator + 'lex { + self.map(Span::text) + } +} - Kind::Digital { .. } => { - self.cursor += 1; - token::Any::Digital(token::Digital { - tok, - ctx: self.context(), - idx: 0, - spec: self.spec, - }) - } - }; +impl<'lex> Iterator for Comments<'lex> { + type Item = Span<'lex>; - Some(next) + fn next(&mut self) -> Option { + let id = *self.comments.next()?; + Some(self.stream.lookup_span_no_affix(id)) } } @@ -436,7 +622,7 @@ pub mod switch { X: Impl<'lex, T>, { let Some(next) = cursor.next() else { - report.builtins(cursor.spec).expected( + report.builtins(cursor.spec()).expected( self.0.lexemes(0), Lexeme::eof(), cursor.end(), @@ -450,7 +636,7 @@ pub mod switch { } report - .builtins(cursor.spec) + .builtins(cursor.spec()) .expected(self.0.lexemes(0), next, next); None } diff --git a/ilex/tests/greedy.rs b/ilex/tests/greedy.rs index 547d527..306b2f0 100644 --- a/ilex/tests/greedy.rs +++ b/ilex/tests/greedy.rs @@ -48,5 +48,5 @@ fn greedy() { .then2(array, ("[", "]"), Matcher::new().then1(ident, "xyz")) .then2(cpp_like, ("R\"cc(", ")cc\""), ["some c++)\" "]) .eof() - .assert_matches(&ctx, &tokens); + .assert_matches(&tokens); } diff --git a/ilex/tests/json.rs b/ilex/tests/json.rs index 652d6b0..567ea89 100644 --- a/ilex/tests/json.rs +++ b/ilex/tests/json.rs @@ -11,7 +11,6 @@ use ilex::token; use ilex::token::Content as C; use ilex::token::Cursor; use ilex::Lexeme; -use ilex::Spanned; #[ilex::spec] struct JsonSpec { @@ -168,7 +167,7 @@ fn check_tokens() { ), ) .eof() - .assert_matches(&ctx, &tokens); + .assert_matches(&tokens); } #[derive(Clone, Debug, PartialEq)] @@ -237,20 +236,15 @@ fn parse(data: &str) -> Result { .new_file("", data) .lex(json.spec(), &report) .map_err(|e| Error(e.to_string()))?; - let value = parse0(&ctx, &report, json, &mut stream.cursor()); + let value = parse0(&report, json, &mut stream.cursor()); report.fatal_or(value).map_err(|e| Error(e.to_string())) } -fn parse0( - ctx: &ilex::Context, - report: &Report, - json: &JsonSpec, - cursor: &mut Cursor, -) -> Json { +fn parse0(report: &Report, json: &JsonSpec, cursor: &mut Cursor) -> Json { let quote2str = |str: token::Quoted| -> String { str.to_utf8(|key, data, buf| { - let char = match key.text(ctx) { + let char = match key.text() { "\\\"" => '\"', r"\\" => '\\', r"\/" => '/', @@ -263,10 +257,10 @@ fn parse0( r"\u" => { let data = data.unwrap(); let code = - u16::from_str_radix(data.text(ctx), 16).unwrap_or_else(|_| { + u16::from_str_radix(data.text(), 16).unwrap_or_else(|_| { report.builtins(json.spec()).expected( [Expected::Name("hex-encoded u16".into())], - data.text(ctx), + data.text(), data, ); 0 @@ -294,7 +288,7 @@ fn parse0( let mut trailing = None; let vec = array .contents() - .delimited(json.comma, |c| Some(parse0(ctx, report, json, c))) + .delimited(json.comma, |c| Some(parse0(report, json, c))) .map(|(e, c)| { trailing = c; e @@ -319,7 +313,7 @@ fn parse0( .map(|q| quote2str(q)) .unwrap_or("😢".into()); c.take(json.colon, report); - let value = parse0(ctx, report, json, c); + let value = parse0(report, json, c); Some((key, value)) }) .map(|(e, c)| { diff --git a/ilex/tests/llvm.rs b/ilex/tests/llvm.rs index 46556be..34e2585 100644 --- a/ilex/tests/llvm.rs +++ b/ilex/tests/llvm.rs @@ -280,5 +280,5 @@ fn llvm() { .then1(llvm.void, "void"), ) .eof() - .assert_matches(&ctx, &tokens) + .assert_matches(&tokens) } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 1c8cfba..6cc20ea 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.75.0" +channel = "1.83.0" profile = "default"