From 5a6cc855e6083e4291015f9b257492bc0730f10f Mon Sep 17 00:00:00 2001 From: KtorZ Date: Fri, 30 Jun 2023 18:26:09 +0200 Subject: [PATCH] Use byte count for token span in the lexer. Somehow, miette doesn't play well with spans when using chars indices. So we have to count the number of bytes in strings / chars, so that spans align accordingly. --- crates/aiken-lang/src/parser.rs | 15 ++-- crates/aiken-lang/src/parser/extra.rs | 14 ++-- crates/aiken-lang/src/tests/parser.rs | 100 ++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 14 deletions(-) diff --git a/crates/aiken-lang/src/parser.rs b/crates/aiken-lang/src/parser.rs index 581cf435..408511f9 100644 --- a/crates/aiken-lang/src/parser.rs +++ b/crates/aiken-lang/src/parser.rs @@ -20,13 +20,18 @@ pub fn module( src: &str, kind: ast::ModuleKind, ) -> Result<(ast::UntypedModule, ModuleExtra), Vec> { - let len = src.chars().count(); + let len = src.as_bytes().len(); - let span = |i| Span::new((), i..i + 1); + let span = |i, n| Span::new((), i..i + n); let tokens = lexer::lexer().parse(chumsky::Stream::from_iter( - span(len), - src.chars().enumerate().map(|(i, c)| (c, span(i))), + span(len, 1), + src.chars().scan(0, |i, c| { + let start = *i; + let offset = c.len_utf8(); + *i = start + offset; + Some((c, span(start, offset))) + }), ))?; let mut extra = ModuleExtra::new(); @@ -74,7 +79,7 @@ pub fn module( }); let definitions = - module_parser().parse(chumsky::Stream::from_iter(span(tokens.len()), tokens))?; + module_parser().parse(chumsky::Stream::from_iter(span(tokens.len(), 1), tokens))?; let module = ast::UntypedModule { kind, diff --git a/crates/aiken-lang/src/parser/extra.rs b/crates/aiken-lang/src/parser/extra.rs index 65f5b2c3..27638197 100644 --- a/crates/aiken-lang/src/parser/extra.rs +++ b/crates/aiken-lang/src/parser/extra.rs @@ -23,16 +23,12 @@ pub struct Comment<'a> { impl<'a> From<(&Span, &'a str)> for Comment<'a> { fn from(src: (&Span, &'a str)) -> Comment<'a> { - fn char_indice(s: &str, i: usize) -> usize { - s.char_indices().nth(i).unwrap_or((i, ' ')).0 - } - - let start = char_indice(src.1, src.0.start); - let end = char_indice(src.1, src.0.end); - + let start = src.0.start; + let end = src.0.end; Comment { - start: src.0.start, - content: src.1.get(start..end).expect("From span to comment"), + start, + content: std::str::from_utf8(src.1.as_bytes()[start..end].as_ref()) + .expect("From span to comment"), } } } diff --git a/crates/aiken-lang/src/tests/parser.rs b/crates/aiken-lang/src/tests/parser.rs index 3e4a0cab..902bec42 100644 --- a/crates/aiken-lang/src/tests/parser.rs +++ b/crates/aiken-lang/src/tests/parser.rs @@ -4796,3 +4796,103 @@ fn first_class_binop() { })], ); } + +#[test] +fn parse_unicode_offset_1() { + use expr::UntypedExpr::*; + + let code = indoc! {r#" + fn foo() { + let x = "★" + x + } + "#}; + + assert_definitions( + code, + vec![ast::Definition::Fn(Function { + arguments: vec![], + body: Sequence { + location: Span::new((), 13..30), + expressions: vec![ + Assignment { + location: Span::new((), 13..26), + value: Box::new(ByteArray { + location: Span::new((), 21..26), + bytes: vec![226, 152, 133], + preferred_format: ast::ByteArrayFormatPreference::Utf8String, + }), + pattern: ast::Pattern::Var { + location: Span::new((), 17..18), + name: "x".to_string(), + }, + kind: ast::AssignmentKind::Let, + annotation: None, + }, + Var { + location: Span::new((), 29..30), + name: "x".to_string(), + }, + ], + }, + doc: None, + location: Span::new((), 0..8), + name: "foo".to_string(), + public: false, + return_annotation: None, + return_type: (), + end_position: 31, + can_error: true, + })], + ) +} + +#[test] +fn parse_unicode_offset_2() { + use expr::UntypedExpr::*; + + let code = indoc! {r#" + fn foo() { + let x = "*" + x + } + "#}; + + assert_definitions( + code, + vec![ast::Definition::Fn(Function { + arguments: vec![], + body: Sequence { + location: Span::new((), 13..28), + expressions: vec![ + Assignment { + location: Span::new((), 13..24), + value: Box::new(ByteArray { + location: Span::new((), 21..24), + bytes: vec![42], + preferred_format: ast::ByteArrayFormatPreference::Utf8String, + }), + pattern: ast::Pattern::Var { + location: Span::new((), 17..18), + name: "x".to_string(), + }, + kind: ast::AssignmentKind::Let, + annotation: None, + }, + Var { + location: Span::new((), 27..28), + name: "x".to_string(), + }, + ], + }, + doc: None, + location: Span::new((), 0..8), + name: "foo".to_string(), + public: false, + return_annotation: None, + return_type: (), + end_position: 29, + can_error: true, + })], + ) +}