diff --git a/crates/aiken-lang/src/parser.rs b/crates/aiken-lang/src/parser.rs index 581cf435..408511f9 100644 --- a/crates/aiken-lang/src/parser.rs +++ b/crates/aiken-lang/src/parser.rs @@ -20,13 +20,18 @@ pub fn module( src: &str, kind: ast::ModuleKind, ) -> Result<(ast::UntypedModule, ModuleExtra), Vec> { - let len = src.chars().count(); + let len = src.as_bytes().len(); - let span = |i| Span::new((), i..i + 1); + let span = |i, n| Span::new((), i..i + n); let tokens = lexer::lexer().parse(chumsky::Stream::from_iter( - span(len), - src.chars().enumerate().map(|(i, c)| (c, span(i))), + span(len, 1), + src.chars().scan(0, |i, c| { + let start = *i; + let offset = c.len_utf8(); + *i = start + offset; + Some((c, span(start, offset))) + }), ))?; let mut extra = ModuleExtra::new(); @@ -74,7 +79,7 @@ pub fn module( }); let definitions = - module_parser().parse(chumsky::Stream::from_iter(span(tokens.len()), tokens))?; + module_parser().parse(chumsky::Stream::from_iter(span(tokens.len(), 1), tokens))?; let module = ast::UntypedModule { kind, diff --git a/crates/aiken-lang/src/parser/extra.rs b/crates/aiken-lang/src/parser/extra.rs index 65f5b2c3..27638197 100644 --- a/crates/aiken-lang/src/parser/extra.rs +++ b/crates/aiken-lang/src/parser/extra.rs @@ -23,16 +23,12 @@ pub struct Comment<'a> { impl<'a> From<(&Span, &'a str)> for Comment<'a> { fn from(src: (&Span, &'a str)) -> Comment<'a> { - fn char_indice(s: &str, i: usize) -> usize { - s.char_indices().nth(i).unwrap_or((i, ' ')).0 - } - - let start = char_indice(src.1, src.0.start); - let end = char_indice(src.1, src.0.end); - + let start = src.0.start; + let end = src.0.end; Comment { - start: src.0.start, - content: src.1.get(start..end).expect("From span to comment"), + start, + content: std::str::from_utf8(src.1.as_bytes()[start..end].as_ref()) + .expect("From span to comment"), } } } diff --git a/crates/aiken-lang/src/tests/parser.rs b/crates/aiken-lang/src/tests/parser.rs index 3e4a0cab..902bec42 100644 --- a/crates/aiken-lang/src/tests/parser.rs +++ b/crates/aiken-lang/src/tests/parser.rs @@ -4796,3 +4796,103 @@ fn first_class_binop() { })], ); } + +#[test] +fn parse_unicode_offset_1() { + use expr::UntypedExpr::*; + + let code = indoc! {r#" + fn foo() { + let x = "★" + x + } + "#}; + + assert_definitions( + code, + vec![ast::Definition::Fn(Function { + arguments: vec![], + body: Sequence { + location: Span::new((), 13..30), + expressions: vec![ + Assignment { + location: Span::new((), 13..26), + value: Box::new(ByteArray { + location: Span::new((), 21..26), + bytes: vec![226, 152, 133], + preferred_format: ast::ByteArrayFormatPreference::Utf8String, + }), + pattern: ast::Pattern::Var { + location: Span::new((), 17..18), + name: "x".to_string(), + }, + kind: ast::AssignmentKind::Let, + annotation: None, + }, + Var { + location: Span::new((), 29..30), + name: "x".to_string(), + }, + ], + }, + doc: None, + location: Span::new((), 0..8), + name: "foo".to_string(), + public: false, + return_annotation: None, + return_type: (), + end_position: 31, + can_error: true, + })], + ) +} + +#[test] +fn parse_unicode_offset_2() { + use expr::UntypedExpr::*; + + let code = indoc! {r#" + fn foo() { + let x = "*" + x + } + "#}; + + assert_definitions( + code, + vec![ast::Definition::Fn(Function { + arguments: vec![], + body: Sequence { + location: Span::new((), 13..28), + expressions: vec![ + Assignment { + location: Span::new((), 13..24), + value: Box::new(ByteArray { + location: Span::new((), 21..24), + bytes: vec![42], + preferred_format: ast::ByteArrayFormatPreference::Utf8String, + }), + pattern: ast::Pattern::Var { + location: Span::new((), 17..18), + name: "x".to_string(), + }, + kind: ast::AssignmentKind::Let, + annotation: None, + }, + Var { + location: Span::new((), 27..28), + name: "x".to_string(), + }, + ], + }, + doc: None, + location: Span::new((), 0..8), + name: "foo".to_string(), + public: false, + return_annotation: None, + return_type: (), + end_position: 29, + can_error: true, + })], + ) +}