Use byte count for token span in the lexer.

Somehow, miette doesn't play well with spans when using chars indices.
  So we have to count the number of bytes in strings / chars, so that
  spans align accordingly.
This commit is contained in:
KtorZ 2023-06-30 18:26:09 +02:00 committed by Lucas
parent 67c072a1a9
commit 5a6cc855e6
3 changed files with 115 additions and 14 deletions

View File

@ -20,13 +20,18 @@ pub fn module(
src: &str,
kind: ast::ModuleKind,
) -> Result<(ast::UntypedModule, ModuleExtra), Vec<ParseError>> {
let len = src.chars().count();
let len = src.as_bytes().len();
let span = |i| Span::new((), i..i + 1);
let span = |i, n| Span::new((), i..i + n);
let tokens = lexer::lexer().parse(chumsky::Stream::from_iter(
span(len),
src.chars().enumerate().map(|(i, c)| (c, span(i))),
span(len, 1),
src.chars().scan(0, |i, c| {
let start = *i;
let offset = c.len_utf8();
*i = start + offset;
Some((c, span(start, offset)))
}),
))?;
let mut extra = ModuleExtra::new();
@ -74,7 +79,7 @@ pub fn module(
});
let definitions =
module_parser().parse(chumsky::Stream::from_iter(span(tokens.len()), tokens))?;
module_parser().parse(chumsky::Stream::from_iter(span(tokens.len(), 1), tokens))?;
let module = ast::UntypedModule {
kind,

View File

@ -23,16 +23,12 @@ pub struct Comment<'a> {
impl<'a> From<(&Span, &'a str)> for Comment<'a> {
fn from(src: (&Span, &'a str)) -> Comment<'a> {
fn char_indice(s: &str, i: usize) -> usize {
s.char_indices().nth(i).unwrap_or((i, ' ')).0
}
let start = char_indice(src.1, src.0.start);
let end = char_indice(src.1, src.0.end);
let start = src.0.start;
let end = src.0.end;
Comment {
start: src.0.start,
content: src.1.get(start..end).expect("From span to comment"),
start,
content: std::str::from_utf8(src.1.as_bytes()[start..end].as_ref())
.expect("From span to comment"),
}
}
}

View File

@ -4796,3 +4796,103 @@ fn first_class_binop() {
})],
);
}
#[test]
fn parse_unicode_offset_1() {
use expr::UntypedExpr::*;
let code = indoc! {r#"
fn foo() {
let x = ""
x
}
"#};
assert_definitions(
code,
vec![ast::Definition::Fn(Function {
arguments: vec![],
body: Sequence {
location: Span::new((), 13..30),
expressions: vec![
Assignment {
location: Span::new((), 13..26),
value: Box::new(ByteArray {
location: Span::new((), 21..26),
bytes: vec![226, 152, 133],
preferred_format: ast::ByteArrayFormatPreference::Utf8String,
}),
pattern: ast::Pattern::Var {
location: Span::new((), 17..18),
name: "x".to_string(),
},
kind: ast::AssignmentKind::Let,
annotation: None,
},
Var {
location: Span::new((), 29..30),
name: "x".to_string(),
},
],
},
doc: None,
location: Span::new((), 0..8),
name: "foo".to_string(),
public: false,
return_annotation: None,
return_type: (),
end_position: 31,
can_error: true,
})],
)
}
#[test]
fn parse_unicode_offset_2() {
use expr::UntypedExpr::*;
let code = indoc! {r#"
fn foo() {
let x = "*"
x
}
"#};
assert_definitions(
code,
vec![ast::Definition::Fn(Function {
arguments: vec![],
body: Sequence {
location: Span::new((), 13..28),
expressions: vec![
Assignment {
location: Span::new((), 13..24),
value: Box::new(ByteArray {
location: Span::new((), 21..24),
bytes: vec![42],
preferred_format: ast::ByteArrayFormatPreference::Utf8String,
}),
pattern: ast::Pattern::Var {
location: Span::new((), 17..18),
name: "x".to_string(),
},
kind: ast::AssignmentKind::Let,
annotation: None,
},
Var {
location: Span::new((), 27..28),
name: "x".to_string(),
},
],
},
doc: None,
location: Span::new((), 0..8),
name: "foo".to_string(),
public: false,
return_annotation: None,
return_type: (),
end_position: 29,
can_error: true,
})],
)
}