From 78d34f7f76ff1feca2b8324b609cea17124fc852 Mon Sep 17 00:00:00 2001 From: KtorZ Date: Thu, 6 Jul 2023 19:16:33 +0200 Subject: [PATCH] Fix parsing of negative int patterns and constants This was trickier than expected as the expression parser, and in particular the bin-op parser will interpret negative patterns as a continuation of a binary operation and eventually choke on the next right-arrow symbol. This is due to how we actually completely erase newlines once we're done with the lexer. The newline separating when clause is actually semantically important. In principle, we could only parse an expression until the next newline. Ideally, we would keep that newline in the list of token but it's difficult to figure out which newline to keep between two right arrows since a clause guard can be written over multiple lines. Though, since we know that this is only truly a problem for negative integers, we can use the same trick as for tuples and define a new 'NewLineMinus' token. That token CANNOT be part of a binop expression. That means it's impossible to write a binary operation with a minus over multiple lines, or more specifically, with the '-' symbol on a newline. This sounds like a fair limitation. What we get in exchange is less ambiguity when parsing patterns following expressions in when clause cases. Another more cumbersome option could be to preserve the first newline encountered after a 'right-arrow' symbol and before any parenthesis or curly brace is found (which would otherwise signal the beginning of a new block). That requires to traverse, at least partially, the list of tokens twice. This feels unnecessary for now and until we do face a similar issue with a binary operator. --- .../src/parser/definition/constant.rs | 24 ++++---- crates/aiken-lang/src/parser/expr/list.rs | 6 +- crates/aiken-lang/src/parser/expr/mod.rs | 2 +- crates/aiken-lang/src/parser/expr/when/mod.rs | 6 +- crates/aiken-lang/src/parser/lexer.rs | 7 +++ crates/aiken-lang/src/parser/literal/int.rs | 21 +++++++ crates/aiken-lang/src/parser/literal/mod.rs | 2 + crates/aiken-lang/src/parser/pattern/int.rs | 40 ++++++++++--- .../snapshots/pattern_negative_int.snap | 30 ++++++++++ .../pattern_negative_int_not_first_case.snap | 57 +++++++++++++++++++ crates/aiken-lang/src/parser/token.rs | 2 + 11 files changed, 171 insertions(+), 26 deletions(-) create mode 100644 crates/aiken-lang/src/parser/literal/int.rs create mode 100644 crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int.snap create mode 100644 crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int_not_first_case.snap diff --git a/crates/aiken-lang/src/parser/definition/constant.rs b/crates/aiken-lang/src/parser/definition/constant.rs index 834449d9..97292038 100644 --- a/crates/aiken-lang/src/parser/definition/constant.rs +++ b/crates/aiken-lang/src/parser/definition/constant.rs @@ -2,7 +2,7 @@ use chumsky::prelude::*; use crate::{ ast, - parser::{annotation, error::ParseError, literal::bytearray, token::Token, utils}, + parser::{annotation, error::ParseError, literal, token::Token, utils}, }; pub fn parser() -> impl Parser { @@ -39,20 +39,20 @@ pub fn value() -> impl Parser { }); let constant_int_parser = - select! {Token::Int {value, base} => (value, base)}.map_with_span(|(value, base), span| { - ast::Constant::Int { - location: span, - value, - base, - } + literal::int().map_with_span(|(value, base), location| ast::Constant::Int { + location, + value, + base, }); let constant_bytearray_parser = - bytearray(|bytes, preferred_format, span| ast::Constant::ByteArray { - location: span, - bytes, - preferred_format, - }); + literal::bytearray( + |bytes, preferred_format, location| ast::Constant::ByteArray { + location, + bytes, + preferred_format, + }, + ); choice(( constant_string_parser, diff --git a/crates/aiken-lang/src/parser/expr/list.rs b/crates/aiken-lang/src/parser/expr/list.rs index d11af859..c4da3687 100644 --- a/crates/aiken-lang/src/parser/expr/list.rs +++ b/crates/aiken-lang/src/parser/expr/list.rs @@ -6,14 +6,14 @@ use crate::{ }; pub fn parser( - r: Recursive<'_, Token, UntypedExpr, ParseError>, + expression: Recursive<'_, Token, UntypedExpr, ParseError>, ) -> impl Parser + '_ { just(Token::LeftSquare) - .ignore_then(r.clone().separated_by(just(Token::Comma))) + .ignore_then(expression.clone().separated_by(just(Token::Comma))) .then(choice(( just(Token::Comma).ignore_then( just(Token::DotDot) - .ignore_then(r.clone()) + .ignore_then(expression) .map(Box::new) .or_not(), ), diff --git a/crates/aiken-lang/src/parser/expr/mod.rs b/crates/aiken-lang/src/parser/expr/mod.rs index 1f5cd277..b0027e69 100644 --- a/crates/aiken-lang/src/parser/expr/mod.rs +++ b/crates/aiken-lang/src/parser/expr/mod.rs @@ -56,7 +56,7 @@ pub fn pure_expression<'a>( // Negate let op = choice(( just(Token::Bang).to(ast::UnOp::Not), - just(Token::Minus) + choice((just(Token::Minus), just(Token::NewLineMinus))) // NOTE: Prevent conflict with usage for '-' as a standalone binary op. // This will make '-' parse when used as standalone binop in a function call. // For example: diff --git a/crates/aiken-lang/src/parser/expr/when/mod.rs b/crates/aiken-lang/src/parser/expr/when/mod.rs index 53b47627..efdbcd00 100644 --- a/crates/aiken-lang/src/parser/expr/when/mod.rs +++ b/crates/aiken-lang/src/parser/expr/when/mod.rs @@ -12,15 +12,15 @@ use crate::{ }; pub fn parser( - r: Recursive<'_, Token, UntypedExpr, ParseError>, + expression: Recursive<'_, Token, UntypedExpr, ParseError>, ) -> impl Parser + '_ { just(Token::When) // TODO: If subject is empty we should return ParseErrorType::ExpectedExpr, - .ignore_then(r.clone().map(Box::new)) + .ignore_then(expression.clone().map(Box::new)) .then_ignore(just(Token::Is)) .then_ignore(just(Token::LeftBrace)) // TODO: If clauses are empty we should return ParseErrorType::NoCaseClause - .then(clause(r).repeated()) + .then(clause(expression).repeated()) .then_ignore(just(Token::RightBrace)) .map_with_span(|(subject, clauses), span| UntypedExpr::When { location: span, diff --git a/crates/aiken-lang/src/parser/lexer.rs b/crates/aiken-lang/src/parser/lexer.rs index f81e8207..d196ce37 100644 --- a/crates/aiken-lang/src/parser/lexer.rs +++ b/crates/aiken-lang/src/parser/lexer.rs @@ -59,6 +59,13 @@ pub fn run(src: &str) -> Result> { Some((Token::LeftParen, *span)) } } + Token::Minus => { + if previous_is_newline { + Some((Token::NewLineMinus, *span)) + } else { + Some((Token::Minus, *span)) + } + } Token::Pipe => { if previous_is_newline { Some((Token::NewLinePipe, *span)) diff --git a/crates/aiken-lang/src/parser/literal/int.rs b/crates/aiken-lang/src/parser/literal/int.rs new file mode 100644 index 00000000..021a08dd --- /dev/null +++ b/crates/aiken-lang/src/parser/literal/int.rs @@ -0,0 +1,21 @@ +use chumsky::prelude::*; + +use crate::parser::{ + error::ParseError, + token::{Base, Token}, +}; + +pub fn parser() -> impl Parser { + choice((just(Token::NewLineMinus), just(Token::Minus))) + .ignored() + .or_not() + .map(|v| v.is_some()) + .then(select! { Token::Int {value, base} => (value, base)}) + .map(|(is_negative, (value, base))| { + if is_negative { + (format!("-{value}"), base) + } else { + (value, base) + } + }) +} diff --git a/crates/aiken-lang/src/parser/literal/mod.rs b/crates/aiken-lang/src/parser/literal/mod.rs index cc5dd8c6..97643a5c 100644 --- a/crates/aiken-lang/src/parser/literal/mod.rs +++ b/crates/aiken-lang/src/parser/literal/mod.rs @@ -1,7 +1,9 @@ mod bytearray; +mod int; mod string; mod uint; pub use bytearray::{array_of_bytes, hex_string, parser as bytearray, utf8_string}; +pub use int::parser as int; pub use string::parser as string; pub use uint::parser as uint; diff --git a/crates/aiken-lang/src/parser/pattern/int.rs b/crates/aiken-lang/src/parser/pattern/int.rs index 4fc8e2e4..ee5180c9 100644 --- a/crates/aiken-lang/src/parser/pattern/int.rs +++ b/crates/aiken-lang/src/parser/pattern/int.rs @@ -2,15 +2,41 @@ use chumsky::prelude::*; use crate::{ ast::UntypedPattern, - parser::{error::ParseError, token::Token}, + parser::{error::ParseError, literal, token::Token}, }; pub fn parser() -> impl Parser { - select! {Token::Int {value, base} => (value, base)}.map_with_span(|(value, base), location| { - UntypedPattern::Int { - location, - value, - base, - } + literal::int().map_with_span(|(value, base), location| UntypedPattern::Int { + location, + value, + base, }) } + +#[cfg(test)] +mod tests { + use crate::assert_expr; + + #[test] + fn pattern_negative_int() { + assert_expr!( + r#" + when foo is { + -1 -> True + } + "# + ); + } + + #[test] + fn pattern_negative_int_not_first_case() { + assert_expr!( + r#" + when bar is { + 42 -> -14 + -42 -> 14 + } + "# + ); + } +} diff --git a/crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int.snap b/crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int.snap new file mode 100644 index 00000000..82879f07 --- /dev/null +++ b/crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int.snap @@ -0,0 +1,30 @@ +--- +source: crates/aiken-lang/src/parser/pattern/int.rs +description: "Code:\n\nwhen foo is {\n -1 -> True\n}\n" +--- +When { + location: 0..28, + subject: Var { + location: 5..8, + name: "foo", + }, + clauses: [ + UntypedClause { + location: 16..26, + patterns: [ + Int { + location: 16..18, + value: "-1", + base: Decimal { + numeric_underscore: false, + }, + }, + ], + guard: None, + then: Var { + location: 22..26, + name: "True", + }, + }, + ], +} diff --git a/crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int_not_first_case.snap b/crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int_not_first_case.snap new file mode 100644 index 00000000..52dd3ee7 --- /dev/null +++ b/crates/aiken-lang/src/parser/pattern/snapshots/pattern_negative_int_not_first_case.snap @@ -0,0 +1,57 @@ +--- +source: crates/aiken-lang/src/parser/pattern/int.rs +description: "Code:\n\nwhen bar is {\n 42 -> -14\n -42 -> 14\n}\n" +--- +When { + location: 0..39, + subject: Var { + location: 5..8, + name: "bar", + }, + clauses: [ + UntypedClause { + location: 16..25, + patterns: [ + Int { + location: 16..18, + value: "42", + base: Decimal { + numeric_underscore: false, + }, + }, + ], + guard: None, + then: UnOp { + op: Negate, + location: 22..25, + value: UInt { + location: 23..25, + value: "14", + base: Decimal { + numeric_underscore: false, + }, + }, + }, + }, + UntypedClause { + location: 28..37, + patterns: [ + Int { + location: 28..31, + value: "-42", + base: Decimal { + numeric_underscore: false, + }, + }, + ], + guard: None, + then: UInt { + location: 35..37, + value: "14", + base: Decimal { + numeric_underscore: false, + }, + }, + }, + ], +} diff --git a/crates/aiken-lang/src/parser/token.rs b/crates/aiken-lang/src/parser/token.rs index a64fc82e..ddd92116 100644 --- a/crates/aiken-lang/src/parser/token.rs +++ b/crates/aiken-lang/src/parser/token.rs @@ -27,6 +27,7 @@ pub enum Token { // Int Operators Plus, Minus, + NewLineMinus, Star, Slash, Less, @@ -115,6 +116,7 @@ impl fmt::Display for Token { Token::RightBrace => "}", Token::Plus => "+", Token::Minus => "-", + Token::NewLineMinus => "↳-", Token::Star => "*", Token::Slash => "/", Token::Less => "<",