From 78770d14b7fb965fbe9630107fdfeaac41d268d1 Mon Sep 17 00:00:00 2001 From: KtorZ Date: Sat, 18 Feb 2023 11:36:45 +0100 Subject: [PATCH] Emit warning when detecting an hex string interpreted as UTF-8 bytes. This will probably save people minutes/hours of puzzled debugging. This is only a warning because there may be cases where one do actually want to specify an hex-encoded bytearray. In which case, they can get rid of the warning by using the plain bytearray syntax (i.e. as an array of bytes). --- crates/aiken-lang/src/tests/check.rs | 14 ++++++++ crates/aiken-lang/src/tipo/error.rs | 25 ++++++++++++++ crates/aiken-lang/src/tipo/expr.rs | 50 +++++++++++++++++++--------- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/crates/aiken-lang/src/tests/check.rs b/crates/aiken-lang/src/tests/check.rs index 0872798b..40227927 100644 --- a/crates/aiken-lang/src/tests/check.rs +++ b/crates/aiken-lang/src/tests/check.rs @@ -305,3 +305,17 @@ fn trace_if_false_ko() { Err((_, Error::CouldNotUnify { .. })) )) } + +#[test] +fn utf8_hex_literal_warning() { + let source_code = r#" + pub const policy_id = "f43a62fdc3965df486de8a0d32fe800963589c41b38946602a0dc535" + "#; + + let (warnings, _) = check(parse(source_code)).unwrap(); + + assert!(matches!( + warnings[0], + Warning::Utf8ByteArrayIsValidHexString { .. } + )) +} diff --git a/crates/aiken-lang/src/tipo/error.rs b/crates/aiken-lang/src/tipo/error.rs index 878aee77..87b3c6c7 100644 --- a/crates/aiken-lang/src/tipo/error.rs +++ b/crates/aiken-lang/src/tipo/error.rs @@ -1244,6 +1244,31 @@ pub enum Warning { #[label("unused")] location: Span, }, + + #[error( + "I noticed a suspicious {type_ByteArray} UTF-8 literal which resembles a hash digest.", + type_ByteArray = "ByteArray".bold().bright_blue() + )] + #[diagnostic(help("{}", formatdoc! { + r#"When you specify a {type_ByteArray} literal using plain double-quotes, it's interpreted as an array of UTF-8 bytes. For example, the literal {literal_foo} is interpreted as the byte sequence {foo_bytes}. + + However here, you have specified a literal that resembles a hash digest encoded as an hexadecimal string. This is a common case, but you probably want to capture the raw bytes represented by this sequence, and not the hexadecimal sequence. Fear not! Aiken provides a convenient syntax for that: just prefix the literal with {symbol_hash}. This will decode the hexadecimal string for you and capture the non-encoded bytes as a {type_ByteArray}. + + ╰─▶ {symbol_hash}{value} + "#, + type_ByteArray = "ByteArray".bold().bright_blue(), + literal_foo = "\"foo\"".purple(), + foo_bytes = "#[102, 111, 111]".purple(), + value = "\"{value}\"".purple(), + symbol_hash = "#".purple(), + }))] + #[diagnostic(code("syntax::bytearray_literal_is_hex_string"))] + #[diagnostic(url("https://aiken-lang.org/language-tour/primitive-types#bytearray"))] + Utf8ByteArrayIsValidHexString { + #[label("missing '#' to decode hex string")] + location: Span, + value: String, + }, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/crates/aiken-lang/src/tipo/expr.rs b/crates/aiken-lang/src/tipo/expr.rs index 0c388f5c..695ff653 100644 --- a/crates/aiken-lang/src/tipo/expr.rs +++ b/crates/aiken-lang/src/tipo/expr.rs @@ -4,11 +4,11 @@ use vec1::Vec1; use crate::{ ast::{ - Annotation, Arg, ArgName, AssignmentKind, BinOp, CallArg, Clause, ClauseGuard, Constant, - IfBranch, RecordUpdateSpread, Span, TraceKind, Tracing, TypedArg, TypedCallArg, - TypedClause, TypedClauseGuard, TypedIfBranch, TypedMultiPattern, TypedRecordUpdateArg, - UnOp, UntypedArg, UntypedClause, UntypedClauseGuard, UntypedIfBranch, UntypedMultiPattern, - UntypedPattern, UntypedRecordUpdateArg, + Annotation, Arg, ArgName, AssignmentKind, BinOp, ByteArrayFormatPreference, CallArg, + Clause, ClauseGuard, Constant, IfBranch, RecordUpdateSpread, Span, TraceKind, Tracing, + TypedArg, TypedCallArg, TypedClause, TypedClauseGuard, TypedIfBranch, TypedMultiPattern, + TypedRecordUpdateArg, UnOp, UntypedArg, UntypedClause, UntypedClauseGuard, UntypedIfBranch, + UntypedMultiPattern, UntypedPattern, UntypedRecordUpdateArg, }, builtins::{bool, byte_array, function, int, list, string, tuple}, expr::{TypedExpr, UntypedExpr}, @@ -351,8 +351,10 @@ impl<'a, 'b> ExprTyper<'a, 'b> { } => self.infer_tuple_index(*tuple, index, location), UntypedExpr::ByteArray { - location, bytes, .. - } => Ok(self.infer_byte_array(bytes, location)), + bytes, + preferred_format, + location, + } => self.infer_bytearray(bytes, preferred_format, location), UntypedExpr::RecordUpdate { location, @@ -373,12 +375,27 @@ impl<'a, 'b> ExprTyper<'a, 'b> { } } - fn infer_byte_array(&mut self, bytes: Vec, location: Span) -> TypedExpr { - TypedExpr::ByteArray { + fn infer_bytearray( + &mut self, + bytes: Vec, + preferred_format: ByteArrayFormatPreference, + location: Span, + ) -> Result { + if let ByteArrayFormatPreference::Utf8String = preferred_format { + let value = String::from_utf8(bytes.clone()).unwrap(); + let is_hex_string = hex::decode(&value).is_ok(); + if bytes.len() >= 56 && is_hex_string { + self.environment + .warnings + .push(Warning::Utf8ByteArrayIsValidHexString { location, value }); + } + } + + Ok(TypedExpr::ByteArray { location, bytes, tipo: byte_array(), - } + }) } fn infer_trace_if_false( @@ -1357,11 +1374,14 @@ impl<'a, 'b> ExprTyper<'a, 'b> { location, bytes, preferred_format, - } => Ok(Constant::ByteArray { - location, - bytes, - preferred_format, - }), + } => { + let _ = self.infer_bytearray(bytes.clone(), preferred_format, location)?; + Ok(Constant::ByteArray { + location, + bytes, + preferred_format, + }) + } }?; // Check type annotation is accurate.