Emit warning when detecting an hex string interpreted as UTF-8 bytes.

This will probably save people minutes/hours of puzzled debugging. This is only a warning because there may be cases where one do actually want to specify an hex-encoded bytearray. In which case, they can get rid of the warning by using the plain bytearray syntax (i.e. as an array of bytes).
2023-02-18 11:36:45 +01:00 · 2023-02-18 11:36:45 +01:00 · 78770d14b7
parent d72e13c7c8
commit 78770d14b7
3 changed files with 74 additions and 15 deletions
--- a/crates/aiken-lang/src/tests/check.rs
+++ b/crates/aiken-lang/src/tests/check.rs
@ -305,3 +305,17 @@ fn trace_if_false_ko() {
        Err((_, Error::CouldNotUnify { .. }))
    ))
 }
 #[test]
 fn utf8_hex_literal_warning() {
    let source_code = r#"
        pub const policy_id = "f43a62fdc3965df486de8a0d32fe800963589c41b38946602a0dc535"
    "#;
    let (warnings, _) = check(parse(source_code)).unwrap();
    assert!(matches!(
        warnings[0],
        Warning::Utf8ByteArrayIsValidHexString { .. }
    ))
 }
--- a/crates/aiken-lang/src/tipo/error.rs
+++ b/crates/aiken-lang/src/tipo/error.rs
@ -1244,6 +1244,31 @@ pub enum Warning {
        #[label("unused")]
        location: Span,
    },
    #[error(
        "I noticed a suspicious {type_ByteArray} UTF-8 literal which resembles a hash digest.",
        type_ByteArray = "ByteArray".bold().bright_blue()
    )]
    #[diagnostic(help("{}", formatdoc! {
        r#"When you specify a {type_ByteArray} literal using plain double-quotes, it's interpreted as an array of UTF-8 bytes. For example, the literal {literal_foo} is interpreted as the byte sequence {foo_bytes}.
           However here, you have specified a literal that resembles a hash digest encoded as an hexadecimal string. This is a common case, but you probably want to capture the raw bytes represented by this sequence, and not the hexadecimal sequence. Fear not! Aiken provides a convenient syntax for that: just prefix the literal with {symbol_hash}. This will decode the hexadecimal string for you and capture the non-encoded bytes as a {type_ByteArray}.
           ╰─▶ {symbol_hash}{value}
        "#,
        type_ByteArray = "ByteArray".bold().bright_blue(),
        literal_foo = "\"foo\"".purple(),
        foo_bytes = "#[102, 111, 111]".purple(),
        value = "\"{value}\"".purple(),
        symbol_hash = "#".purple(),
    }))]
    #[diagnostic(code("syntax::bytearray_literal_is_hex_string"))]
    #[diagnostic(url("https://aiken-lang.org/language-tour/primitive-types#bytearray"))]
    Utf8ByteArrayIsValidHexString {
        #[label("missing '#' to decode hex string")]
        location: Span,
        value: String,
    },
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
--- a/crates/aiken-lang/src/tipo/expr.rs
+++ b/crates/aiken-lang/src/tipo/expr.rs
@ -4,11 +4,11 @@ use vec1::Vec1;
 use crate::{
    ast::{
-        Annotation, Arg, ArgName, AssignmentKind, BinOp, CallArg, Clause, ClauseGuard, Constant,
+        Annotation, Arg, ArgName, AssignmentKind, BinOp, ByteArrayFormatPreference, CallArg,
-        IfBranch, RecordUpdateSpread, Span, TraceKind, Tracing, TypedArg, TypedCallArg,
+        Clause, ClauseGuard, Constant, IfBranch, RecordUpdateSpread, Span, TraceKind, Tracing,
-        TypedClause, TypedClauseGuard, TypedIfBranch, TypedMultiPattern, TypedRecordUpdateArg,
+        TypedArg, TypedCallArg, TypedClause, TypedClauseGuard, TypedIfBranch, TypedMultiPattern,
-        UnOp, UntypedArg, UntypedClause, UntypedClauseGuard, UntypedIfBranch, UntypedMultiPattern,
+        TypedRecordUpdateArg, UnOp, UntypedArg, UntypedClause, UntypedClauseGuard, UntypedIfBranch,
-        UntypedPattern, UntypedRecordUpdateArg,
+        UntypedMultiPattern, UntypedPattern, UntypedRecordUpdateArg,
    },
    builtins::{bool, byte_array, function, int, list, string, tuple},
    expr::{TypedExpr, UntypedExpr},
@ -351,8 +351,10 @@ impl<'a, 'b> ExprTyper<'a, 'b> {
            } => self.infer_tuple_index(*tuple, index, location),
            UntypedExpr::ByteArray {
-                location, bytes, ..
+                bytes,
-            } => Ok(self.infer_byte_array(bytes, location)),
+                preferred_format,
                location,
            } => self.infer_bytearray(bytes, preferred_format, location),
            UntypedExpr::RecordUpdate {
                location,
@ -373,12 +375,27 @@ impl<'a, 'b> ExprTyper<'a, 'b> {
        }
    }
-    fn infer_byte_array(&mut self, bytes: Vec<u8>, location: Span) -> TypedExpr {
+    fn infer_bytearray(
-        TypedExpr::ByteArray {
+        &mut self,
        bytes: Vec<u8>,
        preferred_format: ByteArrayFormatPreference,
        location: Span,
    ) -> Result<TypedExpr, Error> {
        if let ByteArrayFormatPreference::Utf8String = preferred_format {
            let value = String::from_utf8(bytes.clone()).unwrap();
            let is_hex_string = hex::decode(&value).is_ok();
            if bytes.len() >= 56 && is_hex_string {
                self.environment
                    .warnings
                    .push(Warning::Utf8ByteArrayIsValidHexString { location, value });
            }
        }
        Ok(TypedExpr::ByteArray {
            location,
            bytes,
            tipo: byte_array(),
-        }
+        })
    }
    fn infer_trace_if_false(
@ -1357,11 +1374,14 @@ impl<'a, 'b> ExprTyper<'a, 'b> {
                location,
                bytes,
                preferred_format,
-            } => Ok(Constant::ByteArray {
+            } => {
                let _ = self.infer_bytearray(bytes.clone(), preferred_format, location)?;
                Ok(Constant::ByteArray {
                    location,
                    bytes,
                    preferred_format,
-            }),
+                })
            }
        }?;
        // Check type annotation is accurate.