From 78770d14b7fb965fbe9630107fdfeaac41d268d1 Mon Sep 17 00:00:00 2001
From: KtorZ <matthias.benkort@gmail.com>
Date: Sat, 18 Feb 2023 11:36:45 +0100
Subject: [PATCH] Emit warning when detecting an hex string interpreted as
 UTF-8 bytes.

  This will probably save people minutes/hours of puzzled debugging. This is only a warning because there may be cases where one do actually want to specify an hex-encoded bytearray. In which case, they can get rid of the warning by using the plain bytearray syntax (i.e. as an array of bytes).
---
 crates/aiken-lang/src/tests/check.rs | 14 ++++++++
 crates/aiken-lang/src/tipo/error.rs  | 25 ++++++++++++++
 crates/aiken-lang/src/tipo/expr.rs   | 50 +++++++++++++++++++---------
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/crates/aiken-lang/src/tests/check.rs b/crates/aiken-lang/src/tests/check.rs
index 0872798b..40227927 100644
--- a/crates/aiken-lang/src/tests/check.rs
+++ b/crates/aiken-lang/src/tests/check.rs
@@ -305,3 +305,17 @@ fn trace_if_false_ko() {
         Err((_, Error::CouldNotUnify { .. }))
     ))
 }
+
+#[test]
+fn utf8_hex_literal_warning() {
+    let source_code = r#"
+        pub const policy_id = "f43a62fdc3965df486de8a0d32fe800963589c41b38946602a0dc535"
+    "#;
+
+    let (warnings, _) = check(parse(source_code)).unwrap();
+
+    assert!(matches!(
+        warnings[0],
+        Warning::Utf8ByteArrayIsValidHexString { .. }
+    ))
+}
diff --git a/crates/aiken-lang/src/tipo/error.rs b/crates/aiken-lang/src/tipo/error.rs
index 878aee77..87b3c6c7 100644
--- a/crates/aiken-lang/src/tipo/error.rs
+++ b/crates/aiken-lang/src/tipo/error.rs
@@ -1244,6 +1244,31 @@ pub enum Warning {
         #[label("unused")]
         location: Span,
     },
+
+    #[error(
+        "I noticed a suspicious {type_ByteArray} UTF-8 literal which resembles a hash digest.",
+        type_ByteArray = "ByteArray".bold().bright_blue()
+    )]
+    #[diagnostic(help("{}", formatdoc! {
+        r#"When you specify a {type_ByteArray} literal using plain double-quotes, it's interpreted as an array of UTF-8 bytes. For example, the literal {literal_foo} is interpreted as the byte sequence {foo_bytes}.
+
+           However here, you have specified a literal that resembles a hash digest encoded as an hexadecimal string. This is a common case, but you probably want to capture the raw bytes represented by this sequence, and not the hexadecimal sequence. Fear not! Aiken provides a convenient syntax for that: just prefix the literal with {symbol_hash}. This will decode the hexadecimal string for you and capture the non-encoded bytes as a {type_ByteArray}.
+
+           ╰─▶ {symbol_hash}{value}
+        "#,
+        type_ByteArray = "ByteArray".bold().bright_blue(),
+        literal_foo = "\"foo\"".purple(),
+        foo_bytes = "#[102, 111, 111]".purple(),
+        value = "\"{value}\"".purple(),
+        symbol_hash = "#".purple(),
+    }))]
+    #[diagnostic(code("syntax::bytearray_literal_is_hex_string"))]
+    #[diagnostic(url("https://aiken-lang.org/language-tour/primitive-types#bytearray"))]
+    Utf8ByteArrayIsValidHexString {
+        #[label("missing '#' to decode hex string")]
+        location: Span,
+        value: String,
+    },
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
diff --git a/crates/aiken-lang/src/tipo/expr.rs b/crates/aiken-lang/src/tipo/expr.rs
index 0c388f5c..695ff653 100644
--- a/crates/aiken-lang/src/tipo/expr.rs
+++ b/crates/aiken-lang/src/tipo/expr.rs
@@ -4,11 +4,11 @@ use vec1::Vec1;
 
 use crate::{
     ast::{
-        Annotation, Arg, ArgName, AssignmentKind, BinOp, CallArg, Clause, ClauseGuard, Constant,
-        IfBranch, RecordUpdateSpread, Span, TraceKind, Tracing, TypedArg, TypedCallArg,
-        TypedClause, TypedClauseGuard, TypedIfBranch, TypedMultiPattern, TypedRecordUpdateArg,
-        UnOp, UntypedArg, UntypedClause, UntypedClauseGuard, UntypedIfBranch, UntypedMultiPattern,
-        UntypedPattern, UntypedRecordUpdateArg,
+        Annotation, Arg, ArgName, AssignmentKind, BinOp, ByteArrayFormatPreference, CallArg,
+        Clause, ClauseGuard, Constant, IfBranch, RecordUpdateSpread, Span, TraceKind, Tracing,
+        TypedArg, TypedCallArg, TypedClause, TypedClauseGuard, TypedIfBranch, TypedMultiPattern,
+        TypedRecordUpdateArg, UnOp, UntypedArg, UntypedClause, UntypedClauseGuard, UntypedIfBranch,
+        UntypedMultiPattern, UntypedPattern, UntypedRecordUpdateArg,
     },
     builtins::{bool, byte_array, function, int, list, string, tuple},
     expr::{TypedExpr, UntypedExpr},
@@ -351,8 +351,10 @@ impl<'a, 'b> ExprTyper<'a, 'b> {
             } => self.infer_tuple_index(*tuple, index, location),
 
             UntypedExpr::ByteArray {
-                location, bytes, ..
-            } => Ok(self.infer_byte_array(bytes, location)),
+                bytes,
+                preferred_format,
+                location,
+            } => self.infer_bytearray(bytes, preferred_format, location),
 
             UntypedExpr::RecordUpdate {
                 location,
@@ -373,12 +375,27 @@ impl<'a, 'b> ExprTyper<'a, 'b> {
         }
     }
 
-    fn infer_byte_array(&mut self, bytes: Vec<u8>, location: Span) -> TypedExpr {
-        TypedExpr::ByteArray {
+    fn infer_bytearray(
+        &mut self,
+        bytes: Vec<u8>,
+        preferred_format: ByteArrayFormatPreference,
+        location: Span,
+    ) -> Result<TypedExpr, Error> {
+        if let ByteArrayFormatPreference::Utf8String = preferred_format {
+            let value = String::from_utf8(bytes.clone()).unwrap();
+            let is_hex_string = hex::decode(&value).is_ok();
+            if bytes.len() >= 56 && is_hex_string {
+                self.environment
+                    .warnings
+                    .push(Warning::Utf8ByteArrayIsValidHexString { location, value });
+            }
+        }
+
+        Ok(TypedExpr::ByteArray {
             location,
             bytes,
             tipo: byte_array(),
-        }
+        })
     }
 
     fn infer_trace_if_false(
@@ -1357,11 +1374,14 @@ impl<'a, 'b> ExprTyper<'a, 'b> {
                 location,
                 bytes,
                 preferred_format,
-            } => Ok(Constant::ByteArray {
-                location,
-                bytes,
-                preferred_format,
-            }),
+            } => {
+                let _ = self.infer_bytearray(bytes.clone(), preferred_format, location)?;
+                Ok(Constant::ByteArray {
+                    location,
+                    bytes,
+                    preferred_format,
+                })
+            }
         }?;
 
         // Check type annotation is accurate.