Merge pull request #1093 from aiken-lang/benchmarks-wrapup

Wrapping-up benchmarks
2025-02-09 17:04:39 +01:00 · 2025-02-09 17:04:39 +01:00 · 94246bdb2b
parent c382e6fba8 a6cdb5583d
commit 94246bdb2b
16 changed files with 836 additions and 606 deletions
--- a/.github/workflows/nix.yml
+++ b/.github/workflows/nix.yml
@ -6,7 +6,6 @@ on:
 jobs:
  nix-build:
    runs-on: ubuntu-latest
-
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
@ -20,5 +19,14 @@ jobs:
        uses: DeterminateSystems/magic-nix-cache-action@v1

      - name: Build Aiken
-        run: nix build
-
+        shell: bash
+        run: |
+          set +e
+          nix build
+          exitcode="$?"
+          if [[ "$exitcode" != "0" ]] ; then
+            echo "::warning::Nix build failed with exit code $exitcode"
+            exit 0
+          else
+            exit "$exitcode"
+          fi
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,9 +2,21 @@

 ## v1.1.11 - UNRELEASED

+### Added
+
+- **aiken**: New `aiken bench` command to run benchmarks. @Riley-Kilgore, @KtorZ
+
+  The command is very similar to `aiken check`, and will collect and run benchmarks found across the codebase. The output by default is a set of pretty terminal plots for each dimension (mem & cpu) for each test bench. The complete dataset of points can be obtained in a structured (JSON) format by redirecting the output to a file.
+
+- **aiken-lang**: New `bench` keyword and capabilities to the test framework. @Riley-Kilgore, @KtorZ
+
+  A `bench` is a new type of test that takes in a single `Sampler<a> = fn(Int) -> Fuzzer<a>` as parameter, similar to how property-based test receive `Fuzzer<a>`. A `Sampler` is in fact, a _scaled Fuzzer_ which receive a monotically increasing size as parameter. This allows fine-grained control over generated values. Unlike tests, benchmarks can return _anything_ since their output is ignored.
+
+  Read more about benchmarks in the [user manual](https://aiken-lang.org/language-tour/bench).
+
 ### Changed

- **aiken**: support for `bench` keyword to define benchmarks. @Riley-Kilgore
+
 - **aiken-lang**: The compiler now raises a warning when attempting to destructure a record constructor without using named fields. See [#1084](https://github.com/aiken-lang/aiken/issues/1084). @KtorZ
 - **aiken-lang**: Fix blueprint schema definitions related to pairs (no longer omit (sometimes) Pairs definitions, and generate them as data List). See [#1086](https://github.com/aiken-lang/aiken/issues/1086) and [#970](https://github.com/aiken-lang/aiken/issues/970). @KtorZ

@ -16,34 +28,6 @@
 - **aiken-lang**: `write_bits` can now be used from aiken/builtins. @Microproofs


-### Changed
-
- **aiken-project**: The `aiken.toml` file no longer supports `v1` and `v2` for the plutus version field. @rvcas
- **aiken-project**: `Error::TomlLoading` now looks much better - [see](https://github.com/aiken-lang/aiken/issues/1032#issuecomment-2562122101). @rvcas
- **aiken-lang**: 10-20% optimization improvements via case-constr, rearranging function definitions (while maintaining dependency ordering),
-                  and allowing inlining in if_then_else_error cases which preserve the same error semantics for a program. @Microproofs
-
-### Fixed
-
- **aiken**: panic error when using `aiken uplc decode` on cbor encoded flat bytes. @rvcas
- **aiken-lang**: comment formatting in pipelines leading to confusion. @rvcas
- **aiken-lang**: preserve holes discard name in function captures (see [#1080](https://github.com/aiken-lang/aiken/issues/1080)). @KtorZ
- **uplc**: Added deserialization match for the new builtin indices.
-
-## v1.1.11 - UNRELEASED
-
-### Added
-
- **aiken**: support for `bench` keyword to define benchmarks. @Riley-Kilgore
-
-## v1.1.10 - 2025-01-21
-
-### Added
-
- **aiken-project**: `export` output now supports the functions `return_type`. @rvcas
- **aiken-lang**: `write_bits` can now be used from aiken/builtins. @Microproofs
-
-
 ### Changed

 - **aiken-project**: The `aiken.toml` file no longer supports `v1` and `v2` for the plutus version field. @rvcas
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/crates/aiken-lang/src/test_framework.rs
+++ b/crates/aiken-lang/src/test_framework.rs
@ -28,6 +28,12 @@ use uplc::{
 };
 use vec1::{vec1, Vec1};

+#[derive(Debug, Clone, Copy)]
+pub enum RunnableKind {
+    Test,
+    Bench,
+}
+
 /// ----- Test -----------------------------------------------------------------
 ///
 /// Aiken supports two kinds of tests: unit and property. A unit test is a simply
@ -117,15 +123,15 @@ impl Test {
        })
    }

-    pub fn from_test_definition(
+    pub fn from_function_definition(
        generator: &mut CodeGenerator<'_>,
        test: TypedTest,
        module_name: String,
        input_path: PathBuf,
-        is_benchmark: bool,
+        kind: RunnableKind,
    ) -> Test {
        if test.arguments.is_empty() {
-            if is_benchmark {
+            if matches!(kind, RunnableKind::Bench) {
                unreachable!("benchmark must have at least one argument");
            } else {
                Self::unit_test(generator, test, module_name, input_path)
@ -153,8 +159,8 @@ impl Test {
            // apply onto it later.
            let generator_program = generator.clone().generate_raw(&via, &[], &module_name);

-            if is_benchmark {
-                Test::Benchmark(Benchmark {
+            match kind {
+                RunnableKind::Bench => Test::Benchmark(Benchmark {
                    input_path,
                    module: module_name,
                    name: test.name,
@ -165,9 +171,8 @@ impl Test {
                        type_info,
                        stripped_type_info,
                    },
-                })
-            } else {
-                Self::property_test(
+                }),
+                RunnableKind::Test => Self::property_test(
                    input_path,
                    module_name,
                    test.name,
@ -178,27 +183,26 @@ impl Test {
                        stripped_type_info,
                        type_info,
                    },
-                )
+                ),
            }
        }
    }

-    pub fn from_benchmark_definition(
-        generator: &mut CodeGenerator<'_>,
-        test: TypedTest,
-        module_name: String,
-        input_path: PathBuf,
-    ) -> Test {
-        Self::from_test_definition(generator, test, module_name, input_path, true)
-    }
-
-    pub fn from_function_definition(
-        generator: &mut CodeGenerator<'_>,
-        test: TypedTest,
-        module_name: String,
-        input_path: PathBuf,
-    ) -> Test {
-        Self::from_test_definition(generator, test, module_name, input_path, false)
+    pub fn run(
+        self,
+        seed: u32,
+        max_success: usize,
+        plutus_version: &PlutusVersion,
+    ) -> TestResult<(Constant, Rc<Type>), PlutusData> {
+        match self {
+            Test::UnitTest(unit_test) => TestResult::UnitTestResult(unit_test.run(plutus_version)),
+            Test::PropertyTest(property_test) => {
+                TestResult::PropertyTestResult(property_test.run(seed, max_success, plutus_version))
+            }
+            Test::Benchmark(benchmark) => {
+                TestResult::BenchmarkResult(benchmark.run(seed, max_success, plutus_version))
+            }
+        }
    }
 }

@ -217,7 +221,7 @@ pub struct UnitTest {
 unsafe impl Send for UnitTest {}

 impl UnitTest {
-    pub fn run<T>(self, plutus_version: &PlutusVersion) -> TestResult<(Constant, Rc<Type>), T> {
+    pub fn run(self, plutus_version: &PlutusVersion) -> UnitTestResult<(Constant, Rc<Type>)> {
        let mut eval_result = Program::<NamedDeBruijn>::try_from(self.program.clone())
            .unwrap()
            .eval_version(ExBudget::max(), &plutus_version.into());
@ -233,13 +237,13 @@ impl UnitTest {
        }
        traces.extend(eval_result.logs());

-        TestResult::UnitTestResult(UnitTestResult {
+        UnitTestResult {
            success,
            test: self.to_owned(),
            spent_budget: eval_result.cost(),
            traces,
            assertion: self.assertion,
-        })
+        }
    }
 }

@ -270,7 +274,7 @@ pub struct Fuzzer<T> {
 }

 #[derive(Debug, Clone, thiserror::Error, miette::Diagnostic)]
-#[error("Fuzzer exited unexpectedly: {uplc_error}")]
+#[error("Fuzzer exited unexpectedly: {uplc_error}.")]
 pub struct FuzzerError {
    traces: Vec<String>,
    uplc_error: uplc::machine::Error,
@ -317,12 +321,12 @@ impl PropertyTest {

    /// Run a property test from a given seed. The property is run at most DEFAULT_MAX_SUCCESS times. It
    /// may stops earlier on failure; in which case a 'counterexample' is returned.
-    pub fn run<U>(
+    pub fn run(
        self,
        seed: u32,
        n: usize,
        plutus_version: &PlutusVersion,
-    ) -> TestResult<U, PlutusData> {
+    ) -> PropertyTestResult<PlutusData> {
        let mut labels = BTreeMap::new();
        let mut remaining = n;

@ -352,13 +356,13 @@ impl PropertyTest {
            ),
        };

-        TestResult::PropertyTestResult(PropertyTestResult {
+        PropertyTestResult {
            test: self,
            counterexample,
            iterations,
            labels,
            traces,
-        })
+        }
    }

    pub fn run_n_times<'a>(
@ -372,9 +376,7 @@ impl PropertyTest {
        let mut counterexample = None;

        while *remaining > 0 && counterexample.is_none() {
-            let (next_prng, cex) = self.run_once(prng, labels, plutus_version)?;
-            prng = next_prng;
-            counterexample = cex;
+            (prng, counterexample) = self.run_once(prng, labels, plutus_version)?;
            *remaining -= 1;
        }

@ -492,6 +494,29 @@ pub struct Sampler<T> {
    pub stripped_type_info: Rc<Type>,
 }

+#[derive(Debug, Clone, thiserror::Error, miette::Diagnostic)]
+pub enum BenchmarkError {
+    #[error("Sampler exited unexpectedly: {uplc_error}.")]
+    SamplerError {
+        traces: Vec<String>,
+        uplc_error: uplc::machine::Error,
+    },
+    #[error("Bench exited unexpectedly: {uplc_error}.")]
+    BenchError {
+        traces: Vec<String>,
+        uplc_error: uplc::machine::Error,
+    },
+}
+
+impl BenchmarkError {
+    pub fn traces(&self) -> &[String] {
+        match self {
+            BenchmarkError::SamplerError { traces, .. }
+            | BenchmarkError::BenchError { traces, .. } => traces.as_slice(),
+        }
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct Benchmark {
    pub input_path: PathBuf,
@ -505,50 +530,61 @@ pub struct Benchmark {
 unsafe impl Send for Benchmark {}

 impl Benchmark {
-    pub fn benchmark(
+    pub const DEFAULT_MAX_SIZE: usize = 30;
+
+    pub fn run(
        self,
        seed: u32,
-        max_iterations: usize,
+        max_size: usize,
        plutus_version: &PlutusVersion,
-    ) -> Vec<BenchmarkResult> {
-        let mut results = Vec::with_capacity(max_iterations);
-        let mut iteration = 0;
+    ) -> BenchmarkResult {
+        let mut measures = Vec::with_capacity(max_size);
        let mut prng = Prng::from_seed(seed);
+        let mut error = None;
+        let mut size = 0;

-        while max_iterations > iteration {
+        while error.is_none() && max_size >= size {
            let fuzzer = self
                .sampler
                .program
-                .apply_data(Data::integer(num_bigint::BigInt::from(iteration as i64)));
+                .apply_term(&Term::Constant(Constant::Integer(size.into()).into()));
+
            match prng.sample(&fuzzer) {
+                Ok(None) => {
+                    panic!("A seeded PRNG returned 'None' which indicates a sampler is ill-formed and implemented wrongly; please contact library's authors.");
+                }
+
                Ok(Some((new_prng, value))) => {
                    prng = new_prng;
-                    let mut eval_result = self.eval(&value, plutus_version);
-                    results.push(BenchmarkResult {
-                        test: self.clone(),
-                        cost: eval_result.cost(),
-                        success: true,
-                        traces: eval_result.logs().to_vec(),
-                    });
+                    let mut result = self.eval(&value, plutus_version);
+                    match result.result() {
+                        Ok(_) => measures.push((size, result.cost())),
+                        Err(uplc_error) => {
+                            error = Some(BenchmarkError::BenchError {
+                                traces: result
+                                    .logs()
+                                    .into_iter()
+                                    .filter(|s| PropertyTest::extract_label(s).is_none())
+                                    .collect(),
+                                uplc_error,
+                            });
+                        }
+                    }
                }

-                Ok(None) => {
-                    break;
-                }
-                Err(e) => {
-                    results.push(BenchmarkResult {
-                        test: self.clone(),
-                        cost: ExBudget::default(),
-                        success: false,
-                        traces: vec![format!("Fuzzer error: {}", e)],
-                    });
-                    break;
+                Err(FuzzerError { traces, uplc_error }) => {
+                    error = Some(BenchmarkError::SamplerError { traces, uplc_error });
                }
            }
-            iteration += 1;
+
+            size += 1;
        }

-        results
+        BenchmarkResult {
+            bench: self,
+            measures,
+            error,
+        }
    }

    pub fn eval(&self, value: &PlutusData, plutus_version: &PlutusVersion) -> EvalResult {
@ -650,7 +686,6 @@ impl Prng {
    pub fn sample(
        &self,
        fuzzer: &Program<Name>,
-        // iteration: usize,
    ) -> Result<Option<(Prng, PlutusData)>, FuzzerError> {
        let program = Program::<NamedDeBruijn>::try_from(fuzzer.apply_data(self.uplc())).unwrap();
        let mut result = program.eval(ExBudget::max());
@ -1069,7 +1104,7 @@ where
 pub enum TestResult<U, T> {
    UnitTestResult(UnitTestResult<U>),
    PropertyTestResult(PropertyTestResult<T>),
-    Benchmark(BenchmarkResult),
+    BenchmarkResult(BenchmarkResult),
 }

 unsafe impl<U, T> Send for TestResult<U, T> {}
@ -1084,7 +1119,7 @@ impl TestResult<(Constant, Rc<Type>), PlutusData> {
            TestResult::PropertyTestResult(test) => {
                TestResult::PropertyTestResult(test.reify(data_types))
            }
-            TestResult::Benchmark(result) => TestResult::Benchmark(result),
+            TestResult::BenchmarkResult(result) => TestResult::BenchmarkResult(result),
        }
    }
 }
@ -1107,7 +1142,7 @@ impl<U, T> TestResult<U, T> {
                }
                OnTestFailure::SucceedImmediately => counterexample.is_some(),
            },
-            TestResult::Benchmark(BenchmarkResult { success, .. }) => *success,
+            TestResult::BenchmarkResult(BenchmarkResult { error, .. }) => error.is_none(),
        }
    }

@ -1117,7 +1152,7 @@ impl<U, T> TestResult<U, T> {
            TestResult::PropertyTestResult(PropertyTestResult { ref test, .. }) => {
                test.module.as_str()
            }
-            TestResult::Benchmark(BenchmarkResult { ref test, .. }) => test.module.as_str(),
+            TestResult::BenchmarkResult(BenchmarkResult { ref bench, .. }) => bench.module.as_str(),
        }
    }

@ -1127,7 +1162,7 @@ impl<U, T> TestResult<U, T> {
            TestResult::PropertyTestResult(PropertyTestResult { ref test, .. }) => {
                test.name.as_str()
            }
-            TestResult::Benchmark(BenchmarkResult { ref test, .. }) => test.name.as_str(),
+            TestResult::BenchmarkResult(BenchmarkResult { ref bench, .. }) => bench.name.as_str(),
        }
    }

@ -1135,7 +1170,9 @@ impl<U, T> TestResult<U, T> {
        match self {
            TestResult::UnitTestResult(UnitTestResult { traces, .. })
            | TestResult::PropertyTestResult(PropertyTestResult { traces, .. }) => traces,
-            TestResult::Benchmark(BenchmarkResult { traces, .. }) => traces,
+            TestResult::BenchmarkResult(BenchmarkResult { error, .. }) => {
+                error.as_ref().map(|e| e.traces()).unwrap_or_default()
+            }
        }
    }
 }
@ -1473,10 +1510,9 @@ impl Assertion<UntypedExpr> {

 #[derive(Debug, Clone)]
 pub struct BenchmarkResult {
-    pub test: Benchmark,
-    pub cost: ExBudget,
-    pub success: bool,
-    pub traces: Vec<String>,
+    pub bench: Benchmark,
+    pub measures: Vec<(usize, ExBudget)>,
+    pub error: Option<BenchmarkError>,
 }

 unsafe impl Send for BenchmarkResult {}
--- a/crates/aiken-lang/src/tipo/error.rs
+++ b/crates/aiken-lang/src/tipo/error.rs
@ -317,6 +317,14 @@ You can use '{discard}' and numbers to distinguish between similar names.
        location: Span,
    },

+    #[error("I notice a benchmark definition without any argument.\n")]
+    #[diagnostic(url("https://aiken-lang.org/language-tour/bench"))]
+    #[diagnostic(code("arity::bench"))]
+    IncorrectBenchmarkArity {
+        #[label("must have exactly one argument")]
+        location: Span,
+    },
+
    #[error(
        "I saw {} field{} in a context where there should be {}.\n",
        given.if_supports_color(Stdout, |s| s.purple()),
@ -1158,6 +1166,7 @@ impl ExtraData for Error {
            | Error::UnknownPurpose { .. }
            | Error::UnknownValidatorHandler { .. }
            | Error::UnexpectedValidatorFallback { .. }
+            | Error::IncorrectBenchmarkArity { .. }
            | Error::MustInferFirst { .. } => None,

            Error::UnknownType { name, .. }
--- a/crates/aiken-lang/src/tipo/infer.rs
+++ b/crates/aiken-lang/src/tipo/infer.rs
@ -12,7 +12,8 @@ use crate::{
        TypedDefinition, TypedModule, TypedValidator, UntypedArg, UntypedDefinition, UntypedModule,
        UntypedPattern, UntypedValidator, Use, Validator,
    },
-    expr::{TypedExpr, UntypedAssignmentKind},
+    expr::{TypedExpr, UntypedAssignmentKind, UntypedExpr},
+    parser::token::Token,
    tipo::{expr::infer_function, Span, Type, TypeVar},
    IdGenerator,
 };
@ -347,67 +348,8 @@ fn infer_definition(
                        });
                    }

-                    let typed_via = ExprTyper::new(environment, tracing).infer(arg.via.clone())?;
-
-                    let hydrator: &mut Hydrator = hydrators.get_mut(&f.name).unwrap();
-
-                    let provided_inner_type = arg
-                        .arg
-                        .annotation
-                        .as_ref()
-                        .map(|ann| hydrator.type_from_annotation(ann, environment))
-                        .transpose()?;
-
-                    let (inferred_annotation, inferred_inner_type) = infer_fuzzer(
-                        environment,
-                        provided_inner_type.clone(),
-                        &typed_via.tipo(),
-                        &arg.via.location(),
-                    )?;
-
-                    // Ensure that the annotation, if any, matches the type inferred from the
-                    // Fuzzer.
-                    if let Some(provided_inner_type) = provided_inner_type {
-                        if !arg
-                            .arg
-                            .annotation
-                            .as_ref()
-                            .unwrap()
-                            .is_logically_equal(&inferred_annotation)
-                        {
-                            return Err(Error::CouldNotUnify {
-                                location: arg.arg.location,
-                                expected: inferred_inner_type.clone(),
-                                given: provided_inner_type.clone(),
-                                situation: Some(UnifyErrorSituation::FuzzerAnnotationMismatch),
-                                rigid_type_names: hydrator.rigid_names(),
-                            });
-                        }
-                    }
-
-                    // Replace the pre-registered type for the test function, to allow inferring
-                    // the function body with the right type arguments.
-                    let scope = environment
-                        .scope
-                        .get_mut(&f.name)
-                        .expect("Could not find preregistered type for test");
-                    if let Type::Fn {
-                        ref ret,
-                        ref alias,
-                        args: _,
-                    } = scope.tipo.as_ref()
-                    {
-                        scope.tipo = Rc::new(Type::Fn {
-                            ret: ret.clone(),
-                            args: vec![inferred_inner_type.clone()],
-                            alias: alias.clone(),
-                        })
-                    }
-
-                    Ok((
-                        Some((typed_via, inferred_inner_type)),
-                        Some(inferred_annotation),
-                    ))
+                    extract_via_information(&f, arg, hydrators, environment, tracing, infer_fuzzer)
+                        .map(|(typed_via, annotation)| (Some(typed_via), Some(annotation)))
                }
                None => Ok((None, None)),
            }?;
@ -466,130 +408,50 @@ fn infer_definition(
        }

        Definition::Benchmark(f) => {
+            let err_incorrect_arity = || {
+                Err(Error::IncorrectBenchmarkArity {
+                    location: f
+                        .location
+                        .map(|start, end| (start + Token::Benchmark.to_string().len() + 1, end)),
+                })
+            };
+
            let (typed_via, annotation) = match f.arguments.first() {
+                None => return err_incorrect_arity(),
                Some(arg) => {
                    if f.arguments.len() > 1 {
-                        return Err(Error::IncorrectTestArity {
-                            count: f.arguments.len(),
-                            location: f
-                                .arguments
-                                .get(1)
-                                .expect("arguments.len() > 1")
-                                .arg
-                                .location,
-                        });
+                        return err_incorrect_arity();
                    }

-                    let typed_via = ExprTyper::new(environment, tracing).infer(arg.via.clone())?;
-
-                    let hydrator: &mut Hydrator = hydrators.get_mut(&f.name).unwrap();
-
-                    let provided_inner_type = arg
-                        .arg
-                        .annotation
-                        .as_ref()
-                        .map(|ann| hydrator.type_from_annotation(ann, environment))
-                        .transpose()?;
-
-                    let (inferred_annotation, inferred_inner_type) = infer_sampler(
-                        environment,
-                        provided_inner_type.clone(),
-                        &typed_via.tipo(),
-                        &arg.via.location(),
-                    )?;
-
-                    // Ensure that the annotation, if any, matches the type inferred from the
-                    // Sampler.
-                    if let Some(provided_inner_type) = provided_inner_type {
-                        if !arg
-                            .arg
-                            .annotation
-                            .as_ref()
-                            .unwrap()
-                            .is_logically_equal(&inferred_annotation)
-                        {
-                            return Err(Error::CouldNotUnify {
-                                location: arg.arg.location,
-                                expected: inferred_inner_type.clone(),
-                                given: provided_inner_type.clone(),
-                                situation: Some(UnifyErrorSituation::SamplerAnnotationMismatch),
-                                rigid_type_names: hydrator.rigid_names(),
-                            });
-                        }
-                    }
-
-                    // Replace the pre-registered type for the benchmark function, to allow inferring
-                    // the function body with the right type arguments.
-                    let scope = environment
-                        .scope
-                        .get_mut(&f.name)
-                        .expect("Could not find preregistered type for benchmark");
-                    if let Type::Fn {
-                        ref ret,
-                        ref alias,
-                        args: _,
-                    } = scope.tipo.as_ref()
-                    {
-                        scope.tipo = Rc::new(Type::Fn {
-                            ret: ret.clone(),
-                            args: vec![inferred_inner_type.clone()],
-                            alias: alias.clone(),
-                        })
-                    }
-
-                    Ok((
-                        Some((typed_via, inferred_inner_type)),
-                        Some(inferred_annotation),
-                    ))
+                    extract_via_information(&f, arg, hydrators, environment, tracing, infer_sampler)
                }
-                None => Ok((None, None)),
            }?;

            let typed_f = infer_function(&f.into(), module_name, hydrators, environment, tracing)?;

-            let is_bool = environment.unify(
-                typed_f.return_type.clone(),
-                Type::bool(),
-                typed_f.location,
-                false,
-            );
+            let arguments = {
+                let arg = typed_f
+                    .arguments
+                    .first()
+                    .expect("has exactly one argument")
+                    .to_owned();

-            let is_void = environment.unify(
-                typed_f.return_type.clone(),
-                Type::void(),
-                typed_f.location,
-                false,
-            );
-
-            if is_bool.or(is_void).is_err() {
-                return Err(Error::IllegalTestType {
-                    location: typed_f.location,
-                });
-            }
+                vec![ArgVia {
+                    arg: TypedArg {
+                        tipo: typed_via.1,
+                        annotation: Some(annotation),
+                        ..arg
+                    },
+                    via: typed_via.0,
+                }]
+            };

            Ok(Definition::Benchmark(Function {
                doc: typed_f.doc,
                location: typed_f.location,
                name: typed_f.name,
                public: typed_f.public,
-                arguments: match typed_via {
-                    Some((via, tipo)) => {
-                        let arg = typed_f
-                            .arguments
-                            .first()
-                            .expect("has exactly one argument")
-                            .to_owned();
-                        vec![ArgVia {
-                            arg: TypedArg {
-                                tipo,
-                                annotation,
-                                ..arg
-                            },
-                            via,
-                        }]
-                    }
-                    None => vec![],
-                },
+                arguments,
                return_annotation: typed_f.return_annotation,
                return_type: typed_f.return_type,
                body: typed_f.body,
@ -823,6 +685,83 @@ fn infer_definition(
    }
 }

+#[allow(clippy::result_large_err)]
+fn extract_via_information<F>(
+    f: &Function<(), UntypedExpr, ArgVia<UntypedArg, UntypedExpr>>,
+    arg: &ArgVia<UntypedArg, UntypedExpr>,
+    hydrators: &mut HashMap<String, Hydrator>,
+    environment: &mut Environment<'_>,
+    tracing: Tracing,
+    infer_via: F,
+) -> Result<((TypedExpr, Rc<Type>), Annotation), Error>
+where
+    F: FnOnce(
+        &mut Environment<'_>,
+        Option<Rc<Type>>,
+        &Rc<Type>,
+        &Span,
+    ) -> Result<(Annotation, Rc<Type>), Error>,
+{
+    let typed_via = ExprTyper::new(environment, tracing).infer(arg.via.clone())?;
+
+    let hydrator: &mut Hydrator = hydrators.get_mut(&f.name).unwrap();
+
+    let provided_inner_type = arg
+        .arg
+        .annotation
+        .as_ref()
+        .map(|ann| hydrator.type_from_annotation(ann, environment))
+        .transpose()?;
+
+    let (inferred_annotation, inferred_inner_type) = infer_via(
+        environment,
+        provided_inner_type.clone(),
+        &typed_via.tipo(),
+        &arg.via.location(),
+    )?;
+
+    // Ensure that the annotation, if any, matches the type inferred from the
+    // Fuzzer.
+    if let Some(provided_inner_type) = provided_inner_type {
+        if !arg
+            .arg
+            .annotation
+            .as_ref()
+            .unwrap()
+            .is_logically_equal(&inferred_annotation)
+        {
+            return Err(Error::CouldNotUnify {
+                location: arg.arg.location,
+                expected: inferred_inner_type.clone(),
+                given: provided_inner_type.clone(),
+                situation: Some(UnifyErrorSituation::FuzzerAnnotationMismatch),
+                rigid_type_names: hydrator.rigid_names(),
+            });
+        }
+    }
+
+    // Replace the pre-registered type for the test function, to allow inferring
+    // the function body with the right type arguments.
+    let scope = environment
+        .scope
+        .get_mut(&f.name)
+        .expect("Could not find preregistered type for test");
+    if let Type::Fn {
+        ref ret,
+        ref alias,
+        args: _,
+    } = scope.tipo.as_ref()
+    {
+        scope.tipo = Rc::new(Type::Fn {
+            ret: ret.clone(),
+            args: vec![inferred_inner_type.clone()],
+            alias: alias.clone(),
+        })
+    }
+
+    Ok(((typed_via, inferred_inner_type), inferred_annotation))
+}
+
 #[allow(clippy::result_large_err)]
 fn infer_fuzzer(
    environment: &mut Environment<'_>,
--- a/crates/aiken-lang/src/tipo/pretty.rs
+++ b/crates/aiken-lang/src/tipo/pretty.rs
@ -685,6 +685,7 @@ mod tests {
            }),
            "Identity<fn(Bool) -> Bool>",
        );
+        assert_string!(Type::sampler(Type::int()), "Sampler<Int>");
    }

    #[test]
--- a/crates/aiken-project/Cargo.toml
+++ b/crates/aiken-project/Cargo.toml
@ -11,7 +11,7 @@ authors = [
    "Kasey White <kwhitemsg@gmail.com>",
    "KtorZ <matthias.benkort@gmail.com>",
 ]
-rust-version = "1.70.0"
+rust-version = "1.80.0"
 build = "build.rs"

 [dependencies]
@ -42,10 +42,12 @@ pulldown-cmark = { version = "0.12.0", default-features = false, features = [
 rayon = "1.7.0"
 regex = "1.7.1"
 reqwest = { version = "0.11.14", features = ["blocking", "json"] }
+rgb = "0.8.50"
 semver = { version = "1.0.23", features = ["serde"] }
 serde = { version = "1.0.152", features = ["derive"] }
 serde_json = { version = "1.0.94", features = ["preserve_order"] }
 strip-ansi-escapes = "0.1.1"
+textplots = { git = "https://github.com/aiken-lang/textplots-rs.git" }
 thiserror = "1.0.39"
 tokio = { version = "1.26.0", features = ["full"] }
 toml = "0.7.2"
--- a/crates/aiken-project/src/error.rs
+++ b/crates/aiken-project/src/error.rs
@ -3,7 +3,7 @@ use aiken_lang::{
    ast::{self, Span},
    error::ExtraData,
    parser::error::ParseError,
-    test_framework::{PropertyTestResult, TestResult, UnitTestResult},
+    test_framework::{BenchmarkResult, PropertyTestResult, TestResult, UnitTestResult},
    tipo,
 };
 use miette::{
@ -193,7 +193,11 @@ impl Error {
                test.input_path.to_path_buf(),
                test.program.to_pretty(),
            ),
-            TestResult::Benchmark(_) => ("bench".to_string(), PathBuf::new(), String::new()), // todo
+            TestResult::BenchmarkResult(BenchmarkResult { bench, .. }) => (
+                bench.name.to_string(),
+                bench.input_path.to_path_buf(),
+                bench.program.to_pretty(),
+            ),
        };

        Error::TestFailure {
--- a/crates/aiken-project/src/lib.rs
+++ b/crates/aiken-project/src/lib.rs
@ -40,7 +40,7 @@ use aiken_lang::{
    format::{Formatter, MAX_COLUMNS},
    gen_uplc::CodeGenerator,
    line_numbers::LineNumbers,
-    test_framework::{Test, TestResult},
+    test_framework::{RunnableKind, Test, TestResult},
    tipo::{Type, TypeInfo},
    utils, IdGenerator,
 };
@ -299,20 +299,21 @@ where

    pub fn benchmark(
        &mut self,
-        match_tests: Option<Vec<String>>,
+        match_benchmarks: Option<Vec<String>>,
        exact_match: bool,
        seed: u32,
-        times_to_run: usize,
+        max_size: usize,
+        tracing: Tracing,
        env: Option<String>,
    ) -> Result<(), Vec<Error>> {
        let options = Options {
-            tracing: Tracing::silent(),
+            tracing,
            env,
            code_gen_mode: CodeGenMode::Benchmark {
-                match_tests,
+                match_benchmarks,
                exact_match,
                seed,
-                times_to_run,
+                max_size,
            },
            blueprint_path: self.blueprint_path(None),
        };
@ -432,7 +433,7 @@ where
                    self.event_listener.handle_event(Event::RunningTests);
                }

-                let tests = self.run_tests(tests, seed, property_max_success);
+                let tests = self.run_runnables(tests, seed, property_max_success);

                self.checks_count = if tests.is_empty() {
                    None
@ -466,33 +467,39 @@ where
                }
            }
            CodeGenMode::Benchmark {
-                match_tests,
+                match_benchmarks,
                exact_match,
                seed,
-                times_to_run,
+                max_size,
            } => {
-                let tests =
-                    self.collect_benchmarks(false, match_tests, exact_match, options.tracing)?;
+                let verbose = false;

-                if !tests.is_empty() {
+                let benchmarks = self.collect_benchmarks(
+                    verbose,
+                    match_benchmarks,
+                    exact_match,
+                    options.tracing,
+                )?;
+
+                if !benchmarks.is_empty() {
                    self.event_listener.handle_event(Event::RunningBenchmarks);
                }

-                let tests = self.run_benchmarks(tests, seed, times_to_run);
+                let benchmarks = self.run_runnables(benchmarks, seed, max_size);

-                let errors: Vec<Error> = tests
+                let errors: Vec<Error> = benchmarks
                    .iter()
                    .filter_map(|e| {
                        if e.is_success() {
                            None
                        } else {
-                            Some(Error::from_test_result(e, false))
+                            Some(Error::from_test_result(e, verbose))
                        }
                    })
                    .collect();

                self.event_listener
-                    .handle_event(Event::FinishedBenchmarks { seed, tests });
+                    .handle_event(Event::FinishedBenchmarks { seed, benchmarks });

                if !errors.is_empty() {
                    Err(errors)
@ -954,7 +961,7 @@ where

    fn collect_test_items(
        &mut self,
-        kind: &str, // "test" or "bench"
+        kind: RunnableKind,
        verbose: bool,
        match_tests: Option<Vec<String>>,
        exact_match: bool,
@ -993,8 +1000,8 @@ where

            for def in checked_module.ast.definitions() {
                let func = match (kind, def) {
-                    ("test", Definition::Test(func)) => Some(func),
-                    ("bench", Definition::Benchmark(func)) => Some(func),
+                    (RunnableKind::Test, Definition::Test(func)) => Some(func),
+                    (RunnableKind::Bench, Definition::Benchmark(func)) => Some(func),
                    _ => None,
                };

@ -1048,21 +1055,13 @@ where
                })
            }

-            tests.push(match kind {
-                "test" => Test::from_function_definition(
-                    &mut generator,
-                    test.to_owned(),
-                    module_name,
-                    input_path,
-                ),
-                "bench" => Test::from_benchmark_definition(
-                    &mut generator,
-                    test.to_owned(),
-                    module_name,
-                    input_path,
-                ),
-                _ => unreachable!("Invalid test kind"),
-            });
+            tests.push(Test::from_function_definition(
+                &mut generator,
+                test.to_owned(),
+                module_name,
+                input_path,
+                kind,
+            ));
        }

        Ok(tests)
@ -1075,7 +1074,13 @@ where
        exact_match: bool,
        tracing: Tracing,
    ) -> Result<Vec<Test>, Error> {
-        self.collect_test_items("test", verbose, match_tests, exact_match, tracing)
+        self.collect_test_items(
+            RunnableKind::Test,
+            verbose,
+            match_tests,
+            exact_match,
+            tracing,
+        )
    }

    fn collect_benchmarks(
@ -1085,14 +1090,20 @@ where
        exact_match: bool,
        tracing: Tracing,
    ) -> Result<Vec<Test>, Error> {
-        self.collect_test_items("bench", verbose, match_tests, exact_match, tracing)
+        self.collect_test_items(
+            RunnableKind::Bench,
+            verbose,
+            match_tests,
+            exact_match,
+            tracing,
+        )
    }

-    fn run_tests(
+    fn run_runnables(
        &self,
        tests: Vec<Test>,
        seed: u32,
-        property_max_success: usize,
+        max_success: usize,
    ) -> Vec<TestResult<UntypedExpr, UntypedExpr>> {
        use rayon::prelude::*;

@ -1102,42 +1113,7 @@ where

        tests
            .into_par_iter()
-            .map(|test| match test {
-                Test::UnitTest(unit_test) => unit_test.run(plutus_version),
-                Test::PropertyTest(property_test) => {
-                    property_test.run(seed, property_max_success, plutus_version)
-                }
-                Test::Benchmark(_) => unreachable!("Benchmarks cannot be run in PBT."),
-            })
-            .collect::<Vec<TestResult<(Constant, Rc<Type>), PlutusData>>>()
-            .into_iter()
-            .map(|test| test.reify(&data_types))
-            .collect()
-    }
-
-    fn run_benchmarks(
-        &self,
-        tests: Vec<Test>,
-        seed: u32,
-        property_max_success: usize,
-    ) -> Vec<TestResult<UntypedExpr, UntypedExpr>> {
-        use rayon::prelude::*;
-
-        let data_types = utils::indexmap::as_ref_values(&self.data_types);
-        let plutus_version = &self.config.plutus;
-
-        tests
-            .into_par_iter()
-            .flat_map(|test| match test {
-                Test::UnitTest(_) | Test::PropertyTest(_) => {
-                    unreachable!("Tests cannot be ran during benchmarking.")
-                }
-                Test::Benchmark(benchmark) => benchmark
-                    .benchmark(seed, property_max_success, plutus_version)
-                    .into_iter()
-                    .map(TestResult::Benchmark)
-                    .collect::<Vec<_>>(),
-            })
+            .map(|test| test.run(seed, max_success, plutus_version))
            .collect::<Vec<TestResult<(Constant, Rc<Type>), PlutusData>>>()
            .into_iter()
            .map(|test| test.reify(&data_types))
--- a/crates/aiken-project/src/options.rs
+++ b/crates/aiken-project/src/options.rs
@ -30,10 +30,10 @@ pub enum CodeGenMode {
    },
    Build(bool),
    Benchmark {
-        match_tests: Option<Vec<String>>,
+        match_benchmarks: Option<Vec<String>>,
        exact_match: bool,
        seed: u32,
-        times_to_run: usize,
+        max_size: usize,
    },
    NoOp,
 }
--- a/crates/aiken-project/src/telemetry.rs
+++ b/crates/aiken-project/src/telemetry.rs
@ -1,6 +1,6 @@
 use aiken_lang::{
    expr::UntypedExpr,
-    test_framework::{PropertyTestResult, TestResult, UnitTestResult},
+    test_framework::{BenchmarkResult, PropertyTestResult, TestResult, UnitTestResult},
 };
 pub use json::{json_schema, Json};
 use std::{
@ -10,6 +10,7 @@ use std::{
    path::PathBuf,
 };
 pub use terminal::Terminal;
+use uplc::machine::cost_model::ExBudget;

 mod json;
 mod terminal;
@ -50,7 +51,7 @@ pub enum Event {
    },
    FinishedBenchmarks {
        seed: u32,
-        tests: Vec<TestResult<UntypedExpr, UntypedExpr>>,
+        benchmarks: Vec<TestResult<UntypedExpr, UntypedExpr>>,
    },
    WaitingForBuildDirLock,
    ResolvingPackages {
@ -117,6 +118,18 @@ pub(crate) fn group_by_module(
 }

 pub(crate) fn find_max_execution_units<T>(xs: &[TestResult<T, T>]) -> (usize, usize, usize) {
+    fn max_execution_units(max_mem: i64, max_cpu: i64, cost: &ExBudget) -> (i64, i64) {
+        if cost.mem >= max_mem && cost.cpu >= max_cpu {
+            (cost.mem, cost.cpu)
+        } else if cost.mem > max_mem {
+            (cost.mem, max_cpu)
+        } else if cost.cpu > max_cpu {
+            (max_mem, cost.cpu)
+        } else {
+            (max_mem, max_cpu)
+        }
+    }
+
    let (max_mem, max_cpu, max_iter) =
        xs.iter()
            .fold((0, 0, 0), |(max_mem, max_cpu, max_iter), test| match test {
@ -124,18 +137,15 @@ pub(crate) fn find_max_execution_units<T>(xs: &[TestResult<T, T>]) -> (usize, us
                    (max_mem, max_cpu, std::cmp::max(max_iter, *iterations))
                }
                TestResult::UnitTestResult(UnitTestResult { spent_budget, .. }) => {
-                    if spent_budget.mem >= max_mem && spent_budget.cpu >= max_cpu {
-                        (spent_budget.mem, spent_budget.cpu, max_iter)
-                    } else if spent_budget.mem > max_mem {
-                        (spent_budget.mem, max_cpu, max_iter)
-                    } else if spent_budget.cpu > max_cpu {
-                        (max_mem, spent_budget.cpu, max_iter)
-                    } else {
-                        (max_mem, max_cpu, max_iter)
-                    }
+                    let (max_mem, max_cpu) = max_execution_units(max_mem, max_cpu, spent_budget);
+                    (max_mem, max_cpu, max_iter)
                }
-                TestResult::Benchmark(..) => {
-                    unreachable!("property returned benchmark result ?!")
+                TestResult::BenchmarkResult(BenchmarkResult { measures, .. }) => {
+                    let (mut max_mem, mut max_cpu) = (max_mem, max_cpu);
+                    for (_, measure) in measures {
+                        (max_mem, max_cpu) = max_execution_units(max_mem, max_cpu, measure);
+                    }
+                    (max_mem, max_cpu, max_iter)
                }
            });

--- a/crates/aiken-project/src/telemetry/json.rs
+++ b/crates/aiken-project/src/telemetry/json.rs
@ -39,16 +39,22 @@ impl EventListener for Json {
                });
                println!("{}", serde_json::to_string_pretty(&json_output).unwrap());
            }
-            Event::FinishedBenchmarks { tests, seed } => {
-                let benchmark_results: Vec<_> = tests
+            Event::FinishedBenchmarks { benchmarks, seed } => {
+                let benchmark_results: Vec<_> = benchmarks
                    .into_iter()
                    .filter_map(|test| {
-                        if let TestResult::Benchmark(result) = test {
+                        if let TestResult::BenchmarkResult(result) = test {
                            Some(serde_json::json!({
-                                "name": result.test.name,
-                                "module": result.test.module,
-                                "memory": result.cost.mem,
-                                "cpu": result.cost.cpu
+                                "name": result.bench.name,
+                                "module": result.bench.module,
+                                "measures": result.measures
+                                    .into_iter()
+                                    .map(|measure| serde_json::json!({
+                                        "size": measure.0,
+                                        "memory": measure.1.mem,
+                                        "cpu": measure.1.cpu
+                                    }))
+                                    .collect::<Vec<_>>()
                            }))
                        } else {
                            None
@ -74,7 +80,7 @@ fn fmt_test_json(result: &TestResult<UntypedExpr, UntypedExpr>) -> serde_json::V
        TestResult::PropertyTestResult(PropertyTestResult { ref test, .. }) => {
            &test.on_test_failure
        }
-        TestResult::Benchmark(_) => unreachable!("benchmark returned in JSON output"),
+        TestResult::BenchmarkResult(_) => unreachable!("benchmark returned in JSON output"),
    };

    let mut test = json!({
@ -120,7 +126,7 @@ fn fmt_test_json(result: &TestResult<UntypedExpr, UntypedExpr>) -> serde_json::V
                Err(err) => json!({"error": err.to_string()}),
            };
        }
-        TestResult::Benchmark(_) => unreachable!("benchmark returned in JSON output"),
+        TestResult::BenchmarkResult(_) => unreachable!("benchmark returned in JSON output"),
    }

    if !result.traces().is_empty() {
--- a/crates/aiken-project/src/telemetry/terminal.rs
+++ b/crates/aiken-project/src/telemetry/terminal.rs
@ -4,11 +4,21 @@ use aiken_lang::{
    ast::OnTestFailure,
    expr::UntypedExpr,
    format::Formatter,
-    test_framework::{AssertionStyleOptions, PropertyTestResult, TestResult, UnitTestResult},
+    test_framework::{
+        AssertionStyleOptions, BenchmarkResult, PropertyTestResult, TestResult, UnitTestResult,
+    },
 };
 use owo_colors::{OwoColorize, Stream::Stderr};
+use rgb::RGB8;
+use std::sync::LazyLock;
 use uplc::machine::cost_model::ExBudget;

+static BENCH_PLOT_COLOR: LazyLock<RGB8> = LazyLock::new(|| RGB8 {
+    r: 250,
+    g: 211,
+    b: 144,
+});
+
 #[derive(Debug, Default, Clone, Copy)]
 pub struct Terminal;

@ -224,14 +234,47 @@ impl EventListener for Terminal {
                    "...".if_supports_color(Stderr, |s| s.bold())
                );
            }
-            Event::FinishedBenchmarks { tests, .. } => {
-                for test in tests {
-                    if let TestResult::Benchmark(result) = test {
-                        println!("{} {} ", result.test.name.bold(), "BENCH".blue(),);
-                        println!("  Memory: {} bytes", result.cost.mem);
-                        println!("  CPU: {} units", result.cost.cpu);
+            Event::FinishedBenchmarks { seed, benchmarks } => {
+                let (max_mem, max_cpu, max_iter) = find_max_execution_units(&benchmarks);
+
+                for (module, results) in &group_by_module(&benchmarks) {
+                    let title = module
+                        .if_supports_color(Stderr, |s| s.bold())
+                        .if_supports_color(Stderr, |s| s.blue())
+                        .to_string();
+
+                    let benchmarks = results
+                        .iter()
+                        .map(|r| fmt_test(r, max_mem, max_cpu, max_iter, true))
+                        .collect::<Vec<String>>()
+                        .join("\n")
+                        .chars()
+                        .skip(1) // Remove extra first newline
+                        .collect::<String>();
+
+                    let seed_info = format!(
+                        "with {opt}={seed}",
+                        opt = "--seed".if_supports_color(Stderr, |s| s.bold()),
+                        seed = format!("{seed}").if_supports_color(Stderr, |s| s.bold())
+                    );
+
+                    if !benchmarks.is_empty() {
                        println!();
                    }
+
+                    println!(
+                        "{}\n",
+                        pretty::indent(
+                            &pretty::open_box(&title, &benchmarks, &seed_info, |border| border
+                                .if_supports_color(Stderr, |s| s.bright_black())
+                                .to_string()),
+                            4
+                        )
+                    );
+                }
+
+                if !benchmarks.is_empty() {
+                    println!();
                }
            }
        }
@ -246,7 +289,23 @@ fn fmt_test(
    styled: bool,
 ) -> String {
    // Status
-    let mut test = if result.is_success() {
+    let mut test = if matches!(result, TestResult::BenchmarkResult { .. }) {
+        format!(
+            "\n{label}{title}\n",
+            label = if result.is_success() {
+                String::new()
+            } else {
+                pretty::style_if(styled, "FAIL ".to_string(), |s| {
+                    s.if_supports_color(Stderr, |s| s.bold())
+                        .if_supports_color(Stderr, |s| s.red())
+                        .to_string()
+                })
+            },
+            title = pretty::style_if(styled, result.title().to_string(), |s| s
+                .if_supports_color(Stderr, |s| s.bright_blue())
+                .to_string())
+        )
+    } else if result.is_success() {
        pretty::style_if(styled, "PASS".to_string(), |s| {
            s.if_supports_color(Stderr, |s| s.bold())
                .if_supports_color(Stderr, |s| s.green())
@ -292,29 +351,76 @@ fn fmt_test(
                if *iterations > 1 { "s" } else { "" }
            );
        }
-        TestResult::Benchmark(benchmark) => {
-            let mem_pad = pretty::pad_left(benchmark.cost.mem.to_string(), max_mem, " ");
-            let cpu_pad = pretty::pad_left(benchmark.cost.cpu.to_string(), max_cpu, " ");
-
+        TestResult::BenchmarkResult(BenchmarkResult { error: Some(e), .. }) => {
            test = format!(
-                "{test} [mem: {mem_unit}, cpu: {cpu_unit}]",
-                mem_unit = pretty::style_if(styled, mem_pad, |s| s
-                    .if_supports_color(Stderr, |s| s.cyan())
-                    .to_string()),
-                cpu_unit = pretty::style_if(styled, cpu_pad, |s| s
-                    .if_supports_color(Stderr, |s| s.cyan())
-                    .to_string()),
+                "{test}{}",
+                e.to_string().if_supports_color(Stderr, |s| s.red())
            );
        }
+        TestResult::BenchmarkResult(BenchmarkResult {
+            measures,
+            error: None,
+            ..
+        }) => {
+            let max_size = measures
+                .iter()
+                .map(|(size, _)| *size)
+                .max()
+                .unwrap_or_default();
+
+            let mem_chart = format!(
+                "{title}\n{chart}",
+                title = "memory units"
+                    .if_supports_color(Stderr, |s| s.yellow())
+                    .if_supports_color(Stderr, |s| s.bold()),
+                chart = plot(
+                    &BENCH_PLOT_COLOR,
+                    measures
+                        .iter()
+                        .map(|(size, budget)| (*size as f32, budget.mem as f32))
+                        .collect::<Vec<_>>(),
+                    max_size
+                )
+            );
+
+            let cpu_chart = format!(
+                "{title}\n{chart}",
+                title = "cpu units"
+                    .if_supports_color(Stderr, |s| s.yellow())
+                    .if_supports_color(Stderr, |s| s.bold()),
+                chart = plot(
+                    &BENCH_PLOT_COLOR,
+                    measures
+                        .iter()
+                        .map(|(size, budget)| (*size as f32, budget.cpu as f32))
+                        .collect::<Vec<_>>(),
+                    max_size
+                )
+            );
+
+            let charts = mem_chart
+                .lines()
+                .zip(cpu_chart.lines())
+                .map(|(l, r)| format!("  {}{r}", pretty::pad_right(l.to_string(), 55, " ")))
+                .collect::<Vec<_>>()
+                .join("\n");
+
+            test = format!("{test}{charts}",);
+        }
    }

    // Title
-    test = format!(
-        "{test} {title}",
-        title = pretty::style_if(styled, result.title().to_string(), |s| s
-            .if_supports_color(Stderr, |s| s.bright_blue())
-            .to_string())
-    );
+    test = match result {
+        TestResult::BenchmarkResult(..) => test,
+        TestResult::UnitTestResult(..) | TestResult::PropertyTestResult(..) => {
+            format!(
+                "{test} {title}",
+                title = pretty::style_if(styled, result.title().to_string(), |s| s
+                    .if_supports_color(Stderr, |s| s.bright_blue())
+                    .to_string())
+            )
+        }
+    };

    // Annotations
    match result {
@ -470,3 +576,14 @@ fn fmt_test_summary<T>(tests: &[&TestResult<T, T>], styled: bool) -> String {
            .to_string()),
    )
 }
+
+fn plot(color: &RGB8, points: Vec<(f32, f32)>, max_size: usize) -> String {
+    use textplots::{Chart, ColorPlot, Shape};
+    let mut chart = Chart::new(80, 50, 1.0, max_size as f32);
+    let plot = Shape::Lines(&points);
+    let chart = chart.linecolorplot(&plot, *color);
+    chart.borders();
+    chart.axis();
+    chart.figures();
+    chart.to_string()
+}
--- a/crates/aiken-project/src/test_framework.rs
+++ b/crates/aiken-project/src/test_framework.rs
@ -101,6 +101,7 @@ mod test {
                test.to_owned(),
                module_name.to_string(),
                PathBuf::new(),
+                RunnableKind::Test,
            ),
            data_types,
        )
@ -245,13 +246,12 @@ mod test {
            }
        "#});

-        assert!(prop
-            .run::<()>(
-                42,
-                PropertyTest::DEFAULT_MAX_SUCCESS,
-                &PlutusVersion::default()
-            )
-            .is_success());
+        assert!(TestResult::PropertyTestResult::<(), _>(prop.run(
+            42,
+            PropertyTest::DEFAULT_MAX_SUCCESS,
+            &PlutusVersion::default()
+        ))
+        .is_success());
    }

    #[test]
@ -273,24 +273,20 @@ mod test {
            }
        "#});

-        match prop.run::<()>(
+        let result = prop.run(
            42,
            PropertyTest::DEFAULT_MAX_SUCCESS,
            &PlutusVersion::default(),
-        ) {
-            TestResult::UnitTestResult(..) => unreachable!("property returned unit-test result ?!"),
-            TestResult::PropertyTestResult(result) => {
-                assert!(
-                    result
-                        .labels
-                        .iter()
-                        .eq(vec![(&"head".to_string(), &53), (&"tail".to_string(), &47)]),
-                    "labels: {:#?}",
-                    result.labels
-                )
-            }
-            TestResult::Benchmark(..) => unreachable!("property returned benchmark result ?!"),
-        }
+        );
+
+        assert!(
+            result
+                .labels
+                .iter()
+                .eq(vec![(&"head".to_string(), &53), (&"tail".to_string(), &47)]),
+            "labels: {:#?}",
+            result.labels
+        );
    }

    #[test]
--- a/crates/aiken/src/cmd/benchmark.rs
+++ b/crates/aiken/src/cmd/benchmark.rs
@ -1,4 +1,8 @@
-use aiken_lang::test_framework::PropertyTest;
+use super::build::{trace_filter_parser, trace_level_parser};
+use aiken_lang::{
+    ast::{TraceLevel, Tracing},
+    test_framework::Benchmark,
+};
 use aiken_project::watch::with_project;
 use rand::prelude::*;
 use std::{
@ -13,37 +17,69 @@ pub struct Args {
    /// Path to project
    directory: Option<PathBuf>,

-    /// An initial seed to initialize the pseudo-random generator for property-tests.
+    /// An initial seed to initialize the pseudo-random generator for benchmarks.
    #[clap(long)]
    seed: Option<u32>,

-    /// How many times we will run each benchmark in the relevant project.
-    #[clap(long, default_value_t = PropertyTest::DEFAULT_MAX_SUCCESS)]
-    times_to_run: usize,
+    /// The maximum size to benchmark with. Note that this does not necessarily equates the number
+    /// of measurements actually performed but controls the maximum size given to a Sampler.
+    #[clap(long, default_value_t = Benchmark::DEFAULT_MAX_SIZE)]
+    max_size: usize,

-    /// Only run tests if they match any of these strings.
+    /// Only run benchmarks if they match any of these strings.
+    ///
    /// You can match a module with `-m aiken/list` or `-m list`.
-    /// You can match a test with `-m "aiken/list.{map}"` or `-m "aiken/option.{flatten_1}"`
+    /// You can match a benchmark with `-m "aiken/list.{map}"` or `-m "aiken/option.{flatten_1}"`
    #[clap(short, long)]
-    match_tests: Option<Vec<String>>,
+    match_benchmarks: Option<Vec<String>>,

-    /// This is meant to be used with `--match-tests`.
-    /// It forces test names to match exactly
+    /// This is meant to be used with `--match-benchmarks`.
+    /// It forces benchmark names to match exactly
    #[clap(short, long)]
    exact_match: bool,

    /// Environment to use for benchmarking
    env: Option<String>,
+
+    /// Filter traces to be included in the generated program(s).
+    ///
+    ///   - user-defined:
+    ///       only consider traces that you've explicitly defined
+    ///       either through the 'trace' keyword of via the trace-if-false
+    ///       ('?') operator.
+    ///
+    ///   - compiler-generated:
+    ///       only included internal traces generated by the
+    ///       Aiken compiler, for example in usage of 'expect'.
+    ///
+    ///   - all:
+    ///       include both user-defined and compiler-generated traces.
+    ///
+    /// [default: all]
+    #[clap(short = 'f', long, value_parser=trace_filter_parser(), default_missing_value="all", verbatim_doc_comment, alias="filter_traces")]
+    trace_filter: Option<fn(TraceLevel) -> Tracing>,
+
+    /// Choose the verbosity level of traces:
+    ///
+    ///   - silent: disable traces altogether
+    ///   - compact: only culprit line numbers are shown on failures
+    ///   - verbose: enable full verbose traces as provided by the user or the compiler
+    ///
+    /// [optional]
+    #[clap(short, long, value_parser=trace_level_parser(), default_value_t=TraceLevel::Silent, verbatim_doc_comment)]
+    trace_level: TraceLevel,
 }

 pub fn exec(
    Args {
        directory,
-        match_tests,
+        match_benchmarks,
        exact_match,
        seed,
-        times_to_run,
+        max_size,
        env,
+        trace_filter,
+        trace_level,
    }: Args,
 ) -> miette::Result<()> {
    let mut rng = rand::thread_rng();
@ -55,12 +91,15 @@ pub fn exec(
        false,
        !io::stdout().is_terminal(),
        |p| {
-            // We don't want to check here, we want to benchmark
            p.benchmark(
-                match_tests.clone(),
+                match_benchmarks.clone(),
                exact_match,
                seed,
-                times_to_run,
+                max_size,
+                match trace_filter {
+                    Some(trace_filter) => trace_filter(trace_level),
+                    None => Tracing::All(trace_level),
+                },
                env.clone(),
            )
        },