Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions crates/precompile/bench/blake2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
use criterion::{black_box, BenchmarkGroup};
use primitives::hex;
use revm_precompile::blake2;

pub fn add_benches(group: &mut BenchmarkGroup<'_, criterion::measurement::WallTime>) {
// Test vectors from the blake2 test
let inputs = [
hex!("0000000248c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 2 rounds
hex!("0000000448c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b616162636465666768696a6b6c6d6e6f700000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 4 rounds
hex!("0000004048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 64 rounds
hex!("0000000a48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 10 rounds (Blake2s standard)
hex!("0000000c48c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 12 rounds (Blake2b standard)
hex!("0000020048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 512 rounds
hex!("0000040048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 1024 rounds
hex!("000186a048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 100000 rounds (100K)
hex!("00030d4048c9bdf267e6096a3ba7ca8485ae67bb2bf894fe72f36e3cf1361d5f3af54fa5d182e6ad7f520e511f6c3e2b8c68059b6bbd41fbabd9831f79217e1319cde05b61626300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000300000000000000000000000000000001"), // 200000 rounds (200K)
];

// Benchmark with 2 rounds
group.bench_function("blake2/2_rounds", |b| {
let input = &inputs[0]; // 2 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 4 rounds
group.bench_function("blake2/4_rounds", |b| {
let input = &inputs[1]; // 4 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 64 rounds
group.bench_function("blake2/64_rounds", |b| {
let input = &inputs[2]; // 64 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 10 rounds (Blake2s standard)
group.bench_function("blake2/10_rounds", |b| {
let input = &inputs[3]; // 10 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 12 rounds (Blake2b standard)
group.bench_function("blake2/12_rounds", |b| {
let input = &inputs[4]; // 12 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 512 rounds
group.bench_function("blake2/512_rounds", |b| {
let input = &inputs[5]; // 512 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 1024 rounds
group.bench_function("blake2/1024_rounds", |b| {
let input = &inputs[6]; // 1024 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 100K rounds
group.bench_function("blake2/100K_rounds", |b| {
let input = &inputs[7]; // 100000 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark with 200K rounds
group.bench_function("blake2/200K_rounds", |b| {
let input = &inputs[8]; // 200000 rounds
b.iter(|| {
black_box(blake2::run(black_box(input), u64::MAX).unwrap());
});
});

// Benchmark just the compression function with different round counts
group.bench_function("blake2/compress_12_rounds", |b| {
let h = [
0x6a09e667f3bcc908u64,
0xbb67ae8584caa73bu64,
0x3c6ef372fe94f82bu64,
0xa54ff53a5f1d36f1u64,
0x510e527fade682d1u64,
0x9b05688c2b3e6c1fu64,
0x1f83d9abfb41bd6bu64,
0x5be0cd19137e2179u64,
];
let m = [0u64; 16];
let t = [0u64, 0u64];
b.iter(|| {
let mut h_copy = h;
blake2::algo::compress(
black_box(12),
&mut h_copy,
black_box(m),
black_box(t),
black_box(false),
);
});
});
}
4 changes: 4 additions & 0 deletions crates/precompile/bench/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#![allow(missing_docs)]
//! Benchmarks for the crypto precompiles

pub mod blake2;
pub mod ecrecover;
pub mod eip1962;
pub mod eip2537;
Expand Down Expand Up @@ -31,6 +32,9 @@ pub fn benchmark_crypto_precompiles(c: &mut Criterion) {

// Run KZG point evaluation benchmarks
eip4844::add_benches(&mut group);

// Run Blake2 benchmarks
blake2::add_benches(&mut group);
}

criterion_group! {
Expand Down
110 changes: 49 additions & 61 deletions crates/precompile/src/blake2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,44 +16,43 @@ pub fn run(input: &[u8], gas_limit: u64) -> PrecompileResult {
return Err(PrecompileError::Blake2WrongLength);
}

// Rounds 4 bytes
// Parse number of rounds (4 bytes)
let rounds = u32::from_be_bytes(input[..4].try_into().unwrap()) as usize;
let gas_used = rounds as u64 * F_ROUND;
if gas_used > gas_limit {
return Err(PrecompileError::OutOfGas);
}

// Parse final block flag
let f = match input[212] {
1 => true,
0 => false,
1 => true,
_ => return Err(PrecompileError::Blake2WrongFinalIndicatorFlag),
};

// Parse state vector h (8 × u64)
let mut h = [0u64; 8];
//let mut m = [0u64; 16];

let t;
// Optimized parsing using ptr::read_unaligned for potentially better performance

let m;
unsafe {
let ptr = input.as_ptr();

// Read h values
for (i, item) in h.iter_mut().enumerate() {
*item = u64::from_le_bytes(core::ptr::read_unaligned(
ptr.add(4 + i * 8) as *const [u8; 8]
));
}

m = input[68..68 + 16 * size_of::<u64>()].try_into().unwrap();

t = [
u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(196) as *const [u8; 8])),
u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(204) as *const [u8; 8])),
];
}
algo::compress(rounds, &mut h, m, t, f);
input[4..68]
.chunks_exact(8)
.enumerate()
.for_each(|(i, chunk)| {
h[i] = u64::from_le_bytes(chunk.try_into().unwrap());
});

// Parse message block m (16 × u64)
let mut m = [0u64; 16];
input[68..196]
.chunks_exact(8)
.enumerate()
.for_each(|(i, chunk)| {
m[i] = u64::from_le_bytes(chunk.try_into().unwrap());
});

// Parse offset counters
let t_0 = u64::from_le_bytes(input[196..204].try_into().unwrap());
let t_1 = u64::from_le_bytes(input[204..212].try_into().unwrap());

algo::compress(rounds, &mut h, m, [t_0, t_1], f);

let mut out = [0u8; 64];
for (i, h) in (0..64).step_by(8).zip(h.iter()) {
Expand Down Expand Up @@ -94,22 +93,26 @@ pub mod algo {
#[inline(always)]
#[allow(clippy::many_single_char_names)]
/// G function: <https://tools.ietf.org/html/rfc7693#section-3.1>
pub fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
v[a] = v[a].wrapping_add(v[b]);
v[a] = v[a].wrapping_add(x);
v[d] ^= v[a];
v[d] = v[d].rotate_right(32);
v[c] = v[c].wrapping_add(v[d]);
v[b] ^= v[c];
v[b] = v[b].rotate_right(24);

v[a] = v[a].wrapping_add(v[b]);
v[a] = v[a].wrapping_add(y);
v[d] ^= v[a];
v[d] = v[d].rotate_right(16);
v[c] = v[c].wrapping_add(v[d]);
v[b] ^= v[c];
v[b] = v[b].rotate_right(63);
fn g(v: &mut [u64; 16], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
let mut va = v[a];
let mut vb = v[b];
let mut vc = v[c];
let mut vd = v[d];

va = va.wrapping_add(vb).wrapping_add(x);
vd = (vd ^ va).rotate_right(32);
vc = vc.wrapping_add(vd);
vb = (vb ^ vc).rotate_right(24);

va = va.wrapping_add(vb).wrapping_add(y);
vd = (vd ^ va).rotate_right(16);
vc = vc.wrapping_add(vd);
vb = (vb ^ vc).rotate_right(63);

v[a] = va;
v[b] = vb;
v[c] = vc;
v[d] = vd;
}

/// Compression function F takes as an argument the state vector "h",
Expand All @@ -119,15 +122,7 @@ pub mod algo {
/// returns a new state vector. The number of rounds, "r", is 12 for
/// BLAKE2b and 10 for BLAKE2s. Rounds are numbered from 0 to r - 1.
#[allow(clippy::many_single_char_names)]
pub fn compress(
rounds: usize,
h: &mut [u64; 8],
m_slice: &[u8; 16 * size_of::<u64>()],
t: [u64; 2],
f: bool,
) {
assert!(m_slice.len() == 16 * size_of::<u64>());

pub fn compress(rounds: usize, h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool) {
#[cfg(all(target_feature = "avx2", feature = "std"))]
{
// only if it is compiled with avx2 flag and it is std, we can use avx2.
Expand All @@ -136,7 +131,7 @@ pub mod algo {
unsafe {
super::avx2::compress_block(
rounds,
m_slice,
&m,
h,
((t[1] as u128) << 64) | (t[0] as u128),
if f { !0 } else { 0 },
Expand All @@ -149,14 +144,6 @@ pub mod algo {

// if avx2 is not available, use the fallback portable implementation

// Read m values
let mut m = [0u64; 16];
for (i, item) in m.iter_mut().enumerate() {
*item = u64::from_le_bytes(unsafe {
core::ptr::read_unaligned(m_slice.as_ptr().add(i * 8) as *const [u8; 8])
});
}

let mut v = [0u64; 16];
v[..h.len()].copy_from_slice(h); // First half from state.
v[h.len()..].copy_from_slice(&IV); // Second half from IV.
Expand Down Expand Up @@ -224,7 +211,7 @@ mod avx2 {
#[inline(always)]
pub(crate) unsafe fn compress_block(
mut rounds: usize,
block: &[u8; BLOCKBYTES],
block: &[Word; 16],
words: &mut [Word; 8],
count: Count,
last_block: Word,
Expand All @@ -238,6 +225,7 @@ mod avx2 {
let flags = set4(count_low(count), count_high(count), last_block, last_node);
let mut d = xor(loadu(iv_high), flags);

let block: &[u8; BLOCKBYTES] = std::mem::transmute(block);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably fine as endiness is the same for all targets that have avx

let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
Expand Down
Loading