Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/precompile/bench/blake2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,14 @@ pub fn add_benches(group: &mut BenchmarkGroup<'_, criterion::measurement::WallTi
0x1f83d9abfb41bd6bu64,
0x5be0cd19137e2179u64,
];
let m = [0u8; 128];
let m = [0u64; 16];
let t = [0u64, 0u64];
b.iter(|| {
let mut h_copy = h;
blake2::algo::compress(
black_box(12),
&mut h_copy,
black_box(&m),
black_box(m),
black_box(t),
black_box(false),
);
Expand Down
110 changes: 49 additions & 61 deletions crates/precompile/src/blake2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,44 +16,43 @@ pub fn run(input: &[u8], gas_limit: u64) -> PrecompileResult {
return Err(PrecompileError::Blake2WrongLength);
}

// Rounds 4 bytes
// Parse number of rounds (4 bytes)
let rounds = u32::from_be_bytes(input[..4].try_into().unwrap()) as usize;
let gas_used = rounds as u64 * F_ROUND;
if gas_used > gas_limit {
return Err(PrecompileError::OutOfGas);
}

// Parse final block flag
let f = match input[212] {
1 => true,
0 => false,
1 => true,
_ => return Err(PrecompileError::Blake2WrongFinalIndicatorFlag),
};

// Parse state vector h (8 × u64)
let mut h = [0u64; 8];
//let mut m = [0u64; 16];

let t;
// Optimized parsing using ptr::read_unaligned for potentially better performance

let m;
unsafe {
let ptr = input.as_ptr();

// Read h values
for (i, item) in h.iter_mut().enumerate() {
*item = u64::from_le_bytes(core::ptr::read_unaligned(
ptr.add(4 + i * 8) as *const [u8; 8]
));
}

m = input[68..68 + 16 * size_of::<u64>()].try_into().unwrap();

t = [
u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(196) as *const [u8; 8])),
u64::from_le_bytes(core::ptr::read_unaligned(ptr.add(204) as *const [u8; 8])),
];
}
algo::compress(rounds, &mut h, m, t, f);
input[4..68]
.chunks_exact(8)
.enumerate()
.for_each(|(i, chunk)| {
h[i] = u64::from_le_bytes(chunk.try_into().unwrap());
});

// Parse message block m (16 × u64)
let mut m = [0u64; 16];
input[68..196]
.chunks_exact(8)
.enumerate()
.for_each(|(i, chunk)| {
m[i] = u64::from_le_bytes(chunk.try_into().unwrap());
});

// Parse offset counters
let t_0 = u64::from_le_bytes(input[196..204].try_into().unwrap());
let t_1 = u64::from_le_bytes(input[204..212].try_into().unwrap());

algo::compress(rounds, &mut h, m, [t_0, t_1], f);

let mut out = [0u8; 64];
for (i, h) in (0..64).step_by(8).zip(h.iter()) {
Expand Down Expand Up @@ -94,22 +93,26 @@ pub mod algo {
#[inline(always)]
#[allow(clippy::many_single_char_names)]
/// G function: <https://tools.ietf.org/html/rfc7693#section-3.1>
pub fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
v[a] = v[a].wrapping_add(v[b]);
v[a] = v[a].wrapping_add(x);
v[d] ^= v[a];
v[d] = v[d].rotate_right(32);
v[c] = v[c].wrapping_add(v[d]);
v[b] ^= v[c];
v[b] = v[b].rotate_right(24);

v[a] = v[a].wrapping_add(v[b]);
v[a] = v[a].wrapping_add(y);
v[d] ^= v[a];
v[d] = v[d].rotate_right(16);
v[c] = v[c].wrapping_add(v[d]);
v[b] ^= v[c];
v[b] = v[b].rotate_right(63);
fn g(v: &mut [u64; 16], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
let mut va = v[a];
let mut vb = v[b];
let mut vc = v[c];
let mut vd = v[d];

va = va.wrapping_add(vb).wrapping_add(x);
vd = (vd ^ va).rotate_right(32);
vc = vc.wrapping_add(vd);
vb = (vb ^ vc).rotate_right(24);

va = va.wrapping_add(vb).wrapping_add(y);
vd = (vd ^ va).rotate_right(16);
vc = vc.wrapping_add(vd);
vb = (vb ^ vc).rotate_right(63);

v[a] = va;
v[b] = vb;
v[c] = vc;
v[d] = vd;
}

/// Compression function F takes as an argument the state vector "h",
Expand All @@ -119,15 +122,7 @@ pub mod algo {
/// returns a new state vector. The number of rounds, "r", is 12 for
/// BLAKE2b and 10 for BLAKE2s. Rounds are numbered from 0 to r - 1.
#[allow(clippy::many_single_char_names)]
pub fn compress(
rounds: usize,
h: &mut [u64; 8],
m_slice: &[u8; 16 * size_of::<u64>()],
t: [u64; 2],
f: bool,
) {
assert!(m_slice.len() == 16 * size_of::<u64>());

pub fn compress(rounds: usize, h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool) {
#[cfg(all(target_feature = "avx2", feature = "std"))]
{
// only if it is compiled with avx2 flag and it is std, we can use avx2.
Expand All @@ -136,7 +131,7 @@ pub mod algo {
unsafe {
super::avx2::compress_block(
rounds,
m_slice,
&m,
h,
((t[1] as u128) << 64) | (t[0] as u128),
if f { !0 } else { 0 },
Expand All @@ -149,14 +144,6 @@ pub mod algo {

// if avx2 is not available, use the fallback portable implementation

// Read m values
let mut m = [0u64; 16];
for (i, item) in m.iter_mut().enumerate() {
*item = u64::from_le_bytes(unsafe {
core::ptr::read_unaligned(m_slice.as_ptr().add(i * 8) as *const [u8; 8])
});
}

let mut v = [0u64; 16];
v[..h.len()].copy_from_slice(h); // First half from state.
v[h.len()..].copy_from_slice(&IV); // Second half from IV.
Expand Down Expand Up @@ -224,7 +211,7 @@ mod avx2 {
#[inline(always)]
pub(crate) unsafe fn compress_block(
mut rounds: usize,
block: &[u8; BLOCKBYTES],
block: &[Word; 16],
words: &mut [Word; 8],
count: Count,
last_block: Word,
Expand All @@ -238,6 +225,7 @@ mod avx2 {
let flags = set4(count_low(count), count_high(count), last_block, last_node);
let mut d = xor(loadu(iv_high), flags);

let block: &[u8; BLOCKBYTES] = std::mem::transmute(block);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably fine as endiness is the same for all targets that have avx

let msg_chunks = array_refs!(block, 16, 16, 16, 16, 16, 16, 16, 16);
let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
Expand Down
Loading