diff --git a/esp-hal-common/.gitignore b/esp-hal-common/.gitignore
new file mode 100644
index 00000000000..8f61cef6fae
--- /dev/null
+++ b/esp-hal-common/.gitignore
@@ -0,0 +1 @@
+rust-toolchain.toml
diff --git a/esp-hal-common/Cargo.toml b/esp-hal-common/Cargo.toml
index 1b350fcf4ea..4c275c22c7d 100644
--- a/esp-hal-common/Cargo.toml
+++ b/esp-hal-common/Cargo.toml
@@ -50,11 +50,11 @@ ufmt-write = { version = "0.1.0", optional = true }
 # Each supported device MUST have its PAC included below along with a
 # corresponding feature. We rename the PAC packages because we cannot
 # have dependencies and features with the same names.
-esp32   = { version = "0.15.0", features = ["critical-section"], optional = true }
+esp32   = { version = "0.16.0", features = ["critical-section"], optional = true }
 esp32c2 = { version = "0.5.1",  features = ["critical-section"], optional = true }
 esp32c3 = { version = "0.8.1",  features = ["critical-section"], optional = true }
 esp32s2 = { version = "0.6.0",  features = ["critical-section"], optional = true }
-esp32s3 = { version = "0.8.0",  features = ["critical-section"], optional = true }
+esp32s3 = { version = "0.9.0",  features = ["critical-section"], optional = true }
 
 [features]
 esp32   = ["esp32/rt"  , "procmacros/xtensa", "xtensa-lx-rt/esp32",   "xtensa-lx/esp32",   "critical-section/restore-state-u32", "lock_api"]
diff --git a/esp-hal-common/src/lib.rs b/esp-hal-common/src/lib.rs
index fedb8742437..aedc9493879 100644
--- a/esp-hal-common/src/lib.rs
+++ b/esp-hal-common/src/lib.rs
@@ -48,6 +48,7 @@ pub use self::{
     timer::Timer,
 };
 
+pub mod sha;
 pub mod analog;
 pub mod clock;
 pub mod delay;
diff --git a/esp-hal-common/src/sha.rs b/esp-hal-common/src/sha.rs
new file mode 100644
index 00000000000..e56c20c2630
--- /dev/null
+++ b/esp-hal-common/src/sha.rs
@@ -0,0 +1,516 @@
+use core::convert::Infallible;
+
+use crate::pac::SHA;
+
+// All the hash algorithms introduced in FIPS PUB 180-4 Spec.
+// – SHA-1
+// – SHA-224
+// – SHA-256
+// – SHA-384
+// – SHA-512
+// – SHA-512/224
+// – SHA-512/256
+// – SHA-512/t (not implemented yet)
+// Two working modes
+// – Typical SHA
+// – DMA-SHA (not implemented yet)
+
+const ALIGN_SIZE: usize = core::mem::size_of::<u32>();
+
+// ESP32 does reversed order 
+#[cfg(esp32)]
+const U32_FROM_BYTES: fn([u8; 4]) -> u32 = u32::from_be_bytes;
+
+#[cfg(not(esp32))]
+const U32_FROM_BYTES: fn([u8; 4]) -> u32 = u32::from_ne_bytes; 
+
+// The alignment helper helps you write to registers that only accepts u32 using
+// regular u8s (bytes) It keeps a write buffer of 4 u8 (could in theory be 3 but
+// less convient) And if the incoming data is not convertable to u32 (i.e. not a
+// multiple of 4 in length) it will store the remainder in the buffer until the
+// next call
+//
+// It assumes incoming `dst` are aligned to desired layout (in future
+// ptr.is_aligned can be used) It also assumes that writes are done in FIFO
+// order
+#[derive(Debug)]
+struct AlignmentHelper {
+    buf: [u8; ALIGN_SIZE],
+    buf_fill: usize,
+}
+
+impl AlignmentHelper {
+    pub fn default() -> AlignmentHelper {
+        AlignmentHelper {
+            buf: [0u8; ALIGN_SIZE],
+            buf_fill: 0,
+        }
+    }
+
+    // This function will write any remaining buffer to dst and return the amount of
+    // *bytes* written (0 means no write)
+    pub unsafe fn flush_to(&mut self, dst: *mut u32) -> usize {
+        if self.buf_fill != 0 {
+            for i in self.buf_fill..ALIGN_SIZE {
+                self.buf[i] = 0;
+            }
+
+            dst.write_volatile(U32_FROM_BYTES(self.buf));
+        }
+
+        let flushed = self.buf_fill;
+        self.buf_fill = 0;
+
+        return flushed;
+    }
+
+    // This function is similar to `volatile_set_memory` but will prepend data that
+    // was previously ingested and ensure aligned (u32) writes
+    #[allow(unused)]
+    pub unsafe fn volatile_write_bytes(&mut self, dst: *mut u32, val: u8, count: usize) {
+        let mut cursor = 0;
+        if self.buf_fill != 0 {
+            for i in self.buf_fill..ALIGN_SIZE {
+                self.buf[i] = val;
+            }
+
+            dst.write_volatile(U32_FROM_BYTES(self.buf));
+            cursor = 1;
+
+            self.buf_fill = 0;
+        }
+
+        core::ptr::write_bytes(dst.add(cursor), val, count);
+    }
+
+    // This function is similar to `volatile_copy_nonoverlapping_memory`, however it
+    // buffers up to a u32 in order to always write to registers in an aligned
+    // way. Additionally it will keep stop writing when the end of the register
+    // (defined by `dst_bound` relative to `dst`) and returns the remaining data
+    // (if not possible to write everything), and if it wrote till dst_bound or
+    // exited early (due to lack of data).
+    pub unsafe fn aligned_volatile_copy<'a>(
+        &mut self,
+        dst: *mut u32,
+        src: &'a [u8],
+        dst_bound: usize,
+    ) -> (&'a [u8], bool) {
+        assert!(dst_bound > 0);
+
+        let mut nsrc = src;
+        let mut cursor = 0;
+        if self.buf_fill != 0 {
+            // First prepend existing data
+            let max_fill = ALIGN_SIZE - self.buf_fill;
+            let (nbuf, src) = src.split_at(core::cmp::min(src.len(), max_fill));
+            nsrc = src;
+            for i in 0..max_fill {
+                match nbuf.get(i) {
+                    Some(v) => {
+                        self.buf[self.buf_fill + i] = *v;
+                        self.buf_fill += 1;
+                    }
+                    None => return (&[], false), // Used up entire buffer before filling buff_fil
+                }
+            }
+
+            dst.write_volatile(U32_FROM_BYTES(self.buf));
+            cursor += 1;
+
+            self.buf_fill = 0;
+        }
+
+        if dst_bound <= cursor * ALIGN_SIZE {
+            return (nsrc, true);
+        }
+
+        let (to_write, remaining) = nsrc.split_at(core::cmp::min(
+            dst_bound - cursor * ALIGN_SIZE,
+            (nsrc.len() / ALIGN_SIZE) * ALIGN_SIZE, // TODO: unstable div_floor for clarity?
+        ));
+
+        if to_write.len() > 0 {
+            // Raw v_c_n_m also works but only when src.len() >= 4 * ALIGN_SIZE, otherwise
+            // it be broken
+            // core::intrinsics::volatile_copy_nonoverlapping_memory::<u32>(dst.add(cursor),
+            // to_write.as_ptr() as *const u32, to_write.len()/alignment);
+            for (i, v) in to_write.chunks_exact(ALIGN_SIZE).enumerate() {
+                dst.add(i)
+                    .write_volatile(U32_FROM_BYTES(v.try_into().unwrap()).to_be());
+            }
+        }
+
+        // If it's data we can't store we don't need to try and align it, just wait for
+        // next write Generally this applies when (src/4*4) != src
+        let was_bounded = dst_bound - to_write.len() == 0;
+        if remaining.len() > 0 && remaining.len() < 4 {
+            for i in 0..remaining.len() {
+                self.buf[i] = remaining[i];
+            }
+
+            self.buf_fill = remaining.len();
+
+            return (&[], was_bounded);
+        }
+
+        return (remaining, was_bounded);
+    }
+}
+
+#[derive(Debug)]
+pub struct Sha {
+    sha: SHA,
+    mode: ShaMode,
+    alignment_helper: AlignmentHelper,
+    cursor: usize,
+    first_run: bool,
+    finished: bool,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum ShaMode {
+    SHA1,
+    #[cfg(not(esp32))]
+    SHA224,
+    SHA256,
+    #[cfg(any(esp32s2, esp32s3, esp32))]
+    SHA384,
+    #[cfg(any(esp32s2, esp32s3, esp32))]
+    SHA512,
+    #[cfg(any(esp32s2, esp32s3))]
+    SHA512_224,
+    #[cfg(any(esp32s2, esp32s3))]
+    SHA512_256,
+    // SHA512_(u16) // Max 511
+}
+
+// TODO: Maybe make Sha Generic (Sha<Mode>) in order to allow for better
+// compiler optimizations? (Requires complex const generics which isn't stable
+// yet)
+
+#[cfg(not(esp32))]
+fn mode_as_bits(mode: ShaMode) -> u8 {
+    match mode {
+        ShaMode::SHA1 => 0,
+        ShaMode::SHA224 => 1,
+        ShaMode::SHA256 => 2,
+        #[cfg(any(esp32s2, esp32s3))]
+        ShaMode::SHA384 => 3,
+        #[cfg(any(esp32s2, esp32s3))]
+        ShaMode::SHA512 => 4,
+        #[cfg(any(esp32s2, esp32s3))]
+        ShaMode::SHA512_224 => 5,
+        #[cfg(any(esp32s2, esp32s3))]
+        ShaMode::SHA512_256 => 6,
+        // _ => 0 // TODO: SHA512/t
+    }
+}
+
+// TODO: Allow/Implemenet SHA512_(u16)
+
+// A few notes on this implementation with regards to 'memcpy',
+// - It seems that ptr::write_bytes already acts as volatile, while ptr::copy_*
+//   does not (in this case)
+// - The registers are *not* cleared after processing, so padding needs to be
+//   written out
+// - This component uses core::intrinsics::volatile_* which is unstable, but is
+//   the only way to
+// efficiently copy memory with volatile
+// - For this particular registers (and probably others), a full u32 needs to be
+//   written partial
+// register writes (i.e. in u8 mode) does not work
+//   - This means that we need to buffer bytes coming in up to 4 u8's in order
+//     to create a full u32
+
+// This implementation might fail after u32::MAX/8 bytes, to increase please see
+// ::finish() length/self.cursor usage
+impl Sha {
+    pub fn new(sha: SHA, mode: ShaMode) -> Self {
+        // Setup SHA Mode
+        #[cfg(not(esp32))]
+        sha.mode
+            .write(|w| unsafe { w.mode().bits(mode_as_bits(mode)) });
+
+        Self {
+            sha,
+            mode,
+            cursor: 0,
+            first_run: true,
+            finished: false,
+            alignment_helper: AlignmentHelper::default(),
+        }
+    }
+
+    pub fn first_run(&self) -> bool {
+        self.first_run
+    }
+
+    pub fn finished(&self) -> bool {
+        self.finished
+    }
+
+    #[cfg(not(esp32))]
+    fn process_buffer(&mut self) {
+        // FIXME: SHA_START_REG & SHA_CONTINUE_REG are wrongly marked as RO (they are
+        // WO)
+        if self.first_run {
+            // Set SHA_START_REG
+            unsafe {
+                self.sha.start.as_ptr().write_volatile(1u32);
+            }
+            self.first_run = false;
+        } else {
+            // SET SHA_CONTINUE_REG
+            unsafe {
+                self.sha.continue_.as_ptr().write_volatile(1u32);
+            }
+        }
+    }
+
+    #[cfg(esp32)]
+    fn process_buffer(&mut self) {
+        if self.first_run {
+            match self.mode {
+                ShaMode::SHA1 => self.sha.sha1_start.write(|w| unsafe { w.bits(1) }),
+                ShaMode::SHA256 => self.sha.sha256_start.write(|w| unsafe { w.bits(1) }),
+                ShaMode::SHA384 => self.sha.sha384_start.write(|w| unsafe { w.bits(1) }),
+                ShaMode::SHA512 => self.sha.sha512_start.write(|w| unsafe { w.bits(1) }),
+            }
+            self.first_run = false;
+        } else {
+            match self.mode {
+                ShaMode::SHA1 => self.sha.sha1_continue.write(|w| unsafe { w.bits(1) }),
+                ShaMode::SHA256 => self.sha.sha256_continue.write(|w| unsafe { w.bits(1) }),
+                ShaMode::SHA384 => self.sha.sha384_continue.write(|w| unsafe { w.bits(1) }),
+                ShaMode::SHA512 => self.sha.sha512_continue.write(|w| unsafe { w.bits(1) }),
+            }
+        }
+    }
+
+    fn chunk_length(&self) -> usize {
+        return match self.mode {
+            ShaMode::SHA1 | ShaMode::SHA256 => 64,
+            #[cfg(not(esp32))]
+            ShaMode::SHA224 => 64,
+            _ => 128,
+        };
+    }
+
+    #[cfg(esp32)]
+    fn is_busy(&self) -> bool {
+        match self.mode {
+            ShaMode::SHA1 => self.sha.sha1_busy.read().sha1_busy().bit_is_set(),
+            ShaMode::SHA256 => self.sha.sha256_busy.read().sha256_busy().bit_is_set(),
+            ShaMode::SHA384 => self.sha.sha384_busy.read().sha384_busy().bit_is_set(),
+            ShaMode::SHA512 => self.sha.sha512_busy.read().sha512_busy().bit_is_set(),
+        }
+    }
+
+    #[cfg(not(esp32))]
+    fn is_busy(&self) -> bool {
+        self.sha.busy.read().bits() != 0
+    }
+
+    pub fn digest_length(&self) -> usize {
+        match self.mode {
+            ShaMode::SHA1 => 20,
+            #[cfg(not(esp32))]
+            ShaMode::SHA224 => 28,
+            ShaMode::SHA256 => 32,
+            #[cfg(any(esp32, esp32s2, esp32s3))]
+            ShaMode::SHA384 => 48,
+            #[cfg(any(esp32, esp32s2, esp32s3))]
+            ShaMode::SHA512 => 64,
+            #[cfg(any(esp32s2, esp32s3))]
+            ShaMode::SHA512_224 => 28,
+            #[cfg(any(esp32s2, esp32s3))]
+            ShaMode::SHA512_256 => 32,
+        }
+    }
+
+    #[cfg(not(esp32))]
+    fn input_ptr(&self) -> *mut u32 {
+        return self.sha.m_mem[0].as_ptr() as *mut u32;
+    }
+
+    #[cfg(esp32)]
+    fn input_ptr(&self) -> *mut u32 {
+        return self.sha.text[0].as_ptr() as *mut u32;
+    }
+
+    #[cfg(not(esp32))]
+    fn output_ptr(&self) -> *const u32 {
+        return self.sha.h_mem[0].as_ptr() as *const u32;
+    }
+
+    #[cfg(esp32)]
+    fn output_ptr(&self) -> *const u32 {
+        return self.sha.text[0].as_ptr() as *const u32;
+    }
+
+    fn flush_data(&mut self) -> nb::Result<(), Infallible> {
+        if self.is_busy() {
+            return Err(nb::Error::WouldBlock);
+        }
+
+        unsafe {
+            let dst_ptr = self
+                .input_ptr()
+                .add((self.cursor % self.chunk_length()) / ALIGN_SIZE);
+            let flushed = self.alignment_helper.flush_to(dst_ptr);
+            if flushed != 0 {
+                self.cursor = self.cursor.wrapping_add(ALIGN_SIZE - flushed);
+                if self.cursor % self.chunk_length() == 0 {
+                    self.process_buffer();
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    // This function ensures that incoming data is aligned to u32 (due to issues
+    // with cpy_mem<u8>)
+    fn write_data<'a>(&mut self, incoming: &'a [u8]) -> nb::Result<&'a [u8], Infallible> {
+        let mod_cursor = self.cursor % self.chunk_length();
+
+        unsafe {
+            let ptr = self.input_ptr().add(mod_cursor / ALIGN_SIZE);
+            let (remaining, bound_reached) = self.alignment_helper.aligned_volatile_copy(
+                ptr,
+                incoming,
+                self.chunk_length() - mod_cursor,
+            );
+            self.cursor = self.cursor.wrapping_add(incoming.len() - remaining.len());
+            if bound_reached {
+                self.process_buffer();
+            }
+
+            Ok(remaining)
+        }
+    }
+
+    pub fn update<'a>(&mut self, buffer: &'a [u8]) -> nb::Result<&'a [u8], Infallible> {
+        if self.is_busy() {
+            return Err(nb::Error::WouldBlock);
+        }
+
+        self.finished = false;
+
+        let remaining = self.write_data(buffer)?;
+
+        Ok(remaining)
+    }
+
+    // Finish of the calculation (if not alreaedy) and copy result to output
+    // After `finish()` is called `update()`s will contribute to a new hash which
+    // can be calculated again with `finish()`.
+    //
+    // Typically output is expected to be the size of digest_length(), but smaller
+    // inputs can be given to get a "short hash"
+    pub fn finish(&mut self, output: &mut [u8]) -> nb::Result<(), Infallible> {
+        // The main purpose of this function is to dynamically generate padding for the
+        // input. Padding: Append "1" bit, Pad zeros until 512/1024 filled
+        // then set the message length in the LSB (overwriting the padding)
+        // If not enough free space for length+1, add length at end of a new zero'd
+        // block
+
+        if self.is_busy() {
+            return Err(nb::Error::WouldBlock);
+        }
+
+        let chunk_len = self.chunk_length();
+
+        if !self.finished {
+            // Store message length for padding
+            let length = self.cursor * 8;
+            nb::block!(self.update(&[0x80]))?; // Append "1" bit
+            nb::block!(self.flush_data())?; // Flush partial data, ensures aligned cursor
+            debug_assert!(self.cursor % 4 == 0);
+
+            let mod_cursor = self.cursor % chunk_len;
+            if chunk_len - mod_cursor < chunk_len / 8 {
+                // Zero out remaining data if buffer is almost full (>=448/896), and process
+                // buffer
+                let pad_len = chunk_len - mod_cursor;
+                unsafe {
+                    let m_cursor_ptr = self.input_ptr().add(mod_cursor / ALIGN_SIZE);
+                    self.alignment_helper.volatile_write_bytes(
+                        m_cursor_ptr,
+                        0,
+                        pad_len / ALIGN_SIZE,
+                    );
+                }
+                self.process_buffer();
+                self.cursor = self.cursor.wrapping_add(pad_len);
+
+                // Spin-wait for finish
+                while self.is_busy() {}
+            }
+
+            let mod_cursor = self.cursor % chunk_len; // Should be zero if branched above
+            unsafe {
+                let m_cursor_ptr = self.input_ptr();
+                // Pad zeros
+                let pad_ptr = m_cursor_ptr.add(mod_cursor / ALIGN_SIZE);
+                let pad_len = (chunk_len - mod_cursor) - ALIGN_SIZE;
+
+                self.alignment_helper
+                    .volatile_write_bytes(pad_ptr, 0, pad_len / ALIGN_SIZE);
+
+                // Write length (BE) to end
+                // NOTE: aligned_volatile_copy does not work here
+                // The decompiler suggest volatile_copy_memory/write_volatile is optimized to a
+                // simple *v = *pv; While the aligned_volatile_copy makes an
+                // actual call to memcpy, why this makes a difference when
+                // memcpy does works in other places, I don't know
+                let end_ptr = m_cursor_ptr.add((chunk_len / ALIGN_SIZE) - 1);
+                #[cfg(not(esp32))]
+                end_ptr.write_volatile(length.to_be() as u32);
+                #[cfg(esp32)]
+                end_ptr.write_volatile(length.to_le() as u32);
+            }
+
+            self.process_buffer();
+            // Spin-wait for final buffer to be processed
+            while self.is_busy() {}
+
+            // ESP32 requires additional load to retrieve output
+            #[cfg(esp32)]
+            {
+                match self.mode {
+                    ShaMode::SHA1 => unsafe { self.sha.sha1_load.write(|w| w.bits(1)) },
+                    ShaMode::SHA256 => unsafe { self.sha.sha256_load.write(|w| w.bits(1)) },
+                    ShaMode::SHA384 => unsafe { self.sha.sha384_load.write(|w| w.bits(1)) },
+                    ShaMode::SHA512 => unsafe { self.sha.sha512_load.write(|w| w.bits(1)) },
+                }
+
+                // Spin wait for result, 8-20 clock cycles according to manual
+                while self.is_busy() {}
+            }
+
+            self.finished = true;
+        }
+
+        unsafe {
+
+            let digest_ptr = self.output_ptr();
+            let out_ptr = output.as_mut_ptr() as *mut u32;
+            let digest_out = core::cmp::min(self.digest_length(), output.len()) / ALIGN_SIZE;
+            for i in 0..digest_out {
+                #[cfg(not(esp32))]
+                out_ptr.add(i).write(*digest_ptr.add(i));
+                // ESP32 does reversed order 
+                #[cfg(esp32)]
+                out_ptr.add(i).write((*digest_ptr.add(i)).to_be());
+            } 
+        }
+
+
+        Ok(())
+    }
+
+    pub fn free(self) -> SHA {
+        self.sha
+    }
+}
diff --git a/esp32-hal/Cargo.toml b/esp32-hal/Cargo.toml
index 39ce77c8ccb..be12d26932e 100644
--- a/esp32-hal/Cargo.toml
+++ b/esp32-hal/Cargo.toml
@@ -40,6 +40,7 @@ embassy-executor  = { package = "embassy-executor", git = "https://github.com/em
 embedded-graphics = "0.7.1"
 esp-backtrace     = { version = "0.3.0", features = ["esp32", "panic-handler", "exception-handler", "print-uart"] }
 esp-println       = { version = "0.3.1", features = ["esp32"] }
+sha2              = { version = "0.10.6", default-features = false}
 smart-leds        = "0.3.0"
 ssd1306           = "0.7.1"
 static_cell       = "1.0.0"
diff --git a/esp32-hal/examples/sha.rs b/esp32-hal/examples/sha.rs
new file mode 100644
index 00000000000..79a1abc9503
--- /dev/null
+++ b/esp32-hal/examples/sha.rs
@@ -0,0 +1,80 @@
+//! Demonstrates the use of the SHA peripheral and compares the speed of hardware-accelerated and pure software hashing.
+//! 
+
+#![no_std]
+#![no_main]
+
+use esp32_hal::{
+    clock::ClockControl,
+    pac::Peripherals,
+    prelude::*,
+    timer::TimerGroup,
+    Rtc,
+    sha::{Sha, ShaMode},
+};
+use nb::block;
+use esp_backtrace as _;
+use esp_println::println;
+use xtensa_lx_rt::entry;
+use sha2::{Sha512, Digest};
+
+#[entry]
+fn main() -> ! {
+    let peripherals = Peripherals::take().unwrap();
+    let system = peripherals.DPORT.split();
+    let clocks = ClockControl::boot_defaults(system.clock_control).freeze();
+
+    let timer_group0 = TimerGroup::new(peripherals.TIMG0, &clocks);
+    let mut wdt = timer_group0.wdt;
+    let mut rtc = Rtc::new(peripherals.RTC_CNTL);
+
+    // Disable MWDT and RWDT (Watchdog) flash boot protection
+    wdt.disable();
+    rtc.rwdt.disable();
+
+    
+    let source_data = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".as_bytes();
+    let mut remaining = source_data.clone();
+    let mut hasher = Sha::new(peripherals.SHA, ShaMode::SHA512);
+
+    // Short hashes can be created by decreasing the output buffer to the desired length
+    let mut output = [0u8; 64];
+
+    let pre_calc = xtensa_lx::timer::get_cycle_count();
+    // The hardware implementation takes a subslice of the input, and returns the unprocessed parts
+    // The unprocessed parts can be input in the next iteration, you can always add more data until
+    // finish() is called. After finish() is called update()'s will contribute to a new hash which
+    // can be extracted again with finish().
+    
+    while remaining.len() > 0 {
+        // Can add println to view progress, however println takes a few orders of magnitude longer than
+        // the Sha function itself so not useful for comparing processing time
+        // println!("Remaining len: {}", remaining.len());
+
+        // All the HW Sha functions are infallible so unwrap is fine to use if you use block!
+        remaining = block!(hasher.update(remaining)).unwrap();
+    }
+
+    // Finish can be called as many times as desired to get mutliple copies of the output.
+    block!(hasher.finish(output.as_mut_slice())).unwrap();
+
+    let post_calc = xtensa_lx::timer::get_cycle_count();
+    let hw_time = post_calc - pre_calc;
+    println!("Took {} cycles", hw_time);
+    println!("SHA512 Hash output {:02x?}", output);
+    let _usha = hasher.free();
+
+
+    let pre_calc = xtensa_lx::timer::get_cycle_count();
+    let mut hasher = Sha512::new();
+    hasher.update(source_data);
+    let soft_result = hasher.finalize();
+    let post_calc = xtensa_lx::timer::get_cycle_count();
+    let soft_time = post_calc - pre_calc;
+    println!("Took {} cycles", soft_time);
+    println!("SHA512 Hash output {:02x?}", soft_result);
+
+    println!("HW SHA is {}x faster", soft_time/hw_time);
+
+    loop {}
+}
diff --git a/esp32-hal/src/lib.rs b/esp32-hal/src/lib.rs
index bfba115d858..70be3d5496a 100644
--- a/esp32-hal/src/lib.rs
+++ b/esp32-hal/src/lib.rs
@@ -29,6 +29,7 @@ pub use esp_hal_common::{
     Rtc,
     Rwdt,
     Serial,
+    sha
 };
 
 pub use self::gpio::IO;
diff --git a/esp32c2-hal/Cargo.toml b/esp32c2-hal/Cargo.toml
index fb586217cf7..8bac9eb0b95 100644
--- a/esp32c2-hal/Cargo.toml
+++ b/esp32c2-hal/Cargo.toml
@@ -41,6 +41,7 @@ embassy-executor  = { package = "embassy-executor", git = "https://github.com/em
 embedded-graphics = "0.7.1"
 esp-backtrace     = { version = "0.3.0", features = ["esp32c2", "panic-handler", "exception-handler", "print-uart"] }
 esp-println       = { version = "0.3.1", features = ["esp32c2"] }
+sha2              = { version = "0.10.6", default-features = false}
 ssd1306           = "0.7.1"
 static_cell       = "1.0.0"
 
diff --git a/esp32c2-hal/examples/sha.rs b/esp32c2-hal/examples/sha.rs
new file mode 100644
index 00000000000..07234e64cd5
--- /dev/null
+++ b/esp32c2-hal/examples/sha.rs
@@ -0,0 +1,79 @@
+//! Demonstrates the use of the SHA peripheral and compares the speed of hardware-accelerated and pure software hashing.
+//! 
+
+#![no_std]
+#![no_main]
+
+use esp32c2_hal::{
+    clock::ClockControl,
+    pac::Peripherals,
+    prelude::*,
+    timer::TimerGroup,
+    Rtc,
+    sha::{Sha, ShaMode},
+};
+use nb::block;
+use esp_backtrace as _;
+use esp_println::println;
+use riscv_rt::entry;
+use sha2::{Sha256, Digest};
+
+#[entry]
+fn main() -> ! {
+    let peripherals = Peripherals::take().unwrap();
+    let system = peripherals.SYSTEM.split();
+    let clocks = ClockControl::boot_defaults(system.clock_control).freeze();
+
+    let timer_group0 = TimerGroup::new(peripherals.TIMG0, &clocks);
+    let mut wdt = timer_group0.wdt;
+    let mut rtc = Rtc::new(peripherals.RTC_CNTL);
+
+    // Disable MWDT and RWDT (Watchdog) flash boot protection
+    wdt.disable();
+    rtc.rwdt.disable();
+
+    
+    let source_data = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".as_bytes();
+    let mut remaining = source_data.clone();
+    let mut hasher = Sha::new(peripherals.SHA, ShaMode::SHA256);
+
+    // Short hashes can be created by decreasing the output buffer to the desired length
+    let mut output = [0u8; 32];
+
+    //let pre_calc = xtensa_lx::timer::get_cycle_count();
+    // The hardware implementation takes a subslice of the input, and returns the unprocessed parts
+    // The unprocessed parts can be input in the next iteration, you can always add more data until
+    // finish() is called. After finish() is called update()'s will contribute to a new hash which
+    // can be extracted again with finish().
+    
+    while remaining.len() > 0 {
+        // Can add println to view progress, however println takes a few orders of magnitude longer than
+        // the Sha function itself so not useful for comparing processing time
+        // println!("Remaining len: {}", remaining.len());
+
+        // All the HW Sha functions are infallible so unwrap is fine to use if you use block!
+        remaining = block!(hasher.update(remaining)).unwrap();
+    }
+
+    // Finish can be called as many times as desired to get mutliple copies of the output.
+    block!(hasher.finish(output.as_mut_slice())).unwrap();
+    //let post_calc = xtensa_lx::timer::get_cycle_count();
+    //let hw_time = post_calc - pre_calc;
+    //println!("Took {} cycles", hw_time);
+    println!("SHA256 Hash output {:02x?}", output);
+    let _usha = hasher.free();
+
+
+    //let pre_calc = xtensa_lx::timer::get_cycle_count();
+    let mut hasher = Sha256::new();
+    hasher.update(source_data);
+    let soft_result = hasher.finalize();
+    //let post_calc = xtensa_lx::timer::get_cycle_count();
+    //let soft_time = post_calc - pre_calc;
+    //println!("Took {} cycles", soft_time);
+    println!("SHA256 Hash output {:02x?}", soft_result);
+
+    //println!("HW SHA is {}x faster", soft_time/hw_time);
+
+    loop {}
+}
diff --git a/esp32c2-hal/src/lib.rs b/esp32c2-hal/src/lib.rs
index 3c4aca22530..38321d7fd39 100644
--- a/esp32c2-hal/src/lib.rs
+++ b/esp32c2-hal/src/lib.rs
@@ -28,6 +28,7 @@ pub use esp_hal_common::{
     Rtc,
     Rwdt,
     Serial,
+    sha,
 };
 
 pub use self::gpio::IO;
diff --git a/esp32c3-hal/Cargo.toml b/esp32c3-hal/Cargo.toml
index adb30430577..6119c37811b 100644
--- a/esp32c3-hal/Cargo.toml
+++ b/esp32c3-hal/Cargo.toml
@@ -42,6 +42,7 @@ embassy-executor  = { package = "embassy-executor", git = "https://github.com/em
 embedded-graphics = "0.7.1"
 esp-backtrace     = { version = "0.3.0", features = ["esp32c3", "panic-handler", "exception-handler", "print-uart"] }
 esp-println       = { version = "0.3.1", features = ["esp32c3"] }
+sha2              = { version = "0.10.6", default-features = false}
 smart-leds        = "0.3.0"
 ssd1306           = "0.7.1"
 static_cell       = "1.0.0"
diff --git a/esp32c3-hal/examples/sha.rs b/esp32c3-hal/examples/sha.rs
new file mode 100644
index 00000000000..27eca891b85
--- /dev/null
+++ b/esp32c3-hal/examples/sha.rs
@@ -0,0 +1,79 @@
+//! Demonstrates the use of the SHA peripheral and compares the speed of hardware-accelerated and pure software hashing.
+//! 
+
+#![no_std]
+#![no_main]
+
+use esp32c3_hal::{
+    clock::ClockControl,
+    pac::Peripherals,
+    prelude::*,
+    timer::TimerGroup,
+    Rtc,
+    sha::{Sha, ShaMode},
+};
+use nb::block;
+use esp_backtrace as _;
+use esp_println::println;
+use riscv_rt::entry;
+use sha2::{Sha256, Digest};
+
+#[entry]
+fn main() -> ! {
+    let peripherals = Peripherals::take().unwrap();
+    let system = peripherals.SYSTEM.split();
+    let clocks = ClockControl::boot_defaults(system.clock_control).freeze();
+
+    let timer_group0 = TimerGroup::new(peripherals.TIMG0, &clocks);
+    let mut wdt = timer_group0.wdt;
+    let mut rtc = Rtc::new(peripherals.RTC_CNTL);
+
+    // Disable MWDT and RWDT (Watchdog) flash boot protection
+    wdt.disable();
+    rtc.rwdt.disable();
+
+    
+    let source_data = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".as_bytes();
+    let mut remaining = source_data.clone();
+    let mut hasher = Sha::new(peripherals.SHA, ShaMode::SHA256);
+
+    // Short hashes can be created by decreasing the output buffer to the desired length
+    let mut output = [0u8; 32];
+
+    //let pre_calc = xtensa_lx::timer::get_cycle_count();
+    // The hardware implementation takes a subslice of the input, and returns the unprocessed parts
+    // The unprocessed parts can be input in the next iteration, you can always add more data until
+    // finish() is called. After finish() is called update()'s will contribute to a new hash which
+    // can be extracted again with finish().
+    
+    while remaining.len() > 0 {
+        // Can add println to view progress, however println takes a few orders of magnitude longer than
+        // the Sha function itself so not useful for comparing processing time
+        // println!("Remaining len: {}", remaining.len());
+
+        // All the HW Sha functions are infallible so unwrap is fine to use if you use block!
+        remaining = block!(hasher.update(remaining)).unwrap();
+    }
+
+    // Finish can be called as many times as desired to get mutliple copies of the output.
+    block!(hasher.finish(output.as_mut_slice())).unwrap();
+    //let post_calc = xtensa_lx::timer::get_cycle_count();
+    //let hw_time = post_calc - pre_calc;
+    //println!("Took {} cycles", hw_time);
+    println!("SHA256 Hash output {:02x?}", output);
+    let _usha = hasher.free();
+
+
+    //let pre_calc = xtensa_lx::timer::get_cycle_count();
+    let mut hasher = Sha256::new();
+    hasher.update(source_data);
+    let soft_result = hasher.finalize();
+    //let post_calc = xtensa_lx::timer::get_cycle_count();
+    //let soft_time = post_calc - pre_calc;
+    //println!("Took {} cycles", soft_time);
+    println!("SHA256 Hash output {:02x?}", soft_result);
+
+    //println!("HW SHA is {}x faster", soft_time/hw_time);
+
+    loop {}
+}
diff --git a/esp32c3-hal/src/lib.rs b/esp32c3-hal/src/lib.rs
index 7d9921d3b57..cf7f7fbc169 100644
--- a/esp32c3-hal/src/lib.rs
+++ b/esp32c3-hal/src/lib.rs
@@ -34,6 +34,7 @@ pub use esp_hal_common::{
     Rwdt,
     Serial,
     UsbSerialJtag,
+    sha
 };
 
 #[cfg(feature = "embassy")]
diff --git a/esp32s2-hal/Cargo.toml b/esp32s2-hal/Cargo.toml
index 2abc2ac4d89..f9287125187 100644
--- a/esp32s2-hal/Cargo.toml
+++ b/esp32s2-hal/Cargo.toml
@@ -41,6 +41,7 @@ embassy-executor  = { package = "embassy-executor", git = "https://github.com/em
 embedded-graphics = "0.7.1"
 esp-backtrace     = { version = "0.3.0", features = ["esp32s2", "panic-handler", "print-uart"] }
 esp-println       = { version = "0.3.1", features = ["esp32s2"] }
+sha2              = { version = "0.10.6", default-features = false}
 smart-leds        = "0.3.0"
 ssd1306           = "0.7.1"
 usb-device        = { version = "0.2.9" }
diff --git a/esp32s2-hal/examples/sha.rs b/esp32s2-hal/examples/sha.rs
new file mode 100644
index 00000000000..1e16b0f4842
--- /dev/null
+++ b/esp32s2-hal/examples/sha.rs
@@ -0,0 +1,79 @@
+//! Demonstrates the use of the SHA peripheral and compares the speed of hardware-accelerated and pure software hashing.
+//! 
+
+#![no_std]
+#![no_main]
+
+use esp32s2_hal::{
+    clock::ClockControl,
+    pac::Peripherals,
+    prelude::*,
+    timer::TimerGroup,
+    Rtc,
+    sha::{Sha, ShaMode},
+};
+use nb::block;
+use esp_backtrace as _;
+use esp_println::println;
+use xtensa_lx_rt::entry;
+use sha2::{Sha512, Digest};
+
+#[entry]
+fn main() -> ! {
+    let peripherals = Peripherals::take().unwrap();
+    let system = peripherals.SYSTEM.split();
+    let clocks = ClockControl::boot_defaults(system.clock_control).freeze();
+
+    let timer_group0 = TimerGroup::new(peripherals.TIMG0, &clocks);
+    let mut wdt = timer_group0.wdt;
+    let mut rtc = Rtc::new(peripherals.RTC_CNTL);
+
+    // Disable MWDT and RWDT (Watchdog) flash boot protection
+    wdt.disable();
+    rtc.rwdt.disable();
+
+    
+    let source_data = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".as_bytes();
+    let mut remaining = source_data.clone();
+    let mut hasher = Sha::new(peripherals.SHA, ShaMode::SHA512);
+
+    // Short hashes can be created by decreasing the output buffer to the desired length
+    let mut output = [0u8; 64];
+
+    let pre_calc = xtensa_lx::timer::get_cycle_count();
+    // The hardware implementation takes a subslice of the input, and returns the unprocessed parts
+    // The unprocessed parts can be input in the next iteration, you can always add more data until
+    // finish() is called. After finish() is called update()'s will contribute to a new hash which
+    // can be extracted again with finish().
+    
+    while remaining.len() > 0 {
+        // Can add println to view progress, however println takes a few orders of magnitude longer than
+        // the Sha function itself so not useful for comparing processing time
+        // println!("Remaining len: {}", remaining.len());
+
+        // All the HW Sha functions are infallible so unwrap is fine to use if you use block!
+        remaining = block!(hasher.update(remaining)).unwrap();
+    }
+
+    // Finish can be called as many times as desired to get mutliple copies of the output.
+    block!(hasher.finish(output.as_mut_slice())).unwrap();
+    let post_calc = xtensa_lx::timer::get_cycle_count();
+    let hw_time = post_calc - pre_calc;
+    println!("Took {} cycles", hw_time);
+    println!("SHA512 Hash output {:02x?}", output);
+    let _usha = hasher.free();
+
+
+    let pre_calc = xtensa_lx::timer::get_cycle_count();
+    let mut hasher = Sha512::new();
+    hasher.update(source_data);
+    let soft_result = hasher.finalize();
+    let post_calc = xtensa_lx::timer::get_cycle_count();
+    let soft_time = post_calc - pre_calc;
+    println!("Took {} cycles", soft_time);
+    println!("SHA512 Hash output {:02x?}", soft_result);
+
+    println!("HW SHA is {}x faster", soft_time/hw_time);
+
+    loop {}
+}
diff --git a/esp32s2-hal/src/lib.rs b/esp32s2-hal/src/lib.rs
index b109e6d5077..e806c15fd1a 100644
--- a/esp32s2-hal/src/lib.rs
+++ b/esp32s2-hal/src/lib.rs
@@ -30,6 +30,7 @@ pub use esp_hal_common::{
     Rtc,
     Rwdt,
     Serial,
+    sha
 };
 
 #[cfg(feature = "embassy")]
diff --git a/esp32s3-hal/Cargo.toml b/esp32s3-hal/Cargo.toml
index 63bcc433726..c5439e216bf 100644
--- a/esp32s3-hal/Cargo.toml
+++ b/esp32s3-hal/Cargo.toml
@@ -42,6 +42,7 @@ embassy-executor  = { package = "embassy-executor", git = "https://github.com/em
 embedded-graphics = "0.7.1"
 esp-backtrace     = { version = "0.3.0", features = ["esp32s3", "panic-handler", "exception-handler", "print-uart"] }
 esp-println       = { version = "0.3.1", features = ["esp32s3"] }
+sha2              = { version = "0.10.6", default-features = false}
 smart-leds        = "0.3.0"
 ssd1306           = "0.7.1"
 usb-device        = { version = "0.2.9" }
diff --git a/esp32s3-hal/examples/sha.rs b/esp32s3-hal/examples/sha.rs
new file mode 100644
index 00000000000..b47b11f515b
--- /dev/null
+++ b/esp32s3-hal/examples/sha.rs
@@ -0,0 +1,79 @@
+//! Demonstrates the use of the SHA peripheral and compares the speed of hardware-accelerated and pure software hashing.
+//! 
+
+#![no_std]
+#![no_main]
+
+use esp32s3_hal::{
+    clock::ClockControl,
+    pac::Peripherals,
+    prelude::*,
+    timer::TimerGroup,
+    Rtc,
+    sha::{Sha, ShaMode},
+};
+use nb::block;
+use esp_backtrace as _;
+use esp_println::println;
+use xtensa_lx_rt::entry;
+use sha2::{Sha512, Digest};
+
+#[entry]
+fn main() -> ! {
+    let peripherals = Peripherals::take().unwrap();
+    let system = peripherals.SYSTEM.split();
+    let clocks = ClockControl::boot_defaults(system.clock_control).freeze();
+
+    let timer_group0 = TimerGroup::new(peripherals.TIMG0, &clocks);
+    let mut wdt = timer_group0.wdt;
+    let mut rtc = Rtc::new(peripherals.RTC_CNTL);
+
+    // Disable MWDT and RWDT (Watchdog) flash boot protection
+    wdt.disable();
+    rtc.rwdt.disable();
+
+    
+    let source_data = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".as_bytes();
+    let mut remaining = source_data.clone();
+    let mut hasher = Sha::new(peripherals.SHA, ShaMode::SHA512);
+
+    // Short hashes can be created by decreasing the output buffer to the desired length
+    let mut output = [0u8; 64];
+
+    let pre_calc = xtensa_lx::timer::get_cycle_count();
+    // The hardware implementation takes a subslice of the input, and returns the unprocessed parts
+    // The unprocessed parts can be input in the next iteration, you can always add more data until
+    // finish() is called. After finish() is called update()'s will contribute to a new hash which
+    // can be extracted again with finish().
+    
+    while remaining.len() > 0 {
+        // Can add println to view progress, however println takes a few orders of magnitude longer than
+        // the Sha function itself so not useful for comparing processing time
+        // println!("Remaining len: {}", remaining.len());
+
+        // All the HW Sha functions are infallible so unwrap is fine to use if you use block!
+        remaining = block!(hasher.update(remaining)).unwrap();
+    }
+
+    // Finish can be called as many times as desired to get mutliple copies of the output.
+    block!(hasher.finish(output.as_mut_slice())).unwrap();
+    let post_calc = xtensa_lx::timer::get_cycle_count();
+    let hw_time = post_calc - pre_calc;
+    println!("Took {} cycles", hw_time);
+    println!("SHA512 Hash output {:02x?}", output);
+    let _usha = hasher.free();
+
+
+    let pre_calc = xtensa_lx::timer::get_cycle_count();
+    let mut hasher = Sha512::new();
+    hasher.update(source_data);
+    let soft_result = hasher.finalize();
+    let post_calc = xtensa_lx::timer::get_cycle_count();
+    let soft_time = post_calc - pre_calc;
+    println!("Took {} cycles", soft_time);
+    println!("SHA512 Hash output {:02x?}", soft_result);
+
+    println!("HW SHA is {}x faster", soft_time/hw_time);
+
+    loop {}
+}
diff --git a/esp32s3-hal/src/lib.rs b/esp32s3-hal/src/lib.rs
index 94389bf4304..fd161de7632 100644
--- a/esp32s3-hal/src/lib.rs
+++ b/esp32s3-hal/src/lib.rs
@@ -33,6 +33,7 @@ pub use esp_hal_common::{
     Rwdt,
     Serial,
     UsbSerialJtag,
+    sha
 };
 
 #[cfg(feature = "embassy")]