diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1bae5fba5e..9d8d746eb74 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - Bumped MSRV to 1.67 (#798)
+- Optimised multi-core critical section implementation (#797)
 
 ### Fixed
 
diff --git a/esp-hal-common/Cargo.toml b/esp-hal-common/Cargo.toml
index cce0468cb88..fe44cb716de 100644
--- a/esp-hal-common/Cargo.toml
+++ b/esp-hal-common/Cargo.toml
@@ -26,7 +26,6 @@ embedded-io          = "0.5.0"
 esp-synopsys-usb-otg = { version = "0.3.2", optional = true, features = ["fs", "esp32sx"] }
 fugit                = "0.3.7"
 log                  = { version = "0.4.20", optional = true }
-lock_api             = { version = "0.4.10", optional = true }
 nb                   = "1.1.0"
 paste                = "1.0.14"
 procmacros           = { version = "0.6.1", package = "esp-hal-procmacros", path = "../esp-hal-procmacros" }
@@ -70,13 +69,13 @@ basic-toml = "0.1.4"
 serde      = { version = "1.0.188", features = ["derive"] }
 
 [features]
-esp32   = ["esp32/rt",   "xtensa", "xtensa-lx/esp32", "xtensa-lx-rt/esp32", "lock_api", "procmacros/esp32"]
+esp32   = ["esp32/rt",   "xtensa", "xtensa-lx/esp32", "xtensa-lx-rt/esp32", "procmacros/esp32"]
 esp32c2 = ["esp32c2/rt", "riscv",  "procmacros/esp32c2"]
 esp32c3 = ["esp32c3/rt", "riscv",  "procmacros/esp32c3"]
 esp32c6 = ["esp32c6/rt", "riscv",  "procmacros/esp32c6"]
 esp32h2 = ["esp32h2/rt", "riscv",  "procmacros/esp32h2"]
 esp32s2 = ["esp32s2/rt", "xtensa", "xtensa-lx/esp32s2", "xtensa-lx-rt/esp32s2", "esp-synopsys-usb-otg", "usb-device", "procmacros/esp32s2"]
-esp32s3 = ["esp32s3/rt", "xtensa", "xtensa-lx/esp32s3", "xtensa-lx-rt/esp32s3", "lock_api", "esp-synopsys-usb-otg", "usb-device", "procmacros/esp32s3"]
+esp32s3 = ["esp32s3/rt", "xtensa", "xtensa-lx/esp32s3", "xtensa-lx-rt/esp32s3", "esp-synopsys-usb-otg", "usb-device", "procmacros/esp32s3"]
 
 # Crystal frequency selection (ESP32 and ESP32-C2 only!)
 esp32_26mhz   = []
diff --git a/esp-hal-common/src/lib.rs b/esp-hal-common/src/lib.rs
index 702e2c50753..8c3f97d3ae2 100644
--- a/esp-hal-common/src/lib.rs
+++ b/esp-hal-common/src/lib.rs
@@ -181,19 +181,23 @@ pub enum Cpu {
     AppCpu,
 }
 
+#[cfg(all(xtensa, multi_core))]
+fn get_raw_core() -> u32 {
+    xtensa_lx::get_processor_id() & 0x2000
+}
+
 /// Which core the application is currently executing on
+#[cfg(all(xtensa, multi_core))]
 pub fn get_core() -> Cpu {
-    #[cfg(all(xtensa, multi_core))]
-    match ((xtensa_lx::get_processor_id() >> 13) & 1) != 0 {
-        false => Cpu::ProCpu,
-        true => Cpu::AppCpu,
+    match get_raw_core() {
+        0 => Cpu::ProCpu,
+        _ => Cpu::AppCpu,
     }
+}
 
-    // #[cfg(all(riscv, multi_core))]
-    // TODO get hart_id
-
-    // single core always has ProCpu only
-    #[cfg(single_core)]
+/// Which core the application is currently executing on
+#[cfg(not(all(xtensa, multi_core)))]
+pub fn get_core() -> Cpu {
     Cpu::ProCpu
 }
 
@@ -204,30 +208,50 @@ mod critical_section_impl {
 
     #[cfg(xtensa)]
     mod xtensa {
+        // PS has 15 useful bits. Bits 12..16 and 19..32 are unused, so we can use bit
+        // #31 as our reentry flag.
+        #[cfg(multi_core)]
+        const REENTRY_FLAG: u32 = 1 << 31;
+
         unsafe impl critical_section::Impl for super::CriticalSection {
             unsafe fn acquire() -> critical_section::RawRestoreState {
-                let tkn: critical_section::RawRestoreState;
+                let mut tkn: critical_section::RawRestoreState;
                 core::arch::asm!("rsil {0}, 5", out(reg) tkn);
                 #[cfg(multi_core)]
                 {
-                    let guard = super::multicore::MULTICORE_LOCK.lock();
-                    core::mem::forget(guard); // forget it so drop doesn't run
+                    use super::multicore::{LockKind, MULTICORE_LOCK};
+
+                    match MULTICORE_LOCK.lock() {
+                        LockKind::Lock => {
+                            // We can assume the reserved bit is 0 otherwise
+                            // rsil - wsr pairings would be undefined behavior
+                        }
+                        LockKind::Reentry => tkn |= REENTRY_FLAG,
+                    }
                 }
                 tkn
             }
 
             unsafe fn release(token: critical_section::RawRestoreState) {
-                if token != 0 {
-                    #[cfg(multi_core)]
-                    {
-                        debug_assert!(super::multicore::MULTICORE_LOCK.is_owned_by_current_thread());
-                        // safety: we logically own the mutex from acquire()
-                        super::multicore::MULTICORE_LOCK.force_unlock();
+                #[cfg(multi_core)]
+                {
+                    use super::multicore::MULTICORE_LOCK;
+
+                    debug_assert!(MULTICORE_LOCK.is_owned_by_current_thread());
+
+                    if token & REENTRY_FLAG != 0 {
+                        return;
                     }
-                    core::arch::asm!(
-                        "wsr.ps {0}",
-                        "rsync", in(reg) token)
+
+                    MULTICORE_LOCK.unlock();
                 }
+
+                const RESERVED_MASK: u32 = 0b1111_1111_1111_1000_1111_0000_0000_0000;
+                debug_assert!(token & RESERVED_MASK == 0);
+
+                core::arch::asm!(
+                    "wsr.ps {0}",
+                    "rsync", in(reg) token)
             }
         }
     }
@@ -236,28 +260,45 @@ mod critical_section_impl {
     mod riscv {
         use esp_riscv_rt::riscv;
 
+        #[cfg(multi_core)]
+        // The restore state is a u8 that is casted from a bool, so it has a value of
+        // 0x00 or 0x01 before we add the reentry flag to it.
+        const REENTRY_FLAG: u8 = 1 << 7;
+
         unsafe impl critical_section::Impl for super::CriticalSection {
             unsafe fn acquire() -> critical_section::RawRestoreState {
                 let mut mstatus = 0u32;
                 core::arch::asm!("csrrci {0}, mstatus, 8", inout(reg) mstatus);
-                let interrupts_active = (mstatus & 0b1000) != 0;
+                let mut tkn = ((mstatus & 0b1000) != 0) as critical_section::RawRestoreState;
+
                 #[cfg(multi_core)]
                 {
-                    let guard = multicore::MULTICORE_LOCK.lock();
-                    core::mem::forget(guard); // forget it so drop doesn't run
+                    use super::multicore::{LockKind, MULTICORE_LOCK};
+
+                    match MULTICORE_LOCK.lock() {
+                        LockKind::Lock => {}
+                        LockKind::Reentry => tkn |= REENTRY_FLAG,
+                    }
                 }
 
-                interrupts_active as _
+                tkn
             }
 
             unsafe fn release(token: critical_section::RawRestoreState) {
-                if token != 0 {
-                    #[cfg(multi_core)]
-                    {
-                        debug_assert!(multicore::MULTICORE_LOCK.is_owned_by_current_thread());
-                        // safety: we logically own the mutex from acquire()
-                        multicore::MULTICORE_LOCK.force_unlock();
+                #[cfg(multi_core)]
+                {
+                    use super::multicore::MULTICORE_LOCK;
+
+                    debug_assert!(MULTICORE_LOCK.is_owned_by_current_thread());
+
+                    if token & REENTRY_FLAG != 0 {
+                        return;
                     }
+
+                    MULTICORE_LOCK.unlock();
+                }
+
+                if token != 0 {
                     riscv::interrupt::enable();
                 }
             }
@@ -266,50 +307,70 @@ mod critical_section_impl {
 
     #[cfg(multi_core)]
     mod multicore {
-        use core::sync::atomic::{AtomicBool, Ordering};
+        use core::sync::atomic::{AtomicUsize, Ordering};
 
-        use lock_api::{GetThreadId, GuardSend, RawMutex};
+        // We're using a value that we know get_raw_core() will never return. This
+        // avoids an unnecessary increment of the core ID.
+        #[cfg(xtensa)] // TODO: first multi-core RISC-V target will show if this value is OK
+                       // globally or only for Xtensa
+        const UNUSED_THREAD_ID_VALUE: usize = 0x0001;
 
-        use crate::get_core;
+        fn thread_id() -> usize {
+            crate::get_raw_core() as usize
+        }
 
-        /// Reentrant Mutex
-        ///
-        /// Currently implemented using an atomic spin lock.
-        /// In the future we can optimize this raw mutex to use some hardware
-        /// features.
-        pub(crate) static MULTICORE_LOCK: lock_api::ReentrantMutex<RawSpinlock, RawThreadId, ()> =
-            lock_api::ReentrantMutex::const_new(RawSpinlock::INIT, RawThreadId::INIT, ());
+        pub(super) static MULTICORE_LOCK: ReentrantMutex = ReentrantMutex::new();
 
-        pub(crate) struct RawThreadId;
+        pub(super) enum LockKind {
+            Lock = 0,
+            Reentry,
+        }
 
-        unsafe impl lock_api::GetThreadId for RawThreadId {
-            const INIT: Self = RawThreadId;
+        pub(super) struct ReentrantMutex {
+            owner: AtomicUsize,
+        }
 
-            fn nonzero_thread_id(&self) -> core::num::NonZeroUsize {
-                core::num::NonZeroUsize::new((get_core() as usize) + 1).unwrap()
+        impl ReentrantMutex {
+            const fn new() -> Self {
+                Self {
+                    owner: AtomicUsize::new(UNUSED_THREAD_ID_VALUE),
+                }
             }
-        }
 
-        pub(crate) struct RawSpinlock(AtomicBool);
+            pub fn is_owned_by_current_thread(&self) -> bool {
+                self.owner.load(Ordering::Relaxed) == thread_id()
+            }
+
+            pub(super) fn lock(&self) -> LockKind {
+                let current_thread_id = thread_id();
 
-        unsafe impl lock_api::RawMutex for RawSpinlock {
-            const INIT: RawSpinlock = RawSpinlock(AtomicBool::new(false));
+                if self.try_lock(current_thread_id) {
+                    return LockKind::Lock;
+                }
+
+                let current_owner = self.owner.load(Ordering::Relaxed);
+                if current_owner == current_thread_id {
+                    return LockKind::Reentry;
+                }
 
-            // A spinlock guard can be sent to another thread and unlocked there
-            type GuardMarker = GuardSend;
+                while !self.try_lock(current_thread_id) {}
 
-            fn lock(&self) {
-                while !self.try_lock() {}
+                LockKind::Lock
             }
 
-            fn try_lock(&self) -> bool {
-                self.0
-                    .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
+            fn try_lock(&self, new_owner: usize) -> bool {
+                self.owner
+                    .compare_exchange(
+                        UNUSED_THREAD_ID_VALUE,
+                        new_owner,
+                        Ordering::Acquire,
+                        Ordering::Relaxed,
+                    )
                     .is_ok()
             }
 
-            unsafe fn unlock(&self) {
-                self.0.store(false, Ordering::Release);
+            pub(super) fn unlock(&self) {
+                self.owner.store(UNUSED_THREAD_ID_VALUE, Ordering::Release);
             }
         }
     }
diff --git a/esp32s3-hal/ld/db-esp32s3.x b/esp32s3-hal/ld/db-esp32s3.x
index 0d41b3a415a..37bf0675382 100644
--- a/esp32s3-hal/ld/db-esp32s3.x
+++ b/esp32s3-hal/ld/db-esp32s3.x
@@ -97,7 +97,7 @@ SECTIONS {
   }
   _text_size = _etext - _stext;
 
-  .rodata ORIGIN(RODATA) + 0x408 + _text_size : AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header))
+  .rodata ORIGIN(RODATA) + 0x408 + _text_size :
   {
     _rodata_start = ABSOLUTE(.);
     . = ALIGN (4);
@@ -106,8 +106,7 @@ SECTIONS {
     _rodata_end = ABSOLUTE(.);
   }
 
-  .rwtext ORIGIN(RWTEXT) + 0x408 + _text_size + SIZEOF(.rodata) : 
-      AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata))
+  .rwtext ORIGIN(RWTEXT) + 0x408 + _text_size + SIZEOF(.rodata) :
   {
     _irwtext = ORIGIN(RODATA) + 0x408 + _text_size + SIZEOF(.rodata);
     _srwtext = .;
@@ -156,7 +155,6 @@ SECTIONS {
   }
 
   .data ORIGIN(RWDATA) : 
-      AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext))
   {
     _data_start = ABSOLUTE(.);
     . = ALIGN (4);
@@ -164,10 +162,9 @@ SECTIONS {
     . = ALIGN (4);
     _data_end = ABSOLUTE(.);
   }
- 
 
   /* LMA of .data */
-  _sidata = ORIGIN(RODATA) + _text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext);
+  _sidata = _erwtext - ORIGIN(RWTEXT) + ORIGIN(RODATA);
 
   .bss (NOLOAD) : ALIGN(4)
   {
@@ -193,7 +190,6 @@ SECTIONS {
   } > RWDATA
   
   .rtc_fast.text ORIGIN(rtc_fast_iram_seg) : 
-      AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext) )
   {
    . = ALIGN(4);
    _rtc_fast_text_start = ABSOLUTE(.);
@@ -204,7 +200,6 @@ SECTIONS {
   _irtc_fast_text = ORIGIN(RODATA) + _text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext);
  
   .rtc_fast.data ORIGIN(rtc_fast_dram_seg) + SIZEOF(.rtc_fast.text) : 
-      AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext) + SIZEOF(.rtc_fast.text) )
   {
     . = ALIGN(4);
     _rtc_fast_data_start = ABSOLUTE(.);
@@ -215,8 +210,6 @@ SECTIONS {
   _irtc_fast_data = ORIGIN(RODATA) + _text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext) + SIZEOF(.rtc_fast.text);
 
  .rtc_fast.bss ORIGIN(rtc_fast_dram_seg) + SIZEOF(.rtc_fast.text) + SIZEOF(.rtc_fast.data) (NOLOAD) : 
-    AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + 
-      SIZEOF(.rwtext) + SIZEOF(.rtc_fast.text) + SIZEOF(.rtc_fast.data))
   {
     . = ALIGN(4);
     _rtc_fast_bss_start = ABSOLUTE(.);
@@ -233,8 +226,6 @@ SECTIONS {
   }
 
  .rtc_slow.text ORIGIN(rtc_slow_seg) : 
-    AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext) + 
-      SIZEOF(.rtc_fast.text) + SIZEOF(.rtc_fast.data) + SIZEOF(.rtc_fast.bss)) 
   {
    . = ALIGN(4);
    _rtc_slow_text_start = ABSOLUTE(.);
@@ -246,8 +237,6 @@ SECTIONS {
       SIZEOF(.rtc_fast.text) + SIZEOF(.rtc_fast.data) + SIZEOF(.rtc_fast.bss);
 
   .rtc_slow.data ORIGIN(rtc_slow_seg) + SIZEOF(.rtc_slow.text) : 
-      AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext) + 
-        SIZEOF(.rtc_fast.text) + SIZEOF(.rtc_fast.data) + SIZEOF(.rtc_fast.bss) + SIZEOF(.rtc_slow.text))
   {
     . = ALIGN(4);
     _rtc_slow_data_start = ABSOLUTE(.);
@@ -259,8 +248,6 @@ SECTIONS {
         SIZEOF(.rtc_fast.text) + SIZEOF(.rtc_fast.data) + SIZEOF(.rtc_fast.bss) + SIZEOF(.rtc_slow.text);
 
  .rtc_slow.bss ORIGIN(rtc_slow_seg) + SIZEOF(.rtc_slow.text) + SIZEOF(.rtc_slow.data) (NOLOAD) : 
-    AT(_text_size + SIZEOF(.header) + SIZEOF(.pre_header) + SIZEOF(.rodata) + SIZEOF(.rwtext) + 
-      SIZEOF(.rtc_fast.text) + SIZEOF(.rtc_fast.data) + SIZEOF(.rtc_fast.bss) + SIZEOF(.rtc_slow.text) + SIZEOF(.rtc_slow.data))
   {
     . = ALIGN(4);
     _rtc_slow_bss_start = ABSOLUTE(.);