diff --git a/builder/musl.go b/builder/musl.go
index 3c79c7c43a..54ec2c83ed 100644
--- a/builder/musl.go
+++ b/builder/musl.go
@@ -113,6 +113,7 @@ var libMusl = Library{
 	librarySources: func(target string) ([]string, error) {
 		arch := compileopts.MuslArchitecture(target)
 		globs := []string{
+			"conf/*.c",
 			"env/*.c",
 			"errno/*.c",
 			"exit/*.c",
diff --git a/builder/sizes.go b/builder/sizes.go
index 485a652d97..9aa74ffa82 100644
--- a/builder/sizes.go
+++ b/builder/sizes.go
@@ -490,9 +490,9 @@ func loadProgramSize(path string, packagePathMap map[string]string) (*programSiz
 				continue
 			}
 			if section.Type == elf.SHT_NOBITS {
-				if section.Name == ".stack" {
+				if section.Name == ".stack" || section.Name == ".stack1" {
 					// TinyGo emits stack sections on microcontroller using the
-					// ".stack" name.
+					// ".stack" (or ".stack1") name.
 					// This is a bit ugly, but I don't think there is a way to
 					// mark the stack section in a linker script.
 					sections = append(sections, memorySection{
diff --git a/compileopts/config.go b/compileopts/config.go
index ee5c34537c..1c88519cd6 100644
--- a/compileopts/config.go
+++ b/compileopts/config.go
@@ -99,6 +99,11 @@ func (c *Config) BuildTags() []string {
 		"math_big_pure_go",                           // to get math/big to work
 		"gc." + c.GC(), "scheduler." + c.Scheduler(), // used inside the runtime package
 		"serial." + c.Serial()}...) // used inside the machine package
+	switch c.Scheduler() {
+	case "threads", "cores":
+	default:
+		tags = append(tags, "tinygo.unicore")
+	}
 	for i := 1; i <= c.GoMinorVersion; i++ {
 		tags = append(tags, fmt.Sprintf("go1.%d", i))
 	}
diff --git a/compileopts/options.go b/compileopts/options.go
index 30e0e4dbed..4366dd5c11 100644
--- a/compileopts/options.go
+++ b/compileopts/options.go
@@ -10,7 +10,7 @@ import (
 var (
 	validBuildModeOptions     = []string{"default", "c-shared"}
 	validGCOptions            = []string{"none", "leaking", "conservative", "custom", "precise"}
-	validSchedulerOptions     = []string{"none", "tasks", "asyncify"}
+	validSchedulerOptions     = []string{"none", "tasks", "asyncify", "threads", "cores"}
 	validSerialOptions        = []string{"none", "uart", "usb", "rtt"}
 	validPrintSizeOptions     = []string{"none", "short", "full", "html"}
 	validPanicStrategyOptions = []string{"print", "trap"}
diff --git a/compileopts/options_test.go b/compileopts/options_test.go
index ee63c4c46d..d52ef5690e 100644
--- a/compileopts/options_test.go
+++ b/compileopts/options_test.go
@@ -10,7 +10,7 @@ import (
 func TestVerifyOptions(t *testing.T) {
 
 	expectedGCError := errors.New(`invalid gc option 'incorrect': valid values are none, leaking, conservative, custom, precise`)
-	expectedSchedulerError := errors.New(`invalid scheduler option 'incorrect': valid values are none, tasks, asyncify`)
+	expectedSchedulerError := errors.New(`invalid scheduler option 'incorrect': valid values are none, tasks, asyncify, threads`)
 	expectedPrintSizeError := errors.New(`invalid size option 'incorrect': valid values are none, short, full, html`)
 	expectedPanicStrategyError := errors.New(`invalid panic option 'incorrect': valid values are print, trap`)
 
diff --git a/compileopts/target.go b/compileopts/target.go
index 7893e58290..961e69a5d8 100644
--- a/compileopts/target.go
+++ b/compileopts/target.go
@@ -248,7 +248,6 @@ func defaultTarget(options *Options) (*TargetSpec, error) {
 		GOARCH:           options.GOARCH,
 		BuildTags:        []string{options.GOOS, options.GOARCH},
 		GC:               "precise",
-		Scheduler:        "tasks",
 		Linker:           "cc",
 		DefaultStackSize: 1024 * 64, // 64kB
 		GDB:              []string{"gdb"},
@@ -381,6 +380,7 @@ func defaultTarget(options *Options) (*TargetSpec, error) {
 			platformVersion = "11.0.0" // first macosx platform with arm64 support
 		}
 		llvmvendor = "apple"
+		spec.Scheduler = "tasks"
 		spec.Linker = "ld.lld"
 		spec.Libc = "darwin-libSystem"
 		// Use macosx* instead of darwin, otherwise darwin/arm64 will refer to
@@ -398,6 +398,7 @@ func defaultTarget(options *Options) (*TargetSpec, error) {
 			"src/runtime/runtime_unix.c",
 			"src/runtime/signal.c")
 	case "linux":
+		spec.Scheduler = "threads"
 		spec.Linker = "ld.lld"
 		spec.RTLib = "compiler-rt"
 		spec.Libc = "musl"
@@ -418,9 +419,11 @@ func defaultTarget(options *Options) (*TargetSpec, error) {
 		}
 		spec.ExtraFiles = append(spec.ExtraFiles,
 			"src/internal/futex/futex_linux.c",
+			"src/internal/task/task_threads.c",
 			"src/runtime/runtime_unix.c",
 			"src/runtime/signal.c")
 	case "windows":
+		spec.Scheduler = "tasks"
 		spec.Linker = "ld.lld"
 		spec.Libc = "mingw-w64"
 		// Note: using a medium code model, low image base and no ASLR
@@ -498,6 +501,7 @@ func defaultTarget(options *Options) (*TargetSpec, error) {
 		}
 		spec.ExtraFiles = append(spec.ExtraFiles, "src/runtime/asm_"+asmGoarch+suffix+".S")
 		spec.ExtraFiles = append(spec.ExtraFiles, "src/internal/task/task_stack_"+asmGoarch+suffix+".S")
+		spec.ExtraFiles = append(spec.ExtraFiles, "src/internal/task/task_cores.c")
 	}
 
 	// Configure the emulator.
diff --git a/compiler/symbol.go b/compiler/symbol.go
index ff7ef05508..8ad2184d39 100644
--- a/compiler/symbol.go
+++ b/compiler/symbol.go
@@ -208,6 +208,8 @@ func (c *compilerContext) getFunction(fn *ssa.Function) (llvm.Type, llvm.Value)
 			// > circumstances, and should not be exposed to source languages.
 			llvmutil.AppendToGlobal(c.mod, "llvm.compiler.used", llvmFn)
 		}
+	case "tinygo_exitTask", "tinygo_schedulerUnlock":
+		llvmutil.AppendToGlobal(c.mod, "llvm.used", llvmFn)
 	}
 
 	// External/exported functions may not retain pointer values.
diff --git a/src/internal/task/atomic-cooperative.go b/src/internal/task/atomic-cooperative.go
index 60eb917a8e..e05ea7de0d 100644
--- a/src/internal/task/atomic-cooperative.go
+++ b/src/internal/task/atomic-cooperative.go
@@ -1,3 +1,5 @@
+//go:build tinygo.unicore
+
 package task
 
 // Atomics implementation for cooperative systems. The atomic types here aren't
diff --git a/src/internal/task/atomic-preemptive.go b/src/internal/task/atomic-preemptive.go
new file mode 100644
index 0000000000..b395ef48a3
--- /dev/null
+++ b/src/internal/task/atomic-preemptive.go
@@ -0,0 +1,14 @@
+//go:build !tinygo.unicore
+
+package task
+
+// Atomics implementation for non-cooperative systems (multithreaded, etc).
+// These atomic types use real atomic instructions.
+
+import "sync/atomic"
+
+type (
+	Uintptr = atomic.Uintptr
+	Uint32  = atomic.Uint32
+	Uint64  = atomic.Uint64
+)
diff --git a/src/internal/task/futex-cooperative.go b/src/internal/task/futex-cooperative.go
index 8351f88774..ae9efb5a73 100644
--- a/src/internal/task/futex-cooperative.go
+++ b/src/internal/task/futex-cooperative.go
@@ -1,3 +1,5 @@
+//go:build tinygo.unicore
+
 package task
 
 // A futex is a way for userspace to wait with the pointer as the key, and for
diff --git a/src/internal/task/futex-cores.go b/src/internal/task/futex-cores.go
new file mode 100644
index 0000000000..0ffccba2e4
--- /dev/null
+++ b/src/internal/task/futex-cores.go
@@ -0,0 +1,64 @@
+//go:build scheduler.cores
+
+package task
+
+import "runtime/interrupt"
+
+// A futex is a way for userspace to wait with the pointer as the key, and for
+// another thread to wake one or all waiting threads keyed on the same pointer.
+//
+// A futex does not change the underlying value, it only reads it before to prevent
+// lost wake-ups.
+type Futex struct {
+	Uint32
+
+	waiters Stack
+}
+
+// Atomically check for cmp to still be equal to the futex value and if so, go
+// to sleep. Return true if we were definitely awoken by a call to Wake or
+// WakeAll, and false if we can't be sure of that.
+func (f *Futex) Wait(cmp uint32) (awoken bool) {
+	mask := futexLock()
+
+	if f.Uint32.Load() != cmp {
+		futexUnlock(mask)
+		return false
+	}
+
+	// Push the current goroutine onto the waiter stack.
+	f.waiters.Push(Current())
+
+	futexUnlock(mask)
+
+	// Pause until this task is awoken by Wake/WakeAll.
+	Pause()
+
+	// We were awoken by a call to Wake or WakeAll. There is no chance for
+	// spurious wakeups.
+	return true
+}
+
+// Wake a single waiter.
+func (f *Futex) Wake() {
+	mask := futexLock()
+	if t := f.waiters.Pop(); t != nil {
+		scheduleTask(t)
+	}
+	futexUnlock(mask)
+}
+
+// Wake all waiters.
+func (f *Futex) WakeAll() {
+	mask := futexLock()
+	for t := f.waiters.Pop(); t != nil; t = f.waiters.Pop() {
+		scheduleTask(t)
+	}
+	futexUnlock(mask)
+}
+
+//go:linkname futexLock runtime.futexLock
+func futexLock() interrupt.State
+
+//go:linkname futexUnlock runtime.futexUnlock
+func futexUnlock(interrupt.State)
diff --git a/src/internal/task/futex-threads.go b/src/internal/task/futex-threads.go
new file mode 100644
index 0000000000..7f9e89580c
--- /dev/null
+++ b/src/internal/task/futex-threads.go
@@ -0,0 +1,7 @@
+//go:build scheduler.threads
+
+package task
+
+import "internal/futex"
+
+type Futex = futex.Futex
diff --git a/src/internal/task/linux.go b/src/internal/task/linux.go
new file mode 100644
index 0000000000..7d28f708c4
--- /dev/null
+++ b/src/internal/task/linux.go
@@ -0,0 +1,9 @@
+//go:build linux && !baremetal
+
+package task
+
+import "unsafe"
+
+// Musl uses a pointer (or unsigned long for C++) so unsafe.Pointer should be
+// fine.
+type threadID unsafe.Pointer
diff --git a/src/internal/task/mutex-cooperative.go b/src/internal/task/mutex-cooperative.go
index e40966bed4..90274df2bb 100644
--- a/src/internal/task/mutex-cooperative.go
+++ b/src/internal/task/mutex-cooperative.go
@@ -1,3 +1,5 @@
+//go:build tinygo.unicore
+
 package task
 
 type Mutex struct {
diff --git a/src/internal/task/mutex-preemptive.go b/src/internal/task/mutex-preemptive.go
new file mode 100644
index 0000000000..ec83a6135d
--- /dev/null
+++ b/src/internal/task/mutex-preemptive.go
@@ -0,0 +1,71 @@
+//go:build !tinygo.unicore
+
+package task
+
+// Futex-based mutex.
+// This is largely based on the paper "Futexes are Tricky" by Ulrich Drepper.
+// It describes a few ways to implement mutexes using a futex, and how some
+// seemingly-obvious implementations don't exactly work as intended.
+// Unfortunately, Go atomic operations work slightly differently so we can't
+// copy the algorithm verbatim.
+//
+// The implementation works like this. The futex can have 3 different values,
+// depending on the state:
+//
+//   - 0: the futex is currently unlocked.
+//   - 1: the futex is locked, but is uncontended. There is one special case: if
+//     a contended futex is unlocked, it is set to 0. It is possible for another
+//     thread to lock the futex before the next waiter is woken. But because a
+//     waiter will be woken (if there is one), it will always change to 2
+//     regardless. So this is not a problem.
+//   - 2: the futex is locked, and is contended. At least one thread is trying
+//     to obtain the lock (and is in the contended loop, see below).
+//
+// For the paper, see:
+// https://dept-info.labri.fr/~denis/Enseignement/2008-IR/Articles/01-futex.pdf)
+
+type Mutex struct {
+	futex Futex
+}
+
+func (m *Mutex) Lock() {
+	// Fast path: try to take an uncontended lock.
+	if m.futex.CompareAndSwap(0, 1) {
+		// We obtained the mutex.
+		return
+	}
+
+	// The futex is contended, so we enter the contended loop.
+	// If we manage to change the futex from 0 to 2, we managed to take the
+	// lock. Else, we have to wait until a call to Unlock unlocks this mutex.
+	// (Unlock will wake one waiter when it finds the futex is set to 2 when
+	// unlocking).
+	for m.futex.Swap(2) != 0 {
+		// Wait until we get resumed in Unlock.
+		m.futex.Wait(2)
+	}
+}
+
+func (m *Mutex) Unlock() {
+	if old := m.futex.Swap(0); old == 0 {
+		// Mutex wasn't locked before.
+		panic("sync: unlock of unlocked Mutex")
+	} else if old == 2 {
+		// Mutex was a contended lock, so we need to wake the next waiter.
+		m.futex.Wake()
+	}
+}
+
+// TryLock tries to lock m and reports whether it succeeded.
+//
+// Note that while correct uses of TryLock do exist, they are rare,
+// and use of TryLock is often a sign of a deeper problem
+// in a particular use of mutexes.
+func (m *Mutex) TryLock() bool {
+	// Fast path: try to take an uncontended lock.
+	if m.futex.CompareAndSwap(0, 1) {
+		// We obtained the mutex.
+		return true
+	}
+	return false
+}
diff --git a/src/internal/task/pmutex-cooperative.go b/src/internal/task/pmutex-cooperative.go
index ae2aa4bad8..b61e92d829 100644
--- a/src/internal/task/pmutex-cooperative.go
+++ b/src/internal/task/pmutex-cooperative.go
@@ -1,3 +1,5 @@
+//go:build tinygo.unicore
+
 package task
 
 // PMutex is a real mutex on systems that can be either preemptive or threaded,
diff --git a/src/internal/task/pmutex-preemptive.go b/src/internal/task/pmutex-preemptive.go
new file mode 100644
index 0000000000..92263ed256
--- /dev/null
+++ b/src/internal/task/pmutex-preemptive.go
@@ -0,0 +1,11 @@
+//go:build !tinygo.unicore
+
+package task
+
+// PMutex is a real mutex on systems that can be either preemptive or threaded,
+// and a dummy lock on other (purely cooperative) systems.
+//
+// It is mainly useful for short operations that need a lock when threading may
+// be involved, but which do not need a lock with a purely cooperative
+// scheduler.
+type PMutex = Mutex
diff --git a/src/internal/task/semaphore.go b/src/internal/task/semaphore.go
new file mode 100644
index 0000000000..914f09bc5e
--- /dev/null
+++ b/src/internal/task/semaphore.go
@@ -0,0 +1,32 @@
+package task
+
+// Barebones semaphore implementation.
+// The main limitation is that if there are multiple waiters, a single Post()
+// call won't do anything. Only when Post() has been called to awaken all
+// waiters will the waiters proceed.
+// This limitation is not a problem when there will only be a single waiter.
+type Semaphore struct {
+	futex Futex
+}
+
+// Post (unlock) the semaphore, incrementing the value in the semaphore.
+func (s *Semaphore) Post() {
+	newValue := s.futex.Add(1)
+	if newValue == 0 {
+		s.futex.WakeAll()
+	}
+}
+
+// Wait (lock) the semaphore, decrementing the value in the semaphore.
+func (s *Semaphore) Wait() {
+	delta := int32(-1)
+	value := s.futex.Add(uint32(delta))
+	for {
+		if int32(value) >= 0 {
+			// Semaphore unlocked!
+			return
+		}
+		s.futex.Wait(value)
+		value = s.futex.Load()
+	}
+}
diff --git a/src/internal/task/task.go b/src/internal/task/task.go
index 546f5ba117..3f2a36cda9 100644
--- a/src/internal/task/task.go
+++ b/src/internal/task/task.go
@@ -21,11 +21,19 @@ type Task struct {
 	// state is the underlying running state of the task.
 	state state
 
+	RunState uint8
+
 	// DeferFrame stores a pointer to the (stack allocated) defer frame of the
 	// goroutine that is used for the recover builtin.
 	DeferFrame unsafe.Pointer
 }
 
+const (
+	RunStatePaused = iota
+	RunStateRunning
+	RunStateResuming
+)
+
 // DataUint32 returns the Data field as a uint32. The value is only valid after
 // setting it through SetDataUint32 or by storing to it using DataAtomicUint32.
 func (t *Task) DataUint32() uint32 {
@@ -53,3 +61,11 @@ func runtime_alloc(size uintptr, layout unsafe.Pointer) unsafe.Pointer
 
 //go:linkname scheduleTask runtime.scheduleTask
 func scheduleTask(*Task)
+
+//go:linkname runtimePanic runtime.runtimePanic
+func runtimePanic(str string)
+
+// Stack canary, to detect a stack overflow. The number is a random number
+// generated by random.org. The bit fiddling dance is necessary because
+// otherwise Go wouldn't allow the cast to a smaller integer size.
+const stackCanary = uintptr(uint64(0x670c1333b83bf575) & uint64(^uintptr(0)))
diff --git a/src/internal/task/task_asyncify.go b/src/internal/task/task_asyncify.go
index 637a6b2237..bc4197995d 100644
--- a/src/internal/task/task_asyncify.go
+++ b/src/internal/task/task_asyncify.go
@@ -6,14 +6,6 @@ import (
 	"unsafe"
 )
 
-// Stack canary, to detect a stack overflow. The number is a random number
-// generated by random.org. The bit fiddling dance is necessary because
-// otherwise Go wouldn't allow the cast to a smaller integer size.
-const stackCanary = uintptr(uint64(0x670c1333b83bf575) & uint64(^uintptr(0)))
-
-//go:linkname runtimePanic runtime.runtimePanic
-func runtimePanic(str string)
-
 // state is a structure which holds a reference to the state of the task.
 // When the task is suspended, the stack pointers are saved here.
 type state struct {
diff --git a/src/internal/task/task_none.go b/src/internal/task/task_none.go
index 280f1c4a81..8b667eec38 100644
--- a/src/internal/task/task_none.go
+++ b/src/internal/task/task_none.go
@@ -7,9 +7,6 @@ import "unsafe"
 // There is only one goroutine so the task struct can be a global.
 var mainTask Task
 
-//go:linkname runtimePanic runtime.runtimePanic
-func runtimePanic(str string)
-
 func Pause() {
 	runtimePanic("scheduler is disabled")
 }
diff --git a/src/internal/task/task_stack.go b/src/internal/task/task_stack.go
index 88a0970685..cbba8e3807 100644
--- a/src/internal/task/task_stack.go
+++ b/src/internal/task/task_stack.go
@@ -1,4 +1,4 @@
-//go:build scheduler.tasks
+//go:build scheduler.tasks || scheduler.cores
 
 package task
 
@@ -7,14 +7,6 @@ import (
 	"unsafe"
 )
 
-//go:linkname runtimePanic runtime.runtimePanic
-func runtimePanic(str string)
-
-// Stack canary, to detect a stack overflow. The number is a random number
-// generated by random.org. The bit fiddling dance is necessary because
-// otherwise Go wouldn't allow the cast to a smaller integer size.
-const stackCanary = uintptr(uint64(0x670c1333b83bf575) & uint64(^uintptr(0)))
-
 // state is a structure which holds a reference to the state of the task.
 // When the task is suspended, the registers are stored onto the stack and the stack pointer is stored into sp.
 type state struct {
@@ -33,40 +25,54 @@ type state struct {
 }
 
 // currentTask is the current running task, or nil if currently in the scheduler.
-var currentTask *Task
+//var currentTask *Task
 
 // Current returns the current active task.
-func Current() *Task {
-	return currentTask
+//func Current() *Task {
+//	return currentTask
+//}
+
+//go:linkname Current runtime.currentTask
+func Current() *Task
+
+//go:linkname schedulerLock runtime.schedulerLock
+func schedulerLock()
+
+func Pause() {
+	schedulerLock()
+	PauseLocked()
 }
 
 // Pause suspends the current task and returns to the scheduler.
 // This function may only be called when running on a goroutine stack, not when running on the system stack or in an interrupt.
-func Pause() {
+func PauseLocked() {
 	// Check whether the canary (the lowest address of the stack) is still
 	// valid. If it is not, a stack overflow has occurred.
-	if *currentTask.state.canaryPtr != stackCanary {
+	if *Current().state.canaryPtr != stackCanary {
 		runtimePanic("goroutine stack overflow")
 	}
 	if interrupt.In() {
 		runtimePanic("blocked inside interrupt")
 	}
-	currentTask.state.pause()
+	current := Current()
+	current.RunState = RunStatePaused
+	current.state.pause()
 }
 
-//export tinygo_pause
-func pause() {
+//export tinygo_task_exit
+func taskExit() {
+	println("-- exiting task")
 	Pause()
 }
 
 // Resume the task until it pauses or completes.
 // This may only be called from the scheduler.
 func (t *Task) Resume() {
-	currentTask = t
-	t.gcData.swap()
+	//currentTask = t
+	//t.gcData.swap()
 	t.state.resume()
-	t.gcData.swap()
-	currentTask = nil
+	//t.gcData.swap()
+	//currentTask = nil
 }
 
 // initialize the state and prepare to call the specified function with the specified argument bundle.
diff --git a/src/internal/task/task_stack_cortexm.S b/src/internal/task/task_stack_cortexm.S
index dfe713552d..8e6520106e 100644
--- a/src/internal/task/task_stack_cortexm.S
+++ b/src/internal/task/task_stack_cortexm.S
@@ -28,7 +28,7 @@ tinygo_startTask:
     blx   r4
 
     // After return, exit this goroutine. This is a tail call.
-    bl    	tinygo_pause
+    bl    	tinygo_task_exit
     .cfi_endproc
 .size tinygo_startTask, .-tinygo_startTask
 
diff --git a/src/internal/task/task_stack_cortexm.go b/src/internal/task/task_stack_cortexm.go
index 226a088c88..653dc06e17 100644
--- a/src/internal/task/task_stack_cortexm.go
+++ b/src/internal/task/task_stack_cortexm.go
@@ -1,4 +1,4 @@
-//go:build scheduler.tasks && cortexm
+//go:build (scheduler.tasks || scheduler.cores) && cortexm
 
 package task
 
diff --git a/src/internal/task/task_threads.c b/src/internal/task/task_threads.c
new file mode 100644
index 0000000000..6ada95fa84
--- /dev/null
+++ b/src/internal/task/task_threads.c
@@ -0,0 +1,113 @@
+//go:build none
+
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+// BDWGC also uses SIGRTMIN+6 on Linux, which seems like a reasonable choice.
+#ifdef __linux__
+#define taskPauseSignal (SIGRTMIN + 6)
+#endif
+
+// Pointer to the current task.Task structure.
+// Ideally the entire task.Task structure would be a thread-local variable but
+// this also works.
+static __thread void *current_task;
+
+struct state_pass {
+    void      *(*start)(void*);
+    void      *args;
+    void      *task;
+    uintptr_t *stackTop;
+    sem_t     startlock;
+};
+
+// Handle the GC pause in Go.
+void tinygo_task_gc_pause(int sig);
+
+// Initialize the main thread.
+void tinygo_task_init(void *mainTask, pthread_t *thread, int *numCPU, void *context) {
+    // Make sure the current task pointer is set correctly for the main
+    // goroutine as well.
+    current_task = mainTask;
+
+    // Store the thread ID of the main thread.
+    *thread = pthread_self();
+
+    // Register the "GC pause" signal for the entire process.
+    // Using pthread_kill, we can still send the signal to a specific thread.
+    struct sigaction act = { 0 };
+    act.sa_flags = SA_SIGINFO;
+    act.sa_handler = &tinygo_task_gc_pause;
+    sigaction(taskPauseSignal, &act, NULL);
+
+    // Obtain the number of CPUs available on program start (for NumCPU).
+    int num = sysconf(_SC_NPROCESSORS_ONLN);
+    if (num <= 0) {
+        // Fallback in case there is an error.
+        num = 1;
+    }
+    *numCPU = num;
+}
+
+void tinygo_task_exited(void*);
+
+// Helper to start a goroutine while also storing the 'task' structure.
+static void* start_wrapper(void *arg) {
+    struct state_pass *state = arg;
+    void *(*start)(void*) = state->start;
+    void *args = state->args;
+    current_task = state->task;
+
+    // Save the current stack pointer in the goroutine state, for the GC.
+    int stackAddr;
+    *(state->stackTop) = (uintptr_t)(&stackAddr);
+
+    // Notify the caller that the thread has successfully started and
+    // initialized.
+    sem_post(&state->startlock);
+
+    // Run the goroutine function.
+    start(args);
+
+    // Notify the Go side this thread will exit.
+    tinygo_task_exited(current_task);
+
+    return NULL;
+};
+
+// Start a new goroutine in an OS thread.
+int tinygo_task_start(uintptr_t fn, void *args, void *task, pthread_t *thread, uintptr_t *stackTop, void *context) {
+    // Sanity check. Should get optimized away.
+    if (sizeof(pthread_t) != sizeof(void*)) {
+        __builtin_trap();
+    }
+
+    struct state_pass state = {
+        .start     = (void*)fn,
+        .args      = args,
+        .task      = task,
+        .stackTop  = stackTop,
+    };
+    sem_init(&state.startlock, 0, 0);
+    int result = pthread_create(thread, NULL, &start_wrapper, &state);
+
+    // Wait until the thread has been created and read all state_pass variables.
+    sem_wait(&state.startlock);
+
+    return result;
+}
+
+// Return the current task (for task.Current()).
+void* tinygo_task_current(void) {
+    return current_task;
+}
+
+// Send a signal to cause the task to pause for the GC mark phase.
+void tinygo_task_send_gc_signal(pthread_t thread) {
+    pthread_kill(thread, taskPauseSignal);
+}
diff --git a/src/internal/task/task_threads.go b/src/internal/task/task_threads.go
new file mode 100644
index 0000000000..327b2df46e
--- /dev/null
+++ b/src/internal/task/task_threads.go
@@ -0,0 +1,268 @@
+//go:build scheduler.threads
+
+package task
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// If true, print verbose debug logs.
+const verbose = false
+
+// Scheduler-specific state.
+type state struct {
+	// Goroutine ID. The number here is not really significant and after a while
+	// it could wrap around. But it is useful for debugging.
+	id uintptr
+
+	// Thread ID, pthread_t or similar (typically implemented as a pointer).
+	thread threadID
+
+	// Highest address of the stack. It is stored when the goroutine starts, and
+	// is needed to be able to scan the stack.
+	stackTop uintptr
+
+	// Next task in the activeTasks queue.
+	QueueNext *Task
+
+	// Semaphore to pause/resume the thread atomically.
+	pauseSem Semaphore
+
+	// Semaphore used for stack scanning.
+	// We can't reuse pauseSem here since the thread might have been paused for
+	// other reasons (for example, because it was waiting on a channel).
+	gcSem Semaphore
+}
+
+// Goroutine counter, starting at 0 for the main goroutine.
+var goroutineID uintptr
+
+var numCPU int32
+
+var mainTask Task
+
+// Queue of tasks (see QueueNext) that currently exist in the program.
+var activeTasks = &mainTask
+var activeTaskLock PMutex
+
+func OnSystemStack() bool {
+	runtimePanic("todo: task.OnSystemStack")
+	return false
+}
+
+// Initialize the main goroutine state. Must be called by the runtime on
+// startup, before starting any other goroutines.
+func Init(sp uintptr) {
+	mainTask.state.stackTop = sp
+	tinygo_task_init(&mainTask, &mainTask.state.thread, &numCPU)
+}
+
+// Return the task struct for the current thread.
+func Current() *Task {
+	t := (*Task)(tinygo_task_current())
+	if t == nil {
+		runtimePanic("unknown current task")
+	}
+	return t
+}
+
+// Pause pauses the current task, until it is resumed by another task.
+// It is possible that another task has called Resume() on the task before it
+// hits Pause(), in which case the task won't be paused but continues
+// immediately.
+func Pause() {
+	// Wait until resumed
+	t := Current()
+	if verbose {
+		println("*** pause:  ", t.state.id)
+	}
+	t.state.pauseSem.Wait()
+}
+
+// Resume the given task.
+// It is legal to resume a task before it gets paused, it means that the next
+// call to Pause() won't pause but will continue immediately. This happens in
+// practice sometimes in channel operations, where the Resume() might get called
+// between the channel unlock and the call to Pause().
+func (t *Task) Resume() {
+	if verbose {
+		println("*** resume: ", t.state.id)
+	}
+	// Increment the semaphore counter.
+	// If the task is currently paused in Wait(), it will resume.
+	// If the task is not yet paused, the next call to Wait() will continue
+	// immediately.
+	t.state.pauseSem.Post()
+}
+
+// Start a new OS thread.
+func start(fn uintptr, args unsafe.Pointer, stackSize uintptr) {
+	t := &Task{}
+	t.state.id = atomic.AddUintptr(&goroutineID, 1)
+	if verbose {
+		println("*** start:  ", t.state.id, "from", Current().state.id)
+	}
+
+	// Start the new thread, and add it to the list of threads.
+	// Do this with a lock so that only started threads are part of the queue
+	// and the stop-the-world GC won't see threads that haven't started yet or
+	// are not fully started yet.
+	activeTaskLock.Lock()
+	errCode := tinygo_task_start(fn, args, t, &t.state.thread, &t.state.stackTop)
+	if errCode != 0 {
+		runtimePanic("could not start thread")
+	}
+	t.state.QueueNext = activeTasks
+	activeTasks = t
+	activeTaskLock.Unlock()
+}
+
+//export tinygo_task_exited
+func taskExited(t *Task) {
+	if verbose {
+		println("*** exit:", t.state.id)
+	}
+
+	// Remove from the queue.
+	// TODO: this can be made more efficient by using a doubly linked list.
+	activeTaskLock.Lock()
+	found := false
+	for q := &activeTasks; *q != nil; q = &(*q).state.QueueNext {
+		if *q == t {
+			*q = t.state.QueueNext
+			found = true
+			break
+		}
+	}
+	activeTaskLock.Unlock()
+
+	// Sanity check.
+	if !found {
+		runtimePanic("taskExited failed")
+	}
+}
+
+// Futex to wait on until all tasks have finished scanning the stack.
+// This is basically a sync.WaitGroup.
+var scanDoneFutex Futex
+
+// GC scan phase. Because we need to stop the world while scanning, this kinda
+// needs to be done in the tasks package.
+func GCScan() {
+	current := Current()
+
+	// Don't allow new goroutines to be started while pausing/resuming threads
+	// in the stop-the-world phase.
+	activeTaskLock.Lock()
+
+	// Pause all other threads.
+	numOtherThreads := uint32(0)
+	for t := activeTasks; t != nil; t = t.state.QueueNext {
+		if t != current {
+			numOtherThreads++
+			tinygo_task_send_gc_signal(t.state.thread)
+		}
+	}
+
+	// Store the number of threads to wait for in the futex.
+	// This is the equivalent of doing an initial wg.Add(numOtherThreads).
+	scanDoneFutex.Store(numOtherThreads)
+
+	// Scan the current stack, and all current registers.
+	scanCurrentStack()
+
+	// Wake each paused thread for the first time so it will scan the stack.
+	for t := activeTasks; t != nil; t = t.state.QueueNext {
+		if t != current {
+			t.state.gcSem.Post()
+		}
+	}
+
+	// Wait until all threads have finished scanning their stack.
+	// This is the equivalent of wg.Wait()
+	for {
+		val := scanDoneFutex.Load()
+		if val == 0 {
+			break
+		}
+		scanDoneFutex.Wait(val)
+	}
+
+	// Scan all globals (implemented in the runtime).
+	gcScanGlobals()
+
+	// Wake each paused thread for the second time, so they will resume normal
+	// operation.
+	for t := activeTasks; t != nil; t = t.state.QueueNext {
+		if t != current {
+			t.state.gcSem.Post()
+		}
+	}
+
+	// Allow goroutines to start and exit again.
+	activeTaskLock.Unlock()
+}
+
+// Scan globals, implemented in the runtime package.
+func gcScanGlobals()
+
+var stackScanLock PMutex
+
+//export tinygo_task_gc_pause
+func tingyo_task_gc_pause() {
+	// Wait until we get the signal to start scanning the stack.
+	Current().state.gcSem.Wait()
+
+	// Scan the thread stack.
+	// Only scan a single thread stack at a time, because the GC marking phase
+	// doesn't support parallelism.
+	// TODO: it may be possible to call markRoots directly (without saving
+	// registers) since we are in a signal handler that already saved a bunch of
+	// registers. This is an optimization left for a future time.
+	stackScanLock.Lock()
+	scanCurrentStack()
+	stackScanLock.Unlock()
+
+	// Equivalent of wg.Done(): subtract one from the futex and if the result is
+	// 0 (meaning we were the last in the waitgroup), wake the waiting thread.
+	n := uint32(1)
+	if scanDoneFutex.Add(-n) == 0 {
+		scanDoneFutex.Wake()
+	}
+
+	// Wait until we get the signal we can resume normally (after the mark phase
+	// has finished).
+	Current().state.gcSem.Wait()
+}
+
+//go:export tinygo_scanCurrentStack
+func scanCurrentStack()
+
+// Return the highest address of the current stack.
+func StackTop() uintptr {
+	return Current().state.stackTop
+}
+
+// Using //go:linkname instead of //export so that we don't tell the compiler
+// that the 't' parameter won't escape (because it will).
+//
+//go:linkname tinygo_task_init tinygo_task_init
+func tinygo_task_init(t *Task, thread *threadID, numCPU *int32)
+
+// Here same as for tinygo_task_init.
+//
+//go:linkname tinygo_task_start tinygo_task_start
+func tinygo_task_start(fn uintptr, args unsafe.Pointer, t *Task, thread *threadID, stackTop *uintptr) int32
+
+// Pause the thread by sending it a signal.
+//
+//export tinygo_task_send_gc_signal
+func tinygo_task_send_gc_signal(threadID)
+
+//export tinygo_task_current
+func tinygo_task_current() unsafe.Pointer
+
+func NumCPU() int {
+	return int(numCPU)
+}
diff --git a/src/runtime/atomics_critical.go b/src/runtime/atomics_critical.go
index 2d98881a10..edc6a82a3d 100644
--- a/src/runtime/atomics_critical.go
+++ b/src/runtime/atomics_critical.go
@@ -6,7 +6,6 @@
 package runtime
 
 import (
-	"runtime/interrupt"
 	_ "unsafe"
 )
 
@@ -23,27 +22,27 @@ import (
 func __atomic_load_2(ptr *uint16, ordering uintptr) uint16 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	val := *ptr
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return val
 }
 
 //export __atomic_store_2
 func __atomic_store_2(ptr *uint16, val uint16, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	*ptr = val
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 }
 
 //go:inline
 func doAtomicCAS16(ptr *uint16, expected, desired uint16) uint16 {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -61,10 +60,10 @@ func __atomic_compare_exchange_2(ptr, expected *uint16, desired uint16, successO
 
 //go:inline
 func doAtomicSwap16(ptr *uint16, new uint16) uint16 {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -80,11 +79,11 @@ func __atomic_exchange_2(ptr *uint16, new uint16, ordering uintptr) uint16 {
 
 //go:inline
 func doAtomicAdd16(ptr *uint16, value uint16) (old, new uint16) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old, new
 }
 
@@ -112,27 +111,27 @@ func __atomic_add_fetch_2(ptr *uint16, value uint16, ordering uintptr) uint16 {
 func __atomic_load_4(ptr *uint32, ordering uintptr) uint32 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	val := *ptr
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return val
 }
 
 //export __atomic_store_4
 func __atomic_store_4(ptr *uint32, val uint32, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	*ptr = val
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 }
 
 //go:inline
 func doAtomicCAS32(ptr *uint32, expected, desired uint32) uint32 {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -150,10 +149,10 @@ func __atomic_compare_exchange_4(ptr, expected *uint32, desired uint32, successO
 
 //go:inline
 func doAtomicSwap32(ptr *uint32, new uint32) uint32 {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -169,11 +168,11 @@ func __atomic_exchange_4(ptr *uint32, new uint32, ordering uintptr) uint32 {
 
 //go:inline
 func doAtomicAdd32(ptr *uint32, value uint32) (old, new uint32) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old, new
 }
 
@@ -201,27 +200,27 @@ func __atomic_add_fetch_4(ptr *uint32, value uint32, ordering uintptr) uint32 {
 func __atomic_load_8(ptr *uint64, ordering uintptr) uint64 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	val := *ptr
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return val
 }
 
 //export __atomic_store_8
 func __atomic_store_8(ptr *uint64, val uint64, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	*ptr = val
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 }
 
 //go:inline
 func doAtomicCAS64(ptr *uint64, expected, desired uint64) uint64 {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -239,10 +238,10 @@ func __atomic_compare_exchange_8(ptr, expected *uint64, desired uint64, successO
 
 //go:inline
 func doAtomicSwap64(ptr *uint64, new uint64) uint64 {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -258,11 +257,11 @@ func __atomic_exchange_8(ptr *uint64, new uint64, ordering uintptr) uint64 {
 
 //go:inline
 func doAtomicAdd64(ptr *uint64, value uint64) (old, new uint64) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old, new
 }
 
diff --git a/src/runtime/baremetal-multicore.go b/src/runtime/baremetal-multicore.go
new file mode 100644
index 0000000000..8395d8f735
--- /dev/null
+++ b/src/runtime/baremetal-multicore.go
@@ -0,0 +1,13 @@
+//go:build baremetal && !tinygo.unicore
+
+package runtime
+
+import "runtime/interrupt"
+
+func atomicLock() interrupt.State {
+	return atomicLockImpl()
+}
+
+func atomicUnlock(mask interrupt.State) {
+	atomicUnlockImpl(mask)
+}
diff --git a/src/runtime/baremetal-unicore.go b/src/runtime/baremetal-unicore.go
new file mode 100644
index 0000000000..17b8f10212
--- /dev/null
+++ b/src/runtime/baremetal-unicore.go
@@ -0,0 +1,13 @@
+//go:build baremetal && tinygo.unicore
+
+package runtime
+
+import "runtime/interrupt"
+
+func atomicLock() interrupt.State {
+	return interrupt.Disable()
+}
+
+func atomicUnlock(mask interrupt.State) {
+	interrupt.Restore(mask)
+}
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index 139e18bcd2..230515908f 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -1,14 +1,5 @@
 package runtime
 
-// NumCPU returns the number of logical CPUs usable by the current process.
-//
-// The set of available CPUs is checked by querying the operating system
-// at process startup. Changes to operating system CPU allocation after
-// process startup are not reflected.
-func NumCPU() int {
-	return 1
-}
-
 // Stub for NumCgoCall, does not return the real value
 func NumCgoCall() int {
 	return 0
diff --git a/src/runtime/gc_blocks.go b/src/runtime/gc_blocks.go
index d58bfd92a2..bce87ef271 100644
--- a/src/runtime/gc_blocks.go
+++ b/src/runtime/gc_blocks.go
@@ -57,6 +57,7 @@ var (
 	gcMallocs     uint64         // total number of allocations
 	gcFrees       uint64         // total number of objects freed
 	gcFreedBlocks uint64         // total number of freed blocks
+	gcLock        task.PMutex    // lock to avoid race conditions on multicore systems
 )
 
 // zeroSizedAlloc is just a sentinel that gets returned when allocating 0 bytes.
@@ -317,6 +318,10 @@ func alloc(size uintptr, layout unsafe.Pointer) unsafe.Pointer {
 		runtimePanicAt(returnAddress(0), "heap alloc in interrupt")
 	}
 
+	// Make sure there are no concurrent allocations. The heap is not currently
+	// designed for concurrent alloc/GC.
+	gcLock.Lock()
+
 	gcTotalAlloc += uint64(size)
 	gcMallocs++
 
@@ -399,6 +404,9 @@ func alloc(size uintptr, layout unsafe.Pointer) unsafe.Pointer {
 				i.setState(blockStateTail)
 			}
 
+			// We've claimed this allocation, now we can unlock the heap.
+			gcLock.Unlock()
+
 			// Return a pointer to this allocation.
 			pointer := thisAlloc.pointer()
 			if preciseHeap {
@@ -444,7 +452,9 @@ func free(ptr unsafe.Pointer) {
 
 // GC performs a garbage collection cycle.
 func GC() {
+	gcLock.Lock()
 	runGC()
+	gcLock.Unlock()
 }
 
 // runGC performs a garbage collection cycle. It is the internal implementation
@@ -456,8 +466,7 @@ func runGC() (freeBytes uintptr) {
 	}
 
 	// Mark phase: mark all reachable objects, recursively.
-	markStack()
-	findGlobals(markRoots)
+	gcMarkReachable()
 
 	if baremetal && hasScheduler {
 		// Channel operations in interrupts may move task pointers around while we are marking.
@@ -714,6 +723,7 @@ func dumpHeap() {
 // The returned memory statistics are up to date as of the
 // call to ReadMemStats. This would not do GC implicitly for you.
 func ReadMemStats(m *MemStats) {
+	gcLock.Lock()
 	m.HeapIdle = 0
 	m.HeapInuse = 0
 	for block := gcBlock(0); block < endBlock; block++ {
@@ -733,6 +743,7 @@ func ReadMemStats(m *MemStats) {
 	m.Sys = uint64(heapEnd - heapStart)
 	m.HeapAlloc = (gcTotalBlocks - gcFreedBlocks) * uint64(bytesPerBlock)
 	m.Alloc = m.HeapAlloc
+	gcLock.Unlock()
 }
 
 func SetFinalizer(obj interface{}, finalizer interface{}) {
diff --git a/src/runtime/gc_stack_portable.go b/src/runtime/gc_stack_portable.go
index d35e16e30c..750a34ec2c 100644
--- a/src/runtime/gc_stack_portable.go
+++ b/src/runtime/gc_stack_portable.go
@@ -8,6 +8,11 @@ import (
 	"unsafe"
 )
 
+func gcMarkReachable() {
+	markStack()
+	findGlobals(markRoots)
+}
+
 //go:extern runtime.stackChainStart
 var stackChainStart *stackChainObject
 
diff --git a/src/runtime/gc_stack_raw.go b/src/runtime/gc_stack_raw.go
index 5ee18622db..f4483e58c5 100644
--- a/src/runtime/gc_stack_raw.go
+++ b/src/runtime/gc_stack_raw.go
@@ -1,9 +1,14 @@
-//go:build (gc.conservative || gc.precise) && !tinygo.wasm
+//go:build (gc.conservative || gc.precise) && !tinygo.wasm && !scheduler.threads && !scheduler.cores
 
 package runtime
 
 import "internal/task"
 
+func gcMarkReachable() {
+	markStack()
+	findGlobals(markRoots)
+}
+
 // markStack marks all root pointers found on the stack.
 //
 // This implementation is conservative and relies on the stack top (provided by
diff --git a/src/runtime/gc_stack_threads.go b/src/runtime/gc_stack_threads.go
new file mode 100644
index 0000000000..9c77fa0c7b
--- /dev/null
+++ b/src/runtime/gc_stack_threads.go
@@ -0,0 +1,25 @@
+//go:build scheduler.threads
+
+package runtime
+
+import "internal/task"
+
+func gcMarkReachable() {
+	task.GCScan()
+}
+
+// Scan globals inside the stop-the-world phase. Called from the STW
+// implementation in the internal/task package.
+//
+//go:linkname gcScanGlobals internal/task.gcScanGlobals
+func gcScanGlobals() {
+	findGlobals(markRoots)
+}
+
+// Function called from assembly with all registers pushed, to actually scan the
+// stack.
+//
+//go:export tinygo_scanstack
+func scanstack(sp uintptr) {
+	markRoots(sp, task.StackTop())
+}
diff --git a/src/runtime/print.go b/src/runtime/print.go
index a4de460253..ef5c8a7dcc 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -13,11 +13,15 @@ type stringer interface {
 // This is a no-op lock on systems that do not have parallelism.
 var printLock task.PMutex
 
+//var printLocked interrupt.State
+
 func printlock() {
+	//printLocked = serialLock()
 	printLock.Lock()
 }
 
 func printunlock() {
+	//serialUnlock(printLocked)
 	printLock.Unlock()
 }
 
diff --git a/src/runtime/runtime_nrf.go b/src/runtime/runtime_nrf.go
index 729c6bb20f..30a6cda9cb 100644
--- a/src/runtime/runtime_nrf.go
+++ b/src/runtime/runtime_nrf.go
@@ -15,6 +15,8 @@ type timeUnit int64
 //go:linkname systemInit SystemInit
 func systemInit()
 
+const numCPU = 1
+
 //export Reset_Handler
 func main() {
 	if nrf.FPUPresent {
@@ -145,3 +147,31 @@ func rtc_sleep(ticks uint32) {
 		waitForEvents()
 	}
 }
+
+func atomicLockImpl() interrupt.State {
+	mask := interrupt.Disable()
+	return mask
+}
+
+func atomicUnlockImpl(mask interrupt.State) {
+	interrupt.Restore(mask)
+}
+
+func futexLock() interrupt.State {
+	mask := interrupt.Disable()
+	return mask
+}
+
+func futexUnlock(mask interrupt.State) {
+	interrupt.Restore(mask)
+}
+
+func schedulerLock() {
+}
+
+func schedulerUnlock() {
+}
+
+func currentCPU() uint32 {
+	return 0
+}
diff --git a/src/runtime/runtime_rp2040.go b/src/runtime/runtime_rp2040.go
index 1d36a771e5..3c9e8935a0 100644
--- a/src/runtime/runtime_rp2040.go
+++ b/src/runtime/runtime_rp2040.go
@@ -4,8 +4,13 @@ package runtime
 
 import (
 	"device/arm"
+	"device/rp"
+	"internal/task"
 	"machine"
 	"machine/usb/cdc"
+	"reflect"
+	"runtime/interrupt"
+	"unsafe"
 )
 
 // machineTicks is provided by package machine.
@@ -16,6 +21,8 @@ func machineLightSleep(uint64)
 
 type timeUnit int64
 
+const numCPU = 2
+
 // ticks returns the number of ticks (microseconds) elapsed since power up.
 func ticks() timeUnit {
 	t := machineTicks()
@@ -50,14 +57,18 @@ func waitForEvents() {
 }
 
 func putchar(c byte) {
+	//mask := serialLock()
 	machine.Serial.WriteByte(c)
+	//serialUnlock(mask)
 }
 
 func getchar() byte {
+	mask := serialLock()
 	for machine.Serial.Buffered() == 0 {
 		Gosched()
 	}
 	v, _ := machine.Serial.ReadByte()
+	serialUnlock(mask)
 	return v
 }
 
@@ -71,9 +82,11 @@ func machineInit()
 func init() {
 	machineInit()
 
+	mask := serialLock()
 	cdc.EnableUSBCDC()
 	machine.USBDev.Configure(machine.UARTConfig{})
 	machine.InitSerial()
+	serialUnlock(mask)
 }
 
 //export Reset_Handler
@@ -82,3 +95,174 @@ func main() {
 	run()
 	exit(0)
 }
+
+func multicore_fifo_rvalid() bool {
+	return rp.SIO.FIFO_ST.Get()&rp.SIO_FIFO_ST_VLD != 0
+}
+
+func multicore_fifo_wready() bool {
+	return rp.SIO.FIFO_ST.Get()&rp.SIO_FIFO_ST_RDY != 0
+}
+
+func multicore_fifo_drain() {
+	for multicore_fifo_rvalid() {
+		rp.SIO.FIFO_RD.Get()
+	}
+}
+
+func multicore_fifo_push_blocking(data uint32) {
+	for !multicore_fifo_wready() {
+	}
+	rp.SIO.FIFO_WR.Set(data)
+	arm.Asm("sev")
+}
+
+func multicore_fifo_pop_blocking() uint32 {
+	for !multicore_fifo_rvalid() {
+		arm.Asm("wfe")
+	}
+
+	return rp.SIO.FIFO_RD.Get()
+}
+
+//go:extern __isr_vector
+var __isr_vector [0]uint32
+
+//go:extern _stack1_top
+var _stack1_top [0]uint32
+
+var core1StartSequence = [...]uint32{
+	0, 0, 1,
+	uint32(uintptr(unsafe.Pointer(&__isr_vector))),
+	uint32(uintptr(unsafe.Pointer(&_stack1_top))),
+	uint32(uintptr(reflect.ValueOf(runCore1).Pointer())),
+}
+
+func startSecondaryCores() {
+	// Start the second core of the RP2040.
+	// See section 2.8.2 in the datasheet.
+	seq := 0
+	for {
+		cmd := core1StartSequence[seq]
+		if cmd == 0 {
+			multicore_fifo_drain()
+			arm.Asm("sev")
+		}
+		multicore_fifo_push_blocking(cmd)
+		response := multicore_fifo_pop_blocking()
+		if cmd != response {
+			seq = 0
+			continue
+		}
+		seq = seq + 1
+		if seq >= len(core1StartSequence) {
+			break
+		}
+	}
+}
+
+var core1Task task.Task
+
+func runCore1() {
+	//until := ticks() + nanosecondsToTicks(1900e6)
+	//for ticks() < until {
+	//}
+	println("starting core 1")
+
+	runSecondary(1, &core1Task)
+
+	// Just blink a LED to show that this core is running.
+	// TODO: use a real scheduler.
+	//led := machine.GP16
+	//led.Configure(machine.PinConfig{Mode: machine.PinOutput})
+	//const cycles = 7000_000
+	//for {
+	//	for i := 0; i < cycles; i++ {
+	//		led.Low()
+	//	}
+
+	//	for i := 0; i < cycles; i++ {
+	//		led.High()
+	//	}
+	//}
+}
+
+func currentCPU() uint32 {
+	return rp.SIO.CPUID.Get()
+}
+
+const (
+	spinlockAtomic = iota
+	spinlockFutex
+	spinlockScheduler
+)
+
+func atomicLockImpl() interrupt.State {
+	mask := interrupt.Disable()
+	for rp.SIO.SPINLOCK0.Get() == 0 {
+	}
+	return mask
+}
+
+func atomicUnlockImpl(mask interrupt.State) {
+	rp.SIO.SPINLOCK0.Set(0)
+	interrupt.Restore(mask)
+}
+
+func futexLock() interrupt.State {
+	// Disable interrupts.
+	// This is necessary since we might do some futex operations (like Wake)
+	// inside an interrupt and we don't want to deadlock with a non-interrupt
+	// goroutine that has taken the spinlock at the same time.
+	mask := interrupt.Disable()
+
+	// Acquire the spinlock.
+	for rp.SIO.SPINLOCK1.Get() == 0 {
+		// Spin, until the lock is released.
+	}
+
+	return mask
+}
+
+func futexUnlock(mask interrupt.State) {
+	// Release the spinlock.
+	rp.SIO.SPINLOCK1.Set(0)
+
+	// Restore interrupts.
+	interrupt.Restore(mask)
+}
+
+var schedulerLockMasks [numCPU]interrupt.State
+var schedulerLocked = false
+
+// WARNING: doesn't check for deadlocks!
+func schedulerLock() {
+	//schedulerLockMasks[currentCPU()] = interrupt.Disable()
+	for rp.SIO.SPINLOCK2.Get() == 0 {
+	}
+	schedulerLocked = true
+}
+
+func schedulerUnlock() {
+	if !schedulerLocked {
+		println("!!! not locked at unlock")
+		for {
+		}
+	}
+	schedulerLocked = false
+	rp.SIO.SPINLOCK2.Set(0)
+	//interrupt.Restore(schedulerLockMasks[currentCPU()])
+}
+
+func serialLock() interrupt.State {
+	//mask := interrupt.Disable()
+	for rp.SIO.SPINLOCK3.Get() == 0 {
+	}
+	//return mask
+	return 0
+}
+
+func serialUnlock(mask interrupt.State) {
+	rp.SIO.SPINLOCK3.Set(0)
+	//interrupt.Restore(mask)
+}
diff --git a/src/runtime/runtime_unix.go b/src/runtime/runtime_unix.go
index 08e3e74269..fd71d7c488 100644
--- a/src/runtime/runtime_unix.go
+++ b/src/runtime/runtime_unix.go
@@ -11,6 +11,12 @@ import (
 	"unsafe"
 )
 
+const numCPU = 1
+
+func currentCPU() int {
+	return 0
+}
+
 //export write
 func libc_write(fd int32, buf unsafe.Pointer, count uint) int
 
@@ -73,6 +79,7 @@ type timespec struct {
 	tv_nsec int64 // unsigned 64-bit integer on all time64 platforms
 }
 
+// Highest address of the stack of the main thread.
 var stackTop uintptr
 
 // Entry point for Go. Initialize all packages and call main.main().
diff --git a/src/runtime/scheduler.go b/src/runtime/scheduler.go
index 727c7f5f2c..7461c966ed 100644
--- a/src/runtime/scheduler.go
+++ b/src/runtime/scheduler.go
@@ -6,6 +6,8 @@ const schedulerDebug = false
 
 var mainExited bool
 
+var timerQueue *timerNode
+
 // Simple logging, for debugging.
 func scheduleLog(msg string) {
 	if schedulerDebug {
@@ -27,6 +29,34 @@ func scheduleLogChan(msg string, ch *channel, t *task.Task) {
 	}
 }
 
+func timerQueueAdd(tn *timerNode) {
+	q := &timerQueue
+	for ; *q != nil; q = &(*q).next {
+		if tn.whenTicks() < (*q).whenTicks() {
+			// this will finish earlier than the next - insert here
+			break
+		}
+	}
+	tn.next = *q
+	*q = tn
+}
+
+func timerQueueRemove(t *timer) bool {
+	removedTimer := false
+	for q := &timerQueue; *q != nil; q = &(*q).next {
+		if (*q).timer == t {
+			scheduleLog("removed timer")
+			*q = (*q).next
+			removedTimer = true
+			break
+		}
+	}
+	if !removedTimer {
+		scheduleLog("did not remove timer")
+	}
+	return removedTimer
+}
+
 // Goexit terminates the currently running goroutine. No other goroutines are affected.
 func Goexit() {
 	panicOrGoexit(nil, panicGoexit)
diff --git a/src/runtime/scheduler_cooperative.go b/src/runtime/scheduler_cooperative.go
index 91ba86409f..bffda72284 100644
--- a/src/runtime/scheduler_cooperative.go
+++ b/src/runtime/scheduler_cooperative.go
@@ -32,7 +32,6 @@ var (
 	runqueue           task.Queue
 	sleepQueue         *task.Task
 	sleepQueueBaseTime timeUnit
-	timerQueue         *timerNode
 )
 
 // deadlock is called when a goroutine cannot proceed any more, but is in theory
@@ -58,6 +57,11 @@ func Gosched() {
 	task.Pause()
 }
 
+// NumCPU returns the number of logical CPUs usable by the current process.
+func NumCPU() int {
+	return 1
+}
+
 // Add this task to the sleep queue, assuming its state is set to sleeping.
 func addSleepTask(t *task.Task, duration timeUnit) {
 	if schedulerDebug {
@@ -100,36 +104,15 @@ func addSleepTask(t *task.Task, duration timeUnit) {
 // sleepQueue.
 func addTimer(tim *timerNode) {
 	mask := interrupt.Disable()
-
-	// Add to timer queue.
-	q := &timerQueue
-	for ; *q != nil; q = &(*q).next {
-		if tim.whenTicks() < (*q).whenTicks() {
-			// this will finish earlier than the next - insert here
-			break
-		}
-	}
-	tim.next = *q
-	*q = tim
+	timerQueueAdd(tim)
 	interrupt.Restore(mask)
 }
 
 // removeTimer is the implementation of time.stopTimer. It removes a timer from
 // the timer queue, returning true if the timer is present in the timer queue.
 func removeTimer(tim *timer) bool {
-	removedTimer := false
 	mask := interrupt.Disable()
-	for t := &timerQueue; *t != nil; t = &(*t).next {
-		if (*t).timer == tim {
-			scheduleLog("removed timer")
-			*t = (*t).next
-			removedTimer = true
-			break
-		}
-	}
-	if !removedTimer {
-		scheduleLog("did not remove timer")
-	}
+	removedTimer := timerQueueRemove(tim)
 	interrupt.Restore(mask)
 	return removedTimer
 }
diff --git a/src/runtime/scheduler_cores.go b/src/runtime/scheduler_cores.go
new file mode 100644
index 0000000000..4a664acf80
--- /dev/null
+++ b/src/runtime/scheduler_cores.go
@@ -0,0 +1,262 @@
+//go:build scheduler.cores
+
+package runtime
+
+import (
+	"device/arm"
+	"internal/task"
+)
+
+const hasScheduler = true
+
+const hasParallelism = true
+
+const coresVerbose = false
+
+var (
+	mainTask   task.Task
+	cpuTasks   [numCPU]*task.Task
+	sleepQueue *task.Task
+	runQueue   *task.Task
+)
+
+func deadlock() {
+	// Call yield without requesting a wakeup.
+	task.Pause()
+	trap()
+}
+
+func scheduleTask(t *task.Task) {
+	schedulerLock()
+	switch t.RunState {
+	case task.RunStatePaused:
+		// Paused, state is saved on the stack.
+
+		if coresVerbose {
+			println("## schedule: add to runQueue")
+		}
+		addToRunQueue(t)
+		arm.Asm("sev")
+	case task.RunStateRunning:
+		// Not yet paused (probably going to pause very soon), so let the
+		// Pause() function know it can resume immediately.
+		t.RunState = task.RunStateResuming
+		if coresVerbose {
+			println("## schedule: mark as resuming")
+		}
+	default:
+		println("Unknown run state??")
+		for {
+		}
+	}
+	schedulerUnlock()
+}
+
+// Add task to runQueue.
+// Scheduler lock must be held when calling this function.
+func addToRunQueue(t *task.Task) {
+	t.Next = runQueue
+	runQueue = t
+}
+
+func addSleepTask(t *task.Task, wakeup timeUnit) {
+	// Save the timestamp when the task should be woken up.
+	t.Data = uint64(wakeup)
+
+	// Find the position where we should insert this task in the queue.
+	q := &sleepQueue
+	for {
+		if *q == nil {
+			// Found the end of the time queue. Insert it here, at the end.
+			break
+		}
+		if timeUnit((*q).Data) > timeUnit(t.Data) {
+			// Found a task in the queue that has a timeout before the
+			// to-be-sleeping task. Insert our task right before.
+			break
+		}
+		q = &(*q).Next
+	}
+
+	// Insert the task into the queue (this could be at the end, if *q is nil).
+	t.Next = *q
+	*q = t
+}
+
+func Gosched() {
+	// TODO
+}
+
+// NumCPU returns the number of logical CPUs usable by the current process.
+func NumCPU() int {
+	// Return the hardcoded number of physical CPU cores.
+	return numCPU
+}
+
+func addTimer(tn *timerNode) {
+	runtimePanic("todo: timers")
+}
+
+func removeTimer(t *timer) bool {
+	runtimePanic("todo: timers")
+	return false
+}
+
+func schedulerRunQueue() *task.Queue {
+	println("todo: schedulerRunQueue")
+	for {
+	}
+	return nil
+}
+
+// Pause the current task for a given time.
+//
+//go:linkname sleep time.Sleep
+func sleep(duration int64) {
+	if duration <= 0 {
+		return
+	}
+
+	wakeup := ticks() + nanosecondsToTicks(duration)
+
+	schedulerLock()
+	addSleepTask(task.Current(), wakeup)
+	task.PauseLocked()
+}
+
+func run() {
+	initHeap()
+	cpuTasks[0] = &mainTask
+	initAll() // TODO: move into main goroutine!
+
+	until := ticks() + nanosecondsToTicks(200e6)
+	for ticks() < until {
+	}
+	println("\n\n=====")
+
+	go func() {
+		//initAll()
+		startSecondaryCores()
+		callMain()
+		mainExited = true
+	}()
+	schedulerLock()
+	scheduler()
+}
+
+func runSecondary(core uint32, t *task.Task) {
+	println("-- runSecondary")
+	cpuTasks[core] = t
+	println("-- locking for 2nd core")
+	schedulerLock()
+	println("--   locked!")
+	scheduler()
+}
+
+var schedulerIsRunning = false
+
+func scheduler() {
+	if coresVerbose {
+		println("** scheduler on core:", currentCPU())
+	}
+	for {
+		//until := ticks() + nanosecondsToTicks(100e6)
+		//for ticks() < until {
+		//}
+
+		// Check for ready-to-run tasks.
+		if runnable := runQueue; runnable != nil {
+			if coresVerbose {
+				println("** scheduler", currentCPU(), "run runnable")
+			}
+			// Pop off the run queue.
+			runQueue = runnable.Next
+			runnable.Next = nil
+
+			// Resume it now.
+			setCurrentTask(runnable)
+			schedulerUnlock()
+			runnable.Resume()
+			if coresVerbose {
+				println("** scheduler", currentCPU(), "  returned (from runqueue resume)")
+			}
+			setCurrentTask(nil)
+
+			continue
+		}
+
+		// If another core is using the clock, let it handle the sleep queue.
+		if schedulerIsRunning {
+			if coresVerbose {
+				println("** scheduler", currentCPU(), "wait for other core")
+			}
+			schedulerUnlock()
+			waitForEvents()
+			schedulerLock()
+			continue
+		}
+
+		if sleepingTask := sleepQueue; sleepingTask != nil {
+			now := ticks()
+			if now >= timeUnit(sleepingTask.Data) {
+				if coresVerbose {
+					println("** scheduler", currentCPU(), "run sleeping")
+				}
+				// This task is done sleeping.
+				// Resume it now.
+				sleepQueue = sleepQueue.Next
+				sleepingTask.Next = nil
+
+				setCurrentTask(sleepingTask)
+				schedulerUnlock()
+				sleepingTask.Resume()
+				if coresVerbose {
+					println("** scheduler", currentCPU(), "  returned (from sleepQueue resume)")
+				}
+				setCurrentTask(nil)
+				continue
+			}
+
+			delay := timeUnit(sleepingTask.Data) - now
+			if coresVerbose {
+				println("** scheduler", currentCPU(), "sleep", ticksToNanoseconds(delay)/1e6)
+			}
+
+			// Sleep for a bit until the next task is ready to run.
+			schedulerIsRunning = true
+			schedulerUnlock()
+			sleepTicks(delay)
+			schedulerLock()
+			schedulerIsRunning = false
+			continue
+		}
+
+		if coresVerbose {
+			println("** scheduler", currentCPU(), "wait for events")
+		}
+		schedulerUnlock()
+		waitForEvents()
+		schedulerLock()
+	}
+}
+
+func currentTask() *task.Task {
+	return cpuTasks[currentCPU()]
+}
+
+func setCurrentTask(task *task.Task) {
+	cpuTasks[currentCPU()] = task
+}
+
+func runtimeTicks() uint64 {
+	return uint64(ticks())
+}
+
+func runtimeSleepTicks(delay uint64) {
+	sleepTicks(timeUnit(delay))
+}
+
+//export tinygo_schedulerUnlock
+func tinygo_schedulerUnlock() {
+	schedulerUnlock()
+}
diff --git a/src/runtime/scheduler_none.go b/src/runtime/scheduler_none.go
index a5acfd4309..7e2ddeb668 100644
--- a/src/runtime/scheduler_none.go
+++ b/src/runtime/scheduler_none.go
@@ -40,6 +40,11 @@ func Gosched() {
 	// There are no other goroutines, so there's nothing to schedule.
 }
 
+// NumCPU returns the number of logical CPUs usable by the current process.
+func NumCPU() int {
+	return 1
+}
+
 func addTimer(tim *timerNode) {
 	runtimePanic("timers not supported without a scheduler")
 }
diff --git a/src/runtime/scheduler_threads.go b/src/runtime/scheduler_threads.go
new file mode 100644
index 0000000000..292499354e
--- /dev/null
+++ b/src/runtime/scheduler_threads.go
@@ -0,0 +1,129 @@
+//go:build scheduler.threads
+
+package runtime
+
+import "internal/task"
+
+const hasScheduler = false // not using the cooperative scheduler
+
+// We use threads, so yes there is parallelism.
+const hasParallelism = true
+
+var (
+	timerQueueLock    task.PMutex
+	timerQueueStarted bool
+	timerFutex        task.Futex
+)
+
+// Because we just use OS threads, we don't need to do anything special here. We
+// can just initialize everything and run main.main on the main thread.
+func run() {
+	initHeap()
+	task.Init(stackTop)
+	initAll()
+	callMain()
+}
+
+// Pause the current task for a given time.
+//
+//go:linkname sleep time.Sleep
+func sleep(duration int64) {
+	if duration <= 0 {
+		return
+	}
+
+	sleepTicks(nanosecondsToTicks(duration))
+}
+
+func deadlock() {
+	// TODO: exit the thread via pthread_exit.
+	task.Pause()
+}
+
+func scheduleTask(t *task.Task) {
+	t.Resume()
+}
+
+func Gosched() {
+	// Each goroutine runs in a thread, so there's not much we can do here.
+	// There is sched_yield but it's only really intended for realtime
+	// operation, so is probably best not to use.
+}
+
+// NumCPU returns the number of logical CPUs usable by the current process.
+func NumCPU() int {
+	return task.NumCPU()
+}
+
+// Separate goroutine (thread) that runs timer callbacks when they expire.
+func timerRunner() {
+	for {
+		timerQueueLock.Lock()
+
+		if timerQueue == nil {
+			// No timer in the queue, so wait until one becomes available.
+			val := timerFutex.Load()
+			timerQueueLock.Unlock()
+			timerFutex.Wait(val)
+			continue
+		}
+
+		now := ticks()
+		if now < timerQueue.whenTicks() {
+			// There is a timer in the queue, but we need to wait until it
+			// expires.
+			// Using a futex, so that the wait is exited early when adding a new
+			// (sooner-to-expire) timer.
+			val := timerFutex.Load()
+			timerQueueLock.Unlock()
+			timeout := ticksToNanoseconds(timerQueue.whenTicks() - now)
+			timerFutex.WaitUntil(val, uint64(timeout))
+			continue
+		}
+
+		// Pop timer from queue.
+		tn := timerQueue
+		timerQueue = tn.next
+		tn.next = nil
+
+		timerQueueLock.Unlock()
+
+		// Run the callback stored in this timer node.
+		delay := ticksToNanoseconds(now - tn.whenTicks())
+		tn.callback(tn, delay)
+	}
+}
+
+func addTimer(tim *timerNode) {
+	timerQueueLock.Lock()
+
+	if !timerQueueStarted {
+		timerQueueStarted = true
+		go timerRunner()
+	}
+
+	timerQueueAdd(tim)
+
+	timerFutex.Add(1)
+	timerFutex.Wake()
+
+	timerQueueLock.Unlock()
+}
+
+func removeTimer(tim *timer) bool {
+	timerQueueLock.Lock()
+	removed := timerQueueRemove(tim)
+	timerQueueLock.Unlock()
+	return removed
+}
+
+func schedulerRunQueue() *task.Queue {
+	// This function is not actually used, it is only called when hasScheduler
+	// is true. So we can just return nil here.
+	return nil
+}
+
+func runqueueForGC() *task.Queue {
+	// There is only a runqueue when using the cooperative scheduler.
+	return nil
+}
diff --git a/src/sync/mutex.go b/src/sync/mutex.go
index 08c674d7ea..890af78606 100644
--- a/src/sync/mutex.go
+++ b/src/sync/mutex.go
@@ -6,131 +6,96 @@ import (
 
 type Mutex = task.Mutex
 
-type RWMutex struct {
-	// waitingWriters are all of the tasks waiting for write locks.
-	waitingWriters task.Stack
-
-	// waitingReaders are all of the tasks waiting for a read lock.
-	waitingReaders task.Stack
+//go:linkname runtimePanic runtime.runtimePanic
+func runtimePanic(msg string)
 
-	// state is the current state of the RWMutex.
-	// Iff the mutex is completely unlocked, it contains rwMutexStateUnlocked (aka 0).
-	// Iff the mutex is write-locked, it contains rwMutexStateWLocked.
-	// While the mutex is read-locked, it contains the current number of readers.
-	state uint32
+type RWMutex struct {
+	// Reader count, with the number of readers that currently have read-locked
+	// this mutex.
+	// The value can be in two states: one where 0 means no readers and another
+	// where -rwMutexMaxReaders means no readers. A base of 0 is normal
+	// uncontended operation, a base of -rwMutexMaxReaders means a writer has
+	// the lock or is trying to get the lock. In the second case, readers should
+	// wait until the reader count becomes non-negative again to give the writer
+	// a chance to obtain the lock.
+	readers task.Futex
+
+	// Writer futex, normally 0. If there is a writer waiting until all readers
+	// have unlocked, this value is 1. It will be changed to a 2 (and get a
+	// wake) when the last reader unlocks.
+	writer task.Futex
+
+	// Writer lock. Held between Lock() and Unlock().
+	writerLock Mutex
 }
 
-const (
-	rwMutexStateUnlocked = uint32(0)
-	rwMutexStateWLocked  = ^uint32(0)
-	rwMutexMaxReaders    = rwMutexStateWLocked - 1
-)
+const rwMutexMaxReaders = 1 << 30
 
 func (rw *RWMutex) Lock() {
-	if rw.state == 0 {
-		// The mutex is completely unlocked.
-		// Lock without waiting.
-		rw.state = rwMutexStateWLocked
+	// Exclusive lock for writers.
+	rw.writerLock.Lock()
+
+	// Flag that we need to be awakened after the last read-lock unlocks.
+	rw.writer.Store(1)
+
+	// Signal to readers that they can't lock this mutex anymore.
+	n := uint32(rwMutexMaxReaders)
+	waiting := rw.readers.Add(-n)
+	if int32(waiting) == -rwMutexMaxReaders {
+		// All readers were already unlocked, so we don't need to wait for them.
+		rw.writer.Store(0)
 		return
 	}
 
-	// Wait for the lock to be released.
-	rw.waitingWriters.Push(task.Current())
-	task.Pause()
+	// There is at least one reader.
+	// Wait until all readers are unlocked. The last reader to unlock will set
+	// rw.writer to 2 and awaken us.
+	for rw.writer.Load() == 1 {
+		rw.writer.Wait(1)
+	}
+	rw.writer.Store(0)
 }
 
 func (rw *RWMutex) Unlock() {
-	switch rw.state {
-	case rwMutexStateWLocked:
-		// This is correct.
-
-	case rwMutexStateUnlocked:
-		// The mutex is already unlocked.
-		panic("sync: unlock of unlocked RWMutex")
-
-	default:
-		// The mutex is read-locked instead of write-locked.
-		panic("sync: write-unlock of read-locked RWMutex")
+	// Signal that new readers can lock this mutex.
+	waiting := rw.readers.Add(rwMutexMaxReaders)
+	if waiting != 0 {
+		// Awaken all waiting readers.
+		rw.readers.WakeAll()
 	}
 
-	switch {
-	case rw.maybeUnblockReaders():
-		// Switched over to read mode.
-
-	case rw.maybeUnblockWriter():
-		// Transferred to another writer.
-
-	default:
-		// Nothing is waiting for the lock.
-		rw.state = rwMutexStateUnlocked
-	}
+	// Done with this lock (next writer can try to get a lock).
+	rw.writerLock.Unlock()
 }
 
 func (rw *RWMutex) RLock() {
-	if rw.state == rwMutexStateWLocked {
-		// Wait for the write lock to be released.
-		rw.waitingReaders.Push(task.Current())
-		task.Pause()
-		return
-	}
+	// Add us as a reader.
+	newVal := rw.readers.Add(1)
 
-	if rw.state == rwMutexMaxReaders {
-		panic("sync: too many readers on RWMutex")
+	// Wait until the RWMutex is available for readers.
+	for int32(newVal) <= 0 {
+		rw.readers.Wait(newVal)
+		newVal = rw.readers.Load()
 	}
-
-	// Increase the reader count.
-	rw.state++
 }
 
 func (rw *RWMutex) RUnlock() {
-	switch rw.state {
-	case rwMutexStateUnlocked:
-		// The mutex is already unlocked.
-		panic("sync: unlock of unlocked RWMutex")
-
-	case rwMutexStateWLocked:
-		// The mutex is write-locked instead of read-locked.
-		panic("sync: read-unlock of write-locked RWMutex")
-	}
-
-	rw.state--
+	// Remove us as a reader.
+	one := uint32(1)
+	readers := int32(rw.readers.Add(-one))
 
-	if rw.state == rwMutexStateUnlocked {
-		// This was the last reader.
-		// Try to unblock a writer.
-		rw.maybeUnblockWriter()
+	// Check whether RUnlock was called too often.
+	if readers == -1 || readers == (-rwMutexMaxReaders)-1 {
+		runtimePanic("sync: RUnlock of unlocked RWMutex")
 	}
-}
 
-func (rw *RWMutex) maybeUnblockReaders() bool {
-	var n uint32
-	for {
-		t := rw.waitingReaders.Pop()
-		if t == nil {
-			break
+	if readers == -rwMutexMaxReaders {
+		// This was the last read lock. Check whether we need to wake up a write
+		// lock.
+		if rw.writer.CompareAndSwap(1, 2) {
+			rw.writer.Wake()
 		}
-
-		n++
-		scheduleTask(t)
-	}
-	if n == 0 {
-		return false
 	}
-
-	rw.state = n
-	return true
-}
-
-func (rw *RWMutex) maybeUnblockWriter() bool {
-	t := rw.waitingWriters.Pop()
-	if t == nil {
-		return false
-	}
-
-	rw.state = rwMutexStateWLocked
-	scheduleTask(t)
-
-	return true
 }
 
 type Locker interface {
diff --git a/targets/arm.ld b/targets/arm.ld
index cdf5b1dd43..631ec9cab2 100644
--- a/targets/arm.ld
+++ b/targets/arm.ld
@@ -32,6 +32,15 @@ SECTIONS
         _stack_top = .;
     } >RAM
 
+    /* Stack for second core (core 1), if there is one. This memory area won't be
+     * reserved if there is no second core. */
+    .stack1 (NOLOAD) :
+    {
+        . = ALIGN(4);
+        . += _stack_size;
+        _stack1_top = .;
+    } >RAM
+
     /* Start address (in flash) of .data, used by startup code. */
     _sidata = LOADADDR(.data);
 
diff --git a/testdata/channel.go b/testdata/channel.go
index 9c0fee5b73..a247c808f2 100644
--- a/testdata/channel.go
+++ b/testdata/channel.go
@@ -12,6 +12,7 @@ var wg sync.WaitGroup
 type intchan chan int
 
 func main() {
+	time.Sleep(time.Second * 2)
 	ch := make(chan int, 2)
 	ch <- 1
 	println("len, cap of channel:", len(ch), cap(ch), ch == nil)
diff --git a/testdata/goroutines.go b/testdata/goroutines.go
index cf19cc3ca0..bed2ecb1a7 100644
--- a/testdata/goroutines.go
+++ b/testdata/goroutines.go
@@ -5,19 +5,27 @@ import (
 	"time"
 )
 
-func init() {
-	println("init")
-	go println("goroutine in init")
-	time.Sleep(1 * time.Millisecond)
-}
+//func init() {
+//	println("\n----")
+//	println("init")
+//	go println("goroutine in init")
+//	time.Sleep(1 * time.Millisecond)
+//}
 
 func main() {
+	//for i := 0; i < 2; i++ {
+	//	println("...")
+	//	time.Sleep(time.Second)
+	//}
+
 	println("main 1")
 	go sub()
 	time.Sleep(1 * time.Millisecond)
 	println("main 2")
 	time.Sleep(2 * time.Millisecond)
 	println("main 3")
+	//time.Sleep(2 * time.Millisecond)
+	//println("main 4")
 
 	// Await a blocking call.
 	println("wait:")
@@ -101,6 +109,8 @@ func sub() {
 	println("sub 1")
 	time.Sleep(2 * time.Millisecond)
 	println("sub 2")
+	//time.Sleep(2 * time.Millisecond)
+	//println("sub 3")
 }
 
 func wait() {
diff --git a/tools/gen-critical-atomics/gen-critical-atomics.go b/tools/gen-critical-atomics/gen-critical-atomics.go
index 75ea327076..f949a7933a 100644
--- a/tools/gen-critical-atomics/gen-critical-atomics.go
+++ b/tools/gen-critical-atomics/gen-critical-atomics.go
@@ -26,7 +26,6 @@ package runtime
 
 import (
 	_ "unsafe"
-	"runtime/interrupt"
 )
 
 // Documentation:
@@ -41,29 +40,29 @@ import (
 func __atomic_load_{{.}}(ptr *uint{{$bits}}, ordering uintptr) uint{{$bits}} {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	val := *ptr
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return val
 }
 {{end}}
 {{- define "store"}}{{$bits := mul . 8 -}}
 //export __atomic_store_{{.}}
 func __atomic_store_{{.}}(ptr *uint{{$bits}}, val uint{{$bits}}, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	*ptr = val
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 }
 {{end}}
 {{- define "cas"}}{{$bits := mul . 8 -}}
 //go:inline
 func doAtomicCAS{{$bits}}(ptr *uint{{$bits}}, expected, desired uint{{$bits}}) uint{{$bits}} {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -82,10 +81,10 @@ func __atomic_compare_exchange_{{.}}(ptr, expected *uint{{$bits}}, desired uint{
 {{- define "swap"}}{{$bits := mul . 8 -}}
 //go:inline
 func doAtomicSwap{{$bits}}(ptr *uint{{$bits}}, new uint{{$bits}}) uint{{$bits}} {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old
 }
 
@@ -111,11 +110,11 @@ func __atomic_exchange_{{.}}(ptr *uint{{$bits}}, new uint{{$bits}}, ordering uin
 
 //go:inline
 func {{$opfn}}(ptr *{{$type}}, value {{$type}}) (old, new {{$type}}) {
-	mask := interrupt.Disable()
+	mask := atomicLock()
 	old = *ptr
 	{{$opdef}}
 	*ptr = new
-	interrupt.Restore(mask)
+	atomicUnlock(mask)
 	return old, new
 }