From b2b654027b65e7c7ed0b49bce11cd1a38e4e151b Mon Sep 17 00:00:00 2001
From: Richard Davison <ridaviso@amazon.com>
Date: Thu, 5 Sep 2024 19:57:35 +0200
Subject: [PATCH] experiment with a single compressed bytecode bundle. Unpack
 on demand

---
 Cargo.lock                |  18 ++-
 Makefile                  |   7 +-
 build.mjs                 |   6 +-
 embed.mjs                 | 116 ++++++++++++++++
 llrt/src/main.c           | 159 +++++++++++++++++++---
 llrt_core/Cargo.toml      |   3 +-
 llrt_core/build.rs        |  34 +++--
 llrt_core/src/bytecode.rs |   1 +
 llrt_core/src/vm.rs       | 269 +++++++++++++++++++++++++++++++++++---
 pack                      | 105 ++++++++++-----
 10 files changed, 627 insertions(+), 91 deletions(-)
 create mode 100644 embed.mjs

diff --git a/Cargo.lock b/Cargo.lock
index d1c247951e..f9a324aa60 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1530,7 +1530,7 @@ dependencies = [
  "gix-object",
  "gix-traverse",
  "itoa",
- "memmap2",
+ "memmap2 0.5.10",
  "smallvec",
  "thiserror",
 ]
@@ -1610,7 +1610,7 @@ dependencies = [
  "gix-path",
  "gix-tempfile",
  "gix-traverse",
- "memmap2",
+ "memmap2 0.5.10",
  "parking_lot",
  "smallvec",
  "thiserror",
@@ -1696,7 +1696,7 @@ dependencies = [
  "gix-path",
  "gix-tempfile",
  "gix-validate",
- "memmap2",
+ "memmap2 0.5.10",
  "nom",
  "thiserror",
 ]
@@ -2588,9 +2588,9 @@ dependencies = [
  "libc",
  "llrt_modules",
  "llrt_utils",
+ "memmap2 0.9.4",
  "nanoid",
  "once_cell",
- "phf",
  "phf_codegen",
  "quick-xml",
  "rand",
@@ -2603,6 +2603,7 @@ dependencies = [
  "tokio",
  "tracing",
  "tracing-core",
+ "url",
  "uuid",
  "uuid-simd",
  "webpki",
@@ -2713,6 +2714,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "memmap2"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
diff --git a/Makefile b/Makefile
index 98c01e5745..b7f0abbf93 100644
--- a/Makefile
+++ b/Makefile
@@ -84,9 +84,10 @@ define lambda_release_template
 release-${1}${2}: | llrt-lambda-${1}${2} llrt-container-${1}${2}
 
 llrt-lambda-${1}${2}: export SDK_BUNDLE_MODE = ${3}
-llrt-lambda-${1}${2}: | clean-js js
-	cargo $$(BUILD_ARG) --target $$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1})) --features lambda
-	./pack target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/llrt target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/bootstrap
+llrt-lambda-${1}${2}:
+	cargo $$(BUILD_ARG) --target $$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1})) --features lambda,uncompressed -vvv
+	node embed.mjs target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/llrt_bytecode target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/llrt target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/bytecode -r
+	./pack target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/llrt target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/bootstrap target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/bytecode target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/llrt_bytecode/compression.dict
 	@rm -rf llrt-lambda-${1}${2}.zip
 	zip -j llrt-lambda-${1}${2}.zip target/$$(TARGET_linux_$$(RELEASE_ARCH_NAME_${1}))/release/bootstrap
 
diff --git a/build.mjs b/build.mjs
index fb98f5ca04..783c51ab64 100644
--- a/build.mjs
+++ b/build.mjs
@@ -10,7 +10,7 @@ process.env.NODE_PATH = ".";
 const TMP_DIR = `.tmp-llrt-aws-sdk`;
 const SRC_DIR = path.join("llrt_core", "src", "modules", "js");
 const TESTS_DIR = "tests";
-const OUT_DIR = "bundle/js";
+const OUT_DIR = "bundle";
 const SHIMS = new Map();
 const SDK_BUNDLE_MODE = process.env.SDK_BUNDLE_MODE || "NONE"; // "FULL" or "STD" or "NONE"
 
@@ -662,7 +662,7 @@ async function loadShims() {
 
 async function buildLibrary() {
   const defaultLibEsBuildOption = {
-    chunkNames: "llrt-[name]-runtime-[hash]",
+    chunkNames: "lrtrt-[name]-[hash]",
     ...ES_BUILD_OPTIONS,
     splitting: false,
     keepNames: true,
@@ -741,7 +741,7 @@ async function buildSdks() {
         "fast-xml-parser": "llrt:xml",
         uuid: "llrt:uuid",
       },
-      chunkNames: "llrt-[name]-sdk-[hash]",
+      chunkNames: "lrt[hash]",
       metafile: true,
       ...ES_BUILD_OPTIONS,
     }),
diff --git a/embed.mjs b/embed.mjs
new file mode 100644
index 0000000000..350f06f20c
--- /dev/null
+++ b/embed.mjs
@@ -0,0 +1,116 @@
+import fs from "fs/promises";
+import path from "path";
+
+//read all files in ./bundle/lrt that ends with .lrt
+async function readFiles(bytecodeDir) {
+  const fileEntries = await fs.readdir(bytecodeDir, {
+    recursive: true,
+    withFileTypes: true,
+  });
+  const files = fileEntries.reduce((acc, { name, parentPath }) => {
+    if (name.endsWith(".lrt")) {
+      acc.push(path.join(parentPath, name));
+    }
+
+    return acc;
+  }, []);
+  files.sort((a, b) => a.localeCompare(b));
+  return files;
+}
+
+async function readFileData(bytecodeDir, files) {
+  return await Promise.all(
+    files.map(async (file) => {
+      const data = await fs.readFile(file);
+      const { name, dir } = path.parse(path.relative(bytecodeDir, file));
+      return [`${dir ? `${dir}/` : ""}${name}`, data];
+    })
+  );
+}
+
+async function buildFileIndex(source, target, fileData, writeRaw) {
+  const uint32Buffer = (length) => {
+    const buffer = Buffer.alloc(4);
+    buffer.writeUInt32LE(length);
+    return buffer;
+  };
+
+  const uint16Buffer = (length) => {
+    const buffer = Buffer.alloc(2);
+    buffer.writeUInt16LE(length);
+    return buffer;
+  };
+
+  const sourceData = await fs.readFile(source);
+  const packageIndexList = [];
+  let offset = 0;
+  const bytecodeData = [];
+
+  for (let [name, data] of fileData) {
+    if (name.startsWith("lrt") || name.startsWith("llrt")) {
+      name = `${name}.js`;
+    }
+    const nameLengthBuffer = uint16Buffer(name.length);
+    const nameBuffer = Buffer.from(name);
+
+    const bytecodeSizeBuffer = uint32Buffer(data.length);
+    const bytecodeOffsetBuffer = uint32Buffer(offset);
+
+    packageIndexList.push(
+      Buffer.concat([
+        nameLengthBuffer,
+        nameBuffer,
+        bytecodeOffsetBuffer,
+        bytecodeSizeBuffer,
+      ])
+    );
+
+    offset += data.length;
+    bytecodeData.push(data);
+  }
+
+  const allBytecodeData = Buffer.concat(bytecodeData);
+
+  const packageCount = fileData.length;
+  const bytecodePosition = writeRaw ? 0 : sourceData.length;
+  const packageIndexPosition = bytecodePosition + allBytecodeData.length;
+
+  const metadataBuffer = Buffer.concat([
+    uint32Buffer(packageCount),
+    uint32Buffer(bytecodePosition),
+    uint32Buffer(packageIndexPosition),
+    Buffer.from("lrt"),
+  ]);
+
+  const packageIndexBuffer = Buffer.concat(packageIndexList);
+
+  const finalBuffer = Buffer.concat([
+    ...(writeRaw ? [] : [sourceData]),
+    allBytecodeData,
+    packageIndexBuffer,
+    metadataBuffer,
+  ]);
+
+  console.log("Embedded size:", allBytecodeData.length / 1024, "kB");
+
+  await fs.writeFile(target, finalBuffer);
+  if (!writeRaw) {
+    await fs.chmod(target, 0o755);
+  }
+}
+
+const [bytecodeDir, source, target, rawArg] = process.argv.slice(2);
+const writeRaw = rawArg == "-r" || rawArg == "--raw";
+
+if (!bytecodeDir || !source || !target) {
+  console.error(
+    `No source or target specified, use:\n${path.basename(process.argv[0])} ${path.basename(process.argv[1])} {bytecode_directory} {input_target} {output_target}`
+  );
+  process.exit(1);
+}
+
+console.log("Reading files...");
+const files = await readFiles(bytecodeDir);
+console.log("Reading file data...");
+const filesContents = await readFileData(bytecodeDir, files);
+await buildFileIndex(source, target, filesContents, writeRaw);
diff --git a/llrt/src/main.c b/llrt/src/main.c
index 018ae39608..e6f4432457 100644
--- a/llrt/src/main.c
+++ b/llrt/src/main.c
@@ -12,13 +12,13 @@
 #include <sys/time.h>
 #include <sys/mman.h>
 #include <err.h>
-#include <errno.h>
 #include <pthread.h>
 #include <sys/stat.h>
 #include <zstd.h>
 #include <stdarg.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
+#include <fcntl.h>
 
 #ifdef __x86_64__
 #define MEMFD_CREATE_SYSCALL_ID 319
@@ -106,7 +106,7 @@ void logError(const char *format, ...)
   }
 }
 
-static uint32_t calculateSum(uint32_t *array, uint8_t size)
+static uint32_t sumArray(uint32_t *array, uint8_t size)
 {
   uint32_t sum = 0;
   for (uint8_t i = 0; i < size; i++)
@@ -128,12 +128,17 @@ typedef struct
   uint32_t srcSize;
   uint32_t dstSize;
   uint32_t id;
+  uint32_t extraSize;
   const void *inputBuffer;
   const void *outputBuffer;
+  const void *extraSrc;
+  const void *extraDst;
 } DecompressThreadArgs;
 
 static void *decompressPartial(void *arg)
 {
+  double t0 = micro_seconds();
+
   DecompressThreadArgs *args = (DecompressThreadArgs *)arg;
   size_t srcSize = args->srcSize;
   size_t dstSize = args->dstSize;
@@ -145,18 +150,30 @@ static void *decompressPartial(void *arg)
     printf("%s!\n", ZSTD_getErrorName(dSize));
     return (void *)1;
   }
+
+  logInfo("Started thread %d\n", args->id);
+
+  if (args->id == 0)
+  {
+    memcpy(args->extraDst, args->extraSrc, args->extraSize);
+  }
+
+  double t1 = micro_seconds();
+
+  logInfo("Extraction thread %d: %10.4f ms\n", args->id, (t1 - t0) / 1000.0);
   return (void *)0;
 }
 
 extern char **environ;
 
 static void readData(
-    const char *data,
+    const void *data,
     uint8_t parts,
     uint32_t **inputSizes,
     uint32_t **outputSizes,
     uint8_t **compressedData,
-    uint32_t *uncompressedSize)
+    uint32_t *uncompressedSize,
+    uint32_t *extraDataOffset)
 {
   uint32_t metadataSize = sizeof(uint32_t) * parts;
 
@@ -166,20 +183,21 @@ static void readData(
   // Extract output sizes
   *outputSizes = (uint32_t *)&data[1 + metadataSize];
 
-  *uncompressedSize = calculateSum(*outputSizes, parts);
+  *uncompressedSize = sumArray(*outputSizes, parts);
+  uint32_t totalInputSize = sumArray(*inputSizes, parts);
 
   // Calculate the offset to the compressed data
   uint8_t dataOffset = 1 + (2 * metadataSize);
 
   *compressedData = (uint8_t *)&data[dataOffset];
+  *extraDataOffset = dataOffset + totalInputSize;
 }
 
-static void decompress(char **uncompressedData, uint32_t *uncompressedSize, int outputFd)
+static void decompress(void *payload, uint32_t payloadSize, void **uncompressedData, uint32_t *uncompressedSize, uint32_t *extraDataOffset, int outputFd)
 {
 
-#include "data.c"
+  uint8_t parts = *((uint8_t *)payload);
 
-  uint8_t parts = data[0];
   uint32_t *inputSizes;
   uint32_t *outputSizes;
   uint32_t inputOffset = 0;
@@ -198,19 +216,24 @@ static void decompress(char **uncompressedData, uint32_t *uncompressedSize, int
     logInfo("Decompressing\n");
   }
 
-  readData(data, parts, &inputSizes, &outputSizes, &compressedData, uncompressedSize);
+  readData(payload, parts, &inputSizes, &outputSizes, &compressedData, uncompressedSize, extraDataOffset);
+
+  uint32_t extraSize = payloadSize - *extraDataOffset - sizeof(int32_t);
 
-  if (ftruncate(outputFd, *uncompressedSize) == -1)
+  if (ftruncate(outputFd, *uncompressedSize + extraSize) == -1)
   {
-    err(1, "Failed to set file size");
+    err(1, "Failed to set output file size");
   }
 
-  uncompressed = mmap(NULL, *uncompressedSize, PROT_READ | PROT_WRITE, MAP_SHARED, outputFd, 0);
+  uncompressed = mmap(NULL, *uncompressedSize + extraSize, PROT_READ | PROT_WRITE, MAP_SHARED, outputFd, 0);
   if (uncompressed == MAP_FAILED || !uncompressed)
   {
     err(1, "Memory mapping failed: Unable to map %u bytes. Make sure you have enough memory available", *uncompressedSize);
   }
 
+  void *extraDst = uncompressed + *uncompressedSize;
+  void *extraSrc = payload + *extraDataOffset;
+
   DecompressThreadArgs args[parts];
   for (uint32_t i = 0; i < parts; i++)
   {
@@ -219,8 +242,15 @@ static void decompress(char **uncompressedData, uint32_t *uncompressedSize, int
     args[i].srcSize = inputSizes[i];
     args[i].dstSize = outputSizes[i];
     args[i].id = i;
+    args[i].extraSize = 0;
     inputOffset += inputSizes[i];
     outputOffset += outputSizes[i];
+    if (i == 0)
+    {
+      args[i].extraSrc = extraSrc;
+      args[i].extraDst = extraDst;
+      args[i].extraSize = extraSize;
+    }
     if (parts > 1)
     {
       pthread_create(&threads[i], NULL, decompressPartial, (void *)&args[i]);
@@ -246,40 +276,123 @@ static void decompress(char **uncompressedData, uint32_t *uncompressedSize, int
   *uncompressedData = uncompressed;
 }
 
+typedef struct
+{
+  void *addr;
+  size_t length;
+} UnmapThreadArgs;
+
+void *unmapThread(void *arg)
+{
+  UnmapThreadArgs *args = (UnmapThreadArgs *)arg;
+
+  if (munmap(args->addr, args->length) == -1)
+  {
+    err(1, "Failed to unmap memory");
+  }
+
+  return NULL;
+}
+
 int main(int argc, char *argv[])
 {
+  double t0 = micro_seconds();
   initLoggingFlag();
 
-  logInfo("Runtime starting\n");
+  logInfo("Extractor started\n");
 
   char *tmpAppname = strrchr(argv[0], '/');
   char *appname = tmpAppname ? ++tmpAppname : argv[0];
 
-  double t0 = micro_seconds();
-
   int outputFd = memfd_create_syscall(appname, 0);
   if (outputFd == -1)
   {
     err(1, "Could not create memfd");
   }
 
-  char *uncompressedData;
+  // Open the file
+  int selfFd = open(argv[0], O_RDONLY);
+  if (selfFd == -1)
+  {
+    err(1, "Could not open self exec");
+  }
+
+  // Get file size
+  struct stat selfStats;
+  if (fstat(selfFd, &selfStats) == -1)
+  {
+    close(selfFd);
+    err(1, "Could not get filesize");
+  }
+
+  void *selfBytes = mmap(NULL, selfStats.st_size, PROT_READ, MAP_PRIVATE, selfFd, 0);
+  if (selfBytes == MAP_FAILED || !selfBytes)
+  {
+    close(selfFd);
+    err(1, "Failed to memory map source");
+  }
+  close(selfFd);
+
+  uint32_t offset = *(uint32_t *)(selfBytes + (selfStats.st_size - sizeof(uint32_t)));
+  uint32_t payloadSize = selfStats.st_size - offset;
+
+  void *payload = selfBytes + offset;
+
+  logInfo("1 %d @ %d @ %d\n", payloadSize, offset);
+
+  void *uncompressedData;
   uint32_t uncompressedSize;
+  uint32_t extraDataOffset;
 
-  decompress(&uncompressedData, &uncompressedSize, outputFd);
+  decompress(payload, payloadSize, &uncompressedData, &uncompressedSize, &extraDataOffset, outputFd);
 
   double t1 = micro_seconds();
-  logInfo("Runtime starting\n");
   logInfo("Extraction time: %10.4f ms\n", (t1 - t0) / 1000.0);
 
-  if (munmap(uncompressedData, uncompressedSize) == -1)
+  uint32_t extraSize = payloadSize - extraDataOffset - sizeof(int32_t);
+
+  char extraSizeStr[16];
+  sprintf(extraSizeStr, "%i", extraSize);
+
+  logInfo("Extra size: %i\n", extraSize);
+
+  char extraOffsetStr[16];
+  sprintf(extraOffsetStr, "%i", uncompressedSize);
+
+  char outputFdStr[16];
+  sprintf(outputFdStr, "%i", outputFd);
+
+  pthread_t uncompressedUnmapTread;
+  UnmapThreadArgs uncompressedUnmapThreadArgs = {.addr = uncompressedData, .length = uncompressedSize};
+
+  pthread_t selfBytesUnmapThread;
+  UnmapThreadArgs selfBytesUnmapThreadArgs = {.addr = selfBytes, .length = selfStats.st_size};
+
+  if (pthread_create(&uncompressedUnmapTread, NULL, unmapThread, &uncompressedUnmapThreadArgs) != 0)
   {
-    err(1, "Failed to unmap memory");
+    return 1;
+  }
+
+  if (pthread_create(&selfBytesUnmapThread, NULL, unmapThread, &selfBytesUnmapThreadArgs) != 0)
+  {
+    return 1;
   }
 
+  // if (munmap(uncompressedData, uncompressedSize) == -1)
+  // {
+  //   err(1, "Failed to unmap memory");
+  // }
+
+  // if (munmap(selfBytes, selfStats.st_size) == -1)
+  // {
+  //   err(1, "Failed to unmap memory");
+  // }
+
   double t2 = micro_seconds();
   logInfo("Extraction + write time: %10.4f ms\n", (t2 - t0) / 1000.0);
 
+  logInfo("Runtime starting\n");
+
   char **new_argv = malloc((size_t)(argc + 1) * sizeof *new_argv);
   for (uint8_t i = 0; i < argc; ++i)
   {
@@ -327,6 +440,12 @@ int main(int argc, char *argv[])
   setenv("_START_TIME", startTimeStr, false);
   setenv("MIMALLOC_RESERVE_OS_MEMORY", mimallocReserveMemoryMb, false);
   setenv("MIMALLOC_LIMIT_OS_ALLOC", "1", false);
+  setenv("LLRT_MEM_FD", outputFdStr, false);
+  setenv("LLRT_BYTECODE_OFFSET", extraOffsetStr, false);
+  setenv("LLRT_BYTECODE_SIZE", extraSizeStr, false);
+
+  pthread_join(uncompressedUnmapTread, NULL);
+  pthread_join(selfBytesUnmapThread, NULL);
 
   logInfo("Starting app\n");
 
diff --git a/llrt_core/Cargo.toml b/llrt_core/Cargo.toml
index 7801f8eaf7..3ec7f93a00 100644
--- a/llrt_core/Cargo.toml
+++ b/llrt_core/Cargo.toml
@@ -25,7 +25,6 @@ chrono = { version = "0.4.38", default-features = false, features = ["std"] }
 quick-xml = "0.36.0"
 crc32c = { version = "0.6.8" }
 crc32fast = "1.4.2"
-phf = "0.11.2"
 rquickjs = { version = "0.6.2", features = [
     "full-async",
     "parallel",
@@ -78,6 +77,8 @@ flate2 = { version = "1.0.30", features = [
 ], default-features = false }
 brotlic = "0.8.2"
 rustls-pemfile = "2.1.2"
+url = "=2.5.1"
+memmap2 = "0.9.4"
 
 [build-dependencies]
 rquickjs = { version = "0.6.2", features = [
diff --git a/llrt_core/build.rs b/llrt_core/build.rs
index 54c0d5982f..65dcd5969a 100644
--- a/llrt_core/build.rs
+++ b/llrt_core/build.rs
@@ -16,7 +16,7 @@ use std::io::Write;
 use jwalk::WalkDir;
 use rquickjs::{CatchResultExt, CaughtError, Context, Module, Runtime};
 
-const BUNDLE_JS_DIR: &str = "../bundle/js";
+const BUNDLE_DIR: &str = "../bundle";
 
 include!("src/bytecode.rs");
 
@@ -38,7 +38,7 @@ include!("src/compiler_common.rs");
 async fn main() -> StdResult<(), Box<dyn Error>> {
     set_nightly_cfg();
 
-    rerun_if_changed!(BUNDLE_JS_DIR);
+    rerun_if_changed!(BUNDLE_DIR);
     rerun_if_changed!("Cargo.toml");
     rerun_if_changed!("patches");
 
@@ -52,8 +52,20 @@ async fn main() -> StdResult<(), Box<dyn Error>> {
     let ctx = Context::full(&rt)?;
 
     let out_dir = env::var("OUT_DIR").unwrap();
+    let out_dir: PathBuf = out_dir.into();
+    let mut components = out_dir.components();
+    components.next_back();
+    components.next_back();
+    components.next_back();
+    // components.next_back();
+    let out_dir = components.as_path().join("llrt_bytecode");
 
-    let sdk_bytecode_path = Path::new(&out_dir).join("bytecode_cache.rs");
+    info!("outdir={:?}", &out_dir);
+
+    let _ = fs::remove_dir_all(&out_dir);
+    let _ = fs::create_dir_all(&out_dir);
+
+    let sdk_bytecode_path = out_dir.join("bytecode_cache.rs");
     let mut sdk_bytecode_file = BufWriter::new(File::create(sdk_bytecode_path)?);
 
     let mut ph_map = phf_codegen::Map::<String>::new();
@@ -63,10 +75,10 @@ async fn main() -> StdResult<(), Box<dyn Error>> {
     fs::write("../VERSION", env!("CARGO_PKG_VERSION")).expect("Unable to write VERSION file");
 
     ctx.with(|ctx| {
-        for dir_ent in WalkDir::new(BUNDLE_JS_DIR).into_iter().flatten() {
+        for dir_ent in WalkDir::new(BUNDLE_DIR).into_iter().flatten() {
             let path = dir_ent.path();
 
-            let path = path.strip_prefix(BUNDLE_JS_DIR)?.to_owned();
+            let path = path.strip_prefix(BUNDLE_DIR)?.to_owned();
             let path_str = path.to_string_lossy().to_string();
 
             if path_str.starts_with("__tests__") || path.extension().unwrap_or_default() != "js" {
@@ -80,6 +92,10 @@ async fn main() -> StdResult<(), Box<dyn Error>> {
                 }
             }
 
+            if path == PathBuf::new().join("@llrt").join("std.js") {
+                continue;
+            }
+
             #[cfg(feature = "no-sdk")]
             {
                 if path_str.starts_with("@aws-sdk")
@@ -156,9 +172,9 @@ async fn main() -> StdResult<(), Box<dyn Error>> {
         .to_string_lossy()
         .to_string();
 
-    if cfg!(feature = "uncompressed") {
-        generate_compression_dictionary(&compression_dictionary_path, &lrt_filenames)?;
-    } else {
+    generate_compression_dictionary(&compression_dictionary_path, &lrt_filenames)?;
+
+    if cfg!(not(feature = "uncompressed")) {
         total_bytes = compress_bytecode(compression_dictionary_path, lrt_filenames)?;
 
         info!(
@@ -189,8 +205,6 @@ fn set_nightly_cfg() {
 }
 
 fn compress_bytecode(dictionary_path: String, source_files: Vec<String>) -> io::Result<usize> {
-    generate_compression_dictionary(&dictionary_path, &source_files)?;
-
     let mut total_size = 0;
     let tmp_dir = env::temp_dir();
 
diff --git a/llrt_core/src/bytecode.rs b/llrt_core/src/bytecode.rs
index d7232c0447..ce29bf490e 100644
--- a/llrt_core/src/bytecode.rs
+++ b/llrt_core/src/bytecode.rs
@@ -4,6 +4,7 @@
 pub const BYTECODE_VERSION: &str = "lrt01";
 pub const BYTECODE_COMPRESSED: u8 = b'c';
 pub const BYTECODE_UNCOMPRESSED: u8 = b'u';
+pub const BYTECODE_EMBEDDED_SIGNATURE: &[u8] = b"lrt";
 #[allow(dead_code)]
 pub const BYTECODE_EXT: &str = "lrt";
 pub const SIGNATURE_LENGTH: usize = BYTECODE_VERSION.len() + 1;
diff --git a/llrt_core/src/vm.rs b/llrt_core/src/vm.rs
index 901baba436..babb280f8c 100644
--- a/llrt_core/src/vm.rs
+++ b/llrt_core/src/vm.rs
@@ -6,15 +6,23 @@ use std::{
     env,
     ffi::CStr,
     fmt::Write,
-    io,
+    fs::{self, File},
+    io::{self, Read, Seek},
+    mem::size_of,
+    ops::Range,
+    os::unix::fs::MetadataExt,
     path::{Component, Path, PathBuf},
     process::exit,
+    rc::Rc,
     result::Result as StdResult,
-    sync::{Arc, Mutex},
+    sync::{Arc, Mutex, RwLock},
 };
 
 use llrt_modules::timers::{self, poll_timers};
-use llrt_utils::{bytes::ObjectBytes, error::ErrorExtensions, object::ObjectExt};
+use llrt_utils::{
+    bytes::ObjectBytes, encoding::bytes_to_hex_string, error::ErrorExtensions, object::ObjectExt,
+};
+use memmap2::{Advice, MmapOptions};
 use once_cell::sync::Lazy;
 use ring::rand::SecureRandom;
 use rquickjs::{
@@ -31,12 +39,218 @@ use tokio::time::Instant;
 use tracing::trace;
 use zstd::{bulk::Decompressor, dict::DecoderDictionary};
 
-include!(concat!(env!("OUT_DIR"), "/bytecode_cache.rs"));
+//include!(concat!(env!("OUT_DIR"), "/../../../../bytecode_cache.rs"));
+
+#[cfg(unix)]
+fn file_from_raw_fd(raw_fd: i32) -> std::io::Result<File> {
+    #[cfg(not(unix))]
+    {
+        return Err(std::io::Error::new(
+            std::io::ErrorKind::Other,
+            "Unsupported on non-unix platforms",
+        ));
+    }
+    #[cfg(unix)]
+    {
+        use std::os::fd::FromRawFd;
+        let dup_fd = unsafe { libc::dup(raw_fd) };
+        if dup_fd == -1 {
+            return Err(std::io::Error::last_os_error());
+        }
+        Ok(unsafe { File::from_raw_fd(dup_fd) })
+    }
+}
+
+fn u32_from_le_byte_slice_unchecked(bytes: &[u8]) -> u32 {
+    (bytes[0] as u32)
+        | ((bytes[1] as u32) << 8)
+        | ((bytes[2] as u32) << 16)
+        | ((bytes[3] as u32) << 24)
+}
+
+fn u16_from_le_byte_slice_unchecked(bytes: &[u8]) -> u16 {
+    (bytes[0] as u16) | ((bytes[1] as u16) << 8)
+}
+
+#[derive(Default)]
+struct BytecodeCache {
+    data: Vec<u8>,
+    map: HashMap<Box<str>, Range<usize>>,
+}
+
+impl BytecodeCache {
+    pub fn new(data: Vec<u8>, start_position: usize, length: usize, package_count: usize) -> Self {
+        let mut offset = start_position;
+        let mut map = HashMap::with_capacity(package_count);
+
+        loop {
+            let name_len_start = offset;
+            let name_len_end = offset + size_of::<u16>();
+            let name_len =
+                u16_from_le_byte_slice_unchecked(&data[name_len_start..name_len_end]) as usize;
+            let bytecode_pos_start = name_len_end + name_len;
+
+            let name =
+                unsafe { std::str::from_utf8_unchecked(&data[name_len_end..bytecode_pos_start]) };
+
+            let bytecode_pos_end = bytecode_pos_start + size_of::<u32>();
+            let bytecode_pos =
+                u32_from_le_byte_slice_unchecked(&data[bytecode_pos_start..bytecode_pos_end])
+                    as usize;
+
+            let bytecode_size_start = bytecode_pos_end;
+            let bytecode_size_end = bytecode_size_start + size_of::<u32>();
+            let bytecode_size =
+                u32_from_le_byte_slice_unchecked(&data[bytecode_size_start..bytecode_size_end])
+                    as usize;
 
-use crate::modules::{
-    console,
-    crypto::SYSTEM_RANDOM,
-    path::{dirname, join_path, resolve_path},
+            map.insert(name.into(), bytecode_pos..bytecode_pos + bytecode_size);
+
+            offset = bytecode_size_end;
+
+            if offset >= length - 1 {
+                break;
+            }
+        }
+
+        Self { data, map }
+    }
+
+    fn has(&self, name: &str) -> bool {
+        self.map.contains_key(name)
+    }
+
+    fn get(&self, name: &str) -> Option<&[u8]> {
+        self.map.get(name).map(|range| &self.data[range.clone()])
+    }
+}
+
+static EMBEDDED_BYTECODE_DATA: Lazy<RwLock<BytecodeCache>> = Lazy::new(|| {
+    let init = || {
+        let now = Instant::now();
+        trace!("Loading embedded bytecode");
+        let argv_0 = env::args().next().expect("Failed to get argv0");
+
+        let mut file = if let Ok(fd_string) = env::var("LLRT_MEM_FD") {
+            let mem_fd: i32 = fd_string.parse().map_err(|_| {
+                std::io::Error::new(std::io::ErrorKind::Other, "Invalid bytecode-cache fd")
+            })?;
+            trace!("Using raw memfd bytecode cache");
+            file_from_raw_fd(mem_fd)
+        } else {
+            File::open(argv_0)
+        }?;
+
+        let offset: u64 = if let Ok(offset_string) = env::var("LLRT_BYTECODE_OFFSET") {
+            offset_string.parse().map_err(|_| {
+                std::io::Error::new(std::io::ErrorKind::Other, "Invalid bytecode-cache offset")
+            })?
+        } else {
+            0
+        };
+
+        let size: usize = if let Ok(size_string) = env::var("LLRT_BYTECODE_SIZE") {
+            size_string.parse().map_err(|_| {
+                std::io::Error::new(std::io::ErrorKind::Other, "Invalid bytecode-cache size")
+            })?
+        } else {
+            file.metadata()?.size() as usize
+        };
+
+        let mmap = unsafe { MmapOptions::new().offset(offset).len(size).map(&file)? };
+        mmap.advise(Advice::Sequential).unwrap();
+
+        println!(
+            "Size + offset : {},{},{}",
+            size,
+            offset,
+            file.metadata()?.size()
+        );
+
+        let mut buf2 = Vec::new();
+
+        file.read_to_end(&mut buf2)?;
+
+        let compressed_bytes_end_index = mmap.len() - 4;
+
+        println!("End index: {}", compressed_bytes_end_index);
+
+        let uncompressed_size =
+            u32_from_le_byte_slice_unchecked(&mmap[compressed_bytes_end_index..]) as usize;
+
+        println!("Uncompressed size : {}", uncompressed_size);
+
+        let mut bytecode_bundle = Vec::with_capacity(uncompressed_size);
+        let mut decompressor = Decompressor::with_prepared_dictionary(&DECOMPRESSOR_DICT)?;
+        decompressor
+            .decompress_to_buffer(&mmap[0..compressed_bytes_end_index], &mut bytecode_bundle)?;
+
+        println!("Extraction took {:?}", now.elapsed());
+
+        drop(mmap);
+
+        let total_file_size = bytecode_bundle.len();
+
+        let signature_len = BYTECODE_EMBEDDED_SIGNATURE.len();
+        let signed_signature_len = signature_len as isize;
+
+        if &bytecode_bundle[(total_file_size as isize - signed_signature_len) as usize..]
+            != BYTECODE_EMBEDDED_SIGNATURE
+        {
+            return Ok(RwLock::new(BytecodeCache::default()));
+        }
+
+        #[repr(C)]
+        #[derive(Debug)]
+        struct EmbeddedMeta {
+            package_count: u32,
+            bytecode_pos: u32,
+            package_index_pos: u32,
+        }
+
+        let embedded_meta_size = size_of::<EmbeddedMeta>();
+        let meta_and_signature_size = embedded_meta_size + signature_len;
+
+        let meta_start = (total_file_size as isize - meta_and_signature_size as isize) as usize;
+        let meta_end = (total_file_size as isize - signature_len as isize) as usize;
+
+        let embedded_metadata: EmbeddedMeta =
+            unsafe { std::ptr::read(bytecode_bundle[meta_start..meta_end].as_ptr() as *const _) };
+
+        println!("Metadata: {:?}", embedded_metadata);
+
+        let bytecode_pos = embedded_metadata.bytecode_pos as usize;
+        let start_position = embedded_metadata.package_index_pos as usize;
+        let end_pos = total_file_size as usize - meta_and_signature_size;
+        let length = end_pos - bytecode_pos;
+
+        trace!(
+            "Loading bytecode cache of {} kB",
+            (end_pos - bytecode_pos) / 1024
+        );
+
+        let bytecode_cache = BytecodeCache::new(
+            bytecode_bundle,
+            start_position,
+            length,
+            embedded_metadata.package_count as usize,
+        );
+
+        trace!("Building cache took: {:?}", now.elapsed());
+
+        io::Result::Ok(RwLock::new(bytecode_cache))
+    };
+
+    init().unwrap()
+});
+
+use crate::{
+    bytecode::BYTECODE_EMBEDDED_SIGNATURE,
+    modules::{
+        console,
+        crypto::SYSTEM_RANDOM,
+        path::{dirname, join_path, resolve_path},
+    },
 };
 
 use crate::{
@@ -56,8 +270,10 @@ pub fn uncompressed_size(input: &[u8]) -> StdResult<(usize, &[u8]), io::Error> {
     Ok((uncompressed_size, rest))
 }
 
-pub(crate) static COMPRESSION_DICT: &[u8] =
-    include_bytes!(concat!(env!("OUT_DIR"), "/compression.dict"));
+pub(crate) static COMPRESSION_DICT: &[u8] = include_bytes!(concat!(
+    env!("OUT_DIR"),
+    "/../../../llrt_bytecode/compression.dict"
+));
 
 static DECOMPRESSOR_DICT: Lazy<DecoderDictionary> =
     Lazy::new(|| DecoderDictionary::copy(COMPRESSION_DICT));
@@ -119,7 +335,9 @@ impl Resolver for BinaryResolver {
     fn resolve(&mut self, _ctx: &Ctx, base: &str, name: &str) -> Result<String> {
         trace!("Try resolve \"{}\" from \"{}\"", name, base);
 
-        if BYTECODE_CACHE.contains_key(name) {
+        let cache = EMBEDDED_BYTECODE_DATA.read().unwrap();
+
+        if cache.has(name) {
             return Ok(name.to_string());
         }
 
@@ -149,11 +367,11 @@ impl Resolver for BinaryResolver {
 
         trace!("Normalized path: {}, key: {}", normalized_path, cache_key);
 
-        if BYTECODE_CACHE.contains_key(cache_key) {
+        if cache.has(cache_key) {
             return Ok(cache_key.to_string());
         }
 
-        if BYTECODE_CACHE.contains_key(base) {
+        if cache.has(base) {
             normalized_path = name;
             if Path::new(normalized_path).exists() {
                 return Ok(normalized_path.to_string());
@@ -239,7 +457,10 @@ impl Loader for BinaryLoader {
     fn load<'js>(&mut self, ctx: &Ctx<'js>, name: &str) -> Result<Module<'js, Declared>> {
         trace!("Loading module: {}", name);
         let ctx = ctx.clone();
-        if let Some(bytes) = BYTECODE_CACHE.get(name) {
+
+        let cache = EMBEDDED_BYTECODE_DATA.read().unwrap();
+
+        if let Some(bytes) = cache.get(name) {
             trace!("Loading embedded module: {}", name);
 
             return load_bytecode_module(ctx, name, bytes);
@@ -410,7 +631,7 @@ impl Vm {
                     init_global(&ctx)?;
                 }
                 timers::init_timers(&ctx)?;
-                init(&ctx, module_names)?;
+                let _ = init(&ctx, module_names)?;
                 Ok(())
             })()
             .catch(&ctx)
@@ -593,8 +814,13 @@ fn init(ctx: &Ctx<'_>, module_names: HashSet<&'static str>) -> Result<()> {
             } else {
                 specifier
             };
-            let import_name = if module_names.contains(specifier.as_str())
-                || BYTECODE_CACHE.contains_key(&specifier)
+
+            let cache = EMBEDDED_BYTECODE_DATA.read().unwrap();
+
+            let specifier_ref = specifier.as_str();
+
+            let import_name = if module_names.contains(specifier_ref)
+                || cache.has(specifier_ref)
                 || specifier.starts_with('/')
             {
                 specifier
@@ -605,6 +831,8 @@ fn init(ctx: &Ctx<'_>, module_names: HashSet<&'static str>) -> Result<()> {
                 join_path(vec![import_directory, specifier])
             };
 
+            drop(cache);
+
             let mut map = require_in_progress.lock().unwrap();
             if let Some(obj) = map.get(&import_name) {
                 return Ok(obj.clone().into_value());
@@ -663,7 +891,12 @@ fn init(ctx: &Ctx<'_>, module_names: HashSet<&'static str>) -> Result<()> {
         }),
     )?;
 
-    () = Module::import(ctx, "@llrt/std")?.finish()?;
+    let mut opts = EvalOptions::default();
+    opts.global = false;
+    opts.strict = false;
+    opts.promise = true;
+
+    ctx.eval_with_options(include_str!("../../bundle/@llrt/std.js"), opts)?;
 
     Ok(())
 }
diff --git a/pack b/pack
index d0beac0549..0e3f43a313 100755
--- a/pack
+++ b/pack
@@ -4,9 +4,9 @@
 
 set -e
 
-if [ "$#" -ne 2 ]; then
+if [ "$#" -lt 2 ]; then
   echo "Error: Two file arguments are required."
-  echo "Usage: $0 input_file output_file"
+  echo "Usage: $0 input_file output_file <additional file>"
   exit 1
 fi
 
@@ -17,6 +17,13 @@ data_size=$(wc -c < "$1")
 
 srcfile="$1"
 dstfile="$2"
+additional_file="$3"
+additional_file_dict="$4"
+
+if [ ! -z "$additional_file" ] && [ ! -f "$additional_file" ]; then
+    echo "Error: Additional file $additional_file does not exist."
+    exit 1
+fi
 
 # Check if the bytes start with \x7fELF
 if [[ $(hexdump -n 4 -e '4/1 "%02x"' $srcfile) != "7f454c46" ]]; then
@@ -37,14 +44,43 @@ else
     exit 1
 fi
 
-
 # Create a temporary directory to store the parts
 temp_folder=$(mktemp -d)
 
-# Split the file into parts and compress each part using lz4
+# Split the file into parts and compress each part using zstd
 split -n ${num_parts} $srcfile ${temp_folder}/part_
 
-compressed_data=""
+
+function to_u8le() {
+    for num in "$@"; do
+        printf "%02x" $num |  xxd -r -p
+    done
+}
+
+
+function to_u32le() {
+    for num in "$@"; do
+        local le_hex=$(printf "%08x" $num | sed 's/\(..\)\(..\)\(..\)\(..\)/\4\3\2\1/')
+        printf "$le_hex" | xxd -r -p
+    done
+}
+
+
+function compress(){
+    local src=$1
+    local dst=$2
+    local dict=$3
+
+    local dict_args=""
+
+    if [ ! -z "$dict" ]; then
+        dict_args="-D ${dict}"
+    fi
+
+    zstd --ultra $dict_args -22 --no-check -f "$src" -o "${dst}"
+}
+
+compressed_file="${temp_folder}/compressed"
 
 input_sizes=()
 output_sizes=()
@@ -53,9 +89,10 @@ total_output_size=0
 working_dir=$(pwd)
 
 for file in ${temp_folder}/*; do
+    touch $compressed_file
     filename=$(basename $file)
     echo "Compressing $filename..."
-    (cd $(dirname $file) && zstd --ultra -22 --no-check -f "$filename" -o "${filename}.zst")
+    (cd $(dirname $file) && compress "$filename" "${filename}.zst")
     output_size=$(wc -c < "${file}")
     input_size=$(wc -c < "${file}.zst")
     input_sizes+=($input_size)
@@ -63,44 +100,48 @@ for file in ${temp_folder}/*; do
     total_input_size=$((total_input_size + input_size))
     total_output_size=$((total_output_size + output_size))
 
-    hex_data=$(hexdump -v -e '/1 " %02x"' "${file}.zst" | sed 's/ /\\\\x/g')
-
-    compressed_data="${compressed_data}${hex_data}"
-
+    cat "${file}.zst" >> $compressed_file
     rm "${file}"
 done
 
-function little_endian_hex()
-{
-    local hex_code=""
-    local number
 
-    for number in "$@"; do
-        byte=$(printf "%08x" $number | sed -r 's/(..)(..)(..)(..)/\4\3\2\1/')
-        hex_code="${hex_code}${byte}"
-    done
+cp "$1" "${1}.bak"
 
-    # Insert \x separators between bytes
-    hex_code=$(echo "$hex_code" | sed 's/../\\x&/g')
+echo "Compiling for $target..."
 
-    echo "$hex_code"
-}
+original_size=$(du -h $srcfile | cut -f1)
 
-INPUT_SIZES_STR=$(little_endian_hex ${input_sizes[@]})
-OUTPUT_SIZES_STR=$(little_endian_hex ${output_sizes[@]})
+zig cc -target $target -Wno-null-character -std=c99 -Wall -O3 -flto -s llrt/src/main.c -o $dstfile -Ilib -Llib/${arch} -static -lzstd
 
-cat <<EOF > ${temp_folder}/data.c
-const char *data = "$(printf "\\\x%02x" $num_parts)${INPUT_SIZES_STR}${OUTPUT_SIZES_STR}$(printf "$compressed_data")";
-EOF
+dst_size=$(wc -c < "${dstfile}")
 
-cp "$1" "${1}.bak"
-cp llrt/src/main.c $temp_folder/
 
-echo "Compiling..."
+echo "Done. Appending compressed binary..."
 
-original_size=$(du -h $srcfile | cut -f1)
+to_u8le $num_parts >> "${dstfile}"
+to_u32le ${input_sizes[@]} >> "${dstfile}"
+to_u32le ${output_sizes[@]} >> "${dstfile}"
+cat $compressed_file >> "${dstfile}"
+
+if [ ! -z "$additional_file" ]; then
+  echo  "Done. Appending additional data..."
+  filename=$(basename $additional_file)
+  additional_size=$(wc -c < "$additional_file")
+
+  echo "Compressing $additional_size..."
+
+  cp "$additional_file" "${temp_folder}/${filename}"
+  if [ ! -z "$additional_file_dict" ]; then
+    cp "$additional_file_dict" "${temp_folder}/${filename}.dict"
+  fi
+  (cd "${temp_folder}" && compress "${filename}" "${filename}.zst" "${filename}.dict")
+
+  cat "${temp_folder}/${filename}.zst" >> "${dstfile}"
+
+  to_u32le $additional_size >> "${dstfile}"
+fi
 
-zig cc -target $target -Wno-null-character -std=c99 -Wall -O3 -flto -s ${temp_folder}/main.c -o $dstfile -Ilib -Llib/${arch} -static -lzstd
+to_u32le $dst_size >> "${dstfile}"
 
 new_size=$(du -h $dstfile | cut -f1)