microsoft · 0xRaduan · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024 · Jan 9, 2025
diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
   "pathvalidate",
   "charset-normalizer",
   "openai",
+  "ebooklib",
   "azure-ai-documentintelligence",
   "azure-identity"
 ]

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -36,6 +36,7 @@
     OutlookMsgConverter,
     ZipConverter,
     DocumentIntelligenceConverter,
+    EpubConverter,
 )
 
 from ._exceptions import (
@@ -142,6 +143,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(IpynbConverter())
             self.register_converter(PdfConverter())
             self.register_converter(OutlookMsgConverter())
+            self.register_converter(EpubConverter())
 
             # Register Document Intelligence converter at the top of the stack if endpoint is provided
             docintel_endpoint = kwargs.get("docintel_endpoint")

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -20,6 +20,7 @@
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._epub_converter import EpubConverter
 
 __all__ = [
     "DocumentConverter",
@@ -42,4 +43,5 @@
     "OutlookMsgConverter",
     "ZipConverter",
     "DocumentIntelligenceConverter",
+    "EpubConverter",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -0,0 +1,59 @@
+from typing import Any
+
+from ebooklib import epub, ITEM_DOCUMENT
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._html_converter import HtmlConverter
+
+class EpubConverter(DocumentConverter):
+    """Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
+
+    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
+        """Convert an EPUB file to markdown.
+
+        Args:
+            local_path: Path to the EPUB file
+            **kwargs: Additional arguments (unused)
+
+        Returns:
+            DocumentConverterResult containing the converted markdown
+
+        Raises:
+            FileConversionException: If the file is not an EPUB file
+        """
+        # Check if this is an EPUB file
+        file_ext = kwargs.get("file_extension", "").lower()
+        if not file_ext.endswith(".epub"):
+            return None
+
+        book = epub.read_epub(local_path)
+
+        # Initialize result with book title
+        result = DocumentConverterResult(
+            title=(
+                book.get_metadata("DC", "title")[0][0]
+                if book.get_metadata("DC", "title")
+                else None
+            )
+        )
+
+        # Start with metadata
+        metadata_md = []
+        if book.get_metadata("DC", "creator"):
+            metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
+        if book.get_metadata("DC", "description"):
+            metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
+
+        # Convert content
+        content_md = []
+        for item in book.get_items():
+            if item.get_type() == ITEM_DOCUMENT:
+                content = item.get_content().decode("utf-8")
+                html_result = HtmlConverter()._convert(content)
+                if html_result and html_result.text_content:
+                    content_md.append(html_result.text_content)
+
+        # Combine all parts
+        result.text_content = "\n\n".join(metadata_md + content_md)
+
+        return result
diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -87,5 +87,14 @@ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
 
         return "![%s](%s%s)" % (alt, src, title_part)
 
+    def convert_em(self, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Convert emphasized text to Markdown format using underscores."""
+        if not text:
+            return ''
+        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
+        if not text:
+            return ''
+        return '%s_%s_%s' % (prefix, text, suffix)
+
     def convert_soup(self, soup: Any) -> str:
         return super().convert_soup(soup)  # type: ignore
diff --git a/packages/markitdown/tests/test_files/test.epub b/packages/markitdown/tests/test_files/test.epub
diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py
@@ -145,6 +145,19 @@
     "5bda1dd6",
 ]
 
+
+EPUB_TEST_STRINGS = [
+    "Author: Test Author",
+    "A test EPUB document for MarkItDown testing",
+    "# Chapter 1: Test Content",
+    "This is a **test** paragraph with some formatting",
+    "* A bullet point",
+    "* Another point",
+    "# Chapter 2: More Content",
+    "_different_ style",
+    "> This is a blockquote for testing",
+]
+
 JSON_TEST_STRINGS = [
     "5b64c88c-b3c3-4510-bcb8-da0b200602d8",
     "9700dc99-6685-40b4-9a3a-5e406dcb37f3",
@@ -192,6 +205,13 @@ def test_markitdown_remote() -> None:
 def test_markitdown_local() -> None:
     markitdown = MarkItDown()
 
+    # Test EPUB processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
+    assert result.title == "Test EPUB Document"
+    for test_string in EPUB_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
     validate_strings(result, XLSX_TEST_STRINGS)