Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ dependencies = [
"pathvalidate",
"charset-normalizer",
"openai",
"ebooklib",
"azure-ai-documentintelligence",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@

"azure-identity"
]
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
OutlookMsgConverter,
ZipConverter,
DocumentIntelligenceConverter,
EpubConverter,
)

from ._exceptions import (
Expand Down Expand Up @@ -142,6 +143,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())

# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter
from ._epub_converter import EpubConverter

__all__ = [
"DocumentConverter",
Expand All @@ -42,4 +43,5 @@
"OutlookMsgConverter",
"ZipConverter",
"DocumentIntelligenceConverter",
"EpubConverter",
]
59 changes: 59 additions & 0 deletions packages/markitdown/src/markitdown/converters/_epub_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import Any

from ebooklib import epub, ITEM_DOCUMENT

from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter

class EpubConverter(DocumentConverter):
"""Converts EPUB files to Markdown. Preserves chapter structure and metadata."""

def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
"""Convert an EPUB file to markdown.

Args:
local_path: Path to the EPUB file
**kwargs: Additional arguments (unused)

Returns:
DocumentConverterResult containing the converted markdown

Raises:
FileConversionException: If the file is not an EPUB file
"""
# Check if this is an EPUB file
file_ext = kwargs.get("file_extension", "").lower()
if not file_ext.endswith(".epub"):
return None

book = epub.read_epub(local_path)

# Initialize result with book title
result = DocumentConverterResult(
title=(
book.get_metadata("DC", "title")[0][0]
if book.get_metadata("DC", "title")
else None
)
)

# Start with metadata
metadata_md = []
if book.get_metadata("DC", "creator"):
metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
if book.get_metadata("DC", "description"):
metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")

# Convert content
content_md = []
for item in book.get_items():
if item.get_type() == ITEM_DOCUMENT:
content = item.get_content().decode("utf-8")
html_result = HtmlConverter()._convert(content)
if html_result and html_result.text_content:
content_md.append(html_result.text_content)

# Combine all parts
result.text_content = "\n\n".join(metadata_md + content_md)

return result
9 changes: 9 additions & 0 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,14 @@ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:

return "![%s](%s%s)" % (alt, src, title_part)

def convert_em(self, el: Any, text: str, convert_as_inline: bool) -> str:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

noticed that it doesn't have an tag, and that's used in Epub as far as I know

"""Convert emphasized text to Markdown format using underscores."""
if not text:
return ''
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
return ''
return '%s_%s_%s' % (prefix, text, suffix)

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
Binary file added packages/markitdown/tests/test_files/test.epub
Binary file not shown.
20 changes: 20 additions & 0 deletions packages/markitdown/tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,19 @@
"5bda1dd6",
]


EPUB_TEST_STRINGS = [
"Author: Test Author",
"A test EPUB document for MarkItDown testing",
"# Chapter 1: Test Content",
"This is a **test** paragraph with some formatting",
"* A bullet point",
"* Another point",
"# Chapter 2: More Content",
"_different_ style",
"> This is a blockquote for testing",
]

JSON_TEST_STRINGS = [
"5b64c88c-b3c3-4510-bcb8-da0b200602d8",
"9700dc99-6685-40b4-9a3a-5e406dcb37f3",
Expand Down Expand Up @@ -192,6 +205,13 @@ def test_markitdown_remote() -> None:
def test_markitdown_local() -> None:
markitdown = MarkItDown()

# Test EPUB processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
assert result.title == "Test EPUB Document"
for test_string in EPUB_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content

# Test XLSX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
validate_strings(result, XLSX_TEST_STRINGS)
Expand Down