Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions eng/packages/General.props
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
<PackageVersion Include="Microsoft.Extensions.VectorData.Abstractions" Version="$(MicrosoftExtensionsVectorDataAbstractionsVersion)" />
<PackageVersion Include="Microsoft.IO.RecyclableMemoryStream" Version="3.0.0" />
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="$(MicrosoftMLTokenizersVersion)" />
<PackageVersion Include="ModelContextProtocol.Core" Version="0.4.0-preview.3" />
<PackageVersion Include="Newtonsoft.Json" Version="13.0.3" />
<PackageVersion Include="OllamaSharp" Version="5.1.9" />
<PackageVersion Include="OpenAI" Version="2.6.0" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;
using ModelContextProtocol.Client;
using ModelContextProtocol.Protocol;

namespace Microsoft.Extensions.DataIngestion;

/// <summary>
/// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> MCP server.
/// </summary>
public class MarkItDownMcpReader : IngestionDocumentReader
{
private readonly Uri _mcpServerUri;
private readonly McpClientOptions? _options;

/// <summary>
/// Initializes a new instance of the <see cref="MarkItDownMcpReader"/> class.
/// </summary>
/// <param name="mcpServerUri">The URI of the MarkItDown MCP server (e.g., http://localhost:3001/mcp).</param>
/// <param name="options">Optional MCP client options for configuring the connection.</param>
public MarkItDownMcpReader(Uri mcpServerUri, McpClientOptions? options = null)
{
_mcpServerUri = Throw.IfNull(mcpServerUri);
_options = options;
}

/// <inheritdoc/>
public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);

if (!source.Exists)
{
throw new FileNotFoundException("The specified file does not exist.", source.FullName);
}

// Read file content as base64 data URI
#if NET
byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false);
#else
byte[] fileBytes;
using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous))
{
using MemoryStream ms = new();
await fs.CopyToAsync(ms).ConfigureAwait(false);
fileBytes = ms.ToArray();
}
#endif
string dataUri = CreateDataUri(fileBytes, mediaType);

string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false);

return MarkdownParser.Parse(markdown, identifier);
}

/// <inheritdoc/>
public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);

// Read stream content as base64 data URI
using MemoryStream ms = new();
#if NET
await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false);
#else
await source.CopyToAsync(ms).ConfigureAwait(false);
#endif
byte[] fileBytes = ms.ToArray();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot, once you switch to using DataContent, it accepts a Readonlymemory, so rather than using ToArray, you can pass in a Readonlymemory created from MemoryStream's GetBuffer and Length

string dataUri = CreateDataUri(fileBytes, mediaType);

string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false);

return MarkdownParser.Parse(markdown, identifier);
}

#pragma warning disable S3995 // URI return values should not be strings
private static string CreateDataUri(byte[] fileBytes, string? mediaType)
Copy link
Member

@stephentoub stephentoub Nov 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot, you already have a reference (indirectly) to the Microsoft.Extensions.AI.Abstractions package, which provides a DataContent type. DataContent implicitly supports data uris. You code user or instead like new DataContent(bytes, mime Type).Uri.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot please send a new PR that adreses this feedback

#pragma warning restore S3995 // URI return values should not be strings
{
string base64Content = Convert.ToBase64String(fileBytes);
string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!;
return $"data:{mimeType};base64,{base64Content}";
}

private async Task<string> ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken)
{
// Create HTTP client transport for MCP
HttpClientTransport transport = new(new HttpClientTransportOptions
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@halter73, just fyi

{
Endpoint = _mcpServerUri
});

await using (transport.ConfigureAwait(false))
{
// Create MCP client
McpClient client = await McpClient.CreateAsync(transport, _options, loggerFactory: null, cancellationToken).ConfigureAwait(false);

await using (client.ConfigureAwait(false))
{
// Build parameters for convert_to_markdown tool
Dictionary<string, object?> parameters = new()
{
["uri"] = dataUri
};

// Call the convert_to_markdown tool
var result = await client.CallToolAsync("convert_to_markdown", parameters, cancellationToken: cancellationToken).ConfigureAwait(false);

// Extract markdown content from result
// The result is expected to be in the format: { "content": [{ "type": "text", "text": "markdown content" }] }
if (result.Content != null && result.Content.Count > 0)
{
foreach (var content in result.Content)
{
if (content.Type == "text" && content is TextContentBlock textBlock)
{
return textBlock.Text;
}
}
}
}
}

throw new InvalidOperationException("Failed to convert document to markdown: unexpected response format from MCP server.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

<ItemGroup>
<PackageReference Include="Markdig.Signed" />
<PackageReference Include="ModelContextProtocol.Core" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ Or directly in the C# project file:

## Usage Examples

### Creating a MarkItDownReader for Data Ingestion
### Creating a MarkItDownReader for Data Ingestion (Local Process)

Use `MarkItDownReader` to convert documents using the MarkItDown executable installed locally:

```csharp
using Microsoft.Extensions.DataIngestion;
Expand All @@ -31,6 +33,59 @@ IngestionDocumentReader reader =
using IngestionPipeline<string> pipeline = new(reader, CreateChunker(), CreateWriter());
```

### Creating a MarkItDownMcpReader for Data Ingestion (MCP Server)

Use `MarkItDownMcpReader` to convert documents using a MarkItDown MCP server:

```csharp
using Microsoft.Extensions.DataIngestion;

// Connect to a MarkItDown MCP server (e.g., running in Docker)
IngestionDocumentReader reader =
new MarkItDownMcpReader(new Uri("http://localhost:3001/mcp"));

using IngestionPipeline<string> pipeline = new(reader, CreateChunker(), CreateWriter());
```

The MarkItDown MCP server can be run using Docker:

```bash
docker run -p 3001:3001 mcp/markitdown --http --host 0.0.0.0 --port 3001
```

Or installed via pip:

```bash
pip install markitdown-mcp-server
markitdown-mcp --http --host 0.0.0.0 --port 3001
```

### Integrating with Aspire

Aspire can be used for seamless integration with [MarkItDown MCP](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp). Sample AppHost logic:

```csharp
var builder = DistributedApplication.CreateBuilder(args);

var markitdown = builder.AddContainer("markitdown", "mcp/markitdown")
.WithArgs("--http", "--host", "0.0.0.0", "--port", "3001")
.WithHttpEndpoint(targetPort: 3001, name: "http");

var webApp = builder.AddProject("name");

webApp.WithEnvironment("MARKITDOWN_MCP_URL", markitdown.GetEndpoint("http"));

builder.Build().Run();
```

Sample Ingestion Service:

```csharp
string url = $"{Environment.GetEnvironmentVariable("MARKITDOWN_MCP_URL")}/mcp";

IngestionDocumentReader reader = new MarkItDownMcpReader(new Uri(url));
```

## Feedback & Contributing

We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions).
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,6 @@
<PackageReference Include="OpenTelemetry.Exporter.InMemory" />
</ItemGroup>

<ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETFramework'">
<!-- Workaround https://github.com/microsoft/semantic-kernel/issues/13316 -->
<PackageReference Include="Microsoft.Bcl.AsyncInterfaces" VersionOverride="$(MicrosoftBclAsyncInterfacesVersion)" />
</ItemGroup>

<ItemGroup>
<Compile Include="..\Microsoft.Extensions.AI.Abstractions.Tests\TestChatClient.cs" />
<Compile Include="..\Microsoft.Extensions.AI.Abstractions.Tests\TestEmbeddingGenerator.cs" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.IO;
using System.Threading.Tasks;
using Xunit;

namespace Microsoft.Extensions.DataIngestion.Readers.Tests;

public class MarkItDownMcpReaderTests
{
[Fact]
public void Constructor_ThrowsWhenMcpServerUriIsNull()
{
Assert.Throws<ArgumentNullException>("mcpServerUri", () => new MarkItDownMcpReader(null!));
}

[Fact]
public async Task ReadAsync_ThrowsWhenIdentifierIsNull()
{
var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse"));

await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: null!));
await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: string.Empty));

using MemoryStream stream = new();
await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(stream, identifier: null!, mediaType: "some"));
await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(stream, identifier: string.Empty, mediaType: "some"));
}

[Fact]
public async Task ReadAsync_ThrowsWhenSourceIsNull()
{
var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse"));

await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync(null!, "identifier"));
await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync((Stream)null!, "identifier", "mediaType"));
}

[Fact]
public async Task ReadAsync_ThrowsWhenFileDoesNotExist()
{
var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse"));
var nonExistentFile = new FileInfo(Path.Combine(Path.GetTempPath(), Path.GetRandomFileName()));

await Assert.ThrowsAsync<FileNotFoundException>(async () => await reader.ReadAsync(nonExistentFile, "identifier"));
}

// NOTE: Integration tests with an actual MCP server would go here, but they would require
// a running MarkItDown MCP server to be available, which is not part of the test setup.
// For full integration testing, use a real MCP server in a separate test environment.
}
Loading