-
Notifications
You must be signed in to change notification settings - Fork 849
Add MarkItDownMcpReader for MCP server support #7025
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3464cb2
9ee6a70
55eddcf
f91f70b
ab68e3a
b8e70dc
dceabf6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.IO; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Shared.Diagnostics; | ||
| using ModelContextProtocol.Client; | ||
| using ModelContextProtocol.Protocol; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion; | ||
|
|
||
| /// <summary> | ||
| /// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> MCP server. | ||
| /// </summary> | ||
| public class MarkItDownMcpReader : IngestionDocumentReader | ||
| { | ||
| private readonly Uri _mcpServerUri; | ||
| private readonly McpClientOptions? _options; | ||
|
|
||
| /// <summary> | ||
| /// Initializes a new instance of the <see cref="MarkItDownMcpReader"/> class. | ||
| /// </summary> | ||
| /// <param name="mcpServerUri">The URI of the MarkItDown MCP server (e.g., http://localhost:3001/mcp).</param> | ||
| /// <param name="options">Optional MCP client options for configuring the connection.</param> | ||
| public MarkItDownMcpReader(Uri mcpServerUri, McpClientOptions? options = null) | ||
| { | ||
| _mcpServerUri = Throw.IfNull(mcpServerUri); | ||
| _options = options; | ||
| } | ||
|
|
||
| /// <inheritdoc/> | ||
| public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) | ||
| { | ||
| _ = Throw.IfNull(source); | ||
| _ = Throw.IfNullOrEmpty(identifier); | ||
|
|
||
| if (!source.Exists) | ||
| { | ||
| throw new FileNotFoundException("The specified file does not exist.", source.FullName); | ||
| } | ||
|
|
||
| // Read file content as base64 data URI | ||
| #if NET | ||
| byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); | ||
| #else | ||
| byte[] fileBytes; | ||
| using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous)) | ||
| { | ||
| using MemoryStream ms = new(); | ||
| await fs.CopyToAsync(ms).ConfigureAwait(false); | ||
| fileBytes = ms.ToArray(); | ||
| } | ||
| #endif | ||
| string dataUri = CreateDataUri(fileBytes, mediaType); | ||
|
|
||
| string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); | ||
|
|
||
| return MarkdownParser.Parse(markdown, identifier); | ||
| } | ||
|
|
||
| /// <inheritdoc/> | ||
| public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) | ||
| { | ||
| _ = Throw.IfNull(source); | ||
| _ = Throw.IfNullOrEmpty(identifier); | ||
|
|
||
| // Read stream content as base64 data URI | ||
| using MemoryStream ms = new(); | ||
| #if NET | ||
| await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); | ||
| #else | ||
| await source.CopyToAsync(ms).ConfigureAwait(false); | ||
| #endif | ||
| byte[] fileBytes = ms.ToArray(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot, once you switch to using DataContent, it accepts a Readonlymemory, so rather than using ToArray, you can pass in a Readonlymemory created from MemoryStream's GetBuffer and Length |
||
| string dataUri = CreateDataUri(fileBytes, mediaType); | ||
|
|
||
| string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); | ||
|
|
||
| return MarkdownParser.Parse(markdown, identifier); | ||
| } | ||
|
|
||
| #pragma warning disable S3995 // URI return values should not be strings | ||
| private static string CreateDataUri(byte[] fileBytes, string? mediaType) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot, you already have a reference (indirectly) to the Microsoft.Extensions.AI.Abstractions package, which provides a DataContent type. DataContent implicitly supports data uris. You code user or instead like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot please send a new PR that adreses this feedback |
||
| #pragma warning restore S3995 // URI return values should not be strings | ||
| { | ||
| string base64Content = Convert.ToBase64String(fileBytes); | ||
| string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; | ||
| return $"data:{mimeType};base64,{base64Content}"; | ||
| } | ||
|
|
||
| private async Task<string> ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken) | ||
| { | ||
| // Create HTTP client transport for MCP | ||
| HttpClientTransport transport = new(new HttpClientTransportOptions | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @halter73, just fyi |
||
| { | ||
| Endpoint = _mcpServerUri | ||
| }); | ||
|
|
||
| await using (transport.ConfigureAwait(false)) | ||
| { | ||
| // Create MCP client | ||
| McpClient client = await McpClient.CreateAsync(transport, _options, loggerFactory: null, cancellationToken).ConfigureAwait(false); | ||
|
|
||
| await using (client.ConfigureAwait(false)) | ||
| { | ||
| // Build parameters for convert_to_markdown tool | ||
| Dictionary<string, object?> parameters = new() | ||
| { | ||
| ["uri"] = dataUri | ||
| }; | ||
|
|
||
| // Call the convert_to_markdown tool | ||
| var result = await client.CallToolAsync("convert_to_markdown", parameters, cancellationToken: cancellationToken).ConfigureAwait(false); | ||
|
|
||
| // Extract markdown content from result | ||
| // The result is expected to be in the format: { "content": [{ "type": "text", "text": "markdown content" }] } | ||
| if (result.Content != null && result.Content.Count > 0) | ||
| { | ||
| foreach (var content in result.Content) | ||
| { | ||
| if (content.Type == "text" && content is TextContentBlock textBlock) | ||
| { | ||
| return textBlock.Text; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| throw new InvalidOperationException("Failed to convert document to markdown: unexpected response format from MCP server."); | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.IO; | ||
| using System.Threading.Tasks; | ||
| using Xunit; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Readers.Tests; | ||
|
|
||
| public class MarkItDownMcpReaderTests | ||
| { | ||
| [Fact] | ||
| public void Constructor_ThrowsWhenMcpServerUriIsNull() | ||
| { | ||
| Assert.Throws<ArgumentNullException>("mcpServerUri", () => new MarkItDownMcpReader(null!)); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task ReadAsync_ThrowsWhenIdentifierIsNull() | ||
| { | ||
| var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse")); | ||
|
|
||
| await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: null!)); | ||
| await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: string.Empty)); | ||
|
|
||
| using MemoryStream stream = new(); | ||
| await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(stream, identifier: null!, mediaType: "some")); | ||
| await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(stream, identifier: string.Empty, mediaType: "some")); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task ReadAsync_ThrowsWhenSourceIsNull() | ||
| { | ||
| var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse")); | ||
|
|
||
| await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync(null!, "identifier")); | ||
| await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync((Stream)null!, "identifier", "mediaType")); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task ReadAsync_ThrowsWhenFileDoesNotExist() | ||
| { | ||
| var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse")); | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| var nonExistentFile = new FileInfo(Path.Combine(Path.GetTempPath(), Path.GetRandomFileName())); | ||
|
|
||
| await Assert.ThrowsAsync<FileNotFoundException>(async () => await reader.ReadAsync(nonExistentFile, "identifier")); | ||
| } | ||
|
|
||
| // NOTE: Integration tests with an actual MCP server would go here, but they would require | ||
| // a running MarkItDown MCP server to be available, which is not part of the test setup. | ||
| // For full integration testing, use a real MCP server in a separate test environment. | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.