-
Notifications
You must be signed in to change notification settings - Fork 849
Add MarkItDownMcpReader for MCP server support #7025
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 4 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
3464cb2
Initial plan
Copilot 9ee6a70
Add MarkItDownMcpReader for MCP server support
Copilot 55eddcf
Add unit tests for MarkItDownMcpReader
Copilot f91f70b
Fix compiler warnings in MCP reader and tests
Copilot ab68e3a
Address PR feedback: upgrade to ModelContextProtocol.Core 0.4.0-previ…
Copilot b8e70dc
Fix package downgrade: add Microsoft.Bcl.AsyncInterfaces reference fo…
Copilot dceabf6
solve the build failures
adamsitnik File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
125 changes: 125 additions & 0 deletions
125
src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/MarkItDownMcpReader.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.IO; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Shared.Diagnostics; | ||
| using ModelContextProtocol.Client; | ||
| using ModelContextProtocol.Protocol; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion; | ||
|
|
||
| /// <summary> | ||
| /// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> MCP server. | ||
| /// </summary> | ||
| public class MarkItDownMcpReader : IngestionDocumentReader | ||
| { | ||
| private readonly Uri _mcpServerUri; | ||
|
|
||
| /// <summary> | ||
| /// Initializes a new instance of the <see cref="MarkItDownMcpReader"/> class. | ||
| /// </summary> | ||
| /// <param name="mcpServerUri">The URI of the MarkItDown MCP server (e.g., http://localhost:3001/sse).</param> | ||
adamsitnik marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| public MarkItDownMcpReader(Uri mcpServerUri) | ||
adamsitnik marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| { | ||
| _mcpServerUri = Throw.IfNull(mcpServerUri); | ||
| } | ||
|
|
||
| /// <inheritdoc/> | ||
| public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) | ||
| { | ||
| _ = Throw.IfNull(source); | ||
| _ = Throw.IfNullOrEmpty(identifier); | ||
|
|
||
| if (!source.Exists) | ||
| { | ||
| throw new FileNotFoundException("The specified file does not exist.", source.FullName); | ||
| } | ||
|
|
||
| // Read file content as base64 data URI | ||
| #if NET | ||
| byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); | ||
| #else | ||
| byte[] fileBytes; | ||
| using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read)) | ||
adamsitnik marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| { | ||
| using MemoryStream ms = new(); | ||
| await fs.CopyToAsync(ms).ConfigureAwait(false); | ||
| fileBytes = ms.ToArray(); | ||
| } | ||
| #endif | ||
| string base64Content = Convert.ToBase64String(fileBytes); | ||
| string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; | ||
| string dataUri = $"data:{mimeType};base64,{base64Content}"; | ||
adamsitnik marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); | ||
|
|
||
| return MarkdownParser.Parse(markdown, identifier); | ||
| } | ||
|
|
||
| /// <inheritdoc/> | ||
| public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) | ||
| { | ||
| _ = Throw.IfNull(source); | ||
| _ = Throw.IfNullOrEmpty(identifier); | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // Read stream content as base64 data URI | ||
| using MemoryStream ms = new(); | ||
| #if NET | ||
| await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); | ||
| #else | ||
| await source.CopyToAsync(ms).ConfigureAwait(false); | ||
| #endif | ||
| byte[] fileBytes = ms.ToArray(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot, once you switch to using DataContent, it accepts a Readonlymemory, so rather than using ToArray, you can pass in a Readonlymemory created from MemoryStream's GetBuffer and Length |
||
| string base64Content = Convert.ToBase64String(fileBytes); | ||
| string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType; | ||
| string dataUri = $"data:{mimeType};base64,{base64Content}"; | ||
|
|
||
| string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); | ||
|
|
||
| return MarkdownParser.Parse(markdown, identifier); | ||
| } | ||
|
|
||
| private async Task<string> ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken) | ||
| { | ||
| // Create HTTP client transport for MCP | ||
| #pragma warning disable CA2007 // Consider calling ConfigureAwait on the awaited task - await using pattern | ||
| await using var transport = new HttpClientTransport(new HttpClientTransportOptions | ||
| { | ||
| Endpoint = _mcpServerUri | ||
| }); | ||
| #pragma warning restore CA2007 | ||
|
|
||
| // Create MCP client | ||
| #pragma warning disable CA2007 // Consider calling ConfigureAwait on the awaited task | ||
| await using var client = await McpClient.CreateAsync(transport, cancellationToken: cancellationToken).ConfigureAwait(false); | ||
| #pragma warning restore CA2007 // Consider calling ConfigureAwait on the awaited task | ||
adamsitnik marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| // Build parameters for convert_to_markdown tool | ||
| var parameters = new Dictionary<string, object?> | ||
| { | ||
| ["uri"] = dataUri | ||
| }; | ||
|
|
||
| // Call the convert_to_markdown tool | ||
| var result = await client.CallToolAsync("convert_to_markdown", parameters, cancellationToken: cancellationToken).ConfigureAwait(false); | ||
|
|
||
| // Extract markdown content from result | ||
| // The result is expected to be in the format: { "content": [{ "type": "text", "text": "markdown content" }] } | ||
| if (result.Content != null && result.Content.Count > 0) | ||
| { | ||
| foreach (var content in result.Content) | ||
| { | ||
| if (content.Type == "text" && content is TextContentBlock textBlock) | ||
| { | ||
| return textBlock.Text; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| throw new InvalidOperationException("Failed to convert document to markdown: unexpected response format from MCP server."); | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Readers/MarkItDownMcpReaderTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.IO; | ||
| using System.Threading.Tasks; | ||
| using Xunit; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Readers.Tests; | ||
|
|
||
| public class MarkItDownMcpReaderTests | ||
| { | ||
| [Fact] | ||
| public void Constructor_ThrowsWhenMcpServerUriIsNull() | ||
| { | ||
| Assert.Throws<ArgumentNullException>("mcpServerUri", () => new MarkItDownMcpReader(null!)); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task ReadAsync_ThrowsWhenIdentifierIsNull() | ||
| { | ||
| var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse")); | ||
|
|
||
| await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: null!)); | ||
| await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: string.Empty)); | ||
|
|
||
| using MemoryStream stream = new(); | ||
| await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(stream, identifier: null!, mediaType: "some")); | ||
| await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(stream, identifier: string.Empty, mediaType: "some")); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task ReadAsync_ThrowsWhenSourceIsNull() | ||
| { | ||
| var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse")); | ||
|
|
||
| await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync((FileInfo)null!, "identifier")); | ||
| await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync((Stream)null!, "identifier", "mediaType")); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task ReadAsync_ThrowsWhenFileDoesNotExist() | ||
| { | ||
| var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse")); | ||
| var nonExistentFile = new FileInfo(Path.Combine(Path.GetTempPath(), Path.GetRandomFileName())); | ||
|
|
||
| await Assert.ThrowsAsync<FileNotFoundException>(async () => await reader.ReadAsync(nonExistentFile, "identifier")); | ||
| } | ||
|
|
||
| // NOTE: Integration tests with an actual MCP server would go here, but they would require | ||
| // a running MarkItDown MCP server to be available, which is not part of the test setup. | ||
| // For full integration testing, use a real MCP server in a separate test environment. | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.