diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b5d9fdcd..bf5532454 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,10 @@ The following emojis are used to highlight certain changes: ### Added -- `gateway`: Added retrieval state tracking for timeout diagnostics. When retrieval timeouts occur, the error messages now include detailed information about which phase failed (path resolution, provider discovery, connecting, or data retrieval) and provider statistics. [#1015](https://github.com/ipfs/boxo/pull/1015) +- `gateway`: Enhanced error handling and UX for timeouts: + - Added retrieval state tracking for timeout diagnostics. When retrieval timeouts occur, the error messages now include detailed information about which phase failed (path resolution, provider discovery, connecting, or data retrieval) and provider statistics including failed peer IDs [#1015](https://github.com/ipfs/boxo/pull/1015) [#1023](https://github.com/ipfs/boxo/pull/1023) + - Added `Config.DiagnosticServiceURL` to configure a CID retrievability diagnostic service. When set, 504 Gateway Timeout errors show a "Check CID retrievability" button linking to the service with `?cid=` [#1023](https://github.com/ipfs/boxo/pull/1023) + - Improved 504 error pages with "Retry" button, diagnostic service integration, and clear indication when timeout occurs on sub-resource vs root CID [#1023](https://github.com/ipfs/boxo/pull/1023) ### Changed diff --git a/gateway/assets/assets.go b/gateway/assets/assets.go index 4d9676784..c1bd7a20d 100644 --- a/gateway/assets/assets.go +++ b/gateway/assets/assets.go @@ -106,9 +106,12 @@ type DagTemplateData struct { type ErrorTemplateData struct { GlobalData - StatusCode int - StatusText string - Error string + StatusCode int + StatusText string + Error string + DiagnosticServiceURL string + RootCID string + FailedCID string } type DirectoryTemplateData struct { diff --git a/gateway/assets/error.html b/gateway/assets/error.html index 4e3cac507..9a77515e8 100644 --- a/gateway/assets/error.html +++ b/gateway/assets/error.html @@ -32,24 +32,44 @@ {{ else if eq .StatusCode 500 }}

This gateway was unable to return the requested data due to an internal error. Please check the error below for more information.

{{ else if eq .StatusCode 502 }} -

The gateway backed was unable to fullfil your request due to an error.

+

The gateway backend was unable to fulfill your request due to an error.

{{ else if eq .StatusCode 504 }} -

The gateway backend was unable to fullfil your request due to a timeout.

+

This gateway is taking too long to fetch content from content providers. Please check the error below for more information.

+ + {{ if and .RootCID .FailedCID (ne .RootCID .FailedCID) }} +

Note: The timeout occurred while fetching a sub-resource (CID: {{ .FailedCID }}) of the root content (CID: {{ .RootCID }}).

+ {{ end }} + {{ else if eq .StatusCode 506 }} -

The gateway backend was unable to fullfil your request at this time. Try again later.

+

The gateway backend was unable to fulfill your request at this time. Try again later.

{{ end }}
{{ .Error }}
- +

How you can proceed:

+ +
+ + {{ if and (eq .StatusCode 504) .FailedCID .DiagnosticServiceURL }} + + Check CID retrievability + + {{ end }} +
diff --git a/gateway/assets/style.css b/gateway/assets/style.css index 4585d4b4d..bfcd3de9f 100644 --- a/gateway/assets/style.css +++ b/gateway/assets/style.css @@ -36,11 +36,6 @@ a:hover { text-decoration: underline; } -a:active, -a:visited { - color: #00b0e9; -} - .flex { display: flex; } @@ -235,7 +230,7 @@ section > .grid.dag > div:nth-of-type(2n+1) { .terminal { background: var(--steel-gray); color: white; - padding: .7em; + padding: 0.625em 1.25em; border-radius: var(--radius); word-wrap: break-word; white-space: break-spaces; @@ -273,3 +268,41 @@ section > .grid.dag > div:nth-of-type(2n+1) { display: none; } } + +.buttons { + margin: 1.5em 0; + display: flex; + gap: 0.75em; + flex-wrap: wrap; +} + +.button { + display: inline-block; + padding: 0.625em 1.25em; + font-size: 1em; + cursor: pointer; + border-radius: var(--radius); + text-decoration: none; + border: none; + font-family: inherit; +} + +.retry-button { + background-color: #3B8C90; + color: var(--near-white); +} + +.retry-button:hover { + background-color: var(--navy); +} + +.diagnose-button { + background-color: var(--steel-gray); + color: var(--near-white); +} + +.diagnose-button:hover { + background-color: var(--navy); + color: var(--near-white); + text-decoration: none; +} diff --git a/gateway/backend_blocks.go b/gateway/backend_blocks.go index 4a4724dbf..45b11c327 100644 --- a/gateway/backend_blocks.go +++ b/gateway/backend_blocks.go @@ -671,6 +671,8 @@ func (bb *BlocksBackend) getPathRoots(ctx context.Context, contentPath path.Immu // Update retrieval progress, if tracked in the existing context if retrievalState := retrieval.StateFromContext(ctx); retrievalState != nil { retrievalState.SetPhase(retrieval.PhasePathResolution) + // Set the root CID (first CID in the path) + retrievalState.SetRootCID(contentPath.RootCid()) } /* @@ -726,6 +728,14 @@ func (bb *BlocksBackend) getPathRoots(ctx context.Context, contentPath path.Immu } pathRoots = pathRoots[:len(pathRoots)-1] + + // Set the terminal CID after successful path resolution + if retrievalState := retrieval.StateFromContext(ctx); retrievalState != nil { + if rootCid := lastPath.RootCid(); rootCid.Defined() { + retrievalState.SetTerminalCID(rootCid) + } + } + return pathRoots, lastPath, remainder, nil } diff --git a/gateway/backend_car.go b/gateway/backend_car.go index 62f0991d3..e63696041 100644 --- a/gateway/backend_car.go +++ b/gateway/backend_car.go @@ -213,6 +213,8 @@ func resolvePathWithRootsAndBlock(ctx context.Context, p path.ImmutablePath, uni // Update retrieval progress, if tracked in the existing context if retrievalState := retrieval.StateFromContext(ctx); retrievalState != nil { retrievalState.SetPhase(retrieval.PhasePathResolution) + // Set the root CID (first CID in the path) + retrievalState.SetRootCID(p.RootCid()) } md, terminalBlk, err := resolvePathToLastWithRoots(ctx, p, unixFSLsys) if err != nil { @@ -234,6 +236,11 @@ func resolvePathWithRootsAndBlock(ctx context.Context, p path.ImmutablePath, uni } } + // Set the terminal CID after successful path resolution + if retrievalState := retrieval.StateFromContext(ctx); retrievalState != nil { + retrievalState.SetTerminalCID(terminalCid) + } + return md, terminalBlk, err } diff --git a/gateway/errors.go b/gateway/errors.go index 62b5c8734..81e9923ad 100644 --- a/gateway/errors.go +++ b/gateway/errors.go @@ -12,6 +12,7 @@ import ( "github.com/ipfs/boxo/gateway/assets" "github.com/ipfs/boxo/path" "github.com/ipfs/boxo/path/resolver" + "github.com/ipfs/boxo/retrieval" "github.com/ipfs/go-cid" ipld "github.com/ipfs/go-ipld-format" "github.com/ipld/go-ipld-prime/datamodel" @@ -186,15 +187,36 @@ func writeErrorResponse(w http.ResponseWriter, r *http.Request, c *Config, statu } if acceptsHTML { + // Extract CIDs from RetrievalState for diagnostic purposes (504 errors) + var rootCID, failedCID string + if statusCode == http.StatusGatewayTimeout && c != nil && c.DiagnosticServiceURL != "" { + if retrievalState := retrieval.StateFromContext(r.Context()); retrievalState != nil { + // Get root CID (first CID in the path) + if root := retrievalState.GetRootCID(); root.Defined() { + rootCID = root.String() + } + // Get terminal CID (CID that failed to retrieve) + if terminal := retrievalState.GetTerminalCID(); terminal.Defined() { + failedCID = terminal.String() + } else { + // If no terminal CID, use root CID as the failed CID + failedCID = rootCID + } + } + } + w.Header().Set("Content-Type", "text/html") w.WriteHeader(statusCode) err := assets.ErrorTemplate.Execute(w, assets.ErrorTemplateData{ GlobalData: assets.GlobalData{ Menu: c.Menu, }, - StatusCode: statusCode, - StatusText: http.StatusText(statusCode), - Error: message, + StatusCode: statusCode, + StatusText: http.StatusText(statusCode), + Error: message, + DiagnosticServiceURL: c.DiagnosticServiceURL, + RootCID: rootCID, + FailedCID: failedCID, }) if err != nil { fmt.Fprintf(w, "error during body generation: %v", err) diff --git a/gateway/gateway.go b/gateway/gateway.go index 26fa55eec..b8f486d34 100644 --- a/gateway/gateway.go +++ b/gateway/gateway.go @@ -4,6 +4,7 @@ import ( "context" "errors" "io" + "net/url" "strconv" "strings" "time" @@ -32,6 +33,9 @@ const ( // // See the MaxConcurrentRequests field documentation for detailed tuning guidance. DefaultMaxConcurrentRequests = 4096 + + // DefaultDiagnosticServiceURL is the default URL for CID diagnostic service + DefaultDiagnosticServiceURL = "https://check.ipfs.network" ) // Config is the configuration used when creating a new gateway handler. @@ -115,11 +119,34 @@ type Config struct { // A value of 0 disables the limit entirely (use with caution). MaxConcurrentRequests int + // DiagnosticServiceURL is the URL for a service that can be used to diagnose + // issues with CID retrievability. When the gateway returns a 504 Gateway Timeout + // error, an "Inspect retrievability of CID" button will be shown that links to + // this service with the CID as a query parameter. When set to empty string, the + // button is not shown. + DiagnosticServiceURL string + // MetricsRegistry is the Prometheus registry to use for metrics. // If nil, prometheus.DefaultRegisterer will be used. MetricsRegistry prometheus.Registerer } +// validateConfig validates and normalizes the Config, returning a modified copy. +// Invalid values are logged and set to safe defaults. +func validateConfig(c Config) Config { + // Validate DiagnosticServiceURL + if c.DiagnosticServiceURL != "" { + if _, err := url.Parse(c.DiagnosticServiceURL); err != nil { + log.Errorf("invalid DiagnosticServiceURL %q: %v, disabling diagnostic service", c.DiagnosticServiceURL, err) + c.DiagnosticServiceURL = "" + } + } + + // Future validations can be added here + + return c +} + // PublicGateway is the specification of an IPFS Public Gateway. type PublicGateway struct { // Paths is explicit list of path prefixes that should be handled by diff --git a/gateway/handler.go b/gateway/handler.go index 03ff2a2c7..bc8c5da71 100644 --- a/gateway/handler.go +++ b/gateway/handler.go @@ -68,6 +68,9 @@ type handler struct { // // [IPFS HTTP Gateway]: https://specs.ipfs.tech/http-gateways/ func NewHandler(c Config, backend IPFSBackend) http.Handler { + // Validate and normalize configuration + c = validateConfig(c) + // Get registry from config or use default reg := c.MetricsRegistry if reg == nil { diff --git a/retrieval/state.go b/retrieval/state.go index 887e02085..50ed6a15d 100644 --- a/retrieval/state.go +++ b/retrieval/state.go @@ -10,9 +10,11 @@ import ( "errors" "fmt" "slices" + "strings" "sync" "sync/atomic" + "github.com/ipfs/go-cid" "github.com/libp2p/go-libp2p/core/peer" ) @@ -23,10 +25,10 @@ type contextKey string // and StateFromContext functions are preferred. const ContextKey contextKey = "boxo-retrieval-state" -// MaxFailedProvidersToTrack limits the number of failed provider peer IDs -// that are kept for diagnostic purposes. This prevents unbounded memory growth +// MaxProvidersSampleSize limits the number of provider peer IDs (both found and failed) +// that are kept as a sample for diagnostic purposes. This prevents unbounded memory growth // while still providing useful debugging information. -const MaxFailedProvidersToTrack = 3 +const MaxProvidersSampleSize = 3 // RetrievalPhase represents the current phase of content retrieval. // Phases progress monotonically - they can only move forward, never backward. @@ -80,10 +82,20 @@ type State struct { // phase tracks the current retrieval phase (stored as int32) phase atomic.Int32 - // mu protects failedProviders slice during concurrent access + // mu protects foundProviders, failedProviders slices and CID fields during concurrent access mu sync.RWMutex - // Track specific provider failures (limited to first few for brevity) + // Sample of providers found during discovery (limited to first few for brevity) + // NOTE: This is only a sample of the first MaxProvidersSampleSize providers, not all of them + foundProviders []peer.ID + // Sample of providers that failed (limited to first few for brevity) + // NOTE: This is only a sample of the first MaxProvidersSampleSize providers, not all of them failedProviders []peer.ID + + // CIDs for diagnostic purposes + // For /ipfs/cid, both will be the same + // For /ipfs/cid/path/to/file, rootCID is 'cid' and terminalCID is the CID of 'file' + rootCID cid.Cid // First CID in the path + terminalCID cid.Cid // CID of terminating DAG entity on the path } // NewState creates a new State initialized to PhaseInitializing. The returned @@ -120,22 +132,96 @@ func (rs *State) GetPhase() RetrievalPhase { return RetrievalPhase(rs.phase.Load()) } +// appendProviders is a helper to append providers to a sample list with size limit. +// Only the first MaxProvidersSampleSize providers are kept to prevent unbounded memory growth. +// This follows the idiomatic append pattern but operates on internal state. +func (rs *State) appendProviders(list *[]peer.ID, peerIDs ...peer.ID) { + rs.mu.Lock() + defer rs.mu.Unlock() + if len(*list) >= MaxProvidersSampleSize { + return + } + remaining := MaxProvidersSampleSize - len(*list) + if len(peerIDs) > remaining { + peerIDs = peerIDs[:remaining] + } + *list = append(*list, peerIDs...) +} + +// AddFoundProvider records a provider peer ID that was discovered during provider search. +// This method is safe for concurrent use. +func (rs *State) AddFoundProvider(peerID peer.ID) { + rs.appendProviders(&rs.foundProviders, peerID) +} + // AddFailedProvider records a provider peer ID that failed to deliver the requested content. -// Only the first MaxFailedProvidersToTrack providers are kept to prevent unbounded memory growth. // This method is safe for concurrent use. func (rs *State) AddFailedProvider(peerID peer.ID) { + rs.appendProviders(&rs.failedProviders, peerID) +} + +// getProviders is a helper to get a cloned list of providers. +func (rs *State) getProviders(list []peer.ID) []peer.ID { + rs.mu.RLock() + defer rs.mu.RUnlock() + return slices.Clone(list) +} + +// GetFoundProviders returns a sample of found providers (up to MaxProvidersSampleSize). +// This is not all providers, just the first few for diagnostic purposes. +func (rs *State) GetFoundProviders() []peer.ID { + return rs.getProviders(rs.foundProviders) +} + +// GetFailedProviders returns a sample of failed providers (up to MaxProvidersSampleSize). +// This is not all providers, just the first few for diagnostic purposes. +func (rs *State) GetFailedProviders() []peer.ID { + return rs.getProviders(rs.failedProviders) +} + +// SetRootCID sets the root CID (first CID in the path). +// This method is safe for concurrent use. +func (rs *State) SetRootCID(c cid.Cid) { rs.mu.Lock() defer rs.mu.Unlock() - if len(rs.failedProviders) < MaxFailedProvidersToTrack { - rs.failedProviders = append(rs.failedProviders, peerID) - } + rs.rootCID = c } -// GetFailedProviders returns the list of failed providers -func (rs *State) GetFailedProviders() []peer.ID { +// SetTerminalCID sets the terminal CID (CID of terminating DAG entity). +// This method is safe for concurrent use. +func (rs *State) SetTerminalCID(c cid.Cid) { + rs.mu.Lock() + defer rs.mu.Unlock() + rs.terminalCID = c +} + +// GetRootCID returns the root CID (first CID in the path). +// This method is safe for concurrent use. +func (rs *State) GetRootCID() cid.Cid { + rs.mu.RLock() + defer rs.mu.RUnlock() + return rs.rootCID +} + +// GetTerminalCID returns the terminal CID (CID of terminating DAG entity). +// This method is safe for concurrent use. +func (rs *State) GetTerminalCID() cid.Cid { rs.mu.RLock() defer rs.mu.RUnlock() - return slices.Clone(rs.failedProviders) + return rs.terminalCID +} + +// formatPeerIDs converts a slice of peer IDs to a formatted string with a prefix. +// Returns empty string if the slice is empty. +func formatPeerIDs(peers []peer.ID, prefix string) string { + if len(peers) == 0 { + return "" + } + peerStrings := make([]string, len(peers)) + for i, p := range peers { + peerStrings[i] = p.String() + } + return fmt.Sprintf(", %s: %s", prefix, strings.Join(peerStrings, ", ")) } // Summary generates a human-readable summary of the retrieval state, @@ -154,22 +240,18 @@ func (rs *State) Summary() string { return fmt.Sprintf("found %d provider(s) but none could be contacted (phase: %s)", found, phase.String()) } - failedProviders := rs.GetFailedProviders() - failedPeersInfo := "" - if len(failedProviders) > 0 { - // Convert peer IDs to strings for display - peerStrings := make([]string, len(failedProviders)) - for i, p := range failedProviders { - peerStrings[i] = p.String() - } - failedPeersInfo = fmt.Sprintf(", failed peers: %v", peerStrings) - } - if connected == 0 { + // When we can't connect, show the found providers instead of failed ones + // since all attempts effectively failed + foundProviders := rs.GetFoundProviders() + peersInfo := formatPeerIDs(foundProviders, "peers") return fmt.Sprintf("found %d provider(s), attempted %d, but none were reachable (phase: %s%s)", - found, attempted, phase.String(), failedPeersInfo) + found, attempted, phase.String(), peersInfo) } + failedProviders := rs.GetFailedProviders() + failedPeersInfo := formatPeerIDs(failedProviders, "failed peers") + if len(failedProviders) > 0 { return fmt.Sprintf("found %d provider(s), connected to %d, but they did not return the requested content (phase: %s%s)", found, connected, phase.String(), failedPeersInfo) diff --git a/retrieval/state_test.go b/retrieval/state_test.go index ff771a491..02c46f512 100644 --- a/retrieval/state_test.go +++ b/retrieval/state_test.go @@ -79,6 +79,27 @@ func TestState(t *testing.T) { assert.Equal(t, int32(1), rs.ProvidersConnected.Load()) }) + t.Run("Found providers are tracked up to limit", func(t *testing.T) { + rs := NewState() + + // Create real peer IDs for testing + peerIDs := make([]peer.ID, 5) + for i := range peerIDs { + peerIDs[i] = test.RandPeerIDFatal(t) + } + + // Add more than MaxProvidersSampleSize providers + for _, peerID := range peerIDs { + rs.AddFoundProvider(peerID) + } + + foundProviders := rs.GetFoundProviders() + assert.Len(t, foundProviders, MaxProvidersSampleSize) + assert.Equal(t, peerIDs[0], foundProviders[0]) + assert.Equal(t, peerIDs[1], foundProviders[1]) + assert.Equal(t, peerIDs[2], foundProviders[2]) + }) + t.Run("Failed providers are tracked up to limit", func(t *testing.T) { rs := NewState() @@ -88,13 +109,13 @@ func TestState(t *testing.T) { peerIDs[i] = test.RandPeerIDFatal(t) } - // Add more than MaxFailedProvidersToTrack providers + // Add more than MaxProvidersSampleSize providers for _, peerID := range peerIDs { rs.AddFailedProvider(peerID) } failedProviders := rs.GetFailedProviders() - assert.Len(t, failedProviders, MaxFailedProvidersToTrack) + assert.Len(t, failedProviders, MaxProvidersSampleSize) assert.Equal(t, peerIDs[0], failedProviders[0]) assert.Equal(t, peerIDs[1], failedProviders[1]) assert.Equal(t, peerIDs[2], failedProviders[2]) @@ -121,6 +142,21 @@ func TestState(t *testing.T) { }, expectedSubstring: "found 5 provider(s) but none could be contacted", }, + { + name: "Single provider found but not reachable shows peer ID", + setup: func(rs *State) { + rs.ProvidersFound.Store(1) + rs.ProvidersAttempted.Store(1) + peerID := test.RandPeerIDFatal(t) + rs.AddFoundProvider(peerID) + rs.SetPhase(PhaseConnecting) + + // Verify the peer ID appears in the summary + summary := rs.Summary() + assert.Contains(t, summary, peerID.String()) + }, + expectedSubstring: "found 1 provider(s), attempted 1, but none were reachable (phase: connecting to providers, peers:", + }, { name: "Providers attempted but none reachable", setup: func(rs *State) { @@ -131,20 +167,20 @@ func TestState(t *testing.T) { expectedSubstring: "found 5 provider(s), attempted 3, but none were reachable", }, { - name: "Providers attempted but none reachable with failed peers", + name: "Providers attempted but none reachable with found peers", setup: func(rs *State) { rs.ProvidersFound.Store(5) rs.ProvidersAttempted.Store(3) // Store peer IDs so we can verify they appear in the message peerID1 := test.RandPeerIDFatal(t) peerID2 := test.RandPeerIDFatal(t) - rs.AddFailedProvider(peerID1) - rs.AddFailedProvider(peerID2) + rs.AddFoundProvider(peerID1) + rs.AddFoundProvider(peerID2) rs.SetPhase(PhaseConnecting) // Verify the summary includes the actual peer IDs summary := rs.Summary() - assert.Contains(t, summary, "failed peers:") + assert.Contains(t, summary, "peers:") assert.Contains(t, summary, peerID1.String()) assert.Contains(t, summary, peerID2.String()) }, @@ -356,7 +392,7 @@ func TestErrorWithState(t *testing.T) { // Check that error includes failed peers errMsg := err.Error() - assert.Contains(t, errMsg, "connection failed: retrieval: found 3 provider(s), connected to 1, but they did not return the requested content (phase: data retrieval, failed peers: [") + assert.Contains(t, errMsg, "connection failed: retrieval: found 3 provider(s), connected to 1, but they did not return the requested content (phase: data retrieval, failed peers: ") assert.Contains(t, errMsg, peerID1.String()) assert.Contains(t, errMsg, peerID2.String()) }) diff --git a/routing/providerquerymanager/providerquerymanager.go b/routing/providerquerymanager/providerquerymanager.go index 49b1aa457..ce0fb5f6b 100644 --- a/routing/providerquerymanager/providerquerymanager.go +++ b/routing/providerquerymanager/providerquerymanager.go @@ -384,6 +384,7 @@ func (pqm *ProviderQueryManager) findProviderWorker() { span.AddEvent("FoundProvider", trace.WithAttributes(attribute.Stringer("peer", p.ID))) if retrievalState != nil { retrievalState.ProvidersFound.Add(1) + retrievalState.AddFoundProvider(p.ID) retrievalState.SetPhase(retrieval.PhaseConnecting) retrievalState.ProvidersAttempted.Add(1) }