Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 31 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,9 @@ Options:
Do not show progress bar.
This is recommended for non-interactive shells (e.g. for continuous integration)

--host-stats
Show per-host statistics at the end of the run

--extensions <EXTENSIONS>
Test the specified file extensions for URIs when checking files locally.

Expand All @@ -385,7 +388,12 @@ Options:
--default-extension <EXTENSION>
Default file extension to treat files without extensions as having.

This is useful for files without extensions or with unknown extensions. The extension will be used to determine the file type for processing. Examples: --default-extension md, --default-extension html
This is useful for files without extensions or with unknown extensions.
The extension will be used to determine the file type for processing.

Examples:
--default-extension md
--default-extension html

--cache
Use request cache stored on disk at `.lycheecache`
Expand Down Expand Up @@ -447,6 +455,28 @@ Options:

[default: 128]

--host-concurrency <HOST_CONCURRENCY>
Default maximum concurrent requests per host (default: 10)

This limits how many requests can be sent simultaneously to the same
host (domain/subdomain). This helps prevent overwhelming servers and
getting rate-limited. Each host is handled independently.

Examples:
--host-concurrency 5 # Conservative for slow APIs
--host-concurrency 20 # Aggressive for fast APIs

--request-interval <REQUEST_INTERVAL>
Minimum interval between requests to the same host (default: 100ms)

Sets a baseline delay between consecutive requests to prevent
hammering servers. The adaptive algorithm may increase this based
on server responses (rate limits, errors).

Examples:
--request-interval 50ms # Fast for robust APIs
--request-interval 1s # Conservative for rate-limited APIs

-T, --threads <THREADS>
Number of threads to utilize. Defaults to number of cores available to the system

Expand Down
35 changes: 34 additions & 1 deletion lychee-bin/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ use crate::options::{Config, HeaderMapExt};
use crate::parse::{parse_duration_secs, parse_remaps};
use anyhow::{Context, Result};
use http::{HeaderMap, StatusCode};
use lychee_lib::{Client, ClientBuilder};
use lychee_lib::{
Client, ClientBuilder,
ratelimit::{HostPool, RateLimitConfig},
};
use regex::RegexSet;
use reqwest_cookie_store::CookieStoreMutex;
use std::sync::Arc;
Expand All @@ -28,6 +31,35 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -

let headers = HeaderMap::from_header_pairs(&cfg.header)?;

// Create combined headers for HostPool (includes User-Agent + custom headers)
let mut combined_headers = headers.clone();
combined_headers.insert(
http::header::USER_AGENT,
cfg.user_agent
.parse()
.context("Invalid User-Agent header")?,
);

// Create HostPool for rate limiting - always enabled for HTTP requests
let rate_limit_config =
RateLimitConfig::from_options(cfg.host_concurrency, cfg.request_interval);
let cache_max_age = if cfg.cache { 3600 } else { 0 }; // 1 hour if caching enabled, disabled otherwise

let mut host_pool = HostPool::new(
rate_limit_config,
cfg.hosts.clone(),
cfg.max_concurrency,
cache_max_age,
combined_headers,
cfg.max_redirects,
Some(timeout),
cfg.insecure,
);

if let Some(cookie_jar) = cookie_jar {
host_pool = host_pool.with_cookie_jar(cookie_jar.clone());
}

ClientBuilder::builder()
.remaps(remaps)
.base(cfg.base_url.clone())
Expand Down Expand Up @@ -55,6 +87,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
.include_fragments(cfg.include_fragments)
.fallback_extensions(cfg.fallback_extensions.clone())
.index_files(cfg.index_files.clone())
.host_pool(Some(host_pool))
.build()
.client()
.context("Failed to create request client")
Expand Down
30 changes: 21 additions & 9 deletions lychee-bin/src/commands/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use super::CommandParams;

pub(crate) async fn check<S>(
params: CommandParams<S>,
) -> Result<(ResponseStats, Arc<Cache>, ExitCode)>
) -> Result<(ResponseStats, Arc<Cache>, ExitCode, Client)>
where
S: futures::Stream<Item = Result<Request>>,
{
Expand All @@ -47,6 +47,7 @@ where
let cache_ref = params.cache.clone();

let client = params.client;
let client_for_return = client.clone();
let cache = params.cache;
let cache_exclude_status = params.cfg.cache_exclude_status.into_set();
let accept = params.cfg.accept.into();
Expand Down Expand Up @@ -120,7 +121,7 @@ where
} else {
ExitCode::LinkCheckFailure
};
Ok((stats, cache_ref, code))
Ok((stats, cache_ref, code, client_for_return))
}

async fn suggest_archived_links(
Expand Down Expand Up @@ -287,6 +288,8 @@ async fn handle(
accept: HashSet<u16>,
) -> Response {
let uri = request.uri.clone();

// First check the persistent disk-based cache
if let Some(v) = cache.get(&uri) {
// Found a cached request
// Overwrite cache status in case the URI is excluded in the
Expand All @@ -300,18 +303,27 @@ async fn handle(
// code.
Status::from_cache_status(v.value().status, &accept)
};

// Track cache hit in the per-host stats (only for network URIs)
if !uri.is_file() {
if let Err(e) = client.record_cache_hit(&uri) {
log::debug!("Failed to record cache hit for {uri}: {e}");
}
}

return Response::new(uri.clone(), status, request.source);
}

// Request was not cached; run a normal check
// Cache miss - track it and run a normal check (only for network URIs)
if !uri.is_file() {
if let Err(e) = client.record_cache_miss(&uri) {
log::debug!("Failed to record cache miss for {uri}: {e}");
}
}

let response = check_url(client, request).await;

// - Never cache filesystem access as it is fast already so caching has no
// benefit.
// - Skip caching unsupported URLs as they might be supported in a
// future run.
// - Skip caching excluded links; they might not be excluded in the next run.
// - Skip caching links for which the status code has been explicitly excluded from the cache.
// Apply the same caching rules as before
let status = response.status();
if ignore_cache(&uri, status, &cache_exclude_status) {
return response;
Expand Down
81 changes: 81 additions & 0 deletions lychee-bin/src/formatters/host_stats/compact.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
use anyhow::Result;
use std::{
collections::HashMap,
fmt::{self, Display},
};

use crate::formatters::color::{DIM, NORMAL, color};
use lychee_lib::ratelimit::HostStats;

use super::HostStatsFormatter;

struct CompactHostStats {
host_stats: HashMap<String, HostStats>,
}

impl Display for CompactHostStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.host_stats.is_empty() {
return Ok(());
}

writeln!(f)?;
writeln!(f, "πŸ“Š Per-host Statistics")?;

let separator = "─".repeat(60);
color!(f, DIM, "{}", separator)?;
writeln!(f)?;

let sorted_hosts = super::sort_host_stats(&self.host_stats);

// Calculate optimal hostname width based on longest hostname
let max_hostname_len = sorted_hosts
.iter()
.map(|(hostname, _)| hostname.len())
.max()
.unwrap_or(0);
let hostname_width = (max_hostname_len + 2).max(10); // At least 10 chars with padding

for (hostname, stats) in sorted_hosts {
let median_time = stats
.median_request_time()
.map_or_else(|| "N/A".to_string(), |d| format!("{:.0}ms", d.as_millis()));

let cache_hit_rate = stats.cache_hit_rate() * 100.0;

color!(
f,
NORMAL,
"{:<width$} β”‚ {:>6} reqs β”‚ {:>6.1}% success β”‚ {:>8} median β”‚ {:>6.1}% cached",
hostname,
stats.total_requests,
stats.success_rate() * 100.0,
median_time,
cache_hit_rate,
width = hostname_width
)?;
writeln!(f)?;
}

Ok(())
}
}

pub(crate) struct Compact;

impl Compact {
pub(crate) const fn new() -> Self {
Self
}
}

impl HostStatsFormatter for Compact {
fn format(&self, host_stats: HashMap<String, HostStats>) -> Result<Option<String>> {
if host_stats.is_empty() {
return Ok(None);
}

let compact = CompactHostStats { host_stats };
Ok(Some(compact.to_string()))
}
}
Loading