From 0005c64132d427e657f19920416c26b8c3e563d5 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 10 Jan 2025 22:42:21 +0100 Subject: [PATCH 1/8] Fix relative links and anchors --- Cargo.lock | 14 +- fixtures/absolute_paths/index.html | 8 + fixtures/encoded_fragments/about.html | 7 + fixtures/encoded_fragments/index.html | 7 + fixtures/html_fragments/page.html | 9 + fixtures/html_fragments/section.html | 7 + fixtures/relative_fragments/index.html | 11 ++ .../docs/releases/v9_7/index.html | 11 ++ fixtures/remote_fragments.html | 4 + fixtures/root_dir_fragments/about.html | 10 + fixtures/root_dir_fragments/index.html | 13 ++ lychee-bin/src/client.rs | 1 - lychee-bin/src/commands/check.rs | 59 ++++-- lychee-bin/src/options.rs | 2 +- lychee-bin/src/stats.rs | 2 +- lychee-bin/tests/cli.rs | 131 ++++++++++++- lychee-lib/src/checker/file.rs | 129 ++++++++----- lychee-lib/src/checker/website.rs | 52 ++++- lychee-lib/src/client.rs | 15 +- lychee-lib/src/collector.rs | 66 ++++--- lychee-lib/src/extract/html/html5gum.rs | 8 +- lychee-lib/src/test_utils.rs | 10 - lychee-lib/src/types/base.rs | 86 +++++---- lychee-lib/src/types/error.rs | 22 ++- lychee-lib/src/utils/fragment_checker.rs | 17 +- lychee-lib/src/utils/request.rs | 181 ++++++++++-------- 26 files changed, 619 insertions(+), 263 deletions(-) create mode 100644 fixtures/absolute_paths/index.html create mode 100644 fixtures/encoded_fragments/about.html create mode 100644 fixtures/encoded_fragments/index.html create mode 100644 fixtures/html_fragments/page.html create mode 100644 fixtures/html_fragments/section.html create mode 100644 fixtures/relative_fragments/index.html create mode 100644 fixtures/relative_paths/docs/releases/v9_7/index.html create mode 100644 fixtures/remote_fragments.html create mode 100644 fixtures/root_dir_fragments/about.html create mode 100644 fixtures/root_dir_fragments/index.html diff --git a/Cargo.lock b/Cargo.lock index 0af5cd228c..b62580fb73 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 4 +version = 3 [[package]] name = "addr2line" @@ -2540,7 +2540,7 @@ dependencies = [ "serde_with", "shellexpand", "tempfile", - "thiserror 2.0.7", + "thiserror 2.0.8", "tokio", "toml", "typed-builder", @@ -4255,11 +4255,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.7" +version = "2.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93605438cbd668185516ab499d589afb7ee1859ea3d5fc8f6b0755e1c7443767" +checksum = "08f5383f3e0071702bf93ab5ee99b52d26936be9dedd9413067cbdcddcb6141a" dependencies = [ - "thiserror-impl 2.0.7", + "thiserror-impl 2.0.8", ] [[package]] @@ -4275,9 +4275,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.7" +version = "2.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1d8749b4531af2117677a5fcd12b1348a3fe2b81e36e61ffeac5c4aa3273e36" +checksum = "f2f357fcec90b3caef6623a099691be676d033b40a058ac95d2a6ade6fa0c943" dependencies = [ "proc-macro2", "quote", diff --git a/fixtures/absolute_paths/index.html b/fixtures/absolute_paths/index.html new file mode 100644 index 0000000000..bdc90fd156 --- /dev/null +++ b/fixtures/absolute_paths/index.html @@ -0,0 +1,8 @@ + + + + About + Products + Contact + + diff --git a/fixtures/encoded_fragments/about.html b/fixtures/encoded_fragments/about.html new file mode 100644 index 0000000000..6d8a7041c9 --- /dev/null +++ b/fixtures/encoded_fragments/about.html @@ -0,0 +1,7 @@ + + + +

About Page

+ + + diff --git a/fixtures/encoded_fragments/index.html b/fixtures/encoded_fragments/index.html new file mode 100644 index 0000000000..903cf3e961 --- /dev/null +++ b/fixtures/encoded_fragments/index.html @@ -0,0 +1,7 @@ + + + + About + + + diff --git a/fixtures/html_fragments/page.html b/fixtures/html_fragments/page.html new file mode 100644 index 0000000000..f537b0059d --- /dev/null +++ b/fixtures/html_fragments/page.html @@ -0,0 +1,9 @@ + + + +

Existing Section

+ Missing Link + Valid Link + Invalid Link + + diff --git a/fixtures/html_fragments/section.html b/fixtures/html_fragments/section.html new file mode 100644 index 0000000000..4a50ee348d --- /dev/null +++ b/fixtures/html_fragments/section.html @@ -0,0 +1,7 @@ + + + +

Existing Heading

+

Test content

+ + diff --git a/fixtures/relative_fragments/index.html b/fixtures/relative_fragments/index.html new file mode 100644 index 0000000000..716c6cc747 --- /dev/null +++ b/fixtures/relative_fragments/index.html @@ -0,0 +1,11 @@ + + + + Test Page + + + Important Section + Different Disks + + + diff --git a/fixtures/relative_paths/docs/releases/v9_7/index.html b/fixtures/relative_paths/docs/releases/v9_7/index.html new file mode 100644 index 0000000000..f0505d51ba --- /dev/null +++ b/fixtures/relative_paths/docs/releases/v9_7/index.html @@ -0,0 +1,11 @@ + + + + VPN + Webserver + Releases + Current + System Config + Maintenance + + diff --git a/fixtures/remote_fragments.html b/fixtures/remote_fragments.html new file mode 100644 index 0000000000..85895442f3 --- /dev/null +++ b/fixtures/remote_fragments.html @@ -0,0 +1,4 @@ +lychee-repo +lychee-docs diff --git a/fixtures/root_dir_fragments/about.html b/fixtures/root_dir_fragments/about.html new file mode 100644 index 0000000000..90af9f9d8b --- /dev/null +++ b/fixtures/root_dir_fragments/about.html @@ -0,0 +1,10 @@ + + + + About + + +

Introduction

+

This is the about page.

+ + diff --git a/fixtures/root_dir_fragments/index.html b/fixtures/root_dir_fragments/index.html new file mode 100644 index 0000000000..6c057b192c --- /dev/null +++ b/fixtures/root_dir_fragments/index.html @@ -0,0 +1,13 @@ + + + + Index + + +

Index

+ Local Link +
Local Section
+ + About + + diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index d1b982dc26..794ef33cac 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -55,7 +55,6 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - ClientBuilder::builder() .remaps(remaps) - .base(cfg.base.clone()) .includes(includes) .excludes(excludes) .exclude_all_private(cfg.exclude_all_private) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 5c0614b179..1a4702bb6e 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -168,15 +168,14 @@ where { tokio::pin!(requests); while let Some(request) = requests.next().await { - let request = request?; + let request = request; if let Some(pb) = &bar { pb.inc_length(1); - pb.set_message(request.to_string()); + if let Ok(request) = &request { + pb.set_message(request.to_string()); + } }; - send_req - .send(Ok(request)) - .await - .expect("Cannot send request"); + send_req.send(request).await.expect("Cannot send request"); } Ok(()) } @@ -228,20 +227,40 @@ async fn request_channel_task( ReceiverStream::new(recv_req), max_concurrency, |request: Result| async { - let request = request.expect("cannot read request"); - let response = handle( - &client, - cache.clone(), - cache_exclude_status.clone(), - request, - accept.clone(), - ) - .await; - - send_resp - .send(response) - .await - .expect("cannot send response to queue"); + if let Ok(request) = request { + let response = handle( + &client, + cache.clone(), + cache_exclude_status.clone(), + request, + accept.clone(), + ) + .await; + send_resp + .send(response) + .await + .expect("cannot send response to queue"); + }; + + // let response = match request { + // Ok(req) => { + // handle( + // &client, + // cache.clone(), + // cache_exclude_status.clone(), + // req, + // accept.clone(), + // ) + // .await + // } + // Err(e) => { + // log::error!("Error reading request: {}", e); + // } + + // send_resp + // .send(response) + // .await + // .expect("cannot send response to queue"); }, ) .await; diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index bcb70da09c..5e7de7fe28 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -441,7 +441,7 @@ separated list of accepted status codes. This example will accept 200, 201, /// Base URL or website root directory to check relative URLs /// e.g. or `/path/to/public` - #[arg(short, long, value_parser= parse_base)] + #[arg(short, long, value_parser = parse_base)] #[serde(default)] pub(crate) base: Option, diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index fc5839902d..e5804e5608 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -109,7 +109,7 @@ impl ResponseStats { #[inline] /// Check if the entire run was successful pub(crate) const fn is_success(&self) -> bool { - self.total == self.successful + self.excludes + self.unsupported + self.total == self.successful + self.excludes + self.unsupported && self.errors == 0 } #[inline] diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 5479d37ae8..db3d1fd2a5 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -125,6 +125,8 @@ mod cli { .arg("compact") .arg("--mode") .arg("color") + .arg("--max-retries") + .arg("0") .arg(test_path) .env("FORCE_COLOR", "1") .assert() @@ -383,7 +385,7 @@ mod cli { let dir = fixtures_path().join("resolve_paths"); cmd.arg("--offline") - .arg("--base") + .arg("--root-dir") .arg(&dir) .arg(dir.join("index.html")) .env_clear() @@ -1940,4 +1942,131 @@ mod cli { Ok(()) } + + #[test] + fn test_root_dir_with_fragments() { + let mut cmd = main_command(); + let dir = fixtures_path().join("root_dir_fragments"); + + cmd.arg("--root-dir") + .arg(&dir) + .arg("--include-fragments") + .arg(dir.join("index.html")) + .arg("--verbose") + .assert() + .success() + .stdout(contains("2 Total")) + .stdout(contains("2 OK")) + .stdout(contains("0 Errors")) + .stderr(contains(format!( + "{}/about.html#introduction", + dir.display() + ))) + .stderr(contains(format!( + "{}/index.html#local-section", + dir.display() + ))); + } + + #[test] + fn test_relative_fragments_regression() { + let mut cmd = main_command(); + let dir = fixtures_path().join("relative_fragments"); + cmd.arg("--include-fragments") + .arg(dir.join("index.html")) + .assert() + .failure() + .stdout(contains("fixtures/relative_fragments/index.html#unterschiedliche-platten | Cannot find fragment")) + .stdout(contains("fixtures/relative_fragments/index.html#scrubs-wichtig | Cannot find fragment")) + .stdout(contains("2 Total")) + .stdout(contains("0 OK")) + .stdout(contains("2 Errors")); + } + + #[tokio::test] + async fn test_remote_fragments() { + let mock_server = mock_response!( + r##" + + + lychee-repo + lychee-docs + + + "## + ); + + let mut cmd = main_command(); + cmd.arg("--include-fragments") + .arg(mock_server.uri()) + .assert() + .failure() + .stdout(contains("2 Total")) + .stdout(contains("0 OK")) + .stdout(contains("2 Errors")) + .stdout(contains("https://github.com/lycheeverse/lychee#non-existent-anchor | Fragment not found: non-existent-anchor")) + .stdout(contains("https://lychee.cli.rs/#missing-section | Fragment not found: missing-section")); + } + + // #[test] + // fn test_relative_path_resolution() { + // let mut cmd = main_command(); + // let dir = fixtures_path().join("relative_paths"); + // cmd.arg("--base") + // .arg(&dir) + // .arg(dir.join("docs/releases/v9_7/index.html")) + // .assert() + // .success() + // .stderr(predicates::str::is_empty()) + // .stdout(contains("6 Total")) + // .stdout(contains("6 OK")) + // .stdout(contains("0 Errors")); + // } + + // #[test] + // fn test_base_url_absolute_paths() { + // let mut cmd = main_command(); + // let dir = fixtures_path().join("absolute_paths"); + // cmd.arg("--base") + // .arg("https://example.com") + // .arg("--dump") // Add dump flag to avoid actual requests + // .arg(dir.join("index.html")) + // .assert() + // .success() + // .stdout(contains("https://example.com/about")) + // .stdout(contains("https://example.com/products")) + // .stdout(contains("https://example.com/contact")) + // .stderr(predicates::str::is_empty()); + // } + + // #[test] + // fn test_html_fragment_detection() { + // let mut cmd = main_command(); + // let dir = fixtures_path().join("html_fragments"); + // cmd.arg("--include-fragments") + // .arg(dir.join("page.html")) + // .assert() + // .failure() + // .stdout(contains("section.html#non-existent-heading")) + // .stdout(contains("page.html#missing-section")) + // .stdout(contains("Cannot find fragment")) + // .stdout(contains("3 Total")) + // .stdout(contains("1 OK")) + // .stdout(contains("2 Errors")); + // } + + // #[test] + // fn test_encoded_fragments() { + // let mut cmd = main_command(); + // let dir = fixtures_path().join("encoded_fragments"); + // cmd.arg("--include-fragments") + // .arg(dir.join("index.html")) + // .assert() + // .failure() + // .stdout(contains("Cannot convert path '/about.html#about' to a URI")) + // .stdout(contains("#about")) + // .stdout(contains("1 Total")) + // .stdout(contains("0 OK")) + // .stdout(contains("1 Error")); + // } } diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 0060bfbde0..5bec5b1e61 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -2,7 +2,7 @@ use http::StatusCode; use log::warn; use std::path::{Path, PathBuf}; -use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri}; +use crate::{utils::fragment_checker::FragmentChecker, ErrorKind, Result, Status, Uri}; /// A utility for checking the existence and validity of file-based URIs. /// @@ -11,8 +11,6 @@ use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, U /// and optional fragment checking for HTML files. #[derive(Debug, Clone)] pub(crate) struct FileChecker { - /// Base path or URL used for resolving relative paths. - base: Option, /// List of file extensions to try if the original path doesn't exist. fallback_extensions: Vec, /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files. @@ -29,13 +27,8 @@ impl FileChecker { /// * `base` - Optional base path or URL for resolving relative paths. /// * `fallback_extensions` - List of extensions to try if the original file is not found. /// * `include_fragments` - Whether to check for fragment existence in HTML files. - pub(crate) fn new( - base: Option, - fallback_extensions: Vec, - include_fragments: bool, - ) -> Self { + pub(crate) fn new(fallback_extensions: Vec, include_fragments: bool) -> Self { Self { - base, fallback_extensions, include_fragments, fragment_checker: FragmentChecker::new(), @@ -55,43 +48,20 @@ impl FileChecker { /// /// Returns a `Status` indicating the result of the check. pub(crate) async fn check(&self, uri: &Uri) -> Status { - let Ok(path) = uri.url.to_file_path() else { - return ErrorKind::InvalidFilePath(uri.clone()).into(); + let path = match uri_to_path(uri) { + Ok(path) => path, + Err(err) => return err.into(), }; - - let resolved_path = self.resolve_path(&path); - self.check_path(&resolved_path, uri).await + self.check_path(&path, uri).await } - /// Resolves the given path using the base path, if one is set. - /// - /// # Arguments + /// Checks if the given path exists /// - /// * `path` - The path to resolve. + /// Two special cases are considered: /// - /// # Returns - /// - /// Returns the resolved path as a `PathBuf`. - fn resolve_path(&self, path: &Path) -> PathBuf { - if let Some(Base::Local(base_path)) = &self.base { - if path.is_absolute() { - let absolute_base_path = if base_path.is_relative() { - std::env::current_dir().unwrap_or_default().join(base_path) - } else { - base_path.clone() - }; - - let stripped = path.strip_prefix("/").unwrap_or(path); - absolute_base_path.join(stripped) - } else { - base_path.join(path) - } - } else { - path.to_path_buf() - } - } - - /// Checks if the given path exists and performs additional checks if necessary. + /// - If the path exists, it will check for a fragment if enabled. + /// - If the path does not exist, it will try to find the file with + /// fallback extensions. /// /// # Arguments /// @@ -103,13 +73,14 @@ impl FileChecker { /// Returns a `Status` indicating the result of the check. async fn check_path(&self, path: &Path, uri: &Uri) -> Status { if path.exists() { - return self.check_existing_path(path, uri).await; + return self.check_fragment_if_enabled(path, uri).await; } self.check_with_fallback_extensions(path, uri).await } - /// Checks an existing path, optionally verifying fragments for HTML files. + /// Checks for the existence of a fragment in a file if fragment checking is + /// enabled. /// /// # Arguments /// @@ -118,8 +89,8 @@ impl FileChecker { /// /// # Returns /// - /// Returns a `Status` indicating the result of the check. - async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status { + /// Returns a `Status` indicating whether the fragment check was successful. + async fn check_fragment_if_enabled(&self, path: &Path, uri: &Uri) -> Status { if self.include_fragments { self.check_fragment(path, uri).await } else { @@ -127,7 +98,8 @@ impl FileChecker { } } - /// Attempts to find a file by trying different extensions specified in `fallback_extensions`. + /// Attempts to find a file by trying different extensions specified in + /// `fallback_extensions`. /// /// # Arguments /// @@ -142,14 +114,14 @@ impl FileChecker { // If the path already has an extension, try it first if path_buf.extension().is_some() && path_buf.exists() { - return self.check_existing_path(&path_buf, uri).await; + return self.check_fragment_if_enabled(&path_buf, uri).await; } // Try fallback extensions for ext in &self.fallback_extensions { path_buf.set_extension(ext); if path_buf.exists() { - return self.check_existing_path(&path_buf, uri).await; + return self.check_fragment_if_enabled(&path_buf, uri).await; } } @@ -177,3 +149,64 @@ impl FileChecker { } } } + +/// Converts the given URI to a file path. +fn uri_to_path(uri: &Uri) -> Result { + // Make a clone without the fragment for file path checking + let mut url_without_fragment = uri.url.clone(); + url_without_fragment.set_fragment(None); + + let Ok(path) = url_without_fragment.to_file_path() else { + return Err(ErrorKind::InvalidFilePath(uri.clone())); + }; + + Ok(path) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_uri_to_path() { + let cases = vec![ + ( + "file:///path/to/file", + PathBuf::from("/path/to/file"), + "file URI with absolute path", + ), + ( + "file:///path/to/file#fragment", + PathBuf::from("/path/to/file"), + "file URI with absolute path and fragment", + ), + ( + "file:///path/to/file%20with%20spaces", + PathBuf::from("/path/to/file with spaces"), + "file URI with absolute path and spaces", + ), + ( + "file:/path/to/file", + PathBuf::from("/path/to/file"), + "file URI with absolute path (no slashes)", + ), + ( + "file:/path/to/file#fragment", + PathBuf::from("/path/to/file"), + "file URI with absolute path and fragment (no slashes)", + ), + ( + "file:/path/to/file%20with%20spaces", + PathBuf::from("/path/to/file with spaces"), + "file URI with absolute path and spaces (no slashes)", + ), + ]; + + for (uri, expected, name) in cases { + let url = url::Url::parse(uri).unwrap(); + let uri = Uri::from(url); + let path = uri_to_path(&uri).unwrap(); + assert_eq!(path, expected, "{}", name); + } + } +} diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 62cadaff11..5a57fbb016 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -19,6 +19,10 @@ pub(crate) struct WebsiteChecker { /// The HTTP client used for requests. reqwest_client: reqwest::Client, + /// Whether to check for fragments (e.g. `#foo`) in the response body + /// (if any). + include_fragment: bool, + /// GitHub client used for requests. github_client: Option, @@ -50,6 +54,7 @@ impl WebsiteChecker { retry_wait_time: Duration, max_retries: u64, reqwest_client: reqwest::Client, + include_fragment: bool, accepted: Option>, github_client: Option, require_https: bool, @@ -58,6 +63,7 @@ impl WebsiteChecker { Self { method, reqwest_client, + include_fragment, github_client, plugin_request_chain, max_retries, @@ -88,11 +94,55 @@ impl WebsiteChecker { /// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest). async fn check_default(&self, request: Request) -> Status { match self.reqwest_client.execute(request).await { - Ok(ref response) => Status::new(response, self.accepted.clone()), + Ok(response) => { + if !self.include_fragment { + return Status::new(&response, self.accepted.clone()); + } + + let uri = response.url(); + let fragment = match uri.fragment() { + Some(fragment) => fragment.to_string(), + None => return Status::new(&response, self.accepted.clone()), + }; + + // Create the status object first, because the response + // gets consumed when reading the body + let success_status = Status::new(&response, self.accepted.clone()); + + match response.text().await { + Ok(body) if body.contains(&fragment) => success_status, + _ => Status::Error(ErrorKind::FragmentNotFound(fragment)), + } + } Err(e) => e.into(), } } + // async fn check_default(&self, request: Request) -> Status { + // let response = self.reqwest_client.execute(request).await; + + // match response { + // Ok(ref response) if self.include_fragment => { + // let uri = response.url(); + // let fragment = match uri.fragment() { + // Some(fragment) => fragment.to_string(), + // None => return Status::new(response, self.accepted.clone()), + // }; + + // // We have a fragment, so we need to check if the fragment is present + // // in the response body + + // if !body.contains(&fragment) { + // return Status::Error(ErrorKind::FragmentNotFound(fragment)); + // } else { + // return Status::new(response, self.accepted.clone()); + // } + // } + // Ok(ref response) => Status::new(response, self.accepted.clone()), + // Err(e) => e.into(), + // } + // } + /// Checks the given URI of a website. /// /// # Errors diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index f9566f06e4..8e06d84f58 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -34,7 +34,7 @@ use crate::{ filter::{Excludes, Filter, Includes}, remap::Remaps, utils::fragment_checker::FragmentChecker, - Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, + BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, }; /// Default number of redirects before a request is deemed as failed, 5. @@ -242,12 +242,6 @@ pub struct ClientBuilder { /// Response timeout per request in seconds. timeout: Option, - /// Base for resolving paths. - /// - /// E.g. if the base is `/home/user/` and the path is `file.txt`, the - /// resolved path would be `/home/user/file.txt`. - base: Option, - /// Initial time between retries of failed requests. /// /// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`]. @@ -388,6 +382,7 @@ impl ClientBuilder { self.retry_wait_time, self.max_retries, reqwest_client, + self.include_fragments, self.accepted, github_client, self.require_https, @@ -399,11 +394,7 @@ impl ClientBuilder { filter, email_checker: MailChecker::new(), website_checker, - file_checker: FileChecker::new( - self.base, - self.fallback_extensions, - self.include_fragments, - ), + file_checker: FileChecker::new(self.fallback_extensions, self.include_fragments), fragment_checker: FragmentChecker::new(), }) } diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 7eb4c8c82b..03c376879f 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -159,7 +159,7 @@ impl Collector { base.as_ref(), basic_auth_extractor.as_ref(), ); - Result::Ok(stream::iter(requests.into_iter().map(Ok))) + Result::Ok(stream::iter(requests)) } }) .try_flatten() @@ -176,7 +176,7 @@ mod tests { use super::*; use crate::{ mock_server, - test_utils::{load_fixture, mail, path, website}, + test_utils::{load_fixture, mail, website}, types::{FileType, Input, InputSource}, Result, Uri, }; @@ -526,32 +526,38 @@ mod tests { assert_eq!(links, expected_links); } - #[tokio::test] - async fn test_file_path_with_base() { - let base = Base::try_from("/path/to/root").unwrap(); - assert_eq!(base, Base::Local("/path/to/root".into())); - - let input = Input { - source: InputSource::String( - r#" - Index - About - Another - "# - .into(), - ), - file_type_hint: Some(FileType::Html), - excluded_paths: None, - }; - - let links = collect(vec![input], None, Some(base)).await.ok().unwrap(); - - let expected_links = HashSet::from_iter([ - path("/path/to/root/index.html"), - path("/path/to/root/about.html"), - path("/another.html"), - ]); - - assert_eq!(links, expected_links); - } + // #[tokio::test] + // async fn test_file_path_with_root_dir() { + // let root_dir = tempfile::tempdir().unwrap(); + + // let input = Input { + // source: InputSource::String( + // r#" + // Index + // About + // Another + // "# + // .into(), + // ), + // file_type_hint: Some(FileType::Html), + // excluded_paths: None, + // }; + + // let links = collect(vec![input], Some(&root_dir), None).await.ok().unwrap(); + + // assert!(links.len() == 3); + + // // Verify all URLs use file:// scheme + // for link in &links { + // assert_eq!(link.scheme(), "file", "Expected file:// URL but got {link}"); + // } + + // let expected_links = HashSet::from_iter([ + // path("/path/to/root/index.html"), + // path("/path/to/root/about.html"), + // path("/another.html"), + // ]); + + // assert_eq!(links, expected_links); + // } } diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index 7fc03f7c18..1174767a97 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -343,9 +343,9 @@ impl Emitter for &mut LinkExtractor { } /// Extract unparsed URL strings from an HTML string. -pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { +pub(crate) fn extract_html(html_str: &str, include_verbatim: bool) -> Vec { let mut extractor = LinkExtractor::new(include_verbatim); - let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor); + let mut tokenizer = Tokenizer::new_with_emitter(html_str, &mut extractor); assert!(tokenizer.next().is_none()); extractor .links @@ -355,9 +355,9 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { } /// Extract fragments from id attributes within a HTML string. -pub(crate) fn extract_html_fragments(buf: &str) -> HashSet { +pub(crate) fn extract_html_fragments(html_str: &str) -> HashSet { let mut extractor = LinkExtractor::new(true); - let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor); + let mut tokenizer = Tokenizer::new_with_emitter(html_str, &mut extractor); assert!(tokenizer.next().is_none()); extractor.fragments } diff --git a/lychee-lib/src/test_utils.rs b/lychee-lib/src/test_utils.rs index ff7be8e8c6..99fc6aa2de 100644 --- a/lychee-lib/src/test_utils.rs +++ b/lychee-lib/src/test_utils.rs @@ -39,16 +39,6 @@ pub(crate) fn website(url: &str) -> Uri { Uri::from(Url::parse(url).expect("Expected valid Website URI")) } -/// Helper method to convert a `std::path::Path `into a URI with the `file` scheme -/// -/// # Panic -/// -/// This panics if the given path is not absolute, so it should only be used for -/// testing -pub(crate) fn path>(path: P) -> Uri { - Uri::from(Url::from_file_path(path.as_ref()).expect("Expected valid File URI")) -} - /// Creates a mail URI from a string pub(crate) fn mail(address: &str) -> Uri { if address.starts_with("mailto:") { diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 4c68900c18..62692316d9 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -1,33 +1,28 @@ use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::{convert::TryFrom, path::PathBuf}; +use std::convert::TryFrom; use crate::{ErrorKind, InputSource}; -/// When encountering links without a full domain in a document, +/// When encountering links without a domain in a document, /// the base determines where this resource can be found. -/// Both, local and remote targets are supported. +/// +/// This can be only be a remote URL. +/// For local paths, use `root-dir` #[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] #[allow(variant_size_differences)] #[serde(try_from = "String")] -pub enum Base { - /// Local file path pointing to root directory - Local(PathBuf), - /// Remote URL pointing to a website homepage - Remote(Url), -} +pub struct Base(Url); impl Base { /// Join link with base url #[must_use] pub(crate) fn join(&self, link: &str) -> Option { - match self { - Self::Remote(url) => url.join(link).ok(), - Self::Local(path) => { - let full_path = path.join(link); - Url::from_file_path(full_path).ok() - } + // Ensure the link doesn't contain a scheme + if Url::parse(link).is_ok() { + return None; } + self.0.join(link).ok() } pub(crate) fn from_source(source: &InputSource) -> Option { @@ -40,7 +35,7 @@ impl Base { base_url.set_fragment(None); // We keep the username and password intact - Some(Base::Remote(*base_url)) + Some(Base(*base_url)) } // other inputs do not have a URL to extract a base _ => None, @@ -52,16 +47,16 @@ impl TryFrom<&str> for Base { type Error = ErrorKind; fn try_from(value: &str) -> Result { - if let Ok(url) = Url::parse(value) { - if url.cannot_be_a_base() { - return Err(ErrorKind::InvalidBase( - value.to_string(), - "The given URL cannot be a base".to_string(), - )); - } - return Ok(Self::Remote(url)); + let url = Url::parse(value).map_err(|_| { + ErrorKind::InvalidBase(value.to_string(), "The given URL is invalid".to_string()) + })?; + if url.cannot_be_a_base() { + return Err(ErrorKind::InvalidBase( + value.to_string(), + "The given URL cannot be a base".to_string(), + )); } - Ok(Self::Local(PathBuf::from(value))) + return Ok(Self(url)); } } @@ -82,10 +77,7 @@ mod test_base { #[test] fn test_valid_remote() -> Result<()> { let base = Base::try_from("https://endler.dev")?; - assert_eq!( - base, - Base::Remote(Url::parse("https://endler.dev").unwrap()) - ); + assert_eq!(base, Base(Url::parse("https://endler.dev").unwrap())); Ok(()) } @@ -95,22 +87,15 @@ mod test_base { } #[test] - fn test_valid_local_path_string_as_base() -> Result<()> { + fn test_local_path_string_is_invalid_base() -> Result<()> { let cases = vec!["/tmp/lychee", "/tmp/lychee/", "tmp/lychee/"]; for case in cases { - assert_eq!(Base::try_from(case)?, Base::Local(PathBuf::from(case))); + assert!(Base::try_from(case).is_err()); } Ok(()) } - #[test] - fn test_valid_local() -> Result<()> { - let dir = tempfile::tempdir().unwrap(); - Base::try_from(dir.as_ref().to_str().unwrap())?; - Ok(()) - } - #[test] fn test_get_base_from_url() { for (url, expected) in [ @@ -126,8 +111,31 @@ mod test_base { let url = Url::parse(url).unwrap(); let source = InputSource::RemoteUrl(Box::new(url.clone())); let base = Base::from_source(&source); - let expected = Base::Remote(Url::parse(expected).unwrap()); + let expected = Base(Url::parse(expected).unwrap()); assert_eq!(base, Some(expected)); } } + + #[test] + fn test_join_remote() { + let base = Base(Url::parse("https://example.com").unwrap()); + let url = base.join("foo/bar").unwrap(); + assert_eq!(url, Url::parse("https://example.com/foo/bar").unwrap()); + } + + #[test] + fn test_join_query_params() { + let base = Base(Url::parse("https://example.com").unwrap()); + let url = base.join("foo/bar?query=something").unwrap(); + assert_eq!( + url, + Url::parse("https://example.com/foo/bar?query=something").unwrap() + ); + } + + #[test] + fn test_join_data_invalid() { + let base = Base(Url::parse("https://example.com").unwrap()); + assert!(base.join("data:text/plain,Hello?World#").is_none()); + } } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index cbcfefe5e9..795312a1e3 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -17,9 +17,11 @@ pub enum ErrorKind { /// Network error while handling request #[error("Network error")] NetworkRequest(#[source] reqwest::Error), + /// Cannot read the body of the received response #[error("Error reading response body: {0}")] ReadResponseBody(#[source] reqwest::Error), + /// The network client required for making requests cannot be created #[error("Error creating request client: {0}")] BuildRequestClient(#[source] reqwest::Error), @@ -82,12 +84,12 @@ pub enum ErrorKind { #[error("Header could not be parsed.")] InvalidHeader(#[from] http::header::InvalidHeaderValue), - /// The given string can not be parsed into a valid base URL or base directory - #[error("Error with base dir `{0}` : {1}")] + /// The given string can not be parsed into a valid base URL + #[error("Error with base URL `{0}` : {1}")] InvalidBase(String, String), /// Cannot join the given text with the base URL - #[error("Cannot join '{0}' with the base URL")] + #[error("Cannot join '{0}' with base URL")] InvalidBaseJoin(String), /// Cannot convert the given path to a URI @@ -161,6 +163,10 @@ pub enum ErrorKind { /// Status code selector parse error #[error("Status code range error")] StatusCodeSelectorError(#[from] StatusCodeSelectorError), + + /// The given fragment does not exist in the URL + #[error("Fragment not found: {0}")] + FragmentNotFound(String), } impl ErrorKind { @@ -276,7 +282,14 @@ impl PartialEq for ErrorKind { (Self::InvalidBase(b1, e1), Self::InvalidBase(b2, e2)) => b1 == b2 && e1 == e2, (Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2, (Self::EmptyUrl, Self::EmptyUrl) => true, - + (Self::InvalidBaseJoin(s1), Self::InvalidBaseJoin(s2)) => s1 == s2, + (Self::InvalidPathToUri(s1), Self::InvalidPathToUri(s2)) => s1 == s2, + (Self::RootDirMustBeAbsolute(s1), Self::RootDirMustBeAbsolute(s2)) => s1 == s2, + (Self::UnsupportedUriType(s1), Self::UnsupportedUriType(s2)) => s1 == s2, + (Self::StatusCodeSelectorError(e1), Self::StatusCodeSelectorError(e2)) => { + e1.to_string() == e2.to_string() + } + (Self::FragmentNotFound(s1), Self::FragmentNotFound(s2)) => s1 == s2, _ => false, } } @@ -329,6 +342,7 @@ impl Hash for ErrorKind { Self::BasicAuthExtractorError(e) => e.to_string().hash(state), Self::Cookies(e) => e.to_string().hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), + Self::FragmentNotFound(s) => s.hash(state), } } } diff --git a/lychee-lib/src/utils/fragment_checker.rs b/lychee-lib/src/utils/fragment_checker.rs index 1cc6c772c8..dc9461020f 100644 --- a/lychee-lib/src/utils/fragment_checker.rs +++ b/lychee-lib/src/utils/fragment_checker.rs @@ -16,8 +16,8 @@ use url::Url; /// Holds a cache of fragments for a given URL. /// /// Fragments, also known as anchors, are used to link to a specific -/// part of a page. For example, the URL `https://example.com#foo` -/// will link to the element with the `id` of `foo`. +/// part of a document. For example, the URL `https://example.com#foo` +/// will link to the element with the `id` of `foo` in the document. /// /// This cache is used to avoid having to re-parse the same file /// multiple times when checking if a given URL contains a fragment. @@ -39,16 +39,16 @@ impl FragmentChecker { /// Checks if the given path contains the given fragment. /// - /// Returns false, if there is a fragment in the link and the path is to a - /// Markdown file, which doesn't contain the given fragment. + /// Returns false, if there is a fragment in the link and the path points to + /// a file, which doesn't contain said fragment. /// /// In all other cases, returns true. - pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result { - let Some(fragment) = url.fragment() else { + pub(crate) async fn check(&self, path: &Path, url_with_fragment: &Url) -> Result { + let Some(fragment) = url_with_fragment.fragment() else { return Ok(true); }; let mut fragment_decoded = percent_decode_str(fragment).decode_utf8()?; - let url_without_frag = Self::remove_fragment(url.clone()); + let url_without_frag = Self::remove_fragment(url_with_fragment.clone()); let file_type = FileType::from(path); let extractor = match file_type { @@ -75,6 +75,9 @@ impl FragmentChecker { } } + /// Removes the fragment from the given URL. + /// + /// Sets the fragment to `None` and returns the URL as a string. fn remove_fragment(mut url: Url) -> String { url.set_fragment(None); url.into() diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index ca66f7d857..3a02de6446 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,4 +1,3 @@ -use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; use std::{ @@ -112,50 +111,7 @@ fn create_uri_from_file_path( }) } -/// Truncate the source in case it gets too long -/// -/// This is only needed for string inputs. -/// For other inputs, the source is simply a "label" (an enum variant). -// TODO: This would not be necessary if we used `Cow` for the source. -fn truncate_source(source: &InputSource) -> InputSource { - const MAX_TRUNCATED_STR_LEN: usize = 100; - - match source { - InputSource::String(s) => { - InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect()) - } - other => other.clone(), - } -} - -/// Create requests out of the collected URLs. -/// Only keeps "valid" URLs. This filters out anchors for example. -/// -/// If a URLs is ignored (because of the current settings), -/// it will not be added to the `HashSet`. -pub(crate) fn create( - uris: Vec, - source: &InputSource, - root_dir: Option<&PathBuf>, - base: Option<&Base>, - extractor: Option<&BasicAuthExtractor>, -) -> HashSet { - let base = base.cloned().or_else(|| Base::from_source(source)); - - uris.into_iter() - .filter_map(|raw_uri| { - match create_request(&raw_uri, source, root_dir, base.as_ref(), extractor) { - Ok(request) => Some(request), - Err(e) => { - warn!("Error creating request: {:?}", e); - None - } - } - }) - .collect() -} - -/// Create a URI from a path +/// Create a URL from a path /// /// `src_path` is the path of the source file. /// `dest_path` is the path being linked to. @@ -174,7 +130,6 @@ fn resolve_and_create_url( let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path); // Decode the destination path to avoid double-encoding - // This addresses the issue mentioned in the original comment about double-encoding let decoded_dest = percent_decode_str(dest_path).decode_utf8()?; let Ok(Some(resolved_path)) = path::resolve( @@ -193,6 +148,40 @@ fn resolve_and_create_url( Ok(url) } +/// Truncate the source in case it gets too long +/// +/// This is only needed for string inputs. +/// For other inputs, the source is simply a "label" (an enum variant). +// TODO: This would not be necessary if we used `Cow` for the source. +fn truncate_source(source: &InputSource) -> InputSource { + const MAX_TRUNCATED_STR_LEN: usize = 100; + + match source { + InputSource::String(s) => { + InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect()) + } + other => other.clone(), + } +} + +/// Create requests out of the collected URLs. +/// Only keeps "valid" URLs. This filters out anchors for example. +/// +/// If a URLs is ignored (because of the current settings), +/// it will not be added to the `HashSet`. +pub(crate) fn create( + uris: Vec, + source: &InputSource, + root_dir: Option<&PathBuf>, + base: Option<&Base>, + extractor: Option<&BasicAuthExtractor>, +) -> HashSet> { + let base = base.cloned().or_else(|| Base::from_source(source)); + uris.into_iter() + .map(|raw_uri| create_request(&raw_uri, source, root_dir, base.as_ref(), extractor)) + .collect() +} + fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&PathBuf>) -> String { if text.starts_with('/') { if let Some(path) = root_dir { @@ -232,7 +221,8 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() + == "https://example.com/path/relative.html")); } #[test] @@ -246,7 +236,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://another.com/page")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "https://another.com/page")); } #[test] @@ -260,7 +250,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "https://example.com/root-relative")); } #[test] @@ -274,7 +264,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/parent")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "https://example.com/parent")); } #[test] @@ -286,9 +276,8 @@ mod tests { let requests = create(uris, &source, None, Some(&base), None); assert_eq!(requests.len(), 1); - assert!(requests - .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")); + assert!(requests.iter().any(|r| r.as_ref().unwrap().uri.url.as_str() + == "https://example.com/path/page.html#fragment")); } #[test] @@ -302,7 +291,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "file:///some/relative.html")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "file:///some/relative.html")); } #[test] @@ -316,7 +305,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://another.com/page")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "https://another.com/page")); } #[test] @@ -330,7 +319,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "file:///tmp/lychee/root-relative")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "file:///tmp/lychee/root-relative")); } #[test] @@ -344,7 +333,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "file:///parent")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "file:///parent")); } #[test] @@ -358,7 +347,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "file:///some/page.html#fragment")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "file:///some/page.html#fragment")); } #[test] @@ -373,7 +362,8 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() + == "https://example.com/path/relative.html")); } #[test] @@ -388,7 +378,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://another.com/page")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "https://another.com/page")); } #[test] @@ -401,9 +391,8 @@ mod tests { let requests = create(uris, &source, Some(&root_dir), Some(&base), None); assert_eq!(requests.len(), 1); - assert!(requests - .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/tmp/lychee/root-relative")); + assert!(requests.iter().any(|r| r.as_ref().unwrap().uri.url.as_str() + == "https://example.com/tmp/lychee/root-relative")); } #[test] @@ -418,7 +407,7 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/parent")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "https://example.com/parent")); } #[test] @@ -431,9 +420,8 @@ mod tests { let requests = create(uris, &source, Some(&root_dir), Some(&base), None); assert_eq!(requests.len(), 1); - assert!(requests - .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")); + assert!(requests.iter().any(|r| r.as_ref().unwrap().uri.url.as_str() + == "https://example.com/path/page.html#fragment")); } #[test] @@ -446,19 +434,19 @@ mod tests { assert_eq!(requests.len(), 1); assert!(requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/page")); + .any(|r| r.as_ref().unwrap().uri.url.as_str() == "https://example.com/page")); } #[test] fn test_create_request_from_relative_file_path() { - let base = Base::Local(PathBuf::from("/tmp/lychee")); + let root_dir = PathBuf::from("/tmp/lychee"); let input_source = InputSource::FsPath(PathBuf::from("page.html")); let actual = create_request( &RawUri::from("file.html"), &input_source, + Some(&root_dir), None, - Some(&base), None, ) .unwrap(); @@ -479,15 +467,15 @@ mod tests { #[test] fn test_create_request_from_absolute_file_path() { - let base = Base::Local(PathBuf::from("/tmp/lychee")); + let root_dir = PathBuf::from("/tmp/lychee"); let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); - // Use an absolute path that's outside the base directory + // Use an absolute path that's outside the root directory let actual = create_request( &RawUri::from("/usr/local/share/doc/example.html"), &input_source, + Some(&root_dir), None, - Some(&base), None, ) .unwrap(); @@ -508,22 +496,22 @@ mod tests { #[test] fn test_parse_relative_path_into_uri() { - let base = Base::Local(PathBuf::from("/tmp/lychee")); + let root_dir = PathBuf::from("/tmp/lychee"); let source = InputSource::String(String::new()); let raw_uri = RawUri::from("relative.html"); - let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap(); + let uri = try_parse_into_uri(&raw_uri, &source, Some(&root_dir), None).unwrap(); assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html"); } #[test] fn test_parse_absolute_path_into_uri() { - let base = Base::Local(PathBuf::from("/tmp/lychee")); + let root_dir = PathBuf::from("/tmp/lychee"); let source = InputSource::String(String::new()); let raw_uri = RawUri::from("absolute.html"); - let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap(); + let uri = try_parse_into_uri(&raw_uri, &source, Some(&root_dir), None).unwrap(); assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html"); } @@ -557,4 +545,43 @@ mod tests { let result = prepend_root_dir_if_absolute_local_link(text, None); assert_eq!(result, "relative/path"); } + + #[test] + fn test_parse_url_with_anchor() { + let base = Base::try_from("https://example.com/path/page.html").unwrap(); + let source = InputSource::String(String::new()); + + let raw_uri = RawUri::from("#fragment"); + let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap(); + + assert_eq!( + uri.url.as_str(), + "https://example.com/path/page.html#fragment" + ); + } + + #[test] + fn test_parse_url_to_different_page_with_anchor() { + let base = Base::try_from("https://example.com/path/page.html").unwrap(); + let source = InputSource::String(String::new()); + + let raw_uri = RawUri::from("other-page.html#fragment"); + let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap(); + + assert_eq!( + uri.url.as_str(), + "https://example.com/path/other-page.html#fragment" + ); + } + + #[test] + fn test_parse_url_from_path_with_anchor() { + let root_dir = PathBuf::from("/tmp/lychee"); + let source = InputSource::FsPath(PathBuf::from("/some/page.html")); + + let raw_uri = RawUri::from("#fragment"); + let uri = try_parse_into_uri(&raw_uri, &source, Some(&root_dir), None).unwrap(); + + assert_eq!(uri.url.as_str(), "file:///some/page.html#fragment"); + } } From f1f2e9557955007a22cee5745b2ad137710b5d3f Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 14 Jan 2025 01:48:00 +0100 Subject: [PATCH 2/8] Update and fix some test cases. Still WIP --- fixtures/encoded_fragments/about.html | 7 - fixtures/encoded_fragments/index.html | 7 - fixtures/relative_paths/releases/Releases | 0 fixtures/relative_paths/releases/v9_7/Current | 0 .../{docs => }/releases/v9_7/index.html | 6 +- lychee-bin/tests/cli.rs | 139 ++++++++---------- lychee-lib/src/utils/request.rs | 114 +++++++++++--- 7 files changed, 151 insertions(+), 122 deletions(-) delete mode 100644 fixtures/encoded_fragments/about.html delete mode 100644 fixtures/encoded_fragments/index.html create mode 100644 fixtures/relative_paths/releases/Releases create mode 100644 fixtures/relative_paths/releases/v9_7/Current rename fixtures/relative_paths/{docs => }/releases/v9_7/index.html (63%) diff --git a/fixtures/encoded_fragments/about.html b/fixtures/encoded_fragments/about.html deleted file mode 100644 index 6d8a7041c9..0000000000 --- a/fixtures/encoded_fragments/about.html +++ /dev/null @@ -1,7 +0,0 @@ - - - -

About Page

- - - diff --git a/fixtures/encoded_fragments/index.html b/fixtures/encoded_fragments/index.html deleted file mode 100644 index 903cf3e961..0000000000 --- a/fixtures/encoded_fragments/index.html +++ /dev/null @@ -1,7 +0,0 @@ - - - - About - - - diff --git a/fixtures/relative_paths/releases/Releases b/fixtures/relative_paths/releases/Releases new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/relative_paths/releases/v9_7/Current b/fixtures/relative_paths/releases/v9_7/Current new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/relative_paths/docs/releases/v9_7/index.html b/fixtures/relative_paths/releases/v9_7/index.html similarity index 63% rename from fixtures/relative_paths/docs/releases/v9_7/index.html rename to fixtures/relative_paths/releases/v9_7/index.html index f0505d51ba..3aa6f02a23 100644 --- a/fixtures/relative_paths/docs/releases/v9_7/index.html +++ b/fixtures/relative_paths/releases/v9_7/index.html @@ -1,11 +1,9 @@ - VPN - Webserver - Releases Current - System Config + Releases Maintenance + VPN diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index db3d1fd2a5..224adc59dd 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -413,24 +413,6 @@ mod cli { .stdout(contains("2 Errors")); } - #[test] - fn test_resolve_paths_from_root_dir_and_base_url() { - let mut cmd = main_command(); - let dir = fixtures_path(); - - cmd.arg("--offline") - .arg("--root-dir") - .arg("/resolve_paths") - .arg("--base") - .arg(&dir) - .arg(dir.join("resolve_paths").join("index.html")) - .env_clear() - .assert() - .success() - .stdout(contains("3 Total")) - .stdout(contains("3 OK")); - } - #[test] fn test_youtube_quirk() { let url = "https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7"; @@ -1812,7 +1794,6 @@ mod cli { fn test_fragments() { let mut cmd = main_command(); let input = fixtures_path().join("fragments"); - cmd.arg("--verbose") .arg("--include-fragments") .arg(input) @@ -1836,9 +1817,20 @@ mod cli { .stderr(contains( "fixtures/fragments/file1.md#kebab-case-fragment-1", )) + .stderr(contains( + "fixtures/fragments/file1.md#f%C3%BCnf-s%C3%BC%C3%9Fe-%C3%A4pfel", + )) + .stderr(contains("fixtures/fragments/file1.md#IGNORE-CASING")) + .stderr(contains("fixtures/fragments/file1.md#explicit-fragment")) + .stderr(contains( + "fixtures/fragments/file.html#Upper-%C3%84%C3%96%C3%B6", + )) + .stderr(contains( + "fixtures/fragments/file.html#tangent%3A-kustomize", + )) + .stderr(contains("fixtures/fragments/file.html#in-THE-begiNNing")) .stdout(contains("21 Total")) .stdout(contains("17 OK")) - // 4 failures because of missing fragments .stdout(contains("4 Errors")); } @@ -2008,65 +2000,50 @@ mod cli { .stdout(contains("https://lychee.cli.rs/#missing-section | Fragment not found: missing-section")); } - // #[test] - // fn test_relative_path_resolution() { - // let mut cmd = main_command(); - // let dir = fixtures_path().join("relative_paths"); - // cmd.arg("--base") - // .arg(&dir) - // .arg(dir.join("docs/releases/v9_7/index.html")) - // .assert() - // .success() - // .stderr(predicates::str::is_empty()) - // .stdout(contains("6 Total")) - // .stdout(contains("6 OK")) - // .stdout(contains("0 Errors")); - // } - - // #[test] - // fn test_base_url_absolute_paths() { - // let mut cmd = main_command(); - // let dir = fixtures_path().join("absolute_paths"); - // cmd.arg("--base") - // .arg("https://example.com") - // .arg("--dump") // Add dump flag to avoid actual requests - // .arg(dir.join("index.html")) - // .assert() - // .success() - // .stdout(contains("https://example.com/about")) - // .stdout(contains("https://example.com/products")) - // .stdout(contains("https://example.com/contact")) - // .stderr(predicates::str::is_empty()); - // } - - // #[test] - // fn test_html_fragment_detection() { - // let mut cmd = main_command(); - // let dir = fixtures_path().join("html_fragments"); - // cmd.arg("--include-fragments") - // .arg(dir.join("page.html")) - // .assert() - // .failure() - // .stdout(contains("section.html#non-existent-heading")) - // .stdout(contains("page.html#missing-section")) - // .stdout(contains("Cannot find fragment")) - // .stdout(contains("3 Total")) - // .stdout(contains("1 OK")) - // .stdout(contains("2 Errors")); - // } - - // #[test] - // fn test_encoded_fragments() { - // let mut cmd = main_command(); - // let dir = fixtures_path().join("encoded_fragments"); - // cmd.arg("--include-fragments") - // .arg(dir.join("index.html")) - // .assert() - // .failure() - // .stdout(contains("Cannot convert path '/about.html#about' to a URI")) - // .stdout(contains("#about")) - // .stdout(contains("1 Total")) - // .stdout(contains("0 OK")) - // .stdout(contains("1 Error")); - // } + #[test] + fn test_relative_path_resolution() { + let mut cmd = main_command(); + let dir = fixtures_path().join("relative_paths"); + cmd.arg("--root-dir") + .arg(&dir) + .arg(dir.join("releases/v9_7/index.html")) + .assert() + .success() + .stderr(predicates::str::is_empty()) + .stdout(contains("4 Total")) + .stdout(contains("4 OK")) + .stdout(contains("0 Errors")); + } + + #[test] + fn test_base_url_absolute_paths() { + let mut cmd = main_command(); + let dir = fixtures_path().join("absolute_paths"); + cmd.arg("--base") + .arg("https://example.com") + .arg("--dump") // Add dump flag to avoid actual requests + .arg(dir.join("index.html")) + .assert() + .success() + .stdout(contains("https://example.com/about")) + .stdout(contains("https://example.com/products")) + .stdout(contains("https://example.com/contact")) + .stderr(predicates::str::is_empty()); + } + + #[test] + fn test_html_fragment_detection() { + let mut cmd = main_command(); + let dir = fixtures_path().join("html_fragments"); + cmd.arg("--include-fragments") + .arg(dir.join("page.html")) + .assert() + .failure() + .stdout(contains("section.html#non-existent-heading")) + .stdout(contains("page.html#missing-section")) + .stdout(contains("Cannot find fragment")) + .stdout(contains("3 Total")) + .stdout(contains("1 OK")) + .stdout(contains("2 Errors")); + } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 3a02de6446..7b63039143 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -54,23 +54,78 @@ fn try_parse_into_uri( root_dir: Option<&PathBuf>, base: Option<&Base>, ) -> Result { - let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); - let uri = match Uri::try_from(raw_uri.clone()) { - Ok(uri) => uri, - Err(_) => match base { - Some(base_url) => match base_url.join(&text) { - Some(url) => Uri { url }, - None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), - }, - None => match source { - InputSource::FsPath(root) => { - create_uri_from_file_path(root, &text, root_dir.is_none())? + // First try direct URI parsing (handles explicit URLs) + if let Ok(uri) = Uri::try_from(raw_uri.clone()) { + return Ok(uri); + } + + let text = raw_uri.text.clone(); + + // Base URL takes precedence - all paths become URLs + if let Some(base_url) = base { + // For absolute paths with root_dir, insert root_dir after base + if text.starts_with('/') && root_dir.is_some() { + let root_path = root_dir.unwrap().to_string_lossy(); + let combined = format!("{}{}", root_path, text); + return base_url + .join(&combined) + .map(|url| Uri { url }) + .ok_or_else(|| ErrorKind::InvalidBaseJoin(combined)); + } + // Otherwise directly join with base + return base_url + .join(&text) + .map(|url| Uri { url }) + .ok_or_else(|| ErrorKind::InvalidBaseJoin(text)); + } + + // No base URL - handle as filesystem paths + match source { + InputSource::FsPath(source_path) => { + let target_path = if text.starts_with('/') && root_dir.is_some() { + // Absolute paths: resolve via root_dir + let mut path = root_dir.unwrap().clone(); + path.push(&text[1..]); + path + } else { + // If text is just a fragment, we need to append it to the source path + if is_anchor(&text) { + return Url::from_file_path(&source_path) + .map(|url| Uri { url }) + .map_err(|_| ErrorKind::InvalidUrlFromPath(source_path.clone())) + .map(|mut uri| { + uri.url.set_fragment(Some(&text[1..])); + uri + }); } - _ => return Err(ErrorKind::UnsupportedUriType(text)), - }, - }, - }; - Ok(uri) + + // Relative paths: resolve relative to source + match path::resolve( + source_path, + &PathBuf::from(&text), + false, // don't ignore absolute local links since we handled that case already + ) { + Ok(Some(resolved)) => resolved, + _ => return Err(ErrorKind::InvalidPathToUri(text)), + } + }; + + Url::from_file_path(&target_path) + .map(|url| Uri { url }) + .map_err(|_| ErrorKind::InvalidUrlFromPath(target_path)) + } + InputSource::String(s) => Err(ErrorKind::UnsupportedUriType(s.clone())), + InputSource::RemoteUrl(url) => { + let base_url = Url::parse(url.as_str()).map_err(|e| { + ErrorKind::ParseUrl(e, format!("Could not parse base URL: {}", url)) + })?; + base_url + .join(&text) + .map(|url| Uri { url }) + .map_err(|_| ErrorKind::InvalidBaseJoin(text)) + } + _ => Err(ErrorKind::UnsupportedUriType(text)), + } } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -331,9 +386,22 @@ mod tests { let requests = create(uris, &source, Some(&root_dir), None, None); assert_eq!(requests.len(), 1); - assert!(requests - .iter() - .any(|r| r.as_ref().unwrap().uri.url.as_str() == "file:///parent")); + // assert!(requests + // .iter() + // .any(|r| r.as_ref().unwrap().uri.url.as_str() == "file:///parent")); + + assert_eq!( + requests + .iter() + .next() + .unwrap() + .as_ref() + .unwrap() + .uri + .url + .as_str(), + "file:///parent", + ); } #[test] @@ -467,12 +535,12 @@ mod tests { #[test] fn test_create_request_from_absolute_file_path() { - let root_dir = PathBuf::from("/tmp/lychee"); - let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); + let root_dir = PathBuf::from("/foo/bar"); + let input_source = InputSource::FsPath(PathBuf::from("/foo/bar/page.html")); // Use an absolute path that's outside the root directory let actual = create_request( - &RawUri::from("/usr/local/share/doc/example.html"), + &RawUri::from("/baz/example.html"), &input_source, Some(&root_dir), None, @@ -484,7 +552,7 @@ mod tests { actual, Request::new( Uri { - url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap() + url: Url::from_file_path("/foo/bar/baz/example.html").unwrap() }, input_source, None, From 0749d77d2ebe7c43db1c5dc6d3356c0708f55bff Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 14 Jan 2025 01:55:07 +0100 Subject: [PATCH 3/8] fix remaining tests; cleanup --- lychee-lib/src/utils/request.rs | 63 ++++++++++++--------------------- 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 7b63039143..4a7994e7ac 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -90,7 +90,7 @@ fn try_parse_into_uri( } else { // If text is just a fragment, we need to append it to the source path if is_anchor(&text) { - return Url::from_file_path(&source_path) + return Url::from_file_path(source_path) .map(|url| Uri { url }) .map_err(|_| ErrorKind::InvalidUrlFromPath(source_path.clone())) .map(|mut uri| { @@ -99,12 +99,15 @@ fn try_parse_into_uri( }); } - // Relative paths: resolve relative to source - match path::resolve( - source_path, - &PathBuf::from(&text), - false, // don't ignore absolute local links since we handled that case already - ) { + // If source_path is relative and we have a root_dir, + // we need to resolve both source_path and text relative to root_dir + let resolved_source = if !source_path.is_absolute() && root_dir.is_some() { + root_dir.unwrap().join(source_path) + } else { + source_path.to_path_buf() + }; + + match path::resolve(&resolved_source, &PathBuf::from(&text), false) { Ok(Some(resolved)) => resolved, _ => return Err(ErrorKind::InvalidPathToUri(text)), } @@ -114,7 +117,18 @@ fn try_parse_into_uri( .map(|url| Uri { url }) .map_err(|_| ErrorKind::InvalidUrlFromPath(target_path)) } - InputSource::String(s) => Err(ErrorKind::UnsupportedUriType(s.clone())), + InputSource::String(s) => { + // If we have a root_dir, we can still resolve paths against it + // even for string sources + if let Some(root) = root_dir { + let target_path = root.join(&text); + return Url::from_file_path(&target_path) + .map(|url| Uri { url }) + .map_err(|_| ErrorKind::InvalidUrlFromPath(target_path)); + } + // Otherwise, we can't resolve the path + Err(ErrorKind::UnsupportedUriType(s.clone())) + } InputSource::RemoteUrl(url) => { let base_url = Url::parse(url.as_str()).map_err(|e| { ErrorKind::ParseUrl(e, format!("Could not parse base URL: {}", url)) @@ -133,39 +147,6 @@ pub(crate) fn is_anchor(text: &str) -> bool { text.starts_with('#') } -/// Create a URI from a file path -/// -/// # Errors -/// -/// - If the link text is an anchor and the file name cannot be extracted from the file path. -/// - If the path cannot be resolved. -/// - If the resolved path cannot be converted to a URL. -fn create_uri_from_file_path( - file_path: &Path, - link_text: &str, - ignore_absolute_local_links: bool, -) -> Result { - let target_path = if is_anchor(link_text) { - // For anchors, we need to append the anchor to the file name. - let file_name = file_path - .file_name() - .and_then(|name| name.to_str()) - .ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?; - - format!("{file_name}{link_text}") - } else { - link_text.to_string() - }; - let Ok(constructed_url) = - resolve_and_create_url(file_path, &target_path, ignore_absolute_local_links) - else { - return Err(ErrorKind::InvalidPathToUri(target_path)); - }; - Ok(Uri { - url: constructed_url, - }) -} - /// Create a URL from a path /// /// `src_path` is the path of the source file. From 4e20fb3540fc01aa066e3eafa943021d7f5267dc Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 14 Jan 2025 02:24:20 +0100 Subject: [PATCH 4/8] lint --- lychee-lib/src/checker/file.rs | 6 +- lychee-lib/src/types/base.rs | 5 +- lychee-lib/src/utils/request.rs | 139 +++++--------------------------- lychee-lib/src/utils/url.rs | 67 --------------- 4 files changed, 23 insertions(+), 194 deletions(-) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 5bec5b1e61..fb5bc3e600 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -203,10 +203,10 @@ mod tests { ]; for (uri, expected, name) in cases { - let url = url::Url::parse(uri).unwrap(); - let uri = Uri::from(url); + let raw_url = url::Url::parse(uri).unwrap(); + let uri = Uri::from(raw_url); let path = uri_to_path(&uri).unwrap(); - assert_eq!(path, expected, "{}", name); + assert_eq!(path, expected, "{name}"); } } } diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 62692316d9..966201abc8 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -56,7 +56,7 @@ impl TryFrom<&str> for Base { "The given URL cannot be a base".to_string(), )); } - return Ok(Self(url)); + Ok(Self(url)) } } @@ -87,13 +87,12 @@ mod test_base { } #[test] - fn test_local_path_string_is_invalid_base() -> Result<()> { + fn test_local_path_string_is_invalid_base() { let cases = vec!["/tmp/lychee", "/tmp/lychee/", "tmp/lychee/"]; for case in cases { assert!(Base::try_from(case).is_err()); } - Ok(()) } #[test] diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 4a7994e7ac..0dcdbfc743 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,14 +1,10 @@ -use percent_encoding::percent_decode_str; use reqwest::Url; -use std::{ - collections::HashSet, - path::{Path, PathBuf}, -}; +use std::path::PathBuf; use crate::{ basic_auth::BasicAuthExtractor, types::{uri::raw::RawUri, InputSource}, - utils::{path, url}, + utils::path, Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, }; @@ -66,7 +62,7 @@ fn try_parse_into_uri( // For absolute paths with root_dir, insert root_dir after base if text.starts_with('/') && root_dir.is_some() { let root_path = root_dir.unwrap().to_string_lossy(); - let combined = format!("{}{}", root_path, text); + let combined = format!("{root_path}{text}"); return base_url .join(&combined) .map(|url| Uri { url }) @@ -92,7 +88,7 @@ fn try_parse_into_uri( if is_anchor(&text) { return Url::from_file_path(source_path) .map(|url| Uri { url }) - .map_err(|_| ErrorKind::InvalidUrlFromPath(source_path.clone())) + .map_err(|()| ErrorKind::InvalidUrlFromPath(source_path.clone())) .map(|mut uri| { uri.url.set_fragment(Some(&text[1..])); uri @@ -101,10 +97,13 @@ fn try_parse_into_uri( // If source_path is relative and we have a root_dir, // we need to resolve both source_path and text relative to root_dir - let resolved_source = if !source_path.is_absolute() && root_dir.is_some() { - root_dir.unwrap().join(source_path) + let resolved_source = if source_path.is_absolute() { + source_path.clone() } else { - source_path.to_path_buf() + match root_dir { + Some(dir) => dir.join(source_path), + None => source_path.clone(), + } }; match path::resolve(&resolved_source, &PathBuf::from(&text), false) { @@ -115,7 +114,7 @@ fn try_parse_into_uri( Url::from_file_path(&target_path) .map(|url| Uri { url }) - .map_err(|_| ErrorKind::InvalidUrlFromPath(target_path)) + .map_err(|()| ErrorKind::InvalidUrlFromPath(target_path)) } InputSource::String(s) => { // If we have a root_dir, we can still resolve paths against it @@ -124,15 +123,14 @@ fn try_parse_into_uri( let target_path = root.join(&text); return Url::from_file_path(&target_path) .map(|url| Uri { url }) - .map_err(|_| ErrorKind::InvalidUrlFromPath(target_path)); + .map_err(|()| ErrorKind::InvalidUrlFromPath(target_path)); } // Otherwise, we can't resolve the path Err(ErrorKind::UnsupportedUriType(s.clone())) } InputSource::RemoteUrl(url) => { - let base_url = Url::parse(url.as_str()).map_err(|e| { - ErrorKind::ParseUrl(e, format!("Could not parse base URL: {}", url)) - })?; + let base_url = Url::parse(url.as_str()) + .map_err(|e| ErrorKind::ParseUrl(e, format!("Could not parse base URL: {url}")))?; base_url .join(&text) .map(|url| Uri { url }) @@ -147,43 +145,6 @@ pub(crate) fn is_anchor(text: &str) -> bool { text.starts_with('#') } -/// Create a URL from a path -/// -/// `src_path` is the path of the source file. -/// `dest_path` is the path being linked to. -/// The optional `base_uri` specifies the base URI to resolve the destination path against. -/// -/// # Errors -/// -/// - If the percent-decoded destination path cannot be decoded as UTF-8. -/// - The path cannot be resolved -/// - The resolved path cannot be converted to a URL. -fn resolve_and_create_url( - src_path: &Path, - dest_path: &str, - ignore_absolute_local_links: bool, -) -> Result { - let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path); - - // Decode the destination path to avoid double-encoding - let decoded_dest = percent_decode_str(dest_path).decode_utf8()?; - - let Ok(Some(resolved_path)) = path::resolve( - src_path, - &PathBuf::from(&*decoded_dest), - ignore_absolute_local_links, - ) else { - return Err(ErrorKind::InvalidPathToUri(decoded_dest.to_string())); - }; - - let Ok(mut url) = Url::from_file_path(&resolved_path) else { - return Err(ErrorKind::InvalidUrlFromPath(resolved_path.clone())); - }; - - url.set_fragment(fragment); - Ok(url) -} - /// Truncate the source in case it gets too long /// /// This is only needed for string inputs. @@ -204,31 +165,20 @@ fn truncate_source(source: &InputSource) -> InputSource { /// Only keeps "valid" URLs. This filters out anchors for example. /// /// If a URLs is ignored (because of the current settings), -/// it will not be added to the `HashSet`. +/// it will not be added to the `Vector`. pub(crate) fn create( uris: Vec, source: &InputSource, root_dir: Option<&PathBuf>, base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, -) -> HashSet> { +) -> Vec> { let base = base.cloned().or_else(|| Base::from_source(source)); uris.into_iter() .map(|raw_uri| create_request(&raw_uri, source, root_dir, base.as_ref(), extractor)) .collect() } -fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&PathBuf>) -> String { - if text.starts_with('/') { - if let Some(path) = root_dir { - if let Some(path_str) = path.to_str() { - return format!("{path_str}{text}"); - } - } - } - text.to_string() -} - #[cfg(test)] mod tests { use super::*; @@ -239,13 +189,6 @@ mod tests { assert!(!is_anchor("notan#anchor")); } - #[test] - fn test_create_uri_from_path() { - let result = - resolve_and_create_url(&PathBuf::from("/README.md"), "test+encoding", true).unwrap(); - assert_eq!(result.as_str(), "file:///test+encoding"); - } - #[test] fn test_relative_url_resolution() { let base = Base::try_from("https://example.com/path/page.html").unwrap(); @@ -362,27 +305,11 @@ mod tests { fn test_parent_directory_url_resolution_from_root_dir() { let root_dir = PathBuf::from("/tmp/lychee"); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); - let uris = vec![RawUri::from("../parent")]; let requests = create(uris, &source, Some(&root_dir), None, None); - assert_eq!(requests.len(), 1); - // assert!(requests - // .iter() - // .any(|r| r.as_ref().unwrap().uri.url.as_str() == "file:///parent")); - - assert_eq!( - requests - .iter() - .next() - .unwrap() - .as_ref() - .unwrap() - .uri - .url - .as_str(), - "file:///parent", - ); + let url = &requests[0].as_ref().unwrap().uri.url; + assert_eq!(url.as_str(), "file:///parent"); } #[test] @@ -565,36 +492,6 @@ mod tests { assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html"); } - #[test] - fn test_prepend_with_absolute_local_link_and_root_dir() { - let text = "/absolute/path"; - let root_dir = PathBuf::from("/root"); - let result = prepend_root_dir_if_absolute_local_link(text, Some(&root_dir)); - assert_eq!(result, "/root/absolute/path"); - } - - #[test] - fn test_prepend_with_absolute_local_link_and_no_root_dir() { - let text = "/absolute/path"; - let result = prepend_root_dir_if_absolute_local_link(text, None); - assert_eq!(result, "/absolute/path"); - } - - #[test] - fn test_prepend_with_relative_link_and_root_dir() { - let text = "relative/path"; - let root_dir = PathBuf::from("/root"); - let result = prepend_root_dir_if_absolute_local_link(text, Some(&root_dir)); - assert_eq!(result, "relative/path"); - } - - #[test] - fn test_prepend_with_relative_link_and_no_root_dir() { - let text = "relative/path"; - let result = prepend_root_dir_if_absolute_local_link(text, None); - assert_eq!(result, "relative/path"); - } - #[test] fn test_parse_url_with_anchor() { let base = Base::try_from("https://example.com/path/page.html").unwrap(); diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index c27f0a0202..ea1c04d678 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -4,74 +4,7 @@ use once_cell::sync::Lazy; static LINK_FINDER: Lazy = Lazy::new(LinkFinder::new); -/// Remove all GET parameters from a URL and separates out the fragment. -/// The link is not a URL but a String as it may not have a base domain. -pub(crate) fn remove_get_params_and_separate_fragment(url: &str) -> (&str, Option<&str>) { - let (path, frag) = match url.split_once('#') { - Some((path, fragment)) => (path, Some(fragment)), - None => (url, None), - }; - let path = match path.split_once('?') { - Some((path_without_params, _params)) => path_without_params, - None => path, - }; - (path, frag) -} - // Use `LinkFinder` to offload the raw link searching in plaintext pub(crate) fn find_links(input: &str) -> impl Iterator { LINK_FINDER.links(input) } - -#[cfg(test)] -mod test_fs_tree { - use super::*; - - #[test] - fn test_remove_get_params_and_fragment() { - assert_eq!(remove_get_params_and_separate_fragment("/"), ("/", None)); - assert_eq!( - remove_get_params_and_separate_fragment("index.html?foo=bar"), - ("index.html", None) - ); - assert_eq!( - remove_get_params_and_separate_fragment("/index.html?foo=bar"), - ("/index.html", None) - ); - assert_eq!( - remove_get_params_and_separate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), - ("/index.html", None) - ); - assert_eq!( - remove_get_params_and_separate_fragment("https://example.com/index.html?foo=bar"), - ("https://example.com/index.html", None) - ); - assert_eq!( - remove_get_params_and_separate_fragment("test.png?foo=bar"), - ("test.png", None) - ); - - assert_eq!( - remove_get_params_and_separate_fragment("https://example.com/index.html#anchor"), - ("https://example.com/index.html", Some("anchor")) - ); - assert_eq!( - remove_get_params_and_separate_fragment( - "https://example.com/index.html?foo=bar#anchor" - ), - ("https://example.com/index.html", Some("anchor")) - ); - assert_eq!( - remove_get_params_and_separate_fragment("test.png?foo=bar#anchor"), - ("test.png", Some("anchor")) - ); - assert_eq!( - remove_get_params_and_separate_fragment("test.png#anchor?anchor!?"), - ("test.png", Some("anchor?anchor!?")) - ); - assert_eq!( - remove_get_params_and_separate_fragment("test.png?foo=bar#anchor?anchor!"), - ("test.png", Some("anchor?anchor!")) - ); - } -} From bb319bdf5c4c4f6d2191ba4c7392cecd7626214d Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 14 Jan 2025 02:39:22 +0100 Subject: [PATCH 5/8] fix lints --- lychee-lib/src/lib.rs | 4 ++++ lychee-lib/src/utils/request.rs | 14 +++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 023278c034..13e5a21df8 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -46,6 +46,10 @@ #![deny(anonymous_parameters, macro_use_extern_crate)] #![deny(missing_docs)] #![allow(clippy::module_name_repetitions)] +// We use a `HashSet` for deduplicating requests, a `Request` is mutable, so we +// need to allow mutable hash keys. The alternative would be to use a `Vec` and +// do the deduplication on the call-site. +#![allow(clippy::mutable_key_type)] #[cfg(doctest)] doc_comment::doctest!("../../README.md"); diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 0dcdbfc743..9b5d8b9478 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,5 +1,5 @@ use reqwest::Url; -use std::path::PathBuf; +use std::{collections::HashSet, path::PathBuf}; use crate::{ basic_auth::BasicAuthExtractor, @@ -172,7 +172,7 @@ pub(crate) fn create( root_dir: Option<&PathBuf>, base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, -) -> Vec> { +) -> HashSet> { let base = base.cloned().or_else(|| Base::from_source(source)); uris.into_iter() .map(|raw_uri| create_request(&raw_uri, source, root_dir, base.as_ref(), extractor)) @@ -308,7 +308,15 @@ mod tests { let uris = vec![RawUri::from("../parent")]; let requests = create(uris, &source, Some(&root_dir), None, None); - let url = &requests[0].as_ref().unwrap().uri.url; + let url = requests + .iter() + .next() + .unwrap() + .as_ref() + .unwrap() + .uri + .url + .clone(); assert_eq!(url.as_str(), "file:///parent"); } From 4ebb0ceddebe4df19c710d10a03ad706566b9346 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 21 Jan 2025 01:24:27 +0100 Subject: [PATCH 6/8] progress --- fixtures/fragments/file.html | 2 +- lychee-bin/tests/cli.rs | 57 +++++++++++++++++++----- lychee-lib/src/collector.rs | 11 +++-- lychee-lib/src/types/error.rs | 6 --- lychee-lib/src/utils/fragment_checker.rs | 4 +- 5 files changed, 55 insertions(+), 25 deletions(-) diff --git a/fixtures/fragments/file.html b/fixtures/fragments/file.html index 5ff181f737..9a7cad1f66 100644 --- a/fixtures/fragments/file.html +++ b/fixtures/fragments/file.html @@ -24,4 +24,4 @@ doesn't exist
- + \ No newline at end of file diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 224adc59dd..f571bc4f28 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1794,6 +1794,7 @@ mod cli { fn test_fragments() { let mut cmd = main_command(); let input = fixtures_path().join("fragments"); + cmd.arg("--verbose") .arg("--include-fragments") .arg(input) @@ -1817,23 +1818,55 @@ mod cli { .stderr(contains( "fixtures/fragments/file1.md#kebab-case-fragment-1", )) - .stderr(contains( - "fixtures/fragments/file1.md#f%C3%BCnf-s%C3%BC%C3%9Fe-%C3%A4pfel", - )) - .stderr(contains("fixtures/fragments/file1.md#IGNORE-CASING")) - .stderr(contains("fixtures/fragments/file1.md#explicit-fragment")) - .stderr(contains( - "fixtures/fragments/file.html#Upper-%C3%84%C3%96%C3%B6", - )) - .stderr(contains( - "fixtures/fragments/file.html#tangent%3A-kustomize", - )) - .stderr(contains("fixtures/fragments/file.html#in-THE-begiNNing")) .stdout(contains("21 Total")) .stdout(contains("17 OK")) + // 4 failures because of missing fragments .stdout(contains("4 Errors")); } + // fn test_fragments() { + // let mut cmd = main_command(); + // let input = fixtures_path().join("fragments"); + // cmd.arg("--verbose") + // .arg("--include-fragments") + // .arg(input) + // .assert() + // .failure() + // .stderr(contains("fixtures/fragments/file1.md#fragment-1")) + // .stderr(contains("fixtures/fragments/file1.md#fragment-2")) + // .stderr(contains("fixtures/fragments/file1.md#code-heading")) + // .stderr(contains("fixtures/fragments/file2.md#custom-id")) + // .stderr(contains("fixtures/fragments/file1.md#missing-fragment")) + // .stderr(contains("fixtures/fragments/file2.md#fragment-1")) + // .stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment")) + // .stderr(contains( + // "fixtures/fragments/file1.md#lets-wear-a-hat-%C3%AAtre", + // )) + // .stderr(contains("fixtures/fragments/file2.md#missing-fragment")) + // .stderr(contains("fixtures/fragments/empty_file#fragment")) + // .stderr(contains("fixtures/fragments/file.html#a-word")) + // .stderr(contains("fixtures/fragments/file.html#in-the-beginning")) + // .stderr(contains("fixtures/fragments/file.html#in-the-end")) + // .stderr(contains( + // "fixtures/fragments/file1.md#kebab-case-fragment-1", + // )) + // .stderr(contains( + // "fixtures/fragments/file1.md#f%C3%BCnf-s%C3%BC%C3%9Fe-%C3%A4pfel", + // )) + // .stderr(contains("fixtures/fragments/file1.md#IGNORE-CASING")) + // .stderr(contains("fixtures/fragments/file1.md#explicit-fragment")) + // .stderr(contains( + // "fixtures/fragments/file.html#Upper-%C3%84%C3%96%C3%B6", + // )) + // .stderr(contains( + // "fixtures/fragments/file.html#tangent%3A-kustomize", + // )) + // .stderr(contains("fixtures/fragments/file.html#in-THE-begiNNing")) + // .stdout(contains("21 Total")) + // .stdout(contains("17 OK")) + // .stdout(contains("4 Errors")); + // } + #[test] fn test_fallback_extensions() { let mut cmd = main_command(); diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 03c376879f..da94308406 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,4 +1,3 @@ -use crate::ErrorKind; use crate::InputSource; use crate::{ basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request, @@ -51,8 +50,14 @@ impl Collector { pub fn new(root_dir: Option, base: Option) -> Result { if let Some(root_dir) = &root_dir { if root_dir.is_relative() { - return Err(ErrorKind::RootDirMustBeAbsolute(root_dir.clone())); - } + // The root directory must be an absolute path + // Canonicalize the path relative to the current working directory + let root_dir = std::env::current_dir()?.join(root_dir); + root_dir.canonicalize()?; + root_dir + } else { + root_dir.clone() + }; } Ok(Collector { basic_auth_extractor: None, diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 795312a1e3..6211695402 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -96,10 +96,6 @@ pub enum ErrorKind { #[error("Cannot convert path '{0}' to a URI")] InvalidPathToUri(String), - /// Root dir must be an absolute path - #[error("Root dir must be an absolute path: '{0}'")] - RootDirMustBeAbsolute(PathBuf), - /// The given URI type is not supported #[error("Unsupported URI type: '{0}'")] UnsupportedUriType(String), @@ -284,7 +280,6 @@ impl PartialEq for ErrorKind { (Self::EmptyUrl, Self::EmptyUrl) => true, (Self::InvalidBaseJoin(s1), Self::InvalidBaseJoin(s2)) => s1 == s2, (Self::InvalidPathToUri(s1), Self::InvalidPathToUri(s2)) => s1 == s2, - (Self::RootDirMustBeAbsolute(s1), Self::RootDirMustBeAbsolute(s2)) => s1 == s2, (Self::UnsupportedUriType(s1), Self::UnsupportedUriType(s2)) => s1 == s2, (Self::StatusCodeSelectorError(e1), Self::StatusCodeSelectorError(e2)) => { e1.to_string() == e2.to_string() @@ -327,7 +322,6 @@ impl Hash for ErrorKind { Self::InvalidBase(base, e) => (base, e).hash(state), Self::InvalidBaseJoin(s) => s.hash(state), Self::InvalidPathToUri(s) => s.hash(state), - Self::RootDirMustBeAbsolute(s) => s.hash(state), Self::UnsupportedUriType(s) => s.hash(state), Self::InvalidUrlRemap(remap) => (remap).hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), diff --git a/lychee-lib/src/utils/fragment_checker.rs b/lychee-lib/src/utils/fragment_checker.rs index dc9461020f..a89a80d8f8 100644 --- a/lychee-lib/src/utils/fragment_checker.rs +++ b/lychee-lib/src/utils/fragment_checker.rs @@ -56,9 +56,7 @@ impl FragmentChecker { FileType::Html => extract_html_fragments, FileType::Plaintext => return Ok(true), }; - if file_type == FileType::Markdown { - fragment_decoded = fragment_decoded.to_lowercase().into(); - } + fragment_decoded = fragment_decoded.to_lowercase().into(); match self.cache.lock().await.entry(url_without_frag) { Entry::Vacant(entry) => { let content = fs::read_to_string(path).await?; From b46aaec28c5640d0569153d1d4eefbe06863c728 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 21 Jan 2025 02:38:28 +0100 Subject: [PATCH 7/8] Type-safe root dir --- lychee-bin/src/options.rs | 9 ++-- lychee-bin/src/parse.rs | 6 ++- lychee-lib/src/collector.rs | 21 ++------ lychee-lib/src/lib.rs | 2 +- lychee-lib/src/types/mail.rs | 1 - lychee-lib/src/types/mod.rs | 2 + lychee-lib/src/types/root_dir.rs | 84 ++++++++++++++++++++++++++++++++ lychee-lib/src/utils/request.rs | 47 +++++++++--------- 8 files changed, 124 insertions(+), 48 deletions(-) create mode 100644 lychee-lib/src/types/root_dir.rs diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 5e7de7fe28..400e7689a4 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,13 +1,12 @@ use crate::archive::Archive; -use crate::parse::parse_base; +use crate::parse::{parse_base, parse_root_dir}; use crate::verbosity::Verbosity; use anyhow::{anyhow, Context, Error, Result}; use clap::builder::PossibleValuesParser; use clap::{arg, builder::TypedValueParser, Parser}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS, - DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, + Base, BasicAuthSelector, Input, RootDir, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; @@ -447,9 +446,9 @@ separated list of accepted status codes. This example will accept 200, 201, /// Root path to use when checking absolute local links, /// must be an absolute path - #[arg(long)] + #[arg(long, value_parser = parse_root_dir)] #[serde(default)] - pub(crate) root_dir: Option, + pub(crate) root_dir: Option, /// Basic authentication support. E.g. `http://example.com username:password` #[arg(long)] diff --git a/lychee-bin/src/parse.rs b/lychee-bin/src/parse.rs index b5a4cca732..716bcb48cf 100644 --- a/lychee-bin/src/parse.rs +++ b/lychee-bin/src/parse.rs @@ -1,6 +1,6 @@ use anyhow::{anyhow, Context, Result}; use headers::{HeaderMap, HeaderName}; -use lychee_lib::{remap::Remaps, Base}; +use lychee_lib::{remap::Remaps, Base, RootDir}; use std::time::Duration; /// Split a single HTTP header into a (key, value) tuple @@ -40,6 +40,10 @@ pub(crate) fn parse_base(src: &str) -> Result { Base::try_from(src) } +pub(crate) fn parse_root_dir(root_dir: &str) -> Result { + RootDir::new(root_dir) +} + #[cfg(test)] mod tests { diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index da94308406..1a44ead525 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,3 +1,4 @@ +use crate::types::RootDir; use crate::InputSource; use crate::{ basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request, @@ -9,7 +10,6 @@ use futures::{ StreamExt, }; use par_stream::ParStreamExt; -use std::path::PathBuf; /// Collector keeps the state of link collection /// It drives the link extraction from inputs @@ -22,7 +22,7 @@ pub struct Collector { skip_hidden: bool, include_verbatim: bool, use_html5ever: bool, - root_dir: Option, + root_dir: Option, base: Option, } @@ -47,18 +47,7 @@ impl Collector { /// # Errors /// /// Returns an `Err` if the `root_dir` is not an absolute path - pub fn new(root_dir: Option, base: Option) -> Result { - if let Some(root_dir) = &root_dir { - if root_dir.is_relative() { - // The root directory must be an absolute path - // Canonicalize the path relative to the current working directory - let root_dir = std::env::current_dir()?.join(root_dir); - root_dir.canonicalize()?; - root_dir - } else { - root_dir.clone() - }; - } + pub fn new(root_dir: Option, base: Option) -> Result { Ok(Collector { basic_auth_extractor: None, skip_missing_inputs: false, @@ -189,7 +178,7 @@ mod tests { // Helper function to run the collector on the given inputs async fn collect( inputs: Vec, - root_dir: Option, + root_dir: Option, base: Option, ) -> Result> { let responses = Collector::new(root_dir, base)?.collect_links(inputs); @@ -199,7 +188,7 @@ mod tests { // Helper function for collecting verbatim links async fn collect_verbatim( inputs: Vec, - root_dir: Option, + root_dir: Option, base: Option, ) -> Result> { let responses = Collector::new(root_dir, base)? diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 13e5a21df8..9a0da2541d 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -102,7 +102,7 @@ pub use crate::{ types::{ uri::valid::Uri, AcceptRange, AcceptRangeError, Base, BasicAuthCredentials, BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent, - InputSource, Request, Response, ResponseBody, Result, Status, StatusCodeExcluder, + InputSource, Request, Response, ResponseBody, Result, RootDir, Status, StatusCodeExcluder, StatusCodeSelector, }, }; diff --git a/lychee-lib/src/types/mail.rs b/lychee-lib/src/types/mail.rs index 7ef30bfc89..c338d510d0 100644 --- a/lychee-lib/src/types/mail.rs +++ b/lychee-lib/src/types/mail.rs @@ -1,5 +1,4 @@ #![cfg(all(feature = "email-check", feature = "native-tls"))] - use check_if_email_exists::{CheckEmailOutput, Reachable}; /// A crude way to extract error details from the mail output. diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 5bb933fe0b..a607ecb52a 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -11,6 +11,7 @@ mod input; pub(crate) mod mail; mod request; mod response; +mod root_dir; mod status; mod status_code; pub(crate) mod uri; @@ -25,6 +26,7 @@ pub use file::FileType; pub use input::{Input, InputContent, InputSource}; pub use request::Request; pub use response::{Response, ResponseBody}; +pub use root_dir::RootDir; pub use status::Status; pub use status_code::*; diff --git a/lychee-lib/src/types/root_dir.rs b/lychee-lib/src/types/root_dir.rs new file mode 100644 index 0000000000..72a840dea4 --- /dev/null +++ b/lychee-lib/src/types/root_dir.rs @@ -0,0 +1,84 @@ +use crate::Result; +use std::{ + fmt::{Display, Formatter}, + ops::Deref, + path::PathBuf, +}; + +use serde::{Deserialize, Serialize}; + +/// The canonical root directory during document processing. +/// +/// This is used to resolve relative paths in a document, +/// similar to how a web server resolves relative URLs. +/// +/// Similar mechanisms exist in: +/// - Apache's `DocumentRoot` +/// - Nginx's `root` directive +/// +/// The root directory can be only be a local path. +/// Paths must be absolute or are expected to be relative to the current working +/// directory. +/// +/// For resolving remote URLs instead, see [`Base`]. +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] +pub struct RootDir(PathBuf); + +impl RootDir { + /// Creates a new root directory + /// + /// Root directories must be absolute paths. + /// If the given path is relative, it will be resolved relative to the current working directory + /// and then canonicalized to resolve symbolic links. + /// + /// # Examples + /// + /// ``` + /// use lychee_lib::types::RootDir; + /// + /// let root_dir = RootDir::new("/path/to/root").unwrap(); + /// ``` + /// + /// # Errors + /// + /// Returns an error if the current working directory cannot be determined + /// or the path can not be canonicalized. + /// + /// Amongst other reasons, this can happen if: + /// * `path` does not exist. + /// * A non-final component in path is not a directory. + pub fn new>(path: P) -> Result { + let path = path.into(); + let root_dir = if path.is_relative() { + // The root directory must be an absolute path + // Canonicalize the path relative to the current working directory + let root_dir = std::env::current_dir()?.join(path); + root_dir.canonicalize()?; + root_dir + } else { + path.clone() + }; + + Ok(Self(root_dir)) + } +} + +impl Display for RootDir { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0.display()) + } +} + +impl Deref for RootDir { + type Target = PathBuf; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From for PathBuf { + fn from(root_dir: RootDir) -> Self { + root_dir.0 + } +} diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 9b5d8b9478..d3990e9863 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -3,7 +3,7 @@ use std::{collections::HashSet, path::PathBuf}; use crate::{ basic_auth::BasicAuthExtractor, - types::{uri::raw::RawUri, InputSource}, + types::{uri::raw::RawUri, InputSource, RootDir}, utils::path, Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, }; @@ -20,7 +20,7 @@ fn extract_credentials( fn create_request( raw_uri: &RawUri, source: &InputSource, - root_dir: Option<&PathBuf>, + root_dir: Option<&RootDir>, base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> Result { @@ -47,7 +47,7 @@ fn create_request( fn try_parse_into_uri( raw_uri: &RawUri, source: &InputSource, - root_dir: Option<&PathBuf>, + root_dir: Option<&RootDir>, base: Option<&Base>, ) -> Result { // First try direct URI parsing (handles explicit URLs) @@ -61,8 +61,8 @@ fn try_parse_into_uri( if let Some(base_url) = base { // For absolute paths with root_dir, insert root_dir after base if text.starts_with('/') && root_dir.is_some() { - let root_path = root_dir.unwrap().to_string_lossy(); - let combined = format!("{root_path}{text}"); + let root_dir = root_dir.unwrap(); + let combined = format!("{root_dir}{text}"); return base_url .join(&combined) .map(|url| Uri { url }) @@ -80,9 +80,8 @@ fn try_parse_into_uri( InputSource::FsPath(source_path) => { let target_path = if text.starts_with('/') && root_dir.is_some() { // Absolute paths: resolve via root_dir - let mut path = root_dir.unwrap().clone(); - path.push(&text[1..]); - path + let root = root_dir.unwrap(); + root.join(&text[1..]) } else { // If text is just a fragment, we need to append it to the source path if is_anchor(&text) { @@ -169,7 +168,7 @@ fn truncate_source(source: &InputSource) -> InputSource { pub(crate) fn create( uris: Vec, source: &InputSource, - root_dir: Option<&PathBuf>, + root_dir: Option<&RootDir>, base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> HashSet> { @@ -261,7 +260,7 @@ mod tests { #[test] fn test_relative_url_resolution_from_root_dir() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("relative.html")]; @@ -275,7 +274,7 @@ mod tests { #[test] fn test_absolute_url_resolution_from_root_dir() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("https://another.com/page")]; @@ -289,7 +288,7 @@ mod tests { #[test] fn test_root_relative_url_resolution_from_root_dir() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("/root-relative")]; @@ -303,7 +302,7 @@ mod tests { #[test] fn test_parent_directory_url_resolution_from_root_dir() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("../parent")]; let requests = create(uris, &source, Some(&root_dir), None, None); @@ -322,7 +321,7 @@ mod tests { #[test] fn test_fragment_url_resolution_from_root_dir() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("#fragment")]; @@ -336,7 +335,7 @@ mod tests { #[test] fn test_relative_url_resolution_from_root_dir_and_base_url() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); @@ -352,7 +351,7 @@ mod tests { #[test] fn test_absolute_url_resolution_from_root_dir_and_base_url() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); @@ -367,7 +366,7 @@ mod tests { #[test] fn test_root_relative_url_resolution_from_root_dir_and_base_url() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); @@ -381,7 +380,7 @@ mod tests { #[test] fn test_parent_directory_url_resolution_from_root_dir_and_base_url() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); @@ -396,7 +395,7 @@ mod tests { #[test] fn test_fragment_url_resolution_from_root_dir_and_base_url() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); @@ -423,7 +422,7 @@ mod tests { #[test] fn test_create_request_from_relative_file_path() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let input_source = InputSource::FsPath(PathBuf::from("page.html")); let actual = create_request( @@ -451,7 +450,7 @@ mod tests { #[test] fn test_create_request_from_absolute_file_path() { - let root_dir = PathBuf::from("/foo/bar"); + let root_dir = RootDir::new("/foo/bar").unwrap(); let input_source = InputSource::FsPath(PathBuf::from("/foo/bar/page.html")); // Use an absolute path that's outside the root directory @@ -480,7 +479,7 @@ mod tests { #[test] fn test_parse_relative_path_into_uri() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::String(String::new()); let raw_uri = RawUri::from("relative.html"); @@ -491,7 +490,7 @@ mod tests { #[test] fn test_parse_absolute_path_into_uri() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::String(String::new()); let raw_uri = RawUri::from("absolute.html"); @@ -530,7 +529,7 @@ mod tests { #[test] fn test_parse_url_from_path_with_anchor() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = RootDir::new("/tmp/lychee").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let raw_uri = RawUri::from("#fragment"); From 874037649ac2828fc40bb76546674689f2b8cc08 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 6 Feb 2025 14:28:30 +0100 Subject: [PATCH 8/8] work on relative links fixes --- README.md | 2 +- lychee-bin/src/commands/dump.rs | 35 ++++++++++++++++++++---- lychee-bin/src/main.rs | 32 +++++++++++++++++++--- lychee-bin/src/options.rs | 7 ++--- lychee-bin/src/parse.rs | 6 ++--- lychee-lib/src/types/error.rs | 7 +++++ lychee-lib/src/types/input.rs | 14 ++++++++++ lychee-lib/src/types/root_dir.rs | 46 ++++++++++++++++++++++++++++++-- lychee-lib/src/utils/path.rs | 1 + lychee-lib/src/utils/request.rs | 11 +++++--- 10 files changed, 141 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d2b6a2d896..23422f6a8c 100644 --- a/README.md +++ b/README.md @@ -481,7 +481,7 @@ Options: Base URL or website root directory to check relative URLs e.g. or `/path/to/public` --root-dir - Root path to use when checking absolute local links, must be an absolute path + Root directory to use when checking absolute local links --basic-auth Basic authentication support. E.g. `http://example.com username:password` diff --git a/lychee-bin/src/commands/dump.rs b/lychee-bin/src/commands/dump.rs index 572c592203..506ca8057d 100644 --- a/lychee-bin/src/commands/dump.rs +++ b/lychee-bin/src/commands/dump.rs @@ -34,39 +34,64 @@ pub(crate) async fn dump(params: CommandParams) -> Result where S: futures::Stream>, { + println!("Starting dump function"); + println!("About to get requests from params"); let requests = params.requests; + println!("Got requests stream: {:?}", std::any::type_name::()); tokio::pin!(requests); + println!("Pinned requests stream"); if let Some(out_file) = ¶ms.cfg.output { + println!("Creating output file: {:?}", out_file); fs::File::create(out_file)?; } let mut writer = create_writer(params.cfg.output)?; + println!("Writer created successfully"); + println!("About to await first request"); + + match requests.next().await { + Some(Ok(req)) => { + println!( + "Got valid request: source={:?}, uri={:?}", + req.source, req.uri + ); + } + Some(Err(e)) => { + println!("Got error from stream: {:?}", e); + } + None => { + println!("Stream returned None immediately"); + } + } while let Some(request) = requests.next().await { + println!("Processing new request"); let mut request = request?; - // Apply URI remappings (if any) + println!("Original URI: {:?}", request.uri); params.client.remap(&mut request.uri)?; + println!("Remapped URI: {:?}", request.uri); let excluded = params.client.is_excluded(&request.uri); + println!("Request excluded: {}", excluded); if excluded && params.cfg.verbose.log_level() < log::Level::Info { + println!("Skipping excluded request"); continue; } if let Err(e) = write(&mut writer, &request, ¶ms.cfg.verbose, excluded) { - // Avoid panic on broken pipe. - // See https://github.com/rust-lang/rust/issues/46016 - // This can occur when piping the output of lychee - // to another program like `grep`. + println!("Write error occurred: {:?}", e); if e.kind() != io::ErrorKind::BrokenPipe { error!("{e}"); return Ok(ExitCode::UnexpectedFailure); } + println!("Broken pipe detected, continuing"); } } + println!("Dump completed successfully"); Ok(ExitCode::Success) } diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 521a9b8eef..d35404f2b2 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -75,9 +75,9 @@ use openssl_sys as _; // required for vendored-openssl feature use options::LYCHEE_CONFIG_FILE; use ring as _; // required for apple silicon -use lychee_lib::BasicAuthExtractor; -use lychee_lib::Collector; use lychee_lib::CookieJar; +use lychee_lib::{BasicAuthExtractor, RootDir}; +use lychee_lib::{Collector, Input, InputSource}; mod archive; mod cache; @@ -287,8 +287,9 @@ fn underlying_io_error_kind(error: &Error) -> Option { /// Run lychee on the given inputs async fn run(opts: &LycheeOptions) -> Result { let inputs = opts.inputs()?; + let root_dir = set_root_dir(&inputs, &opts.config)?; - let mut collector = Collector::new(opts.config.root_dir.clone(), opts.config.base.clone())? + let mut collector = Collector::new(root_dir, opts.config.base.clone())? .skip_missing_inputs(opts.config.skip_missing) .skip_hidden(!opts.config.hidden) .skip_ignored(!opts.config.no_ignore) @@ -387,3 +388,28 @@ async fn run(opts: &LycheeOptions) -> Result { Ok(exit_code as i32) } + +/// Set the root directory based on the passed configuration +/// as well as the current working directory if no root directory is set +/// and we have exactly one input, which is a directory. +/// +/// In all other cases, set the root directory to `None`. +fn set_root_dir(inputs: &[Input], config: &Config) -> Result> { + if let Some(root_dir) = &config.root_dir { + Ok(Some(root_dir.clone())) + } else { + if inputs.len() == 1 { + let input = &inputs[0]; + if input.is_dir() { + match input.source { + InputSource::FsPath(ref path) => Ok(Some(RootDir::new(path)?)), + _ => bail!("Cannot set root directory for input: {:?}", input), + } + } else { + Ok(None) + } + } else { + Ok(None) + } + } +} diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 400e7689a4..ebbf3d993e 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -168,6 +168,7 @@ macro_rules! fold_in { #[command(version, about)] pub(crate) struct LycheeOptions { /// The inputs (where to get links to check from). + /// /// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`), /// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`). /// NOTE: Use `--` to separate inputs from options that allow multiple arguments. @@ -179,13 +180,14 @@ pub(crate) struct LycheeOptions { #[arg(help = HELP_MSG_CONFIG_FILE)] pub(crate) config_file: Option, + /// The parsed configuration #[clap(flatten)] pub(crate) config: Config, } impl LycheeOptions { /// Get parsed inputs from options. - // This depends on the config, which is why a method is required (we could + // This depends on the parsed config, which is why a method is required (we could // accept a `Vec` in `LycheeOptions` and do the conversion there, but // we wouldn't get access to `glob_ignore_case`. pub(crate) fn inputs(&self) -> Result> { @@ -444,8 +446,7 @@ separated list of accepted status codes. This example will accept 200, 201, #[serde(default)] pub(crate) base: Option, - /// Root path to use when checking absolute local links, - /// must be an absolute path + /// Root directory to use when checking absolute local links #[arg(long, value_parser = parse_root_dir)] #[serde(default)] pub(crate) root_dir: Option, diff --git a/lychee-bin/src/parse.rs b/lychee-bin/src/parse.rs index 716bcb48cf..8b42c1d720 100644 --- a/lychee-bin/src/parse.rs +++ b/lychee-bin/src/parse.rs @@ -36,12 +36,12 @@ pub(crate) fn parse_remaps(remaps: &[String]) -> Result { .context("Remaps must be of the form ' ' (separated by whitespace)") } -pub(crate) fn parse_base(src: &str) -> Result { - Base::try_from(src) +pub(crate) fn parse_base(base: &str) -> Result { + Base::try_from(base) } pub(crate) fn parse_root_dir(root_dir: &str) -> Result { - RootDir::new(root_dir) + RootDir::try_from(root_dir) } #[cfg(test)] diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 6211695402..e98391c1f7 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -84,6 +84,11 @@ pub enum ErrorKind { #[error("Header could not be parsed.")] InvalidHeader(#[from] http::header::InvalidHeaderValue), + /// The given path is not a valid root directory + /// (e.g. it does not exist or is not a directory) + #[error("Invalid root directory. Must exist and must be a directory: {0}")] + InvalidRootDir(String), + /// The given string can not be parsed into a valid base URL #[error("Error with base URL `{0}` : {1}")] InvalidBase(String, String), @@ -275,6 +280,7 @@ impl PartialEq for ErrorKind { (Self::InvalidFilePath(u1), Self::InvalidFilePath(u2)) => u1 == u2, (Self::InvalidFragment(u1), Self::InvalidFragment(u2)) => u1 == u2, (Self::InvalidUrlFromPath(p1), Self::InvalidUrlFromPath(p2)) => p1 == p2, + (Self::InvalidRootDir(s1), Self::InvalidRootDir(s2)) => s1 == s2, (Self::InvalidBase(b1, e1), Self::InvalidBase(b2, e2)) => b1 == b2 && e1 == e2, (Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2, (Self::EmptyUrl, Self::EmptyUrl) => true, @@ -319,6 +325,7 @@ impl Hash for ErrorKind { Self::InvalidFragment(u) => u.hash(state), Self::UnreachableEmailAddress(u, ..) => u.hash(state), Self::InsecureURL(u, ..) => u.hash(state), + Self::InvalidRootDir(s) => s.hash(state), Self::InvalidBase(base, e) => (base, e).hash(state), Self::InvalidBaseJoin(s) => s.hash(state), Self::InvalidPathToUri(s) => s.hash(state), diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index c32be7feb8..ed27398e87 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -309,6 +309,7 @@ impl Input { } } + /// Retrieve the contents from the input by making a network request async fn url_contents(url: &Url) -> Result { // Assume HTML for default paths let file_type = if url.path().is_empty() || url.path() == "/" { @@ -329,6 +330,8 @@ impl Input { Ok(input_content) } + /// Retrieve the contents from the input by expanding the glob pattern and + /// reading from the filesystem fn glob_contents( &self, pattern: &str, @@ -372,6 +375,7 @@ impl Input { } /// Get the input content of a given path + /// /// # Errors /// /// Will return `Err` if file contents can't be read @@ -391,6 +395,7 @@ impl Input { Ok(input_content) } + /// Get the input content from stdin async fn stdin_content(file_type_hint: Option) -> Result { let mut content = String::new(); let mut stdin = stdin(); @@ -405,9 +410,18 @@ impl Input { Ok(input_content) } + /// Get the input content from a raw string fn string_content(s: &str, file_type_hint: Option) -> InputContent { InputContent::from_string(s, file_type_hint.unwrap_or_default()) } + + /// Check if the given input is a directory + pub fn is_dir(&self) -> bool { + match &self.source { + InputSource::FsPath(path) => path.is_dir(), + _ => false, + } + } } /// Function for path exclusion tests diff --git a/lychee-lib/src/types/root_dir.rs b/lychee-lib/src/types/root_dir.rs index 72a840dea4..a83b233cbf 100644 --- a/lychee-lib/src/types/root_dir.rs +++ b/lychee-lib/src/types/root_dir.rs @@ -7,6 +7,8 @@ use std::{ use serde::{Deserialize, Serialize}; +use super::{ErrorKind, Input, InputSource}; + /// The canonical root directory during document processing. /// /// This is used to resolve relative paths in a document, @@ -52,13 +54,17 @@ impl RootDir { let root_dir = if path.is_relative() { // The root directory must be an absolute path // Canonicalize the path relative to the current working directory - let root_dir = std::env::current_dir()?.join(path); - root_dir.canonicalize()?; + let root_dir = std::env::current_dir()?.join(&path); + root_dir + .canonicalize() + .map_err(|_| ErrorKind::InvalidRootDir(path.display().to_string()))?; root_dir } else { path.clone() }; + println!("Set root directory to: {}", root_dir.display()); + Ok(Self(root_dir)) } } @@ -82,3 +88,39 @@ impl From for PathBuf { root_dir.0 } } + +impl TryFrom for RootDir { + type Error = ErrorKind; + + fn try_from(input: Input) -> std::result::Result { + let source = input.source; + match source { + InputSource::FsPath(path) => { + if path.is_dir() { + Ok(Self::new(path)?) + } else { + Err(ErrorKind::InvalidRootDir(path.display().to_string())) + } + } + _ => Err(ErrorKind::InvalidRootDir( + "Root directory must be a local path".to_string(), + )), + } + } +} + +impl TryFrom<&Input> for RootDir { + type Error = ErrorKind; + + fn try_from(input: &Input) -> std::result::Result { + RootDir::try_from(input.clone()) + } +} + +impl TryFrom<&str> for RootDir { + type Error = ErrorKind; + + fn try_from(value: &str) -> std::result::Result { + RootDir::new(value) + } +} diff --git a/lychee-lib/src/utils/path.rs b/lychee-lib/src/utils/path.rs index daa4f7fde6..964e90cebe 100644 --- a/lychee-lib/src/utils/path.rs +++ b/lychee-lib/src/utils/path.rs @@ -32,6 +32,7 @@ pub(crate) fn resolve( dst: &PathBuf, ignore_absolute_local_links: bool, ) -> Result> { + println!("Resolving {} from {}", dst.display(), src.display()); let resolved = match dst { relative if dst.is_relative() => { // Find `dst` in the parent directory of `src` diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index d3990e9863..0b3241ea6d 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -78,11 +78,14 @@ fn try_parse_into_uri( // No base URL - handle as filesystem paths match source { InputSource::FsPath(source_path) => { + println!("Starting path resolution for: {:?}", source_path); let target_path = if text.starts_with('/') && root_dir.is_some() { - // Absolute paths: resolve via root_dir - let root = root_dir.unwrap(); - root.join(&text[1..]) + println!("Pre-root dir resolution: {:?}", text); + let result = root_dir.unwrap().join(&text[1..]); + println!("Post-root dir resolution: {:?}", result); + result } else { + println!("Resolving relative path"); // If text is just a fragment, we need to append it to the source path if is_anchor(&text) { return Url::from_file_path(source_path) @@ -111,6 +114,8 @@ fn try_parse_into_uri( } }; + println!("Final path: {:?}", target_path); + Url::from_file_path(&target_path) .map(|url| Uri { url }) .map_err(|()| ErrorKind::InvalidUrlFromPath(target_path))