diff --git a/.github/workflows/check-pr-links.yml b/.github/workflows/check-pr-links.yml
index 98afdfe..e41bbc8 100644
--- a/.github/workflows/check-pr-links.yml
+++ b/.github/workflows/check-pr-links.yml
@@ -9,59 +9,23 @@ jobs:
linkChecker:
runs-on: ubuntu-latest
steps:
- - name: Clone repository
- uses: actions/checkout@v5
- with:
- fetch-depth: 0
-
- - name: Setup Node.js
- uses: actions/setup-node@v6
- with:
- node-version: "20"
+ - uses: actions/checkout@v5
- - name: Setup pnpm
- uses: pnpm/action-setup@v4
+ - name: Build site
+ uses: withastro/action@v5
with:
- version: latest
-
- - name: Check out master branch
- run: git checkout master
-
- - name: Install dependencies for master
- run: pnpm install --frozen-lockfile
-
- - name: Build site from master
- run: pnpm build
-
- - name: Dump all links from master
- id: dump_links_from_master
- uses: lycheeverse/lychee-action@v2
- with:
- args: '--dump --root-dir ${{ github.workspace }}/dist --exclude-all-private dist'
- output: ./links-master.txt
-
- - name: Stash untracked files
- run: git stash push --include-untracked
-
- - name: Check out feature branch
- run: git fetch origin ${{ github.ref }} && git checkout FETCH_HEAD
-
- - name: Apply stashed changes
- run: git stash pop || true
-
- - name: Install dependencies for feature branch
- run: pnpm install --frozen-lockfile
-
- - name: Build site from feature branch
- run: pnpm build
-
- - name: Append links-master.txt to .lycheeignore
- run: cat links-master.txt >> .lycheeignore
+ package-manager: pnpm@latest
- - name: Check links in PR changes
+ - name: Check links
uses: lycheeverse/lychee-action@v2
with:
- args: '--root-dir ${{ github.workspace }}/dist --exclude-all-private dist'
+ # Remap live URLs to build directory because the links are potentially not live (not yet on master)
+ args: |
+ --root-dir $PWD/dist
+ --exclude-all-private
+ --remap 'https://lychee\.cli\.rs/(.*)/ file://'$PWD'/dist/$1/index.html'
+ dist/
+ src/
fail: true
- name: Suggestions
diff --git a/.lycheeignore b/.lycheeignore
index 1438bff..3646e85 100644
--- a/.lycheeignore
+++ b/.lycheeignore
@@ -1,8 +1,17 @@
-https://api.reacher.email/v0/check_email
file:///home/user/website/
^https://www/$
^https://web/$
-# 404 page returns a 404, d'oh
-https://lychee.cli.rs/404/
-# Errors with "Too Many Requests"
+
+# URL is used with POST
+https://api.reacher.email/v0/check_email
+
+# 404 page is directly in dist/404.html but we've remapped it to an invalid path
+dist/404/index.html$
+
+# Code examples in base-url.mdx which don't exist
+/docs/about.php$
+/docs/recipes/guide.php$
+
+# Websites with aggressive rate limiting / bot detection
https://www.nongnu.org/atool
+https://builtwith.com/
diff --git a/astro.config.mjs b/astro.config.mjs
index 4bd10da..99e78a4 100644
--- a/astro.config.mjs
+++ b/astro.config.mjs
@@ -42,6 +42,7 @@ export default defineConfig({
"guides/config",
"guides/cli",
"guides/output",
+ "guides/preprocessing",
],
},
{
diff --git a/src/content/docs/guides/getting-started.mdx b/src/content/docs/guides/getting-started.mdx
index aad6f15..5d5eef9 100644
--- a/src/content/docs/guides/getting-started.mdx
+++ b/src/content/docs/guides/getting-started.mdx
@@ -23,7 +23,7 @@ You can install Lychee using various package managers.
-
+
@@ -206,24 +206,11 @@ In this command, we ignore the case when globbing, so it matches
- `~/projects/rust_game_/README`
- `~/projects/python_script_/Readme.markdown`
-### Check Links From Epub File
+### Check other file formats
-If you have [atool](https://www.nongnu.org/atool) installed, you can check links inside `.epub` files as well!
-
-```bash
-acat -F zip {file.epub} "_.xhtml" "_.html" | lychee -
-```
-
-:::caution[Attention]
-lychee parses other file formats as plaintext and extracts links using [linkify](https://github.com/robinst/linkify).
-This generally works well if there are no format- or encoding
-specifics, but in case you need dedicated support for a new file format, please
-consider [creating an issue](https://github.com/lycheeverse/lychee/issues).
-:::
-
-[atool]: https://www.nongnu.org/atool
-[linkify]: https://github.com/robinst/linkify
-[issue]: https://github.com/lycheeverse/lychee/issues
+By preprocessing files it is possible to do link checking on
+files which aren't officially supported by lychee.
+See [file preprocessing](/guides/preprocessing).
## GitHub Action
diff --git a/src/content/docs/guides/preprocessing.md b/src/content/docs/guides/preprocessing.md
new file mode 100644
index 0000000..2cf873a
--- /dev/null
+++ b/src/content/docs/guides/preprocessing.md
@@ -0,0 +1,69 @@
+---
+title: File preprocessing
+---
+
+Out of the box lychee supports HTML, Markdown and plain text formats.
+More precisely, HTML files are parsed as HTML5 with the use of the [html5ever] parser.
+Markdown files are treated as [CommonMark] with the use of [pulldown-cmark].
+
+For any other file format lychee falls back to a "plain text" mode.
+This means that [linkify] attempts to extract URLs on a best-effort basis.
+If invalid UTF-8 characters are encountered, the input file is skipped,
+because it is assumed that the file is in a binary format lychee cannot understand.
+
+lychee allows file preprocessing with the `--preprocess` flag.
+For each input file the command specified with `--preprocess` is invoked instead of reading the input file directly.
+In the following there are examples how to preprocess common file formats.
+In most cases it's necessary to create a helper script for preprocessing,
+as no parameters can be supplied from the CLI directly.
+
+```bash
+lychee files/* --preprocess ./preprocess.sh
+```
+
+The referenced `preprocess.sh` script could look like this:
+
+```bash
+#!/usr/bin/env bash
+
+case "$1" in
+*.pdf)
+ exec pdftohtml -i -s -stdout "$1"
+ # Alternatives:
+ # exec pdftotext "$1" -
+ # exec pdftk "$1" output - uncompress | grep -aPo '/URI *\(\K[^)]*'
+ ;;
+*.odt|*.docx|*.epub|*.ipynb)
+ exec pandoc "$1" --to=html --wrap=none --markdown-headings=atx
+ ;;
+*.odp|*.pptx|*.ods|*.xlsx)
+ # libreoffice can't print to stdout unfortunately
+ libreoffice --headless --convert-to html "$1" --outdir /tmp
+ file=$(basename "$1")
+ file="/tmp/${file%.*}.html"
+ sed '/
-Guide
-About
-External
+Guide
+About
+Absolute
-Guide
-About
-External`} lang={fileLang}
+Guide
+About
+Absolute`} lang={fileLang}
title="Link Resolution Example" />
## Common Use Cases