diff --git a/.github/workflows/check-pr-links.yml b/.github/workflows/check-pr-links.yml index 98afdfe..e41bbc8 100644 --- a/.github/workflows/check-pr-links.yml +++ b/.github/workflows/check-pr-links.yml @@ -9,59 +9,23 @@ jobs: linkChecker: runs-on: ubuntu-latest steps: - - name: Clone repository - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Setup Node.js - uses: actions/setup-node@v6 - with: - node-version: "20" + - uses: actions/checkout@v5 - - name: Setup pnpm - uses: pnpm/action-setup@v4 + - name: Build site + uses: withastro/action@v5 with: - version: latest - - - name: Check out master branch - run: git checkout master - - - name: Install dependencies for master - run: pnpm install --frozen-lockfile - - - name: Build site from master - run: pnpm build - - - name: Dump all links from master - id: dump_links_from_master - uses: lycheeverse/lychee-action@v2 - with: - args: '--dump --root-dir ${{ github.workspace }}/dist --exclude-all-private dist' - output: ./links-master.txt - - - name: Stash untracked files - run: git stash push --include-untracked - - - name: Check out feature branch - run: git fetch origin ${{ github.ref }} && git checkout FETCH_HEAD - - - name: Apply stashed changes - run: git stash pop || true - - - name: Install dependencies for feature branch - run: pnpm install --frozen-lockfile - - - name: Build site from feature branch - run: pnpm build - - - name: Append links-master.txt to .lycheeignore - run: cat links-master.txt >> .lycheeignore + package-manager: pnpm@latest - - name: Check links in PR changes + - name: Check links uses: lycheeverse/lychee-action@v2 with: - args: '--root-dir ${{ github.workspace }}/dist --exclude-all-private dist' + # Remap live URLs to build directory because the links are potentially not live (not yet on master) + args: | + --root-dir $PWD/dist + --exclude-all-private + --remap 'https://lychee\.cli\.rs/(.*)/ file://'$PWD'/dist/$1/index.html' + dist/ + src/ fail: true - name: Suggestions diff --git a/.lycheeignore b/.lycheeignore index 1438bff..3646e85 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -1,8 +1,17 @@ -https://api.reacher.email/v0/check_email file:///home/user/website/ ^https://www/$ ^https://web/$ -# 404 page returns a 404, d'oh -https://lychee.cli.rs/404/ -# Errors with "Too Many Requests" + +# URL is used with POST +https://api.reacher.email/v0/check_email + +# 404 page is directly in dist/404.html but we've remapped it to an invalid path +dist/404/index.html$ + +# Code examples in base-url.mdx which don't exist +/docs/about.php$ +/docs/recipes/guide.php$ + +# Websites with aggressive rate limiting / bot detection https://www.nongnu.org/atool +https://builtwith.com/ diff --git a/astro.config.mjs b/astro.config.mjs index 4bd10da..99e78a4 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -42,6 +42,7 @@ export default defineConfig({ "guides/config", "guides/cli", "guides/output", + "guides/preprocessing", ], }, { diff --git a/src/content/docs/guides/getting-started.mdx b/src/content/docs/guides/getting-started.mdx index aad6f15..5d5eef9 100644 --- a/src/content/docs/guides/getting-started.mdx +++ b/src/content/docs/guides/getting-started.mdx @@ -23,7 +23,7 @@ You can install Lychee using various package managers. - + @@ -206,24 +206,11 @@ In this command, we ignore the case when globbing, so it matches - `~/projects/rust_game_/README` - `~/projects/python_script_/Readme.markdown` -### Check Links From Epub File +### Check other file formats -If you have [atool](https://www.nongnu.org/atool) installed, you can check links inside `.epub` files as well! - -```bash -acat -F zip {file.epub} "_.xhtml" "_.html" | lychee - -``` - -:::caution[Attention] -lychee parses other file formats as plaintext and extracts links using [linkify](https://github.com/robinst/linkify). -This generally works well if there are no format- or encoding -specifics, but in case you need dedicated support for a new file format, please -consider [creating an issue](https://github.com/lycheeverse/lychee/issues). -::: - -[atool]: https://www.nongnu.org/atool -[linkify]: https://github.com/robinst/linkify -[issue]: https://github.com/lycheeverse/lychee/issues +By preprocessing files it is possible to do link checking on +files which aren't officially supported by lychee. +See [file preprocessing](/guides/preprocessing). ## GitHub Action diff --git a/src/content/docs/guides/preprocessing.md b/src/content/docs/guides/preprocessing.md new file mode 100644 index 0000000..2cf873a --- /dev/null +++ b/src/content/docs/guides/preprocessing.md @@ -0,0 +1,69 @@ +--- +title: File preprocessing +--- + +Out of the box lychee supports HTML, Markdown and plain text formats. +More precisely, HTML files are parsed as HTML5 with the use of the [html5ever] parser. +Markdown files are treated as [CommonMark] with the use of [pulldown-cmark]. + +For any other file format lychee falls back to a "plain text" mode. +This means that [linkify] attempts to extract URLs on a best-effort basis. +If invalid UTF-8 characters are encountered, the input file is skipped, +because it is assumed that the file is in a binary format lychee cannot understand. + +lychee allows file preprocessing with the `--preprocess` flag. +For each input file the command specified with `--preprocess` is invoked instead of reading the input file directly. +In the following there are examples how to preprocess common file formats. +In most cases it's necessary to create a helper script for preprocessing, +as no parameters can be supplied from the CLI directly. + +```bash +lychee files/* --preprocess ./preprocess.sh +``` + +The referenced `preprocess.sh` script could look like this: + +```bash +#!/usr/bin/env bash + +case "$1" in +*.pdf) + exec pdftohtml -i -s -stdout "$1" + # Alternatives: + # exec pdftotext "$1" - + # exec pdftk "$1" output - uncompress | grep -aPo '/URI *\(\K[^)]*' + ;; +*.odt|*.docx|*.epub|*.ipynb) + exec pandoc "$1" --to=html --wrap=none --markdown-headings=atx + ;; +*.odp|*.pptx|*.ods|*.xlsx) + # libreoffice can't print to stdout unfortunately + libreoffice --headless --convert-to html "$1" --outdir /tmp + file=$(basename "$1") + file="/tmp/${file%.*}.html" + sed '/ -Guide -About -External +Guide +About +Absolute -Guide -About -External`} lang={fileLang} +Guide +About +Absolute`} lang={fileLang} title="Link Resolution Example" /> ## Common Use Cases