From 329f85b09018f5cf55000a58786163561e037bd2 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Sun, 6 Jul 2025 19:01:41 +0100 Subject: [PATCH 1/7] Fix repository URL in package.json (#622) --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index b941f247..a5dc45a8 100644 --- a/package.json +++ b/package.json @@ -23,7 +23,7 @@ "license": "BSD-3-Clause", "repository": { "type": "git", - "url": "git://github.com/kpdecker/jsdiff.git" + "url": "https://github.com/kpdecker/jsdiff.git" }, "engines": { "node": ">=0.3.1" From cf44a37359be33feeea2a23938be7f410b1e397b Mon Sep 17 00:00:00 2001 From: Braden Kopenkoskey Date: Tue, 29 Jul 2025 11:10:01 -0400 Subject: [PATCH 2/7] fix: export ArrayChange type (#626) * fix: export ArrayChange type * Make ArrayChange take a type parameter like in the old DefinitelyTyped definitions --------- Co-authored-by: Mark Amery --- src/index.ts | 2 ++ src/types.ts | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/index.ts b/src/index.ts index edec16dd..a21c29b5 100644 --- a/src/index.ts +++ b/src/index.ts @@ -47,6 +47,7 @@ import {convertChangesToXML} from './convert/xml.js'; import type { ChangeObject, Change, + ArrayChange, DiffArraysOptionsAbortable, DiffArraysOptionsNonabortable, DiffCharsOptionsAbortable, @@ -102,6 +103,7 @@ export { export type { ChangeObject, Change, + ArrayChange, DiffArraysOptionsAbortable, DiffArraysOptionsNonabortable, DiffCharsOptionsAbortable, diff --git a/src/types.ts b/src/types.ts index b8167fd5..1ae11370 100644 --- a/src/types.ts +++ b/src/types.ts @@ -23,7 +23,7 @@ export interface ChangeObject { // explicitly reference by name in their own code, so keeping its name consistent is valuable even // though the names of many other types are inconsistent with the old DefinitelyTyped names. export type Change = ChangeObject; -export type ArrayChange = ChangeObject; +export type ArrayChange = ChangeObject; export interface CommonDiffOptions { /** From 6edb453f93a314c0f98e78cbd8495458a56a1d4f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:07:18 +0000 Subject: [PATCH 3/7] Bump tmp from 0.2.3 to 0.2.4 Bumps [tmp](https://github.com/raszi/node-tmp) from 0.2.3 to 0.2.4. - [Changelog](https://github.com/raszi/node-tmp/blob/master/CHANGELOG.md) - [Commits](https://github.com/raszi/node-tmp/compare/v0.2.3...v0.2.4) --- updated-dependencies: - dependency-name: tmp dependency-version: 0.2.4 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yarn.lock b/yarn.lock index 5664b5b4..c1917d51 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5231,9 +5231,9 @@ thunky@^1.0.2: integrity sha512-eHY7nBftgThBqOyHGVN+l8gF0BucP09fMo0oO/Lb0w1OF80dJv+lDVpXG60WMQvkcxAkNybKsrEIE3ZtKGmPrA== tmp@^0.2.1: - version "0.2.3" - resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.2.3.tgz#eb783cc22bc1e8bebd0671476d46ea4eb32a79ae" - integrity sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w== + version "0.2.4" + resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.2.4.tgz#c6db987a2ccc97f812f17137b36af2b6521b0d13" + integrity sha512-UdiSoX6ypifLmrfQ/XfiawN6hkjSBpCjhKxxZcWlUUmoXLaCKQU0bx4HF/tdDK2uzRuchf1txGvrWBzYREssoQ== to-regex-range@^5.0.1: version "5.0.1" From da071fef795a49c00b199530223b61afe1668382 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Mon, 6 Oct 2025 12:17:29 +0100 Subject: [PATCH 4/7] Fix diffWords crashing when used with an Intl.Segmenter on a text with consecutive newlines (#631) * Add test for broken case reported in https://github.com/kpdecker/jsdiff/issues/630 * Fix the bug * Add release notes --- release-notes.md | 4 ++++ src/diff/word.ts | 19 ++++++++++++++++++- test/diff/word.js | 12 ++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/release-notes.md b/release-notes.md index 15f4a6b1..9640342f 100644 --- a/release-notes.md +++ b/release-notes.md @@ -1,5 +1,9 @@ # Release Notes +## Future 8.0.3 release + +- [#631](https://github.com/kpdecker/jsdiff/pull/631) - **fix support for using an `Intl.Segmenter` with `diffWords`**. This has been almost completely broken since the feature was added in v6.0.0, since it would outright crash on any text that featured two consecutive newlines between a pair of words (a very common case). + ## 8.0.2 - [#616](https://github.com/kpdecker/jsdiff/pull/616) **Restored compatibility of `diffSentences` with old Safari versions.** This was broken in 8.0.0 by the introduction of a regex with a [lookbehind assertion](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Lookbehind_assertion); these weren't supported in Safari prior to version 16.4. diff --git a/src/diff/word.ts b/src/diff/word.ts index 922ac746..9e9ded2a 100644 --- a/src/diff/word.ts +++ b/src/diff/word.ts @@ -67,7 +67,24 @@ class WordDiff extends Diff { if (segmenter.resolvedOptions().granularity != 'word') { throw new Error('The segmenter passed must have a granularity of "word"'); } - parts = Array.from(segmenter.segment(value), segment => segment.segment); + // We want `parts` to be an array whose elements alternate between being + // pure whitespace and being pure non-whitespace. This is ALMOST what the + // segments returned by a word-based Intl.Segmenter already look like, + // and therefore we can ALMOST get what we want by simply doing... + // parts = Array.from(segmenter.segment(value), segment => segment.segment); + // ... but not QUITE, because there's of one annoying special case: every + // newline character gets its own segment, instead of sharing a segment + // with other surrounding whitespace. We therefore need to manually merge + // consecutive segments of whitespace into a single part: + parts = []; + for (const segmentObj of Array.from(segmenter.segment(value))) { + const segment = segmentObj.segment; + if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) { + parts[parts.length - 1] += segment; + } else { + parts.push(segment); + } + } } else { parts = value.match(tokenizeIncludingWhitespace) || []; } diff --git a/test/diff/word.js b/test/diff/word.js index f255f321..c636da9f 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -284,6 +284,18 @@ describe('WordDiff', function() { diffWords('foo', 'bar', {intlSegmenter: segmenter}); }).to['throw']('The segmenter passed must have a granularity of "word"'); }); + + it("doesn't blow up when using an Intl.Segmenter on a text with a double newline", () => { + // Regression test for https://github.com/kpdecker/jsdiff/issues/630 + const englishSegmenter = new Intl.Segmenter('en', {granularity: 'word'}); + expect(convertChangesToXML(diffWords( + 'A\n\nX', + 'B\n\nX', + {intlSegmenter: englishSegmenter} + ))).to.equal( + 'AB\n\nX' + ); + }); }); describe('#diffWordsWithSpace', function() { From 93fb6331def3e1f69934bbb95394777d2c14194c Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Mon, 6 Oct 2025 12:23:21 +0100 Subject: [PATCH 5/7] Let Corepack add a packageManager property to package.json (#632) --- package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index a5dc45a8..fef94f4f 100644 --- a/package.json +++ b/package.json @@ -127,5 +127,6 @@ "lines": 100, "functions": 100, "statements": 100 - } + }, + "packageManager": "yarn@1.22.22+sha1.ac34549e6aa8e7ead463a7407e1c7390f61a6610" } From 3e1774afcfadc806baa58ee780923e4d4097319a Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Mon, 6 Oct 2025 12:24:22 +0100 Subject: [PATCH 6/7] Fix a comment typo (#633) --- tsconfig.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsconfig.json b/tsconfig.json index 87a5c8d0..87fea28c 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -22,7 +22,7 @@ // specifically for "a library author" who wants to ensure their code works "under all // possible library consumer compilation settings" - i.e. a person who is essentially 100% // guaranteed to be running both an ESM and CJS build. So how can the recommendation to - // turn this setting on possible be even remotely sane? Beats me. ¯\_(ツ)_/¯ + // turn this setting on possibly be even remotely sane? Beats me. ¯\_(ツ)_/¯ // I've done the best I can by using the @typescript-eslint/consistent-type-imports and // @typescript-eslint/consistent-type-exports ESLint rules to enforce SOME of what this // setting would have enforced, though I dunno if I'm enforcing the bits that motivated the From ad6dc1728e52e4124abcbf906072eaeaa9e63aea Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Wed, 8 Oct 2025 16:55:23 +0100 Subject: [PATCH 7/7] Fix some bugs in the diffWords regex (and errors & ambiguities in the comment above it) (#635) --- release-notes.md | 1 + src/diff/word.ts | 36 ++++++++++---------- test/diff/word.js | 83 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 17 deletions(-) diff --git a/release-notes.md b/release-notes.md index 9640342f..2bb2108f 100644 --- a/release-notes.md +++ b/release-notes.md @@ -3,6 +3,7 @@ ## Future 8.0.3 release - [#631](https://github.com/kpdecker/jsdiff/pull/631) - **fix support for using an `Intl.Segmenter` with `diffWords`**. This has been almost completely broken since the feature was added in v6.0.0, since it would outright crash on any text that featured two consecutive newlines between a pair of words (a very common case). +- [#635](https://github.com/kpdecker/jsdiff/pull/635) - **small tweaks to tokenization behaviour of `diffWords`** when used *without* an `Intl.Segmenter`. Specifically, the soft hyphen (U+00AD) is no longer considered to be a word break, and the multiplication and division signs (`×` and `÷`) are now treated as punctuation instead of as letters / word characters. ## 8.0.2 diff --git a/src/diff/word.ts b/src/diff/word.ts index 9e9ded2a..5d19abcf 100644 --- a/src/diff/word.ts +++ b/src/diff/word.ts @@ -4,23 +4,25 @@ import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode // -// Ranges and exceptions: -// Latin-1 Supplement, 0080–00FF -// - U+00D7 × Multiplication sign -// - U+00F7 ÷ Division sign -// Latin Extended-A, 0100–017F -// Latin Extended-B, 0180–024F -// IPA Extensions, 0250–02AF -// Spacing Modifier Letters, 02B0–02FF -// - U+02C7 ˇ ˇ Caron -// - U+02D8 ˘ ˘ Breve -// - U+02D9 ˙ ˙ Dot Above -// - U+02DA ˚ ˚ Ring Above -// - U+02DB ˛ ˛ Ogonek -// - U+02DC ˜ ˜ Small Tilde -// - U+02DD ˝ ˝ Double Acute Accent -// Latin Extended Additional, 1E00–1EFF -const extendedWordChars = 'a-zA-Z0-9_\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}'; +// Chars/ranges counted as "word" characters by this regex are as follows: +// +// + U+00AD Soft hyphen +// + 00C0–00FF (letters with diacritics from the Latin-1 Supplement), except: +// - U+00D7 × Multiplication sign +// - U+00F7 ÷ Division sign +// + Latin Extended-A, 0100–017F +// + Latin Extended-B, 0180–024F +// + IPA Extensions, 0250–02AF +// + Spacing Modifier Letters, 02B0–02FF, except: +// - U+02C7 ˇ ˇ Caron +// - U+02D8 ˘ ˘ Breve +// - U+02D9 ˙ ˙ Dot Above +// - U+02DA ˚ ˚ Ring Above +// - U+02DB ˛ ˛ Ogonek +// - U+02DC ˜ ˜ Small Tilde +// - U+02DD ˝ ˝ Double Acute Accent +// + Latin Extended Additional, 1E00–1EFF +const extendedWordChars = 'a-zA-Z0-9_\\u{AD}\\u{C0}-\\u{D6}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}'; // Each token is one of the following: // - A punctuation mark plus the surrounding whitespace diff --git a/test/diff/word.js b/test/diff/word.js index c636da9f..5afbdbf1 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -59,6 +59,89 @@ describe('WordDiff', function() { '.' ]); }); + + // Test for various behaviours discussed at + // https://github.com/kpdecker/jsdiff/issues/634#issuecomment-3381707327 + // In particular we are testing that: + // 1. single code points representing accented characters (most of range + // U+00C0 thru U+00FF) are treated as word characters + // 2. soft hyphens are treated as part of the word they appear in + // 3. the multiplication and division signs are punctuation + // 4. currency signs are punctuation + // 5. section symbol is punctuation + // 6. reserved trademark symbol is punctuation + // 7. fractions are punctuation + // The behaviour being tested for in points 4 thru 7 above is of debatable + // correctness; it is not totally obvious whether we SHOULD treat those + // things as punctuation characters or as word characters. Nonetheless, we + // have this test to help document the current behaviour. + it('should handle the 0080-00FF range the way we expect', () => { + expect( + wordDiff.tokenize( + 'My daugh\u00adter, Am\u00E9lie, is 1½ years old and works for ' + + 'Google® for £6 per hour (equivalently £6÷60=£0.10 per minute, or ' + + '£6×8=£48 per day), in violation of § 123 of the Child Labour Act.' + ) + ).to.deep.equal([ + 'My ', + ' daugh\u00adter', + ', ', + ' Am\u00E9lie', + ', ', + ' is ', + ' 1', + '½ ', + ' years ', + ' old ', + ' and ', + ' works ', + ' for ', + ' Google', + '® ', + ' for ', + ' £', + '6 ', + ' per ', + ' hour ', + ' (', + 'equivalently ', + ' £', + '6', + '÷', + '60', + '=', + '£', + '0', + '.', + '10 ', + ' per ', + ' minute', + ', ', + ' or ', + ' £', + '6', + '×', + '8', + '=', + '£', + '48 ', + ' per ', + ' day', + ')', + ', ', + ' in ', + ' violation ', + ' of ', + ' § ', + ' 123 ', + ' of ', + ' the ', + ' Child ', + ' Labour ', + ' Act', + '.' + ]); + }); }); describe('#diffWords', function() {