kpdecker · ExplodingCabbage · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024
diff --git a/src/diff/word.js b/src/diff/word.js
@@ -19,35 +19,32 @@ import {generateOptions} from '../util/params';
 //  - U+02DC  ˜ &#732;  Small Tilde
 //  - U+02DD  ˝ &#733;  Double Acute Accent
 // Latin Extended Additional, 1E00–1EFF
-const extendedWordChars = /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
+const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}';
 
-const reWhitespace = /\S/;
+// A token is any of the following:
+// * A newline (with or without a carriage return)
+// * A run of word characters
+// * A run of whitespace
+// * A single character that doesn't belong to any of the above categories (and is therefore considered punctuation)
+const tokenizeRegex = new RegExp(`\\r?\\n|[${extendedWordChars}]+|[^\\S\\r\\n]+|[^${extendedWordChars}]`, 'ug');
 
 export const wordDiff = new Diff();
 wordDiff.equals = function(left, right, options) {
   if (options.ignoreCase) {
     left = left.toLowerCase();
     right = right.toLowerCase();
   }
-  return left === right || (options.ignoreWhitespace && !reWhitespace.test(left) && !reWhitespace.test(right));
+  // The comparisons to the empty string are needed PURELY to signal to
+  // buildValues that the whitespace token should be ignored. The empty string
+  // will never be a token (removeEmpty removes it) but buildValues uses empty
+  // string comparisons to test for ignored tokens and we need to handle that
+  // query here.
+  const leftIsWhitespace = (left === '' || (/^\s+$/).test(left));
+  const rightIsWhitespace = (right === '' || (/^\s+$/).test(right));
+  return left === right || (options.ignoreWhitespace && leftIsWhitespace && rightIsWhitespace);
 };
 wordDiff.tokenize = function(value) {
-  // All whitespace symbols except newline group into one token, each newline - in separate token
-  let tokens = value.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
-
-  // Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set.
-  for (let i = 0; i < tokens.length - 1; i++) {
-    // If we have an empty string in the next field and we have only word chars before and after, merge
-    if (!tokens[i + 1] && tokens[i + 2]
-          && extendedWordChars.test(tokens[i])
-          && extendedWordChars.test(tokens[i + 2])) {
-      tokens[i] += tokens[i + 2];
-      tokens.splice(i + 1, 2);
-      i--;
-    }
-  }
-
-  return tokens;
+  return value.match(tokenizeRegex) || [];
 };
 
 export function diffWords(oldStr, newStr, options) {

diff --git a/test/diff/word.js b/test/diff/word.js
@@ -4,6 +4,50 @@ import {convertChangesToXML} from '../../lib/convert/xml';
 import {expect} from 'chai';
 
 describe('WordDiff', function() {
+  describe('#tokenize', function() {
+    it('should give words, punctuation marks, newlines, and runs of whitespace their own token', function() {
+      expect(
+        wordDiff.tokenize(
+          'foo bar baz jurídica wir üben    bla\t\t \txyzáxyz  \n\n\n  animá-los\r\n\r\n(wibbly wobbly)().'
+        )
+      ).to.deep.equal([
+        'foo',
+        ' ',
+        'bar',
+        ' ',
+        'baz',
+        ' ',
+        'jurídica',
+        ' ',
+        'wir',
+        ' ',
+        'üben',
+        '    ',
+        'bla',
+        '\t\t \t',
+        'xyzáxyz',
+        '  ',
+        '\n',
+        '\n',
+        '\n',
+        '  ',
+        'animá',
+        '-',
+        'los',
+        '\r\n',
+        '\r\n',
+        '(',
+        'wibbly',
+        ' ',
+        'wobbly',
+        ')',
+        '(',
+        ')',
+        '.'
+      ]);
+    });
+  });
+
   describe('#diffWords', function() {
     it('should diff whitespace', function() {
       const diffResult = diffWords('New Value', 'New  ValueMoreData');
@@ -61,11 +105,6 @@ describe('WordDiff', function() {
       expect(convertChangesToXML(diffResult)).to.equal('<del>New</del><ins>Value</ins> Value New <del>Value</del><ins>New</ins>');
     });
 
-    it('should token unicode characters safely', function() {
-      expect(wordDiff.removeEmpty(wordDiff.tokenize('jurídica'))).to.eql(['jurídica']);
-      expect(wordDiff.removeEmpty(wordDiff.tokenize('wir üben'))).to.eql(['wir', ' ', 'üben']);
-    });
-
     it('should include count with identity cases', function() {
       expect(diffWords('foo', 'foo')).to.eql([{value: 'foo', count: 1, removed: false, added: false}]);
       expect(diffWords('foo bar', 'foo bar')).to.eql([{value: 'foo bar', count: 3, removed: false, added: false}]);