diff --git a/src/diff/word.js b/src/diff/word.js index 4ed3df26..1319d93a 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -19,9 +19,14 @@ import {generateOptions} from '../util/params'; // - U+02DC ˜ ˜ Small Tilde // - U+02DD ˝ ˝ Double Acute Accent // Latin Extended Additional, 1E00–1EFF -const extendedWordChars = /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u; +const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}'; -const reWhitespace = /\S/; +// A token is any of the following: +// * A newline (with or without a carriage return) +// * A run of word characters +// * A run of whitespace +// * A single character that doesn't belong to any of the above categories (and is therefore considered punctuation) +const tokenizeRegex = new RegExp(`\\r?\\n|[${extendedWordChars}]+|[^\\S\\r\\n]+|[^${extendedWordChars}]`, 'ug'); export const wordDiff = new Diff(); wordDiff.equals = function(left, right, options) { @@ -29,25 +34,17 @@ wordDiff.equals = function(left, right, options) { left = left.toLowerCase(); right = right.toLowerCase(); } - return left === right || (options.ignoreWhitespace && !reWhitespace.test(left) && !reWhitespace.test(right)); + // The comparisons to the empty string are needed PURELY to signal to + // buildValues that the whitespace token should be ignored. The empty string + // will never be a token (removeEmpty removes it) but buildValues uses empty + // string comparisons to test for ignored tokens and we need to handle that + // query here. + const leftIsWhitespace = (left === '' || (/^\s+$/).test(left)); + const rightIsWhitespace = (right === '' || (/^\s+$/).test(right)); + return left === right || (options.ignoreWhitespace && leftIsWhitespace && rightIsWhitespace); }; wordDiff.tokenize = function(value) { - // All whitespace symbols except newline group into one token, each newline - in separate token - let tokens = value.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/); - - // Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set. - for (let i = 0; i < tokens.length - 1; i++) { - // If we have an empty string in the next field and we have only word chars before and after, merge - if (!tokens[i + 1] && tokens[i + 2] - && extendedWordChars.test(tokens[i]) - && extendedWordChars.test(tokens[i + 2])) { - tokens[i] += tokens[i + 2]; - tokens.splice(i + 1, 2); - i--; - } - } - - return tokens; + return value.match(tokenizeRegex) || []; }; export function diffWords(oldStr, newStr, options) { diff --git a/test/diff/word.js b/test/diff/word.js index 4deb84c4..59ab19ea 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -4,6 +4,50 @@ import {convertChangesToXML} from '../../lib/convert/xml'; import {expect} from 'chai'; describe('WordDiff', function() { + describe('#tokenize', function() { + it('should give words, punctuation marks, newlines, and runs of whitespace their own token', function() { + expect( + wordDiff.tokenize( + 'foo bar baz jurídica wir üben bla\t\t \txyzáxyz \n\n\n animá-los\r\n\r\n(wibbly wobbly)().' + ) + ).to.deep.equal([ + 'foo', + ' ', + 'bar', + ' ', + 'baz', + ' ', + 'jurídica', + ' ', + 'wir', + ' ', + 'üben', + ' ', + 'bla', + '\t\t \t', + 'xyzáxyz', + ' ', + '\n', + '\n', + '\n', + ' ', + 'animá', + '-', + 'los', + '\r\n', + '\r\n', + '(', + 'wibbly', + ' ', + 'wobbly', + ')', + '(', + ')', + '.' + ]); + }); + }); + describe('#diffWords', function() { it('should diff whitespace', function() { const diffResult = diffWords('New Value', 'New ValueMoreData'); @@ -61,11 +105,6 @@ describe('WordDiff', function() { expect(convertChangesToXML(diffResult)).to.equal('NewValue Value New ValueNew'); }); - it('should token unicode characters safely', function() { - expect(wordDiff.removeEmpty(wordDiff.tokenize('jurídica'))).to.eql(['jurídica']); - expect(wordDiff.removeEmpty(wordDiff.tokenize('wir üben'))).to.eql(['wir', ' ', 'üben']); - }); - it('should include count with identity cases', function() { expect(diffWords('foo', 'foo')).to.eql([{value: 'foo', count: 1, removed: false, added: false}]); expect(diffWords('foo bar', 'foo bar')).to.eql([{value: 'foo bar', count: 3, removed: false, added: false}]);