Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 16 additions & 19 deletions src/diff/word.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,35 +19,32 @@ import {generateOptions} from '../util/params';
// - U+02DC ˜ ˜ Small Tilde
// - U+02DD ˝ ˝ Double Acute Accent
// Latin Extended Additional, 1E00–1EFF
const extendedWordChars = /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
const extendedWordChars = 'a-zA-Z\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}';

const reWhitespace = /\S/;
// A token is any of the following:
// * A newline (with or without a carriage return)
// * A run of word characters
// * A run of whitespace
// * A single character that doesn't belong to any of the above categories (and is therefore considered punctuation)
const tokenizeRegex = new RegExp(`\\r?\\n|[${extendedWordChars}]+|[^\\S\\r\\n]+|[^${extendedWordChars}]`, 'ug');

export const wordDiff = new Diff();
wordDiff.equals = function(left, right, options) {
if (options.ignoreCase) {
left = left.toLowerCase();
right = right.toLowerCase();
}
return left === right || (options.ignoreWhitespace && !reWhitespace.test(left) && !reWhitespace.test(right));
// The comparisons to the empty string are needed PURELY to signal to
// buildValues that the whitespace token should be ignored. The empty string
// will never be a token (removeEmpty removes it) but buildValues uses empty
// string comparisons to test for ignored tokens and we need to handle that
// query here.
const leftIsWhitespace = (left === '' || (/^\s+$/).test(left));
const rightIsWhitespace = (right === '' || (/^\s+$/).test(right));
return left === right || (options.ignoreWhitespace && leftIsWhitespace && rightIsWhitespace);
};
wordDiff.tokenize = function(value) {
// All whitespace symbols except newline group into one token, each newline - in separate token
let tokens = value.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);

// Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set.
for (let i = 0; i < tokens.length - 1; i++) {
// If we have an empty string in the next field and we have only word chars before and after, merge
if (!tokens[i + 1] && tokens[i + 2]
&& extendedWordChars.test(tokens[i])
&& extendedWordChars.test(tokens[i + 2])) {
tokens[i] += tokens[i + 2];
tokens.splice(i + 1, 2);
i--;
}
}

return tokens;
return value.match(tokenizeRegex) || [];
};

export function diffWords(oldStr, newStr, options) {
Expand Down
49 changes: 44 additions & 5 deletions test/diff/word.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,50 @@ import {convertChangesToXML} from '../../lib/convert/xml';
import {expect} from 'chai';

describe('WordDiff', function() {
describe('#tokenize', function() {
it('should give words, punctuation marks, newlines, and runs of whitespace their own token', function() {
expect(
wordDiff.tokenize(
'foo bar baz jurídica wir üben bla\t\t \txyzáxyz \n\n\n animá-los\r\n\r\n(wibbly wobbly)().'
)
).to.deep.equal([
'foo',
' ',
'bar',
' ',
'baz',
' ',
'jurídica',
' ',
'wir',
' ',
'üben',
' ',
'bla',
'\t\t \t',
'xyzáxyz',
' ',
'\n',
'\n',
'\n',
' ',
'animá',
'-',
'los',
'\r\n',
'\r\n',
'(',
'wibbly',
' ',
'wobbly',
')',
'(',
')',
'.'
]);
});
});

describe('#diffWords', function() {
it('should diff whitespace', function() {
const diffResult = diffWords('New Value', 'New ValueMoreData');
Expand Down Expand Up @@ -61,11 +105,6 @@ describe('WordDiff', function() {
expect(convertChangesToXML(diffResult)).to.equal('<del>New</del><ins>Value</ins> Value New <del>Value</del><ins>New</ins>');
});

it('should token unicode characters safely', function() {
expect(wordDiff.removeEmpty(wordDiff.tokenize('jurídica'))).to.eql(['jurídica']);
expect(wordDiff.removeEmpty(wordDiff.tokenize('wir üben'))).to.eql(['wir', ' ', 'üben']);
});

it('should include count with identity cases', function() {
expect(diffWords('foo', 'foo')).to.eql([{value: 'foo', count: 1, removed: false, added: false}]);
expect(diffWords('foo bar', 'foo bar')).to.eql([{value: 'foo bar', count: 3, removed: false, added: false}]);
Expand Down