mongodb
diff --git a/‎jstests/aggregation/expressions/trim.js‎
Lines changed: 45 additions & 6 deletions b/‎jstests/aggregation/expressions/trim.js‎
Lines changed: 45 additions & 6 deletions
diff --git a/‎jstests/libs/sbe_assert_error_override.js‎
Lines changed: 4 additions & 0 deletions b/‎jstests/libs/sbe_assert_error_override.js‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/mongo/db/SConscript‎
Lines changed: 1 addition & 0 deletions b/‎src/mongo/db/SConscript‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/mongo/db/exec/sbe/SConscript‎
Lines changed: 1 addition & 0 deletions b/‎src/mongo/db/exec/sbe/SConscript‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/mongo/db/exec/sbe/expressions/expression.cpp‎
Lines changed: 3 additions & 0 deletions b/‎src/mongo/db/exec/sbe/expressions/expression.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/mongo/db/exec/sbe/vm/vm.cpp‎
Lines changed: 33 additions & 0 deletions b/‎src/mongo/db/exec/sbe/vm/vm.cpp‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/mongo/db/exec/sbe/vm/vm.h‎
Lines changed: 6 additions & 0 deletions b/‎src/mongo/db/exec/sbe/vm/vm.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/mongo/db/pipeline/expression.cpp‎
Lines changed: 12 additions & 128 deletions b/‎src/mongo/db/pipeline/expression.cpp‎
Lines changed: 12 additions & 128 deletions
@@ -5,6 +5,7 @@
 "use strict";
 load("jstests/aggregation/extras/utils.js");  // For assertErrorCode, testExpression and
                                               // testExpressionWithCollation.
+load("jstests/libs/sbe_assert_error_override.js");
 
 const coll = db.trim_expressions;
 
@@ -81,10 +82,48 @@ assert.eq(
         {_id: 4, proof: null},
     ]);
 
-// Test that errors are reported correctly.
-assertErrorCode(coll, [{$project: {x: {$trim: " x "}}}], 50696);
-assertErrorCode(coll, [{$project: {x: {$trim: {input: 4}}}}], 50699);
-assertErrorCode(coll, [{$project: {x: {$trim: {input: {$add: [4, 2]}}}}}], 50699);
-assertErrorCode(coll, [{$project: {x: {$trim: {input: "$_id"}}}}], 50699);
-assertErrorCode(coll, [{$project: {x: {$trim: {input: " x ", chars: "$_id"}}}}], 50700);
+// Semantically same as the tests above but non-constant input for 'chars'
+coll.drop();
+assert.commandWorked(coll.insert([
+    {_id: 0, proof: "Left as an exercise for the reader∎", extra: "∎"},
+    {_id: 1, proof: "∎∃ proof∎", extra: "∎"},
+    {
+        _id: 2,
+        proof: "Just view the problem as a continuous DAG whose elements are taylor series∎",
+        extra: "∎"
+    },
+    {_id: 3, proof: null},
+    {_id: 4},
+]));
+assert.eq(
+    coll.aggregate(
+            [{$sort: {_id: 1}}, {$project: {proof: {$rtrim: {input: "$proof", chars: "$extra"}}}}])
+        .toArray(),
+    [
+        {_id: 0, proof: "Left as an exercise for the reader"},
+        {_id: 1, proof: "∎∃ proof"},
+        {
+            _id: 2,
+            proof: "Just view the problem as a continuous DAG whose elements are taylor series"
+        },
+        {_id: 3, proof: null},
+        {_id: 4, proof: null},
+    ]);
+
+coll.drop();
+assert.commandWorked(coll.insert([
+    {_id: 0, nonObject: " x "},
+    {_id: 1, constantNum: 4},
+]));
+
+// Test that errors are reported correctly (for all of $trim, $ltrim, $rtrim).
+for (const op of ["$trim", "$ltrim", "$rtrim"]) {
+    assertErrorCode(coll, [{$project: {x: {[op]: {}}}}], 50695);
+    assertErrorCode(coll, [{$project: {x: {[op]: "$nonObject"}}}], 50696);
+    assertErrorCode(coll, [{$project: {x: {[op]: {input: "$constantNum"}}}}], 50699);
+    assertErrorCode(
+        coll, [{$project: {x: {[op]: {input: {$add: ["$constantNum", "$constantNum"]}}}}}], 50699);
+    assertErrorCode(coll, [{$project: {x: {[op]: {input: "$_id"}}}}], 50699);
+    assertErrorCode(coll, [{$project: {x: {[op]: {input: "$nonObject", chars: "$_id"}}}}], 50700);
+}
 }());
@@ -163,6 +163,10 @@ const equivalentErrorCodesList = [
     [5787903, 7548606],
     [5787908, 7548606],
     [ErrorCodes.BadValue, 4938500],
+    [50700, 5156303],
+    [50699, 5156302],
+    [50697, 5156304],
+    [50698, 5156305],
     [5155800, 34473],
     [5155801, 34470],
 ];
 
@@ -1709,6 +1709,7 @@ env.Library(
     LIBDEPS=[
         '$BUILD_DIR/mongo/bson/util/bson_extract',
         '$BUILD_DIR/mongo/crypto/fle_crypto',
+        '$BUILD_DIR/mongo/db/query/str_trim_utils',
         '$BUILD_DIR/mongo/scripting/scripting',
         '$BUILD_DIR/mongo/scripting/scripting_common',
         '$BUILD_DIR/mongo/util/pcre_util',
 
@@ -66,6 +66,7 @@ sbeEnv.Library(
     ],
     LIBDEPS_PRIVATE=[
         '$BUILD_DIR/mongo/db/bson/dotted_path_support',
+        '$BUILD_DIR/mongo/db/query/str_trim_utils',
         '$BUILD_DIR/mongo/db/sorter/sorter_idl',
         '$BUILD_DIR/mongo/db/sorter/sorter_stats',
     ],
 
@@ -692,6 +692,9 @@ static stdx::unordered_map<std::string, BuiltinFn> kBuiltinFunctions = {
     {"strLenBytes", BuiltinFn{[](size_t n) { return n == 1; }, vm::Builtin::strLenBytes, false}},
     {"toLower", BuiltinFn{[](size_t n) { return n == 1; }, vm::Builtin::toLower, false}},
     {"toUpper", BuiltinFn{[](size_t n) { return n == 1; }, vm::Builtin::toUpper, false}},
+    {"trim", BuiltinFn{[](size_t n) { return n == 2; }, vm::Builtin::trim, false}},
+    {"ltrim", BuiltinFn{[](size_t n) { return n == 2; }, vm::Builtin::ltrim, false}},
+    {"rtrim", BuiltinFn{[](size_t n) { return n == 2; }, vm::Builtin::rtrim, false}},
     {"coerceToBool", BuiltinFn{[](size_t n) { return n == 1; }, vm::Builtin::coerceToBool, false}},
     {"coerceToString",
      BuiltinFn{[](size_t n) { return n == 1; }, vm::Builtin::coerceToString, false}},
 
@@ -86,6 +86,7 @@
 #include "mongo/db/query/collation/collation_index_key.h"
 #include "mongo/db/query/datetime/date_time_support.h"
 #include "mongo/db/query/query_knobs_gen.h"
+#include "mongo/db/query/str_trim_utils.h"
 #include "mongo/db/storage/column_store.h"
 #include "mongo/db/storage/key_string.h"
 #include "mongo/logv2/log.h"
@@ -4036,6 +4037,26 @@ FastTuple<bool, value::TypeTags, value::Value> ByteCode::builtinConcatArrays(Ari
     return {true, resTag, resVal};
 }
 
+FastTuple<bool, value::TypeTags, value::Value> ByteCode::builtinTrim(ArityType arity,
+                                                                     bool trimLeft,
+                                                                     bool trimRight) {
+    auto [ownedChars, tagChars, valChars] = getFromStack(1);
+    auto [ownedInput, tagInput, valInput] = getFromStack(0);
+
+    if (!value::isString(tagInput)) {
+        return {false, value::TypeTags::Nothing, 0};
+    }
+
+    auto replacementChars = !value::isNullish(tagChars)
+        ? str_trim_utils::extractCodePointsFromChars(value::getStringView(tagChars, valChars))
+        : str_trim_utils::kDefaultTrimWhitespaceChars;
+    auto inputString = value::getStringView(tagInput, valInput);
+
+    auto [strTag, strValue] = sbe::value::makeNewString(
+        str_trim_utils::doTrim(inputString, replacementChars, trimLeft, trimRight));
+    return {true, strTag, strValue};
+}
+
 FastTuple<bool, value::TypeTags, value::Value> ByteCode::builtinAggConcatArraysCapped(
     ArityType arity) {
     auto [ownArr, tagArr, valArr] = getFromStack(0);
@@ -6935,6 +6956,12 @@ FastTuple<bool, value::TypeTags, value::Value> ByteCode::dispatchBuiltin(Builtin
             return builtinToUpper(arity);
         case Builtin::toLower:
             return builtinToLower(arity);
+        case Builtin::trim:
+            return builtinTrim(arity, true, true);
+        case Builtin::ltrim:
+            return builtinTrim(arity, true, false);
+        case Builtin::rtrim:
+            return builtinTrim(arity, false, true);
         case Builtin::coerceToBool:
             return builtinCoerceToBool(arity);
         case Builtin::coerceToString:
@@ -7260,6 +7287,12 @@ std::string builtinToString(Builtin b) {
             return "toUpper";
         case Builtin::toLower:
             return "toLower";
+        case Builtin::trim:
+            return "trim";
+        case Builtin::ltrim:
+            return "ltrim";
+        case Builtin::rtrim:
+            return "rtrim";
         case Builtin::coerceToBool:
             return "coerceToBool";
         case Builtin::coerceToString:
 
@@ -696,6 +696,9 @@ enum class Builtin : uint8_t {
     coerceToString,
     concat,
     concatArrays,
+    trim,
+    ltrim,
+    rtrim,
 
     // Agg function to concatenate arrays, failing when the accumulator reaches a specified size.
     aggConcatArraysCapped,
@@ -1664,6 +1667,9 @@ class ByteCode {
     FastTuple<bool, value::TypeTags, value::Value> builtinRound(ArityType arity);
     FastTuple<bool, value::TypeTags, value::Value> builtinConcat(ArityType arity);
     FastTuple<bool, value::TypeTags, value::Value> builtinConcatArrays(ArityType arity);
+    FastTuple<bool, value::TypeTags, value::Value> builtinTrim(ArityType arity,
+                                                               bool trimLeft,
+                                                               bool trimRight);
     FastTuple<bool, value::TypeTags, value::Value> builtinAggConcatArraysCapped(ArityType arity);
     FastTuple<bool, value::TypeTags, value::Value> builtinAggSetUnion(ArityType arity);
     FastTuple<bool, value::TypeTags, value::Value> builtinAggSetUnionCapped(ArityType arity);
 
@@ -81,6 +81,8 @@
 #include "mongo/db/query/collation/collator_interface.h"
 #include "mongo/db/query/datetime/date_time_support.h"
 #include "mongo/db/query/query_knobs_gen.h"
+#include "mongo/db/query/sort_pattern.h"
+#include "mongo/db/query/str_trim_utils.h"
 #include "mongo/db/query/util/make_data_structure.h"
 #include "mongo/db/record_id.h"
 #include "mongo/db/stats/counters.h"
@@ -6022,74 +6024,6 @@ intrusive_ptr<Expression> ExpressionTrim::parse(ExpressionContext* const expCtx,
     return new ExpressionTrim(expCtx, trimType, name, input, characters);
 }
 
-namespace {
-const std::vector<StringData> kDefaultTrimWhitespaceChars = {
-    "\0"_sd,      // Null character. Avoid using "\u0000" syntax to work around a gcc bug:
-                  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53690.
-    "\u0020"_sd,  // Space
-    "\u0009"_sd,  // Horizontal tab
-    "\u000A"_sd,  // Line feed/new line
-    "\u000B"_sd,  // Vertical tab
-    "\u000C"_sd,  // Form feed
-    "\u000D"_sd,  // Horizontal tab
-    "\u00A0"_sd,  // Non-breaking space
-    "\u1680"_sd,  // Ogham space mark
-    "\u2000"_sd,  // En quad
-    "\u2001"_sd,  // Em quad
-    "\u2002"_sd,  // En space
-    "\u2003"_sd,  // Em space
-    "\u2004"_sd,  // Three-per-em space
-    "\u2005"_sd,  // Four-per-em space
-    "\u2006"_sd,  // Six-per-em space
-    "\u2007"_sd,  // Figure space
-    "\u2008"_sd,  // Punctuation space
-    "\u2009"_sd,  // Thin space
-    "\u200A"_sd   // Hair space
-};
-
-/**
- * Assuming 'charByte' is the beginning of a UTF-8 code point, returns the number of bytes that
- * should be used to represent the code point. Said another way, computes how many continuation
- * bytes are expected to be present after 'charByte' in a UTF-8 encoded string.
- */
-inline size_t numberOfBytesForCodePoint(char charByte) {
-    if ((charByte & 0b11111000) == 0b11110000) {
-        return 4;
-    } else if ((charByte & 0b11110000) == 0b11100000) {
-        return 3;
-    } else if ((charByte & 0b11100000) == 0b11000000) {
-        return 2;
-    } else {
-        return 1;
-    }
-}
-
-/**
- * Returns a vector with one entry per code point to trim, or throws an exception if 'utf8String'
- * contains invalid UTF-8.
- */
-std::vector<StringData> extractCodePointsFromChars(StringData utf8String,
-                                                   StringData expressionName) {
-    std::vector<StringData> codePoints;
-    std::size_t i = 0;
-    while (i < utf8String.size()) {
-        uassert(50698,
-                str::stream() << "Failed to parse \"chars\" argument to " << expressionName
-                              << ": Detected invalid UTF-8. Got continuation byte when expecting "
-                                 "the start of a new code point.",
-                !str::isUTF8ContinuationByte(utf8String[i]));
-        codePoints.push_back(utf8String.substr(i, numberOfBytesForCodePoint(utf8String[i])));
-        i += numberOfBytesForCodePoint(utf8String[i]);
-    }
-    uassert(50697,
-            str::stream()
-                << "Failed to parse \"chars\" argument to " << expressionName
-                << ": Detected invalid UTF-8. Missing expected continuation byte at end of string.",
-            i <= utf8String.size());
-    return codePoints;
-}
-}  // namespace
-
 Value ExpressionTrim::evaluate(const Document& root, Variables* variables) const {
     auto unvalidatedInput = _children[_kInput]->evaluate(root, variables);
     if (unvalidatedInput.nullish()) {
@@ -6103,7 +6037,11 @@ Value ExpressionTrim::evaluate(const Document& root, Variables* variables) const
     const StringData input(unvalidatedInput.getStringData());
 
     if (!_children[_kCharacters]) {
-        return Value(doTrim(input, kDefaultTrimWhitespaceChars));
+        return Value(
+            str_trim_utils::doTrim(input,
+                                   str_trim_utils::kDefaultTrimWhitespaceChars,
+                                   _trimType == TrimType::kBoth || _trimType == TrimType::kLeft,
+                                   _trimType == TrimType::kBoth || _trimType == TrimType::kRight));
     }
     auto unvalidatedUserChars = _children[_kCharacters]->evaluate(root, variables);
     if (unvalidatedUserChars.nullish()) {
@@ -6115,65 +6053,11 @@ Value ExpressionTrim::evaluate(const Document& root, Variables* variables) const
                           << typeName(unvalidatedUserChars.getType()) << ") instead.",
             unvalidatedUserChars.getType() == BSONType::String);
 
-    return Value(
-        doTrim(input, extractCodePointsFromChars(unvalidatedUserChars.getStringData(), _name)));
-}
-
-bool ExpressionTrim::codePointMatchesAtIndex(const StringData& input,
-                                             std::size_t indexOfInput,
-                                             const StringData& testCP) {
-    for (size_t i = 0; i < testCP.size(); ++i) {
-        if (indexOfInput + i >= input.size() || input[indexOfInput + i] != testCP[i]) {
-            return false;
-        }
-    }
-    return true;
-};
-
-StringData ExpressionTrim::trimFromLeft(StringData input, const std::vector<StringData>& trimCPs) {
-    std::size_t bytesTrimmedFromLeft = 0u;
-    while (bytesTrimmedFromLeft < input.size()) {
-        // Look for any matching code point to trim.
-        auto matchingCP = std::find_if(trimCPs.begin(), trimCPs.end(), [&](auto& testCP) {
-            return codePointMatchesAtIndex(input, bytesTrimmedFromLeft, testCP);
-        });
-        if (matchingCP == trimCPs.end()) {
-            // Nothing to trim, stop here.
-            break;
-        }
-        bytesTrimmedFromLeft += matchingCP->size();
-    }
-    return input.substr(bytesTrimmedFromLeft);
-}
-
-StringData ExpressionTrim::trimFromRight(StringData input, const std::vector<StringData>& trimCPs) {
-    std::size_t bytesTrimmedFromRight = 0u;
-    while (bytesTrimmedFromRight < input.size()) {
-        std::size_t indexToTrimFrom = input.size() - bytesTrimmedFromRight;
-        auto matchingCP = std::find_if(trimCPs.begin(), trimCPs.end(), [&](auto& testCP) {
-            if (indexToTrimFrom < testCP.size()) {
-                // We've gone off the left of the string.
-                return false;
-            }
-            return codePointMatchesAtIndex(input, indexToTrimFrom - testCP.size(), testCP);
-        });
-        if (matchingCP == trimCPs.end()) {
-            // Nothing to trim, stop here.
-            break;
-        }
-        bytesTrimmedFromRight += matchingCP->size();
-    }
-    return input.substr(0, input.size() - bytesTrimmedFromRight);
-}
-
-StringData ExpressionTrim::doTrim(StringData input, const std::vector<StringData>& trimCPs) const {
-    if (_trimType == TrimType::kBoth || _trimType == TrimType::kLeft) {
-        input = trimFromLeft(input, trimCPs);
-    }
-    if (_trimType == TrimType::kBoth || _trimType == TrimType::kRight) {
-        input = trimFromRight(input, trimCPs);
-    }
-    return input;
+    return Value(str_trim_utils::doTrim(
+        input,
+        str_trim_utils::extractCodePointsFromChars(unvalidatedUserChars.getStringData()),
+        _trimType == TrimType::kBoth || _trimType == TrimType::kLeft,
+        _trimType == TrimType::kBoth || _trimType == TrimType::kRight));
 }
 
 boost::intrusive_ptr<Expression> ExpressionTrim::optimize() {