8181#include " mongo/db/query/collation/collator_interface.h"
8282#include " mongo/db/query/datetime/date_time_support.h"
8383#include " mongo/db/query/query_knobs_gen.h"
84+ #include " mongo/db/query/sort_pattern.h"
85+ #include " mongo/db/query/str_trim_utils.h"
8486#include " mongo/db/query/util/make_data_structure.h"
8587#include " mongo/db/record_id.h"
8688#include " mongo/db/stats/counters.h"
@@ -6022,74 +6024,6 @@ intrusive_ptr<Expression> ExpressionTrim::parse(ExpressionContext* const expCtx,
60226024 return new ExpressionTrim (expCtx, trimType, name, input, characters);
60236025}
60246026
6025- namespace {
6026- const std::vector<StringData> kDefaultTrimWhitespaceChars = {
6027- " \0 " _sd, // Null character. Avoid using "\u0000" syntax to work around a gcc bug:
6028- // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53690.
6029- " \u0020 " _sd, // Space
6030- " \u0009 " _sd, // Horizontal tab
6031- " \u000A " _sd, // Line feed/new line
6032- " \u000B " _sd, // Vertical tab
6033- " \u000C " _sd, // Form feed
6034- " \u000D " _sd, // Horizontal tab
6035- " \u00A0 " _sd, // Non-breaking space
6036- " \u1680 " _sd, // Ogham space mark
6037- " \u2000 " _sd, // En quad
6038- " \u2001 " _sd, // Em quad
6039- " \u2002 " _sd, // En space
6040- " \u2003 " _sd, // Em space
6041- " \u2004 " _sd, // Three-per-em space
6042- " \u2005 " _sd, // Four-per-em space
6043- " \u2006 " _sd, // Six-per-em space
6044- " \u2007 " _sd, // Figure space
6045- " \u2008 " _sd, // Punctuation space
6046- " \u2009 " _sd, // Thin space
6047- " \u200A " _sd // Hair space
6048- };
6049-
6050- /* *
6051- * Assuming 'charByte' is the beginning of a UTF-8 code point, returns the number of bytes that
6052- * should be used to represent the code point. Said another way, computes how many continuation
6053- * bytes are expected to be present after 'charByte' in a UTF-8 encoded string.
6054- */
6055- inline size_t numberOfBytesForCodePoint (char charByte) {
6056- if ((charByte & 0b11111000 ) == 0b11110000 ) {
6057- return 4 ;
6058- } else if ((charByte & 0b11110000 ) == 0b11100000 ) {
6059- return 3 ;
6060- } else if ((charByte & 0b11100000 ) == 0b11000000 ) {
6061- return 2 ;
6062- } else {
6063- return 1 ;
6064- }
6065- }
6066-
6067- /* *
6068- * Returns a vector with one entry per code point to trim, or throws an exception if 'utf8String'
6069- * contains invalid UTF-8.
6070- */
6071- std::vector<StringData> extractCodePointsFromChars (StringData utf8String,
6072- StringData expressionName) {
6073- std::vector<StringData> codePoints;
6074- std::size_t i = 0 ;
6075- while (i < utf8String.size ()) {
6076- uassert (50698 ,
6077- str::stream () << " Failed to parse \" chars\" argument to " << expressionName
6078- << " : Detected invalid UTF-8. Got continuation byte when expecting "
6079- " the start of a new code point." ,
6080- !str::isUTF8ContinuationByte (utf8String[i]));
6081- codePoints.push_back (utf8String.substr (i, numberOfBytesForCodePoint (utf8String[i])));
6082- i += numberOfBytesForCodePoint (utf8String[i]);
6083- }
6084- uassert (50697 ,
6085- str::stream ()
6086- << " Failed to parse \" chars\" argument to " << expressionName
6087- << " : Detected invalid UTF-8. Missing expected continuation byte at end of string." ,
6088- i <= utf8String.size ());
6089- return codePoints;
6090- }
6091- } // namespace
6092-
60936027Value ExpressionTrim::evaluate (const Document& root, Variables* variables) const {
60946028 auto unvalidatedInput = _children[_kInput]->evaluate (root, variables);
60956029 if (unvalidatedInput.nullish ()) {
@@ -6103,7 +6037,11 @@ Value ExpressionTrim::evaluate(const Document& root, Variables* variables) const
61036037 const StringData input (unvalidatedInput.getStringData ());
61046038
61056039 if (!_children[_kCharacters]) {
6106- return Value (doTrim (input, kDefaultTrimWhitespaceChars ));
6040+ return Value (
6041+ str_trim_utils::doTrim (input,
6042+ str_trim_utils::kDefaultTrimWhitespaceChars ,
6043+ _trimType == TrimType::kBoth || _trimType == TrimType::kLeft ,
6044+ _trimType == TrimType::kBoth || _trimType == TrimType::kRight ));
61076045 }
61086046 auto unvalidatedUserChars = _children[_kCharacters]->evaluate (root, variables);
61096047 if (unvalidatedUserChars.nullish ()) {
@@ -6115,65 +6053,11 @@ Value ExpressionTrim::evaluate(const Document& root, Variables* variables) const
61156053 << typeName (unvalidatedUserChars.getType ()) << " ) instead." ,
61166054 unvalidatedUserChars.getType () == BSONType::String);
61176055
6118- return Value (
6119- doTrim (input, extractCodePointsFromChars (unvalidatedUserChars.getStringData (), _name)));
6120- }
6121-
6122- bool ExpressionTrim::codePointMatchesAtIndex (const StringData& input,
6123- std::size_t indexOfInput,
6124- const StringData& testCP) {
6125- for (size_t i = 0 ; i < testCP.size (); ++i) {
6126- if (indexOfInput + i >= input.size () || input[indexOfInput + i] != testCP[i]) {
6127- return false ;
6128- }
6129- }
6130- return true ;
6131- };
6132-
6133- StringData ExpressionTrim::trimFromLeft (StringData input, const std::vector<StringData>& trimCPs) {
6134- std::size_t bytesTrimmedFromLeft = 0u ;
6135- while (bytesTrimmedFromLeft < input.size ()) {
6136- // Look for any matching code point to trim.
6137- auto matchingCP = std::find_if (trimCPs.begin (), trimCPs.end (), [&](auto & testCP) {
6138- return codePointMatchesAtIndex (input, bytesTrimmedFromLeft, testCP);
6139- });
6140- if (matchingCP == trimCPs.end ()) {
6141- // Nothing to trim, stop here.
6142- break ;
6143- }
6144- bytesTrimmedFromLeft += matchingCP->size ();
6145- }
6146- return input.substr (bytesTrimmedFromLeft);
6147- }
6148-
6149- StringData ExpressionTrim::trimFromRight (StringData input, const std::vector<StringData>& trimCPs) {
6150- std::size_t bytesTrimmedFromRight = 0u ;
6151- while (bytesTrimmedFromRight < input.size ()) {
6152- std::size_t indexToTrimFrom = input.size () - bytesTrimmedFromRight;
6153- auto matchingCP = std::find_if (trimCPs.begin (), trimCPs.end (), [&](auto & testCP) {
6154- if (indexToTrimFrom < testCP.size ()) {
6155- // We've gone off the left of the string.
6156- return false ;
6157- }
6158- return codePointMatchesAtIndex (input, indexToTrimFrom - testCP.size (), testCP);
6159- });
6160- if (matchingCP == trimCPs.end ()) {
6161- // Nothing to trim, stop here.
6162- break ;
6163- }
6164- bytesTrimmedFromRight += matchingCP->size ();
6165- }
6166- return input.substr (0 , input.size () - bytesTrimmedFromRight);
6167- }
6168-
6169- StringData ExpressionTrim::doTrim (StringData input, const std::vector<StringData>& trimCPs) const {
6170- if (_trimType == TrimType::kBoth || _trimType == TrimType::kLeft ) {
6171- input = trimFromLeft (input, trimCPs);
6172- }
6173- if (_trimType == TrimType::kBoth || _trimType == TrimType::kRight ) {
6174- input = trimFromRight (input, trimCPs);
6175- }
6176- return input;
6056+ return Value (str_trim_utils::doTrim (
6057+ input,
6058+ str_trim_utils::extractCodePointsFromChars (unvalidatedUserChars.getStringData ()),
6059+ _trimType == TrimType::kBoth || _trimType == TrimType::kLeft ,
6060+ _trimType == TrimType::kBoth || _trimType == TrimType::kRight ));
61776061}
61786062
61796063boost::intrusive_ptr<Expression> ExpressionTrim::optimize () {
0 commit comments