forked from rapidfuzz/rapidfuzz-cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.txx
More file actions
232 lines (194 loc) · 6.58 KB
/
utils.txx
File metadata and controls
232 lines (194 loc) · 6.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/* SPDX-License-Identifier: MIT */
/* Copyright © 2020 Max Bachmann */
#include <algorithm>
#include <array>
#include <cctype>
#include <cwctype>
#include "details/unicode.hpp"
namespace rapidfuzz {
template <typename CharT1, typename CharT2>
bool string_view_eq(basic_string_view<CharT1> x, basic_string_view<CharT2> y)
{
if (x.size() != y.size()) return false;
for (std::size_t i = 0; i < x.size(); ++i) {
if (x[i] != y[i]) return false;
}
return true;
}
template <typename CharT1, typename CharT2>
DecomposedSet<CharT1, CharT2, CharT1> utils::set_decomposition(SplittedSentenceView<CharT1> a,
SplittedSentenceView<CharT2> b)
{
a.dedupe();
b.dedupe();
string_view_vec<CharT1> intersection;
string_view_vec<CharT1> difference_ab;
string_view_vec<CharT2> difference_ba = b.words();
for (const auto& current_a : a.words()) {
auto element_b = std::find_if(difference_ba.begin(), difference_ba.end(),
[current_a](basic_string_view<CharT2> current_b) {
return string_view_eq(current_a, current_b);
});
if (element_b != difference_ba.end()) {
difference_ba.erase(element_b);
intersection.push_back(current_a);
}
else {
difference_ab.push_back(current_a);
}
}
return {difference_ab, difference_ba, intersection};
}
inline percent utils::result_cutoff(const double result, const percent score_cutoff)
{
return (result >= score_cutoff) ? result : 0;
}
percent utils::norm_distance(std::size_t dist, std::size_t lensum, percent score_cutoff)
{
percent ratio = 100.0 - 100 * static_cast<double>(dist) / static_cast<double>(lensum);
return result_cutoff(ratio, score_cutoff);
}
template <typename T>
bool utils::is_zero(T a, T tolerance)
{
return std::fabs(a) <= tolerance;
}
template <typename Sentence, typename CharT, typename>
basic_string_view<CharT> utils::to_string_view(Sentence&& str)
{
return basic_string_view<CharT>(std::forward<Sentence>(str));
}
template <typename Sentence, typename CharT, typename>
basic_string_view<CharT> utils::to_string_view(const Sentence& str)
{
return basic_string_view<CharT>(str.data(), str.size());
}
template <typename Sentence, typename CharT, typename>
std::basic_string<CharT> utils::to_string(Sentence&& str)
{
return std::basic_string<CharT>(std::forward<Sentence>(str));
}
template <typename Sentence, typename CharT, typename>
std::basic_string<CharT> utils::to_string(const Sentence& str)
{
return std::basic_string<CharT>(str.data(), str.size());
}
template <typename InputIterator1, typename InputIterator2>
std::pair<InputIterator1, InputIterator2>
utils::mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2,
InputIterator2 last2)
{
while (first1 != last1 && first2 != last2 && *first1 == *first2) {
++first1;
++first2;
}
return std::pair<InputIterator1, InputIterator2>(first1, first2);
}
/**
* Removes common prefix of two string views
*/
template <typename CharT1, typename CharT2>
std::size_t utils::remove_common_prefix(basic_string_view<CharT1>& a, basic_string_view<CharT2>& b)
{
auto prefix =
std::distance(a.begin(), utils::mismatch(a.begin(), a.end(), b.begin(), b.end()).first);
a.remove_prefix(prefix);
b.remove_prefix(prefix);
return prefix;
}
/**
* Removes common suffix of two string views
*/
template <typename CharT1, typename CharT2>
std::size_t utils::remove_common_suffix(basic_string_view<CharT1>& a, basic_string_view<CharT2>& b)
{
auto suffix =
std::distance(a.rbegin(), utils::mismatch(a.rbegin(), a.rend(), b.rbegin(), b.rend()).first);
a.remove_suffix(suffix);
b.remove_suffix(suffix);
return suffix;
}
/**
* Removes common affix of two string views
*/
template <typename CharT1, typename CharT2>
StringAffix utils::remove_common_affix(basic_string_view<CharT1>& a, basic_string_view<CharT2>& b)
{
return StringAffix{remove_common_prefix(a, b), remove_common_suffix(a, b)};
}
template <typename Sentence1, typename Sentence2>
std::size_t utils::count_uncommon_chars(const Sentence1& s1, const Sentence2& s2)
{
std::array<signed int, 32> char_freq{};
for (const auto& ch : s1) {
++char_freq[ch % 32];
}
for (const auto& ch : s2) {
--char_freq[ch % 32];
}
std::size_t count = 0;
for (const auto& freq : char_freq) {
count += std::abs(freq);
}
return count;
}
template <typename Sentence, typename CharT>
SplittedSentenceView<CharT> utils::sorted_split(Sentence&& sentence)
{
auto s = to_string_view(std::forward<Sentence>(sentence));
string_view_vec<CharT> splitted;
auto first = s.data();
auto second = s.data();
auto last = first + s.size();
for (; second != last && first != last; first = second + 1) {
second = std::find_if(first, last, Unicode::is_space<CharT>);
if (first != second) {
splitted.emplace_back(first, second - first);
}
}
std::sort(splitted.begin(), splitted.end());
return SplittedSentenceView<CharT>(splitted);
}
template <typename CharT>
void utils::lower_case(std::basic_string<CharT>& s)
{
// TODO: handle other characters like Ä <-> ä (maybe check how this is
// implemented in cpython)
std::transform(s.begin(), s.end(), s.begin(),
[](CharT ch) { return (ch >= 'A' && ch <= 'Z' ? ch + 32 : ch); });
}
template <typename CharT>
void utils::replace_non_alnum(std::basic_string<CharT>& s)
{
// replace punctuation, control control characters, whitespaces with
// whitespaces
std::replace_if(
s.begin(), s.end(),
[](CharT ch) {
int ascii = static_cast<int>(ch);
return ascii <= '/' || (ascii >= ':' && ascii <= '@') || (ascii >= '[' && ascii <= '`') ||
(ascii >= '{' && ascii <= 0x7F) /* DEL */;
},
static_cast<CharT>(' '));
}
template <typename Sentence, typename CharT, typename>
std::basic_string<CharT> utils::default_process(Sentence&& s)
{
std::basic_string<CharT> str(std::forward<Sentence>(s));
replace_non_alnum(str);
// only remove SPACE since all other space characters are already replaced
// with SPACE
str.erase(str.begin(),
std::find_if(str.begin(), str.end(), [](const CharT& ch) { return ch != ' '; }));
str.erase(
std::find_if(str.rbegin(), str.rend(), [](const CharT& ch) { return ch != ' '; }).base(),
str.end());
lower_case(str);
return str;
}
template <typename Sentence, typename CharT, typename>
std::basic_string<CharT> utils::default_process(Sentence s)
{
return default_process(std::basic_string<CharT>(s.data(), s.size()));
}
} // namespace rapidfuzz