only split by newlines

RobinMalfait · thecrypticace · RobinMalfait · commit 8fe397717d11 · 2024-12-02T17:44:40.000+01:00
To reduce overhead of the Extractor itself, we can chunk the work by
lines instead of every whitespace-separated chunk.

This seems to improve the overall cost even more!

Co-authored-by: Jordan Pittman &lt;jordan@cryptica.me&gt;
diff --git a/crates/oxide/src/lib.rs b/crates/oxide/src/lib.rs
@@ -456,7 +456,7 @@ fn read_all_files(changed_content: Vec<ChangedContent>) -> Vec<Vec<u8>> {
 fn parse_all_blobs(blobs: Vec<Vec<u8>>) -> Vec<String> {
     let mut result: Vec<_> = blobs
         .par_iter()
-        .flat_map(|blob| blob.par_split(|x| x.is_ascii_whitespace()))
+        .flat_map(|blob| blob.par_split(|x| matches!(x, b'\n' | b'\r')))
         .map(|blob| Extractor::unique(blob, Default::default()))
         .reduce(Default::default, |mut a, b| {
             a.extend(b);