-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21289][SQL][ML] Supports custom line separator for all text-based datasources #18581
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
fb589a3
c4282bf
e7983d6
16cc0c2
b006d65
5ce9895
bc65e6b
a240973
265dd48
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -185,46 +185,54 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { | |
| } | ||
| } | ||
|
|
||
| test("SPARK-21289: Support line separator") { | ||
| import testImplicits._ | ||
|
|
||
| val data = """1 1:1.0 3:2.0 5:3.0|0|0 2:4.0 4:5.0 6:6.0""" | ||
| val lineSep = "|" | ||
| Seq(data, s"$data$lineSep").foreach { lines => | ||
| val path0 = new File(tempDir.getCanonicalPath, "write0") | ||
| val path1 = new File(tempDir.getCanonicalPath, "write1") | ||
| try { | ||
| // Read | ||
| java.nio.file.Files.write(path0.toPath, lines.getBytes(StandardCharsets.UTF_8)) | ||
| val df = spark.read | ||
| .option("lineSep", lineSep) | ||
| .format("libsvm") | ||
| .load(path0.getAbsolutePath) | ||
|
|
||
| assert(df.columns(0) == "label") | ||
| assert(df.columns(1) == "features") | ||
| val row1 = df.first() | ||
| assert(row1.getDouble(0) == 1.0) | ||
| val v = row1.getAs[SparseVector](1) | ||
| assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) | ||
|
|
||
| // Write | ||
| df.coalesce(1).write.option("lineSep", "^").format("libsvm").save(path1.getAbsolutePath) | ||
| val partFile = Utils.recursiveList(path1).filter(f => f.getName.startsWith("part-")).head | ||
| val readBack = new String( | ||
| java.nio.file.Files.readAllBytes(partFile.toPath), StandardCharsets.UTF_8) | ||
| assert(readBack === "1.0 1:1.0 3:2.0 5:3.0^0.0^0.0 2:4.0 4:5.0 6:6.0^") | ||
|
|
||
| // Roundtrip | ||
| val readBackDF = spark.read | ||
| .option("lineSep", "^") | ||
| .format("libsvm") | ||
| .load(path1.getAbsolutePath) | ||
| assert(df.collect().toSet === readBackDF.collect().toSet) | ||
| } finally { | ||
| Utils.deleteRecursively(path0) | ||
| Utils.deleteRecursively(path1) | ||
|
|
||
| def testLineSeparator(lineSep: String): Unit = { | ||
| test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") { | ||
| val data = Seq( | ||
| "1.0 1:1.0 3:2.0 5:3.0", "0.0", "0.0", "0.0 2:4.0 4:5.0 6:6.0").mkString(lineSep) | ||
| val dataWithTrailingLineSep = s"$data$lineSep" | ||
|
|
||
| Seq(data, dataWithTrailingLineSep).foreach { lines => | ||
| val path0 = new File(tempDir.getCanonicalPath, "write0") | ||
| val path1 = new File(tempDir.getCanonicalPath, "write1") | ||
| try { | ||
| // Read | ||
| java.nio.file.Files.write(path0.toPath, lines.getBytes(StandardCharsets.UTF_8)) | ||
| val df = spark.read | ||
| .option("lineSep", lineSep) | ||
| .format("libsvm") | ||
| .load(path0.getAbsolutePath) | ||
|
|
||
| assert(df.columns(0) == "label") | ||
| assert(df.columns(1) == "features") | ||
| val row1 = df.first() | ||
| assert(row1.getDouble(0) == 1.0) | ||
| val v = row1.getAs[SparseVector](1) | ||
| assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) | ||
|
||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So here you only test the first line ?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not just test the first line?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The following test only include checking
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we change how to deal with each line in iteration. I think both comparing single line or repeated multiple lines are fine. I think many tests here already test only first line?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, let me update it. It's easy to change anyway. |
||
| // Write | ||
| df.coalesce(1) | ||
| .write.option("lineSep", lineSep).format("libsvm").save(path1.getAbsolutePath) | ||
| val partFile = Utils.recursiveList(path1).filter(f => f.getName.startsWith("part-")).head | ||
| val readBack = new String( | ||
| java.nio.file.Files.readAllBytes(partFile.toPath), StandardCharsets.UTF_8) | ||
| assert(readBack === dataWithTrailingLineSep) | ||
|
|
||
| // Roundtrip | ||
| val readBackDF = spark.read | ||
| .option("lineSep", lineSep) | ||
| .format("libsvm") | ||
| .load(path1.getAbsolutePath) | ||
| assert(df.collect().toSet === readBackDF.collect().toSet) | ||
| } finally { | ||
| Utils.deleteRecursively(path0) | ||
| Utils.deleteRecursively(path1) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Seq("123!!@", "^", "&@").foreach { lineSep => | ||
| testLineSeparator(lineSep) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not import this ?
java.nio.file.FilesUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To differentiate it from google's
Filesexplicitly above. Not a big deal.